Commit 98d9ae51fc4e1a6967b52e7708f6ddc66c684276
1 parent
320245e0
Integrate JSONParser::decode_string into getToken
Showing
1 changed file
with
30 additions
and
87 deletions
libqpdf/JSON.cc
| ... | ... | @@ -616,12 +616,9 @@ namespace |
| 616 | 616 | void getToken(); |
| 617 | 617 | void handleToken(); |
| 618 | 618 | void numberError(); |
| 619 | - static std::string | |
| 620 | - decode_string(std::string const& json, qpdf_offset_t offset); | |
| 621 | 619 | static void handle_u_code( |
| 622 | - char const* s, | |
| 620 | + unsigned long codepoint, | |
| 623 | 621 | qpdf_offset_t offset, |
| 624 | - qpdf_offset_t i, | |
| 625 | 622 | unsigned long& high_surrogate, |
| 626 | 623 | qpdf_offset_t& high_offset, |
| 627 | 624 | std::string& result); |
| ... | ... | @@ -680,6 +677,7 @@ namespace |
| 680 | 677 | size_t bytes; |
| 681 | 678 | char const* p; |
| 682 | 679 | qpdf_offset_t u_count; |
| 680 | + unsigned long u_value{0}; | |
| 683 | 681 | qpdf_offset_t offset; |
| 684 | 682 | bool done; |
| 685 | 683 | std::string token; |
| ... | ... | @@ -693,22 +691,15 @@ namespace |
| 693 | 691 | |
| 694 | 692 | void |
| 695 | 693 | JSONParser::handle_u_code( |
| 696 | - char const* s, | |
| 694 | + unsigned long codepoint, | |
| 697 | 695 | qpdf_offset_t offset, |
| 698 | - qpdf_offset_t i, | |
| 699 | 696 | unsigned long& high_surrogate, |
| 700 | 697 | qpdf_offset_t& high_offset, |
| 701 | 698 | std::string& result) |
| 702 | 699 | { |
| 703 | - std::string hex = QUtil::hex_decode(std::string(s + i + 1, s + i + 5)); | |
| 704 | - unsigned char high = static_cast<unsigned char>(hex.at(0)); | |
| 705 | - unsigned char low = static_cast<unsigned char>(hex.at(1)); | |
| 706 | - unsigned long codepoint = high; | |
| 707 | - codepoint <<= 8; | |
| 708 | - codepoint += low; | |
| 709 | 700 | if ((codepoint & 0xFC00) == 0xD800) { |
| 710 | 701 | // high surrogate |
| 711 | - qpdf_offset_t new_high_offset = offset + i; | |
| 702 | + qpdf_offset_t new_high_offset = offset; | |
| 712 | 703 | if (high_offset) { |
| 713 | 704 | QTC::TC("libtests", "JSON 16 high high"); |
| 714 | 705 | throw std::runtime_error( |
| ... | ... | @@ -721,10 +712,10 @@ JSONParser::handle_u_code( |
| 721 | 712 | high_surrogate = codepoint; |
| 722 | 713 | } else if ((codepoint & 0xFC00) == 0xDC00) { |
| 723 | 714 | // low surrogate |
| 724 | - if (offset + i != (high_offset + 6)) { | |
| 715 | + if (offset != (high_offset + 6)) { | |
| 725 | 716 | QTC::TC("libtests", "JSON 16 low not after high"); |
| 726 | 717 | throw std::runtime_error( |
| 727 | - "JSON: offset " + std::to_string(offset + i) + | |
| 718 | + "JSON: offset " + std::to_string(offset) + | |
| 728 | 719 | ": UTF-16 low surrogate found not immediately after high" |
| 729 | 720 | " surrogate"); |
| 730 | 721 | } |
| ... | ... | @@ -737,74 +728,6 @@ JSONParser::handle_u_code( |
| 737 | 728 | } |
| 738 | 729 | } |
| 739 | 730 | |
| 740 | -std::string | |
| 741 | -JSONParser::decode_string(std::string const& str, qpdf_offset_t offset) | |
| 742 | -{ | |
| 743 | - // The string has already been validated when this private method | |
| 744 | - // is called, so errors are logic errors instead of runtime | |
| 745 | - // errors. | |
| 746 | - size_t len = str.length(); | |
| 747 | - char const* s = str.c_str(); | |
| 748 | - | |
| 749 | - // Keep track of UTF-16 surrogate pairs. | |
| 750 | - unsigned long high_surrogate = 0; | |
| 751 | - qpdf_offset_t high_offset = 0; | |
| 752 | - std::string result; | |
| 753 | - qpdf_offset_t olen = toO(len); | |
| 754 | - for (qpdf_offset_t i = 0; i < olen; ++i) { | |
| 755 | - if (s[i] == '\\') { | |
| 756 | - if (i + 1 >= olen) { | |
| 757 | - throw std::logic_error("JSON parse: nothing after \\"); | |
| 758 | - } | |
| 759 | - char ch = s[++i]; | |
| 760 | - switch (ch) { | |
| 761 | - case '\\': | |
| 762 | - case '\"': | |
| 763 | - case '/': | |
| 764 | - // \/ is allowed in json input, but so is /, so we | |
| 765 | - // don't map / to \/ in output. | |
| 766 | - result.append(1, ch); | |
| 767 | - break; | |
| 768 | - case 'b': | |
| 769 | - result.append(1, '\b'); | |
| 770 | - break; | |
| 771 | - case 'f': | |
| 772 | - result.append(1, '\f'); | |
| 773 | - break; | |
| 774 | - case 'n': | |
| 775 | - result.append(1, '\n'); | |
| 776 | - break; | |
| 777 | - case 'r': | |
| 778 | - result.append(1, '\r'); | |
| 779 | - break; | |
| 780 | - case 't': | |
| 781 | - result.append(1, '\t'); | |
| 782 | - break; | |
| 783 | - case 'u': | |
| 784 | - if (i + 4 >= olen) { | |
| 785 | - throw std::logic_error( | |
| 786 | - "JSON parse: not enough characters after \\u"); | |
| 787 | - } | |
| 788 | - handle_u_code( | |
| 789 | - s, offset, i, high_surrogate, high_offset, result); | |
| 790 | - i += 4; | |
| 791 | - break; | |
| 792 | - default: | |
| 793 | - break; | |
| 794 | - } | |
| 795 | - } else { | |
| 796 | - result.append(1, s[i]); | |
| 797 | - } | |
| 798 | - } | |
| 799 | - if (high_offset) { | |
| 800 | - QTC::TC("libtests", "JSON 16 dangling high"); | |
| 801 | - throw std::runtime_error( | |
| 802 | - "JSON: offset " + std::to_string(high_offset) + | |
| 803 | - ": UTF-16 high surrogate not followed by low surrogate"); | |
| 804 | - } | |
| 805 | - return result; | |
| 806 | -} | |
| 807 | - | |
| 808 | 731 | void |
| 809 | 732 | JSONParser::numberError() |
| 810 | 733 | { |
| ... | ... | @@ -850,6 +773,11 @@ JSONParser::getToken() |
| 850 | 773 | enum { append, ignore, reread } action = append; |
| 851 | 774 | bool ready = false; |
| 852 | 775 | token.clear(); |
| 776 | + | |
| 777 | + // Keep track of UTF-16 surrogate pairs. | |
| 778 | + unsigned long high_surrogate = 0; | |
| 779 | + qpdf_offset_t high_offset = 0; | |
| 780 | + | |
| 853 | 781 | while (!done) { |
| 854 | 782 | if (p == (buf + bytes)) { |
| 855 | 783 | p = buf; |
| ... | ... | @@ -1046,7 +974,13 @@ JSONParser::getToken() |
| 1046 | 974 | |
| 1047 | 975 | case ls_string: |
| 1048 | 976 | if (*p == '"') { |
| 1049 | - token = decode_string(token, token_start); | |
| 977 | + if (high_offset) { | |
| 978 | + QTC::TC("libtests", "JSON 16 dangling high"); | |
| 979 | + throw std::runtime_error( | |
| 980 | + "JSON: offset " + std::to_string(high_offset) + | |
| 981 | + ": UTF-16 high surrogate not followed by low " | |
| 982 | + "surrogate"); | |
| 983 | + } | |
| 1050 | 984 | action = ignore; |
| 1051 | 985 | ready = true; |
| 1052 | 986 | } else if (*p == '\\') { |
| ... | ... | @@ -1060,7 +994,6 @@ JSONParser::getToken() |
| 1060 | 994 | lex_state = ls_string; |
| 1061 | 995 | switch (*p) { |
| 1062 | 996 | case '\\': |
| 1063 | - token += "\\\\"; | |
| 1064 | 997 | case '\"': |
| 1065 | 998 | case '/': |
| 1066 | 999 | // \/ is allowed in json input, but so is /, so we |
| ... | ... | @@ -1083,9 +1016,9 @@ JSONParser::getToken() |
| 1083 | 1016 | token += '\t'; |
| 1084 | 1017 | break; |
| 1085 | 1018 | case 'u': |
| 1086 | - token += "\\u"; | |
| 1087 | 1019 | lex_state = ls_u4; |
| 1088 | 1020 | u_count = 0; |
| 1021 | + u_value = 0; | |
| 1089 | 1022 | break; |
| 1090 | 1023 | default: |
| 1091 | 1024 | QTC::TC("libtests", "JSON parse backslash bad character"); |
| ... | ... | @@ -1097,13 +1030,23 @@ JSONParser::getToken() |
| 1097 | 1030 | break; |
| 1098 | 1031 | |
| 1099 | 1032 | case ls_u4: |
| 1100 | - if (!QUtil::is_hex_digit(*p)) { | |
| 1033 | + using ui = unsigned int; | |
| 1034 | + action = ignore; | |
| 1035 | + if ('0' <= *p && *p <= '9') { | |
| 1036 | + u_value = 16 * u_value + (ui(*p) - ui('0')); | |
| 1037 | + } else if ('a' <= *p && *p <= 'f') { | |
| 1038 | + u_value = 16 * u_value + (10 + ui(*p) - ui('a')); | |
| 1039 | + } else if ('A' <= *p && *p <= 'F') { | |
| 1040 | + u_value = 16 * u_value + (10 + ui(*p) - ui('A')); | |
| 1041 | + } else { | |
| 1101 | 1042 | QTC::TC("libtests", "JSON parse bad hex after u"); |
| 1102 | 1043 | throw std::runtime_error( |
| 1103 | 1044 | "JSON: offset " + std::to_string(offset - u_count - 1) + |
| 1104 | 1045 | ": \\u must be followed by four hex digits"); |
| 1105 | 1046 | } |
| 1106 | 1047 | if (++u_count == 4) { |
| 1048 | + handle_u_code( | |
| 1049 | + u_value, offset - 5, high_surrogate, high_offset, token); | |
| 1107 | 1050 | lex_state = ls_string; |
| 1108 | 1051 | } |
| 1109 | 1052 | break; | ... | ... |