Commit 98d9ae51fc4e1a6967b52e7708f6ddc66c684276
1 parent
320245e0
Integrate JSONParser::decode_string into getToken
Showing
1 changed file
with
30 additions
and
87 deletions
libqpdf/JSON.cc
| @@ -616,12 +616,9 @@ namespace | @@ -616,12 +616,9 @@ namespace | ||
| 616 | void getToken(); | 616 | void getToken(); |
| 617 | void handleToken(); | 617 | void handleToken(); |
| 618 | void numberError(); | 618 | void numberError(); |
| 619 | - static std::string | ||
| 620 | - decode_string(std::string const& json, qpdf_offset_t offset); | ||
| 621 | static void handle_u_code( | 619 | static void handle_u_code( |
| 622 | - char const* s, | 620 | + unsigned long codepoint, |
| 623 | qpdf_offset_t offset, | 621 | qpdf_offset_t offset, |
| 624 | - qpdf_offset_t i, | ||
| 625 | unsigned long& high_surrogate, | 622 | unsigned long& high_surrogate, |
| 626 | qpdf_offset_t& high_offset, | 623 | qpdf_offset_t& high_offset, |
| 627 | std::string& result); | 624 | std::string& result); |
| @@ -680,6 +677,7 @@ namespace | @@ -680,6 +677,7 @@ namespace | ||
| 680 | size_t bytes; | 677 | size_t bytes; |
| 681 | char const* p; | 678 | char const* p; |
| 682 | qpdf_offset_t u_count; | 679 | qpdf_offset_t u_count; |
| 680 | + unsigned long u_value{0}; | ||
| 683 | qpdf_offset_t offset; | 681 | qpdf_offset_t offset; |
| 684 | bool done; | 682 | bool done; |
| 685 | std::string token; | 683 | std::string token; |
| @@ -693,22 +691,15 @@ namespace | @@ -693,22 +691,15 @@ namespace | ||
| 693 | 691 | ||
| 694 | void | 692 | void |
| 695 | JSONParser::handle_u_code( | 693 | JSONParser::handle_u_code( |
| 696 | - char const* s, | 694 | + unsigned long codepoint, |
| 697 | qpdf_offset_t offset, | 695 | qpdf_offset_t offset, |
| 698 | - qpdf_offset_t i, | ||
| 699 | unsigned long& high_surrogate, | 696 | unsigned long& high_surrogate, |
| 700 | qpdf_offset_t& high_offset, | 697 | qpdf_offset_t& high_offset, |
| 701 | std::string& result) | 698 | std::string& result) |
| 702 | { | 699 | { |
| 703 | - std::string hex = QUtil::hex_decode(std::string(s + i + 1, s + i + 5)); | ||
| 704 | - unsigned char high = static_cast<unsigned char>(hex.at(0)); | ||
| 705 | - unsigned char low = static_cast<unsigned char>(hex.at(1)); | ||
| 706 | - unsigned long codepoint = high; | ||
| 707 | - codepoint <<= 8; | ||
| 708 | - codepoint += low; | ||
| 709 | if ((codepoint & 0xFC00) == 0xD800) { | 700 | if ((codepoint & 0xFC00) == 0xD800) { |
| 710 | // high surrogate | 701 | // high surrogate |
| 711 | - qpdf_offset_t new_high_offset = offset + i; | 702 | + qpdf_offset_t new_high_offset = offset; |
| 712 | if (high_offset) { | 703 | if (high_offset) { |
| 713 | QTC::TC("libtests", "JSON 16 high high"); | 704 | QTC::TC("libtests", "JSON 16 high high"); |
| 714 | throw std::runtime_error( | 705 | throw std::runtime_error( |
| @@ -721,10 +712,10 @@ JSONParser::handle_u_code( | @@ -721,10 +712,10 @@ JSONParser::handle_u_code( | ||
| 721 | high_surrogate = codepoint; | 712 | high_surrogate = codepoint; |
| 722 | } else if ((codepoint & 0xFC00) == 0xDC00) { | 713 | } else if ((codepoint & 0xFC00) == 0xDC00) { |
| 723 | // low surrogate | 714 | // low surrogate |
| 724 | - if (offset + i != (high_offset + 6)) { | 715 | + if (offset != (high_offset + 6)) { |
| 725 | QTC::TC("libtests", "JSON 16 low not after high"); | 716 | QTC::TC("libtests", "JSON 16 low not after high"); |
| 726 | throw std::runtime_error( | 717 | throw std::runtime_error( |
| 727 | - "JSON: offset " + std::to_string(offset + i) + | 718 | + "JSON: offset " + std::to_string(offset) + |
| 728 | ": UTF-16 low surrogate found not immediately after high" | 719 | ": UTF-16 low surrogate found not immediately after high" |
| 729 | " surrogate"); | 720 | " surrogate"); |
| 730 | } | 721 | } |
| @@ -737,74 +728,6 @@ JSONParser::handle_u_code( | @@ -737,74 +728,6 @@ JSONParser::handle_u_code( | ||
| 737 | } | 728 | } |
| 738 | } | 729 | } |
| 739 | 730 | ||
| 740 | -std::string | ||
| 741 | -JSONParser::decode_string(std::string const& str, qpdf_offset_t offset) | ||
| 742 | -{ | ||
| 743 | - // The string has already been validated when this private method | ||
| 744 | - // is called, so errors are logic errors instead of runtime | ||
| 745 | - // errors. | ||
| 746 | - size_t len = str.length(); | ||
| 747 | - char const* s = str.c_str(); | ||
| 748 | - | ||
| 749 | - // Keep track of UTF-16 surrogate pairs. | ||
| 750 | - unsigned long high_surrogate = 0; | ||
| 751 | - qpdf_offset_t high_offset = 0; | ||
| 752 | - std::string result; | ||
| 753 | - qpdf_offset_t olen = toO(len); | ||
| 754 | - for (qpdf_offset_t i = 0; i < olen; ++i) { | ||
| 755 | - if (s[i] == '\\') { | ||
| 756 | - if (i + 1 >= olen) { | ||
| 757 | - throw std::logic_error("JSON parse: nothing after \\"); | ||
| 758 | - } | ||
| 759 | - char ch = s[++i]; | ||
| 760 | - switch (ch) { | ||
| 761 | - case '\\': | ||
| 762 | - case '\"': | ||
| 763 | - case '/': | ||
| 764 | - // \/ is allowed in json input, but so is /, so we | ||
| 765 | - // don't map / to \/ in output. | ||
| 766 | - result.append(1, ch); | ||
| 767 | - break; | ||
| 768 | - case 'b': | ||
| 769 | - result.append(1, '\b'); | ||
| 770 | - break; | ||
| 771 | - case 'f': | ||
| 772 | - result.append(1, '\f'); | ||
| 773 | - break; | ||
| 774 | - case 'n': | ||
| 775 | - result.append(1, '\n'); | ||
| 776 | - break; | ||
| 777 | - case 'r': | ||
| 778 | - result.append(1, '\r'); | ||
| 779 | - break; | ||
| 780 | - case 't': | ||
| 781 | - result.append(1, '\t'); | ||
| 782 | - break; | ||
| 783 | - case 'u': | ||
| 784 | - if (i + 4 >= olen) { | ||
| 785 | - throw std::logic_error( | ||
| 786 | - "JSON parse: not enough characters after \\u"); | ||
| 787 | - } | ||
| 788 | - handle_u_code( | ||
| 789 | - s, offset, i, high_surrogate, high_offset, result); | ||
| 790 | - i += 4; | ||
| 791 | - break; | ||
| 792 | - default: | ||
| 793 | - break; | ||
| 794 | - } | ||
| 795 | - } else { | ||
| 796 | - result.append(1, s[i]); | ||
| 797 | - } | ||
| 798 | - } | ||
| 799 | - if (high_offset) { | ||
| 800 | - QTC::TC("libtests", "JSON 16 dangling high"); | ||
| 801 | - throw std::runtime_error( | ||
| 802 | - "JSON: offset " + std::to_string(high_offset) + | ||
| 803 | - ": UTF-16 high surrogate not followed by low surrogate"); | ||
| 804 | - } | ||
| 805 | - return result; | ||
| 806 | -} | ||
| 807 | - | ||
| 808 | void | 731 | void |
| 809 | JSONParser::numberError() | 732 | JSONParser::numberError() |
| 810 | { | 733 | { |
| @@ -850,6 +773,11 @@ JSONParser::getToken() | @@ -850,6 +773,11 @@ JSONParser::getToken() | ||
| 850 | enum { append, ignore, reread } action = append; | 773 | enum { append, ignore, reread } action = append; |
| 851 | bool ready = false; | 774 | bool ready = false; |
| 852 | token.clear(); | 775 | token.clear(); |
| 776 | + | ||
| 777 | + // Keep track of UTF-16 surrogate pairs. | ||
| 778 | + unsigned long high_surrogate = 0; | ||
| 779 | + qpdf_offset_t high_offset = 0; | ||
| 780 | + | ||
| 853 | while (!done) { | 781 | while (!done) { |
| 854 | if (p == (buf + bytes)) { | 782 | if (p == (buf + bytes)) { |
| 855 | p = buf; | 783 | p = buf; |
| @@ -1046,7 +974,13 @@ JSONParser::getToken() | @@ -1046,7 +974,13 @@ JSONParser::getToken() | ||
| 1046 | 974 | ||
| 1047 | case ls_string: | 975 | case ls_string: |
| 1048 | if (*p == '"') { | 976 | if (*p == '"') { |
| 1049 | - token = decode_string(token, token_start); | 977 | + if (high_offset) { |
| 978 | + QTC::TC("libtests", "JSON 16 dangling high"); | ||
| 979 | + throw std::runtime_error( | ||
| 980 | + "JSON: offset " + std::to_string(high_offset) + | ||
| 981 | + ": UTF-16 high surrogate not followed by low " | ||
| 982 | + "surrogate"); | ||
| 983 | + } | ||
| 1050 | action = ignore; | 984 | action = ignore; |
| 1051 | ready = true; | 985 | ready = true; |
| 1052 | } else if (*p == '\\') { | 986 | } else if (*p == '\\') { |
| @@ -1060,7 +994,6 @@ JSONParser::getToken() | @@ -1060,7 +994,6 @@ JSONParser::getToken() | ||
| 1060 | lex_state = ls_string; | 994 | lex_state = ls_string; |
| 1061 | switch (*p) { | 995 | switch (*p) { |
| 1062 | case '\\': | 996 | case '\\': |
| 1063 | - token += "\\\\"; | ||
| 1064 | case '\"': | 997 | case '\"': |
| 1065 | case '/': | 998 | case '/': |
| 1066 | // \/ is allowed in json input, but so is /, so we | 999 | // \/ is allowed in json input, but so is /, so we |
| @@ -1083,9 +1016,9 @@ JSONParser::getToken() | @@ -1083,9 +1016,9 @@ JSONParser::getToken() | ||
| 1083 | token += '\t'; | 1016 | token += '\t'; |
| 1084 | break; | 1017 | break; |
| 1085 | case 'u': | 1018 | case 'u': |
| 1086 | - token += "\\u"; | ||
| 1087 | lex_state = ls_u4; | 1019 | lex_state = ls_u4; |
| 1088 | u_count = 0; | 1020 | u_count = 0; |
| 1021 | + u_value = 0; | ||
| 1089 | break; | 1022 | break; |
| 1090 | default: | 1023 | default: |
| 1091 | QTC::TC("libtests", "JSON parse backslash bad character"); | 1024 | QTC::TC("libtests", "JSON parse backslash bad character"); |
| @@ -1097,13 +1030,23 @@ JSONParser::getToken() | @@ -1097,13 +1030,23 @@ JSONParser::getToken() | ||
| 1097 | break; | 1030 | break; |
| 1098 | 1031 | ||
| 1099 | case ls_u4: | 1032 | case ls_u4: |
| 1100 | - if (!QUtil::is_hex_digit(*p)) { | 1033 | + using ui = unsigned int; |
| 1034 | + action = ignore; | ||
| 1035 | + if ('0' <= *p && *p <= '9') { | ||
| 1036 | + u_value = 16 * u_value + (ui(*p) - ui('0')); | ||
| 1037 | + } else if ('a' <= *p && *p <= 'f') { | ||
| 1038 | + u_value = 16 * u_value + (10 + ui(*p) - ui('a')); | ||
| 1039 | + } else if ('A' <= *p && *p <= 'F') { | ||
| 1040 | + u_value = 16 * u_value + (10 + ui(*p) - ui('A')); | ||
| 1041 | + } else { | ||
| 1101 | QTC::TC("libtests", "JSON parse bad hex after u"); | 1042 | QTC::TC("libtests", "JSON parse bad hex after u"); |
| 1102 | throw std::runtime_error( | 1043 | throw std::runtime_error( |
| 1103 | "JSON: offset " + std::to_string(offset - u_count - 1) + | 1044 | "JSON: offset " + std::to_string(offset - u_count - 1) + |
| 1104 | ": \\u must be followed by four hex digits"); | 1045 | ": \\u must be followed by four hex digits"); |
| 1105 | } | 1046 | } |
| 1106 | if (++u_count == 4) { | 1047 | if (++u_count == 4) { |
| 1048 | + handle_u_code( | ||
| 1049 | + u_value, offset - 5, high_surrogate, high_offset, token); | ||
| 1107 | lex_state = ls_string; | 1050 | lex_state = ls_string; |
| 1108 | } | 1051 | } |
| 1109 | break; | 1052 | break; |