Commit 98d9ae51fc4e1a6967b52e7708f6ddc66c684276

Authored by m-holger
1 parent 320245e0

Integrate JSONParser::decode_string into getToken

Showing 1 changed file with 30 additions and 87 deletions
libqpdf/JSON.cc
... ... @@ -616,12 +616,9 @@ namespace
616 616 void getToken();
617 617 void handleToken();
618 618 void numberError();
619   - static std::string
620   - decode_string(std::string const& json, qpdf_offset_t offset);
621 619 static void handle_u_code(
622   - char const* s,
  620 + unsigned long codepoint,
623 621 qpdf_offset_t offset,
624   - qpdf_offset_t i,
625 622 unsigned long& high_surrogate,
626 623 qpdf_offset_t& high_offset,
627 624 std::string& result);
... ... @@ -680,6 +677,7 @@ namespace
680 677 size_t bytes;
681 678 char const* p;
682 679 qpdf_offset_t u_count;
  680 + unsigned long u_value{0};
683 681 qpdf_offset_t offset;
684 682 bool done;
685 683 std::string token;
... ... @@ -693,22 +691,15 @@ namespace
693 691  
694 692 void
695 693 JSONParser::handle_u_code(
696   - char const* s,
  694 + unsigned long codepoint,
697 695 qpdf_offset_t offset,
698   - qpdf_offset_t i,
699 696 unsigned long& high_surrogate,
700 697 qpdf_offset_t& high_offset,
701 698 std::string& result)
702 699 {
703   - std::string hex = QUtil::hex_decode(std::string(s + i + 1, s + i + 5));
704   - unsigned char high = static_cast<unsigned char>(hex.at(0));
705   - unsigned char low = static_cast<unsigned char>(hex.at(1));
706   - unsigned long codepoint = high;
707   - codepoint <<= 8;
708   - codepoint += low;
709 700 if ((codepoint & 0xFC00) == 0xD800) {
710 701 // high surrogate
711   - qpdf_offset_t new_high_offset = offset + i;
  702 + qpdf_offset_t new_high_offset = offset;
712 703 if (high_offset) {
713 704 QTC::TC("libtests", "JSON 16 high high");
714 705 throw std::runtime_error(
... ... @@ -721,10 +712,10 @@ JSONParser::handle_u_code(
721 712 high_surrogate = codepoint;
722 713 } else if ((codepoint & 0xFC00) == 0xDC00) {
723 714 // low surrogate
724   - if (offset + i != (high_offset + 6)) {
  715 + if (offset != (high_offset + 6)) {
725 716 QTC::TC("libtests", "JSON 16 low not after high");
726 717 throw std::runtime_error(
727   - "JSON: offset " + std::to_string(offset + i) +
  718 + "JSON: offset " + std::to_string(offset) +
728 719 ": UTF-16 low surrogate found not immediately after high"
729 720 " surrogate");
730 721 }
... ... @@ -737,74 +728,6 @@ JSONParser::handle_u_code(
737 728 }
738 729 }
739 730  
740   -std::string
741   -JSONParser::decode_string(std::string const& str, qpdf_offset_t offset)
742   -{
743   - // The string has already been validated when this private method
744   - // is called, so errors are logic errors instead of runtime
745   - // errors.
746   - size_t len = str.length();
747   - char const* s = str.c_str();
748   -
749   - // Keep track of UTF-16 surrogate pairs.
750   - unsigned long high_surrogate = 0;
751   - qpdf_offset_t high_offset = 0;
752   - std::string result;
753   - qpdf_offset_t olen = toO(len);
754   - for (qpdf_offset_t i = 0; i < olen; ++i) {
755   - if (s[i] == '\\') {
756   - if (i + 1 >= olen) {
757   - throw std::logic_error("JSON parse: nothing after \\");
758   - }
759   - char ch = s[++i];
760   - switch (ch) {
761   - case '\\':
762   - case '\"':
763   - case '/':
764   - // \/ is allowed in json input, but so is /, so we
765   - // don't map / to \/ in output.
766   - result.append(1, ch);
767   - break;
768   - case 'b':
769   - result.append(1, '\b');
770   - break;
771   - case 'f':
772   - result.append(1, '\f');
773   - break;
774   - case 'n':
775   - result.append(1, '\n');
776   - break;
777   - case 'r':
778   - result.append(1, '\r');
779   - break;
780   - case 't':
781   - result.append(1, '\t');
782   - break;
783   - case 'u':
784   - if (i + 4 >= olen) {
785   - throw std::logic_error(
786   - "JSON parse: not enough characters after \\u");
787   - }
788   - handle_u_code(
789   - s, offset, i, high_surrogate, high_offset, result);
790   - i += 4;
791   - break;
792   - default:
793   - break;
794   - }
795   - } else {
796   - result.append(1, s[i]);
797   - }
798   - }
799   - if (high_offset) {
800   - QTC::TC("libtests", "JSON 16 dangling high");
801   - throw std::runtime_error(
802   - "JSON: offset " + std::to_string(high_offset) +
803   - ": UTF-16 high surrogate not followed by low surrogate");
804   - }
805   - return result;
806   -}
807   -
808 731 void
809 732 JSONParser::numberError()
810 733 {
... ... @@ -850,6 +773,11 @@ JSONParser::getToken()
850 773 enum { append, ignore, reread } action = append;
851 774 bool ready = false;
852 775 token.clear();
  776 +
  777 + // Keep track of UTF-16 surrogate pairs.
  778 + unsigned long high_surrogate = 0;
  779 + qpdf_offset_t high_offset = 0;
  780 +
853 781 while (!done) {
854 782 if (p == (buf + bytes)) {
855 783 p = buf;
... ... @@ -1046,7 +974,13 @@ JSONParser::getToken()
1046 974  
1047 975 case ls_string:
1048 976 if (*p == '"') {
1049   - token = decode_string(token, token_start);
  977 + if (high_offset) {
  978 + QTC::TC("libtests", "JSON 16 dangling high");
  979 + throw std::runtime_error(
  980 + "JSON: offset " + std::to_string(high_offset) +
  981 + ": UTF-16 high surrogate not followed by low "
  982 + "surrogate");
  983 + }
1050 984 action = ignore;
1051 985 ready = true;
1052 986 } else if (*p == '\\') {
... ... @@ -1060,7 +994,6 @@ JSONParser::getToken()
1060 994 lex_state = ls_string;
1061 995 switch (*p) {
1062 996 case '\\':
1063   - token += "\\\\";
1064 997 case '\"':
1065 998 case '/':
1066 999 // \/ is allowed in json input, but so is /, so we
... ... @@ -1083,9 +1016,9 @@ JSONParser::getToken()
1083 1016 token += '\t';
1084 1017 break;
1085 1018 case 'u':
1086   - token += "\\u";
1087 1019 lex_state = ls_u4;
1088 1020 u_count = 0;
  1021 + u_value = 0;
1089 1022 break;
1090 1023 default:
1091 1024 QTC::TC("libtests", "JSON parse backslash bad character");
... ... @@ -1097,13 +1030,23 @@ JSONParser::getToken()
1097 1030 break;
1098 1031  
1099 1032 case ls_u4:
1100   - if (!QUtil::is_hex_digit(*p)) {
  1033 + using ui = unsigned int;
  1034 + action = ignore;
  1035 + if ('0' <= *p && *p <= '9') {
  1036 + u_value = 16 * u_value + (ui(*p) - ui('0'));
  1037 + } else if ('a' <= *p && *p <= 'f') {
  1038 + u_value = 16 * u_value + (10 + ui(*p) - ui('a'));
  1039 + } else if ('A' <= *p && *p <= 'F') {
  1040 + u_value = 16 * u_value + (10 + ui(*p) - ui('A'));
  1041 + } else {
1101 1042 QTC::TC("libtests", "JSON parse bad hex after u");
1102 1043 throw std::runtime_error(
1103 1044 "JSON: offset " + std::to_string(offset - u_count - 1) +
1104 1045 ": \\u must be followed by four hex digits");
1105 1046 }
1106 1047 if (++u_count == 4) {
  1048 + handle_u_code(
  1049 + u_value, offset - 5, high_surrogate, high_offset, token);
1107 1050 lex_state = ls_string;
1108 1051 }
1109 1052 break;
... ...