Commit ee32235f54884247f6117fc0fbdd462a4e38ac1f

Authored by m-holger
1 parent f5b7448a

In JSONParser::getToken handle legal control chars early

Also, reject them in strings.
libqpdf/JSON.cc
... ... @@ -723,10 +723,11 @@ JSONParser::handle_u_code(
723 723 void
724 724 JSONParser::tokenError()
725 725 {
726   - if (bytes == 0) {
  726 + if (done) {
727 727 QTC::TC("libtests", "JSON parse ls premature end of input");
728 728 throw std::runtime_error("JSON: premature end of input");
729 729 }
  730 +
730 731 if (lex_state == ls_u4) {
731 732 QTC::TC("libtests", "JSON parse bad hex after u");
732 733 throw std::runtime_error(
... ... @@ -737,6 +738,11 @@ JSONParser::tokenError()
737 738 throw std::runtime_error(
738 739 "JSON: offset " + std::to_string(offset) +
739 740 ": keyword: unexpected character " + std::string(p, 1));
  741 + } else if (lex_state == ls_string) {
  742 + QTC::TC("libtests", "JSON parse control char in string");
  743 + throw std::runtime_error(
  744 + "JSON: offset " + std::to_string(offset) +
  745 + ": control character in string (missing \"?)");
740 746 } else if (lex_state == ls_backslash) {
741 747 QTC::TC("libtests", "JSON parse backslash bad character");
742 748 throw std::runtime_error(
... ... @@ -779,6 +785,7 @@ JSONParser::tokenError()
779 785 "JSON: offset " + std::to_string(offset) +
780 786 ": numeric literal: unexpected character " + std::string(p, 1));
781 787 }
  788 + throw std::logic_error("JSON::tokenError : unhandled error");
782 789 }
783 790  
784 791 void
... ... @@ -792,7 +799,7 @@ JSONParser::getToken()
792 799 unsigned long high_surrogate = 0;
793 800 qpdf_offset_t high_offset = 0;
794 801  
795   - while (!done) {
  802 + while (true) {
796 803 if (p == (buf + bytes)) {
797 804 p = buf;
798 805 bytes = is.read(buf, sizeof(buf));
... ... @@ -808,307 +815,320 @@ JSONParser::getToken()
808 815 // end the current token (unless we are still before the start
809 816 // of the token).
810 817 if (lex_state == ls_top) {
811   - // Continue with token
  818 + ++p;
  819 + ++offset;
812 820 } else {
813   - // done
  821 + break;
814 822 }
  823 +
815 824 } else {
816 825 QTC::TC("libtests", "JSON parse null character");
817 826 throw std::runtime_error(
818 827 "JSON: control or null character at offset " +
819 828 std::to_string(offset));
820 829 }
821   - }
822   - action = append;
823   - switch (lex_state) {
824   - case ls_top:
825   - token_start = offset;
826   - if (*p == '"') {
827   - lex_state = ls_string;
828   - action = ignore;
829   - } else if (QUtil::is_space(*p)) {
830   - action = ignore;
831   - } else if (*p == ',') {
832   - lex_state = ls_comma;
833   - action = ignore;
834   - ready = true;
835   - } else if (*p == ':') {
836   - lex_state = ls_colon;
837   - action = ignore;
838   - ready = true;
839   - } else if (*p == '{') {
840   - lex_state = ls_begin_dict;
841   - action = ignore;
842   - ready = true;
843   - } else if (*p == '}') {
844   - lex_state = ls_end_dict;
845   - action = ignore;
846   - ready = true;
847   - } else if (*p == '[') {
848   - lex_state = ls_begin_array;
849   - action = ignore;
850   - ready = true;
851   - } else if (*p == ']') {
852   - lex_state = ls_end_array;
853   - action = ignore;
854   - ready = true;
855   - } else if ((*p >= 'a') && (*p <= 'z')) {
856   - lex_state = ls_alpha;
857   - } else if (*p == '-') {
858   - lex_state = ls_number_minus;
859   - } else if ((*p >= '1') && (*p <= '9')) {
860   - lex_state = ls_number_before_point;
861   - } else if (*p == '0') {
862   - lex_state = ls_number_leading_zero;
863   - } else {
864   - QTC::TC("libtests", "JSON parse bad character");
865   - throw std::runtime_error(
866   - "JSON: offset " + std::to_string(offset) +
867   - ": unexpected character " + std::string(p, 1));
868   - }
869   - break;
870   -
871   - case ls_number_minus:
872   - if ((*p >= '1') && (*p <= '9')) {
873   - lex_state = ls_number_before_point;
874   - } else if (*p == '0') {
875   - lex_state = ls_number_leading_zero;
876   - } else {
877   - QTC::TC("libtests", "JSON parse number minus no digits");
878   - throw std::runtime_error(
879   - "JSON: offset " + std::to_string(offset) +
880   - ": numeric literal: no digit after minus sign");
881   - }
882   - break;
883   -
884   - case ls_number_leading_zero:
885   - if (*p == '.') {
886   - lex_state = ls_number_point;
887   - } else if (QUtil::is_space(*p)) {
888   - lex_state = ls_number;
889   - action = ignore;
890   - ready = true;
891   - } else if (strchr("{}[]:,", *p)) {
892   - lex_state = ls_number;
893   - action = reread;
894   - ready = true;
895   - } else if (*p == 'e' || *p == 'E') {
896   - lex_state = ls_number_e;
897   - } else {
898   - QTC::TC("libtests", "JSON parse leading zero");
899   - throw std::runtime_error(
900   - "JSON: offset " + std::to_string(offset) +
901   - ": number with leading zero");
902   - }
903   - break;
904   -
905   - case ls_number_before_point:
906   - if ((*p >= '0') && (*p <= '9')) {
907   - // continue
908   - } else if (*p == '.') {
909   - lex_state = ls_number_point;
910   - } else if (QUtil::is_space(*p)) {
911   - lex_state = ls_number;
912   - action = ignore;
913   - ready = true;
914   - } else if (strchr("{}[]:,", *p)) {
915   - lex_state = ls_number;
916   - action = reread;
917   - ready = true;
918   - } else if (*p == 'e' || *p == 'E') {
919   - lex_state = ls_number_e;
920   - } else {
921   - tokenError();
922   - }
923   - break;
924   -
925   - case ls_number_point:
926   - if ((*p >= '0') && (*p <= '9')) {
927   - lex_state = ls_number_after_point;
928   - } else {
929   - tokenError();
930   - }
931   - break;
932   -
933   - case ls_number_after_point:
934   - if ((*p >= '0') && (*p <= '9')) {
935   - // continue
936   - } else if (QUtil::is_space(*p)) {
937   - lex_state = ls_number;
938   - action = ignore;
939   - ready = true;
940   - } else if (strchr("{}[]:,", *p)) {
941   - lex_state = ls_number;
942   - action = reread;
943   - ready = true;
944   - } else if (*p == 'e' || *p == 'E') {
945   - lex_state = ls_number_e;
946   - } else {
947   - tokenError();
948   - }
949   - break;
  830 + } else {
  831 + action = append;
  832 + switch (lex_state) {
  833 + case ls_top:
  834 + token_start = offset;
  835 + if (*p == '"') {
  836 + lex_state = ls_string;
  837 + action = ignore;
  838 + } else if (*p == ' ') {
  839 + action = ignore;
  840 + } else if (*p == ',') {
  841 + lex_state = ls_comma;
  842 + action = ignore;
  843 + ready = true;
  844 + } else if (*p == ',') {
  845 + lex_state = ls_comma;
  846 + action = ignore;
  847 + ready = true;
  848 + } else if (*p == ':') {
  849 + lex_state = ls_colon;
  850 + action = ignore;
  851 + ready = true;
  852 + } else if (*p == '{') {
  853 + lex_state = ls_begin_dict;
  854 + action = ignore;
  855 + ready = true;
  856 + } else if (*p == '}') {
  857 + lex_state = ls_end_dict;
  858 + action = ignore;
  859 + ready = true;
  860 + } else if (*p == '[') {
  861 + lex_state = ls_begin_array;
  862 + action = ignore;
  863 + ready = true;
  864 + } else if (*p == ']') {
  865 + lex_state = ls_end_array;
  866 + action = ignore;
  867 + ready = true;
  868 + } else if ((*p >= 'a') && (*p <= 'z')) {
  869 + lex_state = ls_alpha;
  870 + } else if (*p == '-') {
  871 + lex_state = ls_number_minus;
  872 + } else if ((*p >= '1') && (*p <= '9')) {
  873 + lex_state = ls_number_before_point;
  874 + } else if (*p == '0') {
  875 + lex_state = ls_number_leading_zero;
  876 + } else {
  877 + QTC::TC("libtests", "JSON parse bad character");
  878 + throw std::runtime_error(
  879 + "JSON: offset " + std::to_string(offset) +
  880 + ": unexpected character " + std::string(p, 1));
  881 + }
  882 + break;
950 883  
951   - case ls_number_e:
952   - if ((*p >= '0') && (*p <= '9')) {
953   - lex_state = ls_number;
954   - } else if ((*p == '+') || (*p == '-')) {
955   - lex_state = ls_number_e_sign;
956   - } else {
957   - tokenError();
958   - }
959   - break;
  884 + case ls_number_minus:
  885 + if ((*p >= '1') && (*p <= '9')) {
  886 + lex_state = ls_number_before_point;
  887 + } else if (*p == '0') {
  888 + lex_state = ls_number_leading_zero;
  889 + } else {
  890 + QTC::TC("libtests", "JSON parse number minus no digits");
  891 + throw std::runtime_error(
  892 + "JSON: offset " + std::to_string(offset) +
  893 + ": numeric literal: no digit after minus sign");
  894 + }
  895 + break;
960 896  
961   - case ls_number_e_sign:
962   - if ((*p >= '0') && (*p <= '9')) {
963   - lex_state = ls_number;
964   - } else {
965   - tokenError();
966   - }
967   - break;
  897 + case ls_number_leading_zero:
  898 + if (*p == '.') {
  899 + lex_state = ls_number_point;
  900 + } else if (*p == ' ') {
  901 + lex_state = ls_number;
  902 + action = ignore;
  903 + ready = true;
  904 + } else if (strchr("{}[]:,", *p)) {
  905 + lex_state = ls_number;
  906 + action = reread;
  907 + ready = true;
  908 + } else if (*p == 'e' || *p == 'E') {
  909 + lex_state = ls_number_e;
  910 + } else {
  911 + QTC::TC("libtests", "JSON parse leading zero");
  912 + throw std::runtime_error(
  913 + "JSON: offset " + std::to_string(offset) +
  914 + ": number with leading zero");
  915 + }
  916 + break;
968 917  
969   - case ls_number:
970   - // We only get here after we have seen an exponent.
971   - if ((*p >= '0') && (*p <= '9')) {
972   - // continue
973   - } else if (QUtil::is_space(*p)) {
974   - action = ignore;
975   - ready = true;
976   - } else if (strchr("{}[]:,", *p)) {
977   - action = reread;
978   - ready = true;
979   - } else {
980   - tokenError();
981   - }
982   - break;
  918 + case ls_number_before_point:
  919 + if ((*p >= '0') && (*p <= '9')) {
  920 + // continue
  921 + } else if (*p == '.') {
  922 + lex_state = ls_number_point;
  923 + } else if (*p == ' ') {
  924 + lex_state = ls_number;
  925 + action = ignore;
  926 + ready = true;
  927 + } else if (strchr("{}[]:,", *p)) {
  928 + lex_state = ls_number;
  929 + action = reread;
  930 + ready = true;
  931 + } else if (*p == 'e' || *p == 'E') {
  932 + lex_state = ls_number_e;
  933 + } else {
  934 + tokenError();
  935 + }
  936 + break;
983 937  
984   - case ls_alpha:
985   - if ((*p >= 'a') && (*p <= 'z')) {
986   - // okay
987   - } else if (QUtil::is_space(*p)) {
988   - action = ignore;
989   - ready = true;
990   - } else if (strchr("{}[]:,", *p)) {
991   - action = reread;
992   - ready = true;
993   - } else {
994   - tokenError();
995   - }
996   - break;
  938 + case ls_number_point:
  939 + if ((*p >= '0') && (*p <= '9')) {
  940 + lex_state = ls_number_after_point;
  941 + } else {
  942 + tokenError();
  943 + }
  944 + break;
997 945  
998   - case ls_string:
999   - if (*p == '"') {
1000   - if (high_offset) {
1001   - QTC::TC("libtests", "JSON 16 dangling high");
1002   - throw std::runtime_error(
1003   - "JSON: offset " + std::to_string(high_offset) +
1004   - ": UTF-16 high surrogate not followed by low "
1005   - "surrogate");
  946 + case ls_number_after_point:
  947 + if ((*p >= '0') && (*p <= '9')) {
  948 + // continue
  949 + } else if (*p == ' ') {
  950 + lex_state = ls_number;
  951 + action = ignore;
  952 + ready = true;
  953 + } else if (strchr("{}[]:,", *p)) {
  954 + lex_state = ls_number;
  955 + action = reread;
  956 + ready = true;
  957 + } else if (*p == 'e' || *p == 'E') {
  958 + lex_state = ls_number_e;
  959 + } else {
  960 + tokenError();
1006 961 }
1007   - action = ignore;
1008   - ready = true;
1009   - } else if (*p == '\\') {
1010   - lex_state = ls_backslash;
1011   - action = ignore;
1012   - }
1013   - break;
  962 + break;
1014 963  
1015   - case ls_backslash:
1016   - action = ignore;
1017   - lex_state = ls_string;
1018   - switch (*p) {
1019   - case '\\':
1020   - case '\"':
1021   - case '/':
1022   - // \/ is allowed in json input, but so is /, so we
1023   - // don't map / to \/ in output.
1024   - token += *p;
  964 + case ls_number_e:
  965 + if ((*p >= '0') && (*p <= '9')) {
  966 + lex_state = ls_number;
  967 + } else if ((*p == '+') || (*p == '-')) {
  968 + lex_state = ls_number_e_sign;
  969 + } else {
  970 + tokenError();
  971 + }
1025 972 break;
1026   - case 'b':
1027   - token += '\b';
  973 +
  974 + case ls_number_e_sign:
  975 + if ((*p >= '0') && (*p <= '9')) {
  976 + lex_state = ls_number;
  977 + } else {
  978 + tokenError();
  979 + }
1028 980 break;
1029   - case 'f':
1030   - token += '\f';
  981 +
  982 + case ls_number:
  983 + // We only get here after we have seen an exponent.
  984 + if ((*p >= '0') && (*p <= '9')) {
  985 + // continue
  986 + } else if (*p == ' ') {
  987 + action = ignore;
  988 + ready = true;
  989 + } else if (strchr("{}[]:,", *p)) {
  990 + action = reread;
  991 + ready = true;
  992 + } else {
  993 + tokenError();
  994 + }
1031 995 break;
1032   - case 'n':
1033   - token += '\n';
  996 +
  997 + case ls_alpha:
  998 + if ((*p >= 'a') && (*p <= 'z')) {
  999 + // okay
  1000 + } else if (*p == ' ') {
  1001 + action = ignore;
  1002 + ready = true;
  1003 + } else if (strchr("{}[]:,", *p)) {
  1004 + action = reread;
  1005 + ready = true;
  1006 + } else {
  1007 + tokenError();
  1008 + }
1034 1009 break;
1035   - case 'r':
1036   - token += '\r';
  1010 +
  1011 + case ls_string:
  1012 + if (*p == '"') {
  1013 + if (high_offset) {
  1014 + QTC::TC("libtests", "JSON 16 dangling high");
  1015 + throw std::runtime_error(
  1016 + "JSON: offset " + std::to_string(high_offset) +
  1017 + ": UTF-16 high surrogate not followed by low "
  1018 + "surrogate");
  1019 + }
  1020 + action = ignore;
  1021 + ready = true;
  1022 + } else if (*p == '\\') {
  1023 + lex_state = ls_backslash;
  1024 + action = ignore;
  1025 + }
1037 1026 break;
1038   - case 't':
1039   - token += '\t';
  1027 +
  1028 + case ls_backslash:
  1029 + action = ignore;
  1030 + lex_state = ls_string;
  1031 + switch (*p) {
  1032 + case '\\':
  1033 + case '\"':
  1034 + case '/':
  1035 + // \/ is allowed in json input, but so is /, so we
  1036 + // don't map / to \/ in output.
  1037 + token += *p;
  1038 + break;
  1039 + case 'b':
  1040 + token += '\b';
  1041 + break;
  1042 + case 'f':
  1043 + token += '\f';
  1044 + break;
  1045 + case 'n':
  1046 + token += '\n';
  1047 + break;
  1048 + case 'r':
  1049 + token += '\r';
  1050 + break;
  1051 + case 't':
  1052 + token += '\t';
  1053 + break;
  1054 + case 'u':
  1055 + lex_state = ls_u4;
  1056 + u_count = 0;
  1057 + u_value = 0;
  1058 + break;
  1059 + default:
  1060 + lex_state = ls_backslash;
  1061 + tokenError();
  1062 + }
1040 1063 break;
1041   - case 'u':
1042   - lex_state = ls_u4;
1043   - u_count = 0;
1044   - u_value = 0;
  1064 +
  1065 + case ls_u4:
  1066 + using ui = unsigned int;
  1067 + action = ignore;
  1068 + if ('0' <= *p && *p <= '9') {
  1069 + u_value = 16 * u_value + (ui(*p) - ui('0'));
  1070 + } else if ('a' <= *p && *p <= 'f') {
  1071 + u_value = 16 * u_value + (10 + ui(*p) - ui('a'));
  1072 + } else if ('A' <= *p && *p <= 'F') {
  1073 + u_value = 16 * u_value + (10 + ui(*p) - ui('A'));
  1074 + } else {
  1075 + tokenError();
  1076 + }
  1077 + if (++u_count == 4) {
  1078 + handle_u_code(
  1079 + u_value,
  1080 + offset - 5,
  1081 + high_surrogate,
  1082 + high_offset,
  1083 + token);
  1084 + lex_state = ls_string;
  1085 + }
1045 1086 break;
  1087 +
1046 1088 default:
1047   - lex_state = ls_backslash;
1048   - tokenError();
  1089 + throw std::logic_error(
  1090 + "JSONParser::getToken : trying to handle delimiter state");
1049 1091 }
1050   - break;
1051   -
1052   - case ls_u4:
1053   - using ui = unsigned int;
1054   - action = ignore;
1055   - if ('0' <= *p && *p <= '9') {
1056   - u_value = 16 * u_value + (ui(*p) - ui('0'));
1057   - } else if ('a' <= *p && *p <= 'f') {
1058   - u_value = 16 * u_value + (10 + ui(*p) - ui('a'));
1059   - } else if ('A' <= *p && *p <= 'F') {
1060   - u_value = 16 * u_value + (10 + ui(*p) - ui('A'));
1061   - } else {
1062   - tokenError();
  1092 + switch (action) {
  1093 + case reread:
  1094 + break;
  1095 + case append:
  1096 + token.append(1, *p);
  1097 + // fall through
  1098 + case ignore:
  1099 + ++p;
  1100 + ++offset;
  1101 + break;
1063 1102 }
1064   - if (++u_count == 4) {
1065   - handle_u_code(
1066   - u_value, offset - 5, high_surrogate, high_offset, token);
1067   - lex_state = ls_string;
  1103 + if (ready) {
  1104 + return;
1068 1105 }
1069   - break;
1070   -
1071   - default:
1072   - throw std::logic_error(
1073   - "JSONParser::getToken : trying to handle delimiter state");
1074   - }
1075   - switch (action) {
1076   - case reread:
1077   - break;
1078   - case append:
1079   - token.append(1, *p);
1080   - // fall through
1081   - case ignore:
1082   - ++p;
1083   - ++offset;
1084   - break;
1085   - }
1086   - if (ready) {
1087   - break;
1088 1106 }
1089 1107 }
1090   - if (done) {
1091   - if (!token.empty() && !ready) {
1092   - switch (lex_state) {
1093   - case ls_top:
1094   - // Can't happen
1095   - throw std::logic_error("tok_start set in ls_top while parsing");
1096   - break;
1097 1108  
1098   - case ls_number_leading_zero:
1099   - case ls_number_before_point:
1100   - case ls_number_after_point:
1101   - lex_state = ls_number;
1102   - break;
  1109 + // We only get here if on end of input or if the last character was a
  1110 + // control character.
1103 1111  
1104   - case ls_number:
1105   - case ls_alpha:
1106   - // terminal state
1107   - break;
  1112 + if (!token.empty()) {
  1113 + switch (lex_state) {
  1114 + case ls_top:
  1115 + // Can't happen
  1116 + throw std::logic_error("tok_start set in ls_top while parsing");
  1117 + break;
1108 1118  
1109   - default:
1110   - tokenError();
1111   - }
  1119 + case ls_number_leading_zero:
  1120 + case ls_number_before_point:
  1121 + case ls_number_after_point:
  1122 + lex_state = ls_number;
  1123 + break;
  1124 +
  1125 + case ls_number:
  1126 + case ls_alpha:
  1127 + // terminal state
  1128 + break;
  1129 +
  1130 + default:
  1131 + tokenError();
1112 1132 }
1113 1133 }
1114 1134 }
... ...
libtests/libtests.testcov
... ... @@ -79,6 +79,7 @@ JSON parse number minus no digits 0
79 79 JSON parse incomplete number 0
80 80 JSON parse keyword bad character 0
81 81 JSON parse backslash bad character 0
  82 +JSON parse control char in string 0
82 83 JSON parse leading zero 0
83 84 JSON parse ls premature end of input 0
84 85 JSON parse bad hex after u 0
... ...
libtests/qtest/json_parse.test
... ... @@ -125,10 +125,10 @@ my @bad = (
125 125 "e after minus", # 42
126 126 "missing digit after e", # 43
127 127 "missing digit after e+/-", # 44
128   - # "tab char in string", # 45
129   - # "cr char in string", # 46
130   - # "lf char in string", # 47
131   - # "bs char in string", # 48
  128 + "tab char in string", # 45
  129 + "cr char in string", # 46
  130 + "lf char in string", # 47
  131 + "bs char in string", # 48
132 132 );
133 133  
134 134 my $i = 0;
... ...
libtests/qtest/json_parse/bad-01.out
1   -exception: bad-01.json: JSON: offset 9: material follows end of object: junk
  1 +exception: bad-01.json: JSON: offset 8: material follows end of object: junk
... ...
libtests/qtest/json_parse/bad-02.out
1   -exception: bad-02.json: JSON: offset 11: material follows end of object: junk
  1 +exception: bad-02.json: JSON: offset 10: material follows end of object: junk
... ...
libtests/qtest/json_parse/bad-03.out
1   -exception: bad-03.json: JSON: offset 16: material follows end of object: junk
  1 +exception: bad-03.json: JSON: offset 15: material follows end of object: junk
... ...
libtests/qtest/json_parse/bad-27.out
1   -exception: bad-27.json: JSON: premature end of input
  1 +exception: bad-27.json: JSON: offset 5: control character in string (missing "?)
... ...
libtests/qtest/json_parse/bad-31.json
1   --
  1 +-
... ...
libtests/qtest/json_parse/bad-45.out
1   -"Tab in str\ting"
  1 +exception: bad-45.json: JSON: offset 11: control character in string (missing "?)
... ...
libtests/qtest/json_parse/bad-46.out
1   -"cr in str\ring"
  1 +exception: bad-46.json: JSON: offset 10: control character in string (missing "?)
... ...
libtests/qtest/json_parse/bad-47.out
1   -"lf in str\ning"
  1 +exception: bad-47.json: JSON: offset 10: control character in string (missing "?)
... ...