Commit ce5b864c53eee2dccb912b363099409497c08a9c
Committed by
GitHub
Merge pull request #1201 from m-holger/xref_stream
QPDF::processXRefStream
Showing
2 changed files
with
158 additions
and
119 deletions
include/qpdf/QPDF.hh
| ... | ... | @@ -1028,6 +1028,14 @@ class QPDF |
| 1028 | 1028 | qpdf_offset_t read_xrefTable(qpdf_offset_t offset); |
| 1029 | 1029 | qpdf_offset_t read_xrefStream(qpdf_offset_t offset); |
| 1030 | 1030 | qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream); |
| 1031 | + std::pair<int, std::array<int, 3>> | |
| 1032 | + processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged); | |
| 1033 | + int processXRefSize( | |
| 1034 | + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged); | |
| 1035 | + std::pair<int, std::vector<std::pair<int, int>>> processXRefIndex( | |
| 1036 | + QPDFObjectHandle& dict, | |
| 1037 | + int max_num_entries, | |
| 1038 | + std::function<QPDFExc(std::string_view)> damaged); | |
| 1031 | 1039 | void insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2); |
| 1032 | 1040 | void insertFreeXrefEntry(QPDFObjGen); |
| 1033 | 1041 | void insertReconstructedXrefEntry(int obj, qpdf_offset_t f1, int f2); | ... | ... |
libqpdf/QPDF.cc
| ... | ... | @@ -2,6 +2,7 @@ |
| 2 | 2 | |
| 3 | 3 | #include <qpdf/QPDF.hh> |
| 4 | 4 | |
| 5 | +#include <array> | |
| 5 | 6 | #include <atomic> |
| 6 | 7 | #include <cstring> |
| 7 | 8 | #include <limits> |
| ... | ... | @@ -968,95 +969,144 @@ QPDF::read_xrefStream(qpdf_offset_t xref_offset) |
| 968 | 969 | return 0; // unreachable |
| 969 | 970 | } |
| 970 | 971 | |
| 971 | -qpdf_offset_t | |
| 972 | -QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) | |
| 972 | +// Return the entry size of the xref stream and the processed W array. | |
| 973 | +std::pair<int, std::array<int, 3>> | |
| 974 | +QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged) | |
| 973 | 975 | { |
| 974 | - QPDFObjectHandle dict = xref_obj.getDict(); | |
| 975 | - QPDFObjectHandle W_obj = dict.getKey("/W"); | |
| 976 | - QPDFObjectHandle Index_obj = dict.getKey("/Index"); | |
| 976 | + auto W_obj = dict.getKey("/W"); | |
| 977 | 977 | if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() && |
| 978 | - W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger() && | |
| 979 | - dict.getKey("/Size").isInteger() && (Index_obj.isArray() || Index_obj.isNull()))) { | |
| 980 | - throw damagedPDF( | |
| 981 | - "xref stream", | |
| 982 | - xref_offset, | |
| 983 | - "Cross-reference stream does not have proper /W and /Index keys"); | |
| 978 | + W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) { | |
| 979 | + throw damaged("Cross-reference stream does not have a proper /W key"); | |
| 984 | 980 | } |
| 985 | 981 | |
| 986 | - int W[3]; | |
| 987 | - size_t entry_size = 0; | |
| 982 | + std::array<int, 3> W; | |
| 983 | + int entry_size = 0; | |
| 984 | + auto w_vector = W_obj.getArrayAsVector(); | |
| 988 | 985 | int max_bytes = sizeof(qpdf_offset_t); |
| 989 | - for (int i = 0; i < 3; ++i) { | |
| 990 | - W[i] = W_obj.getArrayItem(i).getIntValueAsInt(); | |
| 986 | + for (size_t i = 0; i < 3; ++i) { | |
| 987 | + W[i] = w_vector[i].getIntValueAsInt(); | |
| 991 | 988 | if (W[i] > max_bytes) { |
| 992 | - throw damagedPDF( | |
| 993 | - "xref stream", | |
| 994 | - xref_offset, | |
| 995 | - "Cross-reference stream's /W contains impossibly large values"); | |
| 989 | + throw damaged("Cross-reference stream's /W contains impossibly large values"); | |
| 990 | + } | |
| 991 | + if (W[i] < 0) { | |
| 992 | + throw damaged("Cross-reference stream's /W contains negative values"); | |
| 996 | 993 | } |
| 997 | - entry_size += toS(W[i]); | |
| 994 | + entry_size += W[i]; | |
| 998 | 995 | } |
| 999 | 996 | if (entry_size == 0) { |
| 1000 | - throw damagedPDF( | |
| 1001 | - "xref stream", xref_offset, "Cross-reference stream's /W indicates entry size of 0"); | |
| 997 | + throw damaged("Cross-reference stream's /W indicates entry size of 0"); | |
| 998 | + } | |
| 999 | + return {entry_size, W}; | |
| 1000 | +} | |
| 1001 | + | |
| 1002 | +// Validate Size key and return the maximum number of entries that the xref stream can contain. | |
| 1003 | +int | |
| 1004 | +QPDF::processXRefSize( | |
| 1005 | + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged) | |
| 1006 | +{ | |
| 1007 | + // Number of entries is limited by the highest possible object id and stream size. | |
| 1008 | + auto max_num_entries = std::numeric_limits<int>::max(); | |
| 1009 | + if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) { | |
| 1010 | + max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size); | |
| 1011 | + } | |
| 1012 | + | |
| 1013 | + auto Size_obj = dict.getKey("/Size"); | |
| 1014 | + long long size; | |
| 1015 | + if (!dict.getKey("/Size").getValueAsInt(size)) { | |
| 1016 | + throw damaged("Cross-reference stream does not have a proper /Size key"); | |
| 1017 | + } else if (size < 0) { | |
| 1018 | + throw damaged("Cross-reference stream has a negative /Size key"); | |
| 1019 | + } else if (size >= max_num_entries) { | |
| 1020 | + throw damaged("Cross-reference stream has an impossibly large /Size key"); | |
| 1002 | 1021 | } |
| 1003 | - unsigned long long max_num_entries = static_cast<unsigned long long>(-1) / entry_size; | |
| 1022 | + // We are not validating that Size <= (Size key of parent xref / trailer). | |
| 1023 | + return max_num_entries; | |
| 1024 | +} | |
| 1025 | + | |
| 1026 | +// Return the number of entries of the xref stream and the processed Index array. | |
| 1027 | +std::pair<int, std::vector<std::pair<int, int>>> | |
| 1028 | +QPDF::processXRefIndex( | |
| 1029 | + QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged) | |
| 1030 | +{ | |
| 1031 | + auto size = dict.getKey("/Size").getIntValueAsInt(); | |
| 1032 | + auto Index_obj = dict.getKey("/Index"); | |
| 1004 | 1033 | |
| 1005 | - std::vector<long long> indx; | |
| 1006 | 1034 | if (Index_obj.isArray()) { |
| 1007 | - int n_index = Index_obj.getArrayNItems(); | |
| 1008 | - if ((n_index % 2) || (n_index < 2)) { | |
| 1009 | - throw damagedPDF( | |
| 1010 | - "xref stream", | |
| 1011 | - xref_offset, | |
| 1012 | - "Cross-reference stream's /Index has an invalid number of " | |
| 1013 | - "values"); | |
| 1014 | - } | |
| 1015 | - for (int i = 0; i < n_index; ++i) { | |
| 1016 | - if (Index_obj.getArrayItem(i).isInteger()) { | |
| 1017 | - indx.push_back(Index_obj.getArrayItem(i).getIntValue()); | |
| 1035 | + std::vector<std::pair<int, int>> indx; | |
| 1036 | + int num_entries = 0; | |
| 1037 | + auto index_vec = Index_obj.getArrayAsVector(); | |
| 1038 | + if ((index_vec.size() % 2) || index_vec.size() < 2) { | |
| 1039 | + throw damaged("Cross-reference stream's /Index has an invalid number of values"); | |
| 1040 | + } | |
| 1041 | + | |
| 1042 | + int i = 0; | |
| 1043 | + long long first = 0; | |
| 1044 | + for (auto& val: index_vec) { | |
| 1045 | + if (val.isInteger()) { | |
| 1046 | + if (i % 2) { | |
| 1047 | + auto count = val.getIntValue(); | |
| 1048 | + // We are guarding against the possibility of num_entries * entry_size | |
| 1049 | + // overflowing. We are not checking that entries are in ascending order as | |
| 1050 | + // required by the spec, which probably should generate a warning. We are also | |
| 1051 | + // not checking that for each subsection first object number + number of entries | |
| 1052 | + // <= /Size. The spec requires us to ignore object number > /Size. | |
| 1053 | + if (first > (max_num_entries - count) || | |
| 1054 | + count > (max_num_entries - num_entries)) { | |
| 1055 | + throw damaged( | |
| 1056 | + "Cross-reference stream claims to contain too many entries: " + | |
| 1057 | + std::to_string(first) + " " + std::to_string(max_num_entries) + " " + | |
| 1058 | + std::to_string(num_entries)); | |
| 1059 | + } | |
| 1060 | + indx.emplace_back(static_cast<int>(first), static_cast<int>(count)); | |
| 1061 | + num_entries += static_cast<int>(count); | |
| 1062 | + } else { | |
| 1063 | + first = val.getIntValue(); | |
| 1064 | + if (first < 0) { | |
| 1065 | + throw damaged( | |
| 1066 | + "Cross-reference stream's /Index contains a negative object id"); | |
| 1067 | + } else if (first > max_num_entries) { | |
| 1068 | + throw damaged("Cross-reference stream's /Index contains an impossibly " | |
| 1069 | + "large object id"); | |
| 1070 | + } | |
| 1071 | + } | |
| 1018 | 1072 | } else { |
| 1019 | - throw damagedPDF( | |
| 1020 | - "xref stream", | |
| 1021 | - xref_offset, | |
| 1022 | - ("Cross-reference stream's /Index's item " + std::to_string(i) + | |
| 1023 | - " is not an integer")); | |
| 1073 | + throw damaged( | |
| 1074 | + "Cross-reference stream's /Index's item " + std::to_string(i) + | |
| 1075 | + " is not an integer"); | |
| 1024 | 1076 | } |
| 1077 | + i++; | |
| 1025 | 1078 | } |
| 1026 | - QTC::TC("qpdf", "QPDF xref /Index is array", n_index == 2 ? 0 : 1); | |
| 1027 | - } else { | |
| 1079 | + QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1); | |
| 1080 | + return {num_entries, indx}; | |
| 1081 | + } else if (Index_obj.isNull()) { | |
| 1028 | 1082 | QTC::TC("qpdf", "QPDF xref /Index is null"); |
| 1029 | - long long size = dict.getKey("/Size").getIntValue(); | |
| 1030 | - indx.push_back(0); | |
| 1031 | - indx.push_back(size); | |
| 1083 | + return {size, {{0, size}}}; | |
| 1084 | + } else { | |
| 1085 | + throw damaged("Cross-reference stream does not have a proper /Index key"); | |
| 1032 | 1086 | } |
| 1087 | +} | |
| 1033 | 1088 | |
| 1034 | - size_t num_entries = 0; | |
| 1035 | - for (size_t i = 1; i < indx.size(); i += 2) { | |
| 1036 | - if (indx.at(i) > QIntC::to_longlong(max_num_entries - num_entries)) { | |
| 1037 | - throw damagedPDF( | |
| 1038 | - "xref stream", | |
| 1039 | - xref_offset, | |
| 1040 | - ("Cross-reference stream claims to contain too many entries: " + | |
| 1041 | - std::to_string(indx.at(i)) + " " + std::to_string(max_num_entries) + " " + | |
| 1042 | - std::to_string(num_entries))); | |
| 1043 | - } | |
| 1044 | - num_entries += toS(indx.at(i)); | |
| 1045 | - } | |
| 1089 | +qpdf_offset_t | |
| 1090 | +QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) | |
| 1091 | +{ | |
| 1092 | + auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc { | |
| 1093 | + return damagedPDF("xref stream", xref_offset, msg.data()); | |
| 1094 | + }; | |
| 1095 | + | |
| 1096 | + auto dict = xref_obj.getDict(); | |
| 1046 | 1097 | |
| 1047 | - // entry_size and num_entries have both been validated to ensure that this multiplication does | |
| 1048 | - // not cause an overflow. | |
| 1049 | - size_t expected_size = entry_size * num_entries; | |
| 1098 | + auto [entry_size, W] = processXRefW(dict, damaged); | |
| 1099 | + int max_num_entries = processXRefSize(dict, entry_size, damaged); | |
| 1100 | + auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged); | |
| 1050 | 1101 | |
| 1051 | 1102 | std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized); |
| 1052 | 1103 | size_t actual_size = bp->getSize(); |
| 1104 | + auto expected_size = toS(entry_size) * toS(num_entries); | |
| 1053 | 1105 | |
| 1054 | 1106 | if (expected_size != actual_size) { |
| 1055 | - QPDFExc x = damagedPDF( | |
| 1056 | - "xref stream", | |
| 1057 | - xref_offset, | |
| 1058 | - ("Cross-reference stream data has the wrong size; expected = " + | |
| 1059 | - std::to_string(expected_size) + "; actual = " + std::to_string(actual_size))); | |
| 1107 | + QPDFExc x = damaged( | |
| 1108 | + "Cross-reference stream data has the wrong size; expected = " + | |
| 1109 | + std::to_string(expected_size) + "; actual = " + std::to_string(actual_size)); | |
| 1060 | 1110 | if (expected_size > actual_size) { |
| 1061 | 1111 | throw x; |
| 1062 | 1112 | } else { |
| ... | ... | @@ -1064,65 +1114,48 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) |
| 1064 | 1114 | } |
| 1065 | 1115 | } |
| 1066 | 1116 | |
| 1067 | - size_t cur_chunk = 0; | |
| 1068 | - int chunk_count = 0; | |
| 1069 | - | |
| 1070 | 1117 | bool saw_first_compressed_object = false; |
| 1071 | 1118 | |
| 1072 | 1119 | // Actual size vs. expected size check above ensures that we will not overflow any buffers here. |
| 1073 | - // We know that entry_size * num_entries is equal to the size of the buffer. | |
| 1074 | - unsigned char const* data = bp->getBuffer(); | |
| 1075 | - for (size_t i = 0; i < num_entries; ++i) { | |
| 1076 | - // Read this entry | |
| 1077 | - unsigned char const* entry = data + (entry_size * i); | |
| 1078 | - qpdf_offset_t fields[3]; | |
| 1079 | - unsigned char const* p = entry; | |
| 1080 | - for (int j = 0; j < 3; ++j) { | |
| 1081 | - fields[j] = 0; | |
| 1082 | - if ((j == 0) && (W[0] == 0)) { | |
| 1120 | + // We know that entry_size * num_entries is less or equal to the size of the buffer. | |
| 1121 | + auto p = bp->getBuffer(); | |
| 1122 | + for (auto [obj, sec_entries]: indx) { | |
| 1123 | + // Process a subsection. | |
| 1124 | + for (int i = 0; i < sec_entries; ++i) { | |
| 1125 | + // Read this entry | |
| 1126 | + std::array<qpdf_offset_t, 3> fields{}; | |
| 1127 | + if (W[0] == 0) { | |
| 1083 | 1128 | QTC::TC("qpdf", "QPDF default for xref stream field 0"); |
| 1084 | 1129 | fields[0] = 1; |
| 1085 | 1130 | } |
| 1086 | - for (int k = 0; k < W[j]; ++k) { | |
| 1087 | - fields[j] <<= 8; | |
| 1088 | - fields[j] += toI(*p++); | |
| 1131 | + for (size_t j = 0; j < 3; ++j) { | |
| 1132 | + for (int k = 0; k < W[j]; ++k) { | |
| 1133 | + fields[j] <<= 8; | |
| 1134 | + fields[j] |= *p++; | |
| 1135 | + } | |
| 1089 | 1136 | } |
| 1090 | - } | |
| 1091 | 1137 | |
| 1092 | - // Get the object and generation number. The object number is based on /Index. The | |
| 1093 | - // generation number is 0 unless this is an uncompressed object record, in which case the | |
| 1094 | - // generation number appears as the third field. | |
| 1095 | - int obj = toI(indx.at(cur_chunk)); | |
| 1096 | - if ((obj < 0) || ((std::numeric_limits<int>::max() - obj) < chunk_count)) { | |
| 1097 | - std::ostringstream msg; | |
| 1098 | - msg.imbue(std::locale::classic()); | |
| 1099 | - msg << "adding " << chunk_count << " to " << obj | |
| 1100 | - << " while computing index in xref stream would cause an integer overflow"; | |
| 1101 | - throw std::range_error(msg.str()); | |
| 1102 | - } | |
| 1103 | - obj += chunk_count; | |
| 1104 | - ++chunk_count; | |
| 1105 | - if (chunk_count >= indx.at(cur_chunk + 1)) { | |
| 1106 | - cur_chunk += 2; | |
| 1107 | - chunk_count = 0; | |
| 1108 | - } | |
| 1109 | - | |
| 1110 | - if (saw_first_compressed_object) { | |
| 1111 | - if (fields[0] != 2) { | |
| 1112 | - m->uncompressed_after_compressed = true; | |
| 1138 | + // Get the generation number. The generation number is 0 unless this is an uncompressed | |
| 1139 | + // object record, in which case the generation number appears as the third field. | |
| 1140 | + if (saw_first_compressed_object) { | |
| 1141 | + if (fields[0] != 2) { | |
| 1142 | + m->uncompressed_after_compressed = true; | |
| 1143 | + } | |
| 1144 | + } else if (fields[0] == 2) { | |
| 1145 | + saw_first_compressed_object = true; | |
| 1113 | 1146 | } |
| 1114 | - } else if (fields[0] == 2) { | |
| 1115 | - saw_first_compressed_object = true; | |
| 1116 | - } | |
| 1117 | - if (obj == 0) { | |
| 1118 | - // This is needed by checkLinearization() | |
| 1119 | - m->first_xref_item_offset = xref_offset; | |
| 1120 | - } else if (fields[0] == 0) { | |
| 1121 | - // Ignore fields[2], which we don't care about in this case. This works around the issue | |
| 1122 | - // of some PDF files that put invalid values, like -1, here for deleted objects. | |
| 1123 | - insertFreeXrefEntry(QPDFObjGen(obj, 0)); | |
| 1124 | - } else { | |
| 1125 | - insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2])); | |
| 1147 | + if (obj == 0) { | |
| 1148 | + // This is needed by checkLinearization() | |
| 1149 | + m->first_xref_item_offset = xref_offset; | |
| 1150 | + } else if (fields[0] == 0) { | |
| 1151 | + // Ignore fields[2], which we don't care about in this case. This works around the | |
| 1152 | + // issue of some PDF files that put invalid values, like -1, here for deleted | |
| 1153 | + // objects. | |
| 1154 | + insertFreeXrefEntry(QPDFObjGen(obj, 0)); | |
| 1155 | + } else { | |
| 1156 | + insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2])); | |
| 1157 | + } | |
| 1158 | + ++obj; | |
| 1126 | 1159 | } |
| 1127 | 1160 | } |
| 1128 | 1161 | |
| ... | ... | @@ -1136,12 +1169,10 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) |
| 1136 | 1169 | "xref stream", "/Prev key in xref stream dictionary is not an integer"); |
| 1137 | 1170 | } |
| 1138 | 1171 | QTC::TC("qpdf", "QPDF prev key in xref stream dictionary"); |
| 1139 | - xref_offset = dict.getKey("/Prev").getIntValue(); | |
| 1172 | + return dict.getKey("/Prev").getIntValue(); | |
| 1140 | 1173 | } else { |
| 1141 | - xref_offset = 0; | |
| 1174 | + return 0; | |
| 1142 | 1175 | } |
| 1143 | - | |
| 1144 | - return xref_offset; | |
| 1145 | 1176 | } |
| 1146 | 1177 | |
| 1147 | 1178 | void | ... | ... |