Commit ce5b864c53eee2dccb912b363099409497c08a9c
Committed by
GitHub
Merge pull request #1201 from m-holger/xref_stream
QPDF::processXRefStream
Showing
2 changed files
with
158 additions
and
119 deletions
include/qpdf/QPDF.hh
| @@ -1028,6 +1028,14 @@ class QPDF | @@ -1028,6 +1028,14 @@ class QPDF | ||
| 1028 | qpdf_offset_t read_xrefTable(qpdf_offset_t offset); | 1028 | qpdf_offset_t read_xrefTable(qpdf_offset_t offset); |
| 1029 | qpdf_offset_t read_xrefStream(qpdf_offset_t offset); | 1029 | qpdf_offset_t read_xrefStream(qpdf_offset_t offset); |
| 1030 | qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream); | 1030 | qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream); |
| 1031 | + std::pair<int, std::array<int, 3>> | ||
| 1032 | + processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged); | ||
| 1033 | + int processXRefSize( | ||
| 1034 | + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged); | ||
| 1035 | + std::pair<int, std::vector<std::pair<int, int>>> processXRefIndex( | ||
| 1036 | + QPDFObjectHandle& dict, | ||
| 1037 | + int max_num_entries, | ||
| 1038 | + std::function<QPDFExc(std::string_view)> damaged); | ||
| 1031 | void insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2); | 1039 | void insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2); |
| 1032 | void insertFreeXrefEntry(QPDFObjGen); | 1040 | void insertFreeXrefEntry(QPDFObjGen); |
| 1033 | void insertReconstructedXrefEntry(int obj, qpdf_offset_t f1, int f2); | 1041 | void insertReconstructedXrefEntry(int obj, qpdf_offset_t f1, int f2); |
libqpdf/QPDF.cc
| @@ -2,6 +2,7 @@ | @@ -2,6 +2,7 @@ | ||
| 2 | 2 | ||
| 3 | #include <qpdf/QPDF.hh> | 3 | #include <qpdf/QPDF.hh> |
| 4 | 4 | ||
| 5 | +#include <array> | ||
| 5 | #include <atomic> | 6 | #include <atomic> |
| 6 | #include <cstring> | 7 | #include <cstring> |
| 7 | #include <limits> | 8 | #include <limits> |
| @@ -968,95 +969,144 @@ QPDF::read_xrefStream(qpdf_offset_t xref_offset) | @@ -968,95 +969,144 @@ QPDF::read_xrefStream(qpdf_offset_t xref_offset) | ||
| 968 | return 0; // unreachable | 969 | return 0; // unreachable |
| 969 | } | 970 | } |
| 970 | 971 | ||
| 971 | -qpdf_offset_t | ||
| 972 | -QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) | 972 | +// Return the entry size of the xref stream and the processed W array. |
| 973 | +std::pair<int, std::array<int, 3>> | ||
| 974 | +QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged) | ||
| 973 | { | 975 | { |
| 974 | - QPDFObjectHandle dict = xref_obj.getDict(); | ||
| 975 | - QPDFObjectHandle W_obj = dict.getKey("/W"); | ||
| 976 | - QPDFObjectHandle Index_obj = dict.getKey("/Index"); | 976 | + auto W_obj = dict.getKey("/W"); |
| 977 | if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() && | 977 | if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() && |
| 978 | - W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger() && | ||
| 979 | - dict.getKey("/Size").isInteger() && (Index_obj.isArray() || Index_obj.isNull()))) { | ||
| 980 | - throw damagedPDF( | ||
| 981 | - "xref stream", | ||
| 982 | - xref_offset, | ||
| 983 | - "Cross-reference stream does not have proper /W and /Index keys"); | 978 | + W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) { |
| 979 | + throw damaged("Cross-reference stream does not have a proper /W key"); | ||
| 984 | } | 980 | } |
| 985 | 981 | ||
| 986 | - int W[3]; | ||
| 987 | - size_t entry_size = 0; | 982 | + std::array<int, 3> W; |
| 983 | + int entry_size = 0; | ||
| 984 | + auto w_vector = W_obj.getArrayAsVector(); | ||
| 988 | int max_bytes = sizeof(qpdf_offset_t); | 985 | int max_bytes = sizeof(qpdf_offset_t); |
| 989 | - for (int i = 0; i < 3; ++i) { | ||
| 990 | - W[i] = W_obj.getArrayItem(i).getIntValueAsInt(); | 986 | + for (size_t i = 0; i < 3; ++i) { |
| 987 | + W[i] = w_vector[i].getIntValueAsInt(); | ||
| 991 | if (W[i] > max_bytes) { | 988 | if (W[i] > max_bytes) { |
| 992 | - throw damagedPDF( | ||
| 993 | - "xref stream", | ||
| 994 | - xref_offset, | ||
| 995 | - "Cross-reference stream's /W contains impossibly large values"); | 989 | + throw damaged("Cross-reference stream's /W contains impossibly large values"); |
| 990 | + } | ||
| 991 | + if (W[i] < 0) { | ||
| 992 | + throw damaged("Cross-reference stream's /W contains negative values"); | ||
| 996 | } | 993 | } |
| 997 | - entry_size += toS(W[i]); | 994 | + entry_size += W[i]; |
| 998 | } | 995 | } |
| 999 | if (entry_size == 0) { | 996 | if (entry_size == 0) { |
| 1000 | - throw damagedPDF( | ||
| 1001 | - "xref stream", xref_offset, "Cross-reference stream's /W indicates entry size of 0"); | 997 | + throw damaged("Cross-reference stream's /W indicates entry size of 0"); |
| 998 | + } | ||
| 999 | + return {entry_size, W}; | ||
| 1000 | +} | ||
| 1001 | + | ||
| 1002 | +// Validate Size key and return the maximum number of entries that the xref stream can contain. | ||
| 1003 | +int | ||
| 1004 | +QPDF::processXRefSize( | ||
| 1005 | + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged) | ||
| 1006 | +{ | ||
| 1007 | + // Number of entries is limited by the highest possible object id and stream size. | ||
| 1008 | + auto max_num_entries = std::numeric_limits<int>::max(); | ||
| 1009 | + if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) { | ||
| 1010 | + max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size); | ||
| 1011 | + } | ||
| 1012 | + | ||
| 1013 | + auto Size_obj = dict.getKey("/Size"); | ||
| 1014 | + long long size; | ||
| 1015 | + if (!dict.getKey("/Size").getValueAsInt(size)) { | ||
| 1016 | + throw damaged("Cross-reference stream does not have a proper /Size key"); | ||
| 1017 | + } else if (size < 0) { | ||
| 1018 | + throw damaged("Cross-reference stream has a negative /Size key"); | ||
| 1019 | + } else if (size >= max_num_entries) { | ||
| 1020 | + throw damaged("Cross-reference stream has an impossibly large /Size key"); | ||
| 1002 | } | 1021 | } |
| 1003 | - unsigned long long max_num_entries = static_cast<unsigned long long>(-1) / entry_size; | 1022 | + // We are not validating that Size <= (Size key of parent xref / trailer). |
| 1023 | + return max_num_entries; | ||
| 1024 | +} | ||
| 1025 | + | ||
| 1026 | +// Return the number of entries of the xref stream and the processed Index array. | ||
| 1027 | +std::pair<int, std::vector<std::pair<int, int>>> | ||
| 1028 | +QPDF::processXRefIndex( | ||
| 1029 | + QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged) | ||
| 1030 | +{ | ||
| 1031 | + auto size = dict.getKey("/Size").getIntValueAsInt(); | ||
| 1032 | + auto Index_obj = dict.getKey("/Index"); | ||
| 1004 | 1033 | ||
| 1005 | - std::vector<long long> indx; | ||
| 1006 | if (Index_obj.isArray()) { | 1034 | if (Index_obj.isArray()) { |
| 1007 | - int n_index = Index_obj.getArrayNItems(); | ||
| 1008 | - if ((n_index % 2) || (n_index < 2)) { | ||
| 1009 | - throw damagedPDF( | ||
| 1010 | - "xref stream", | ||
| 1011 | - xref_offset, | ||
| 1012 | - "Cross-reference stream's /Index has an invalid number of " | ||
| 1013 | - "values"); | ||
| 1014 | - } | ||
| 1015 | - for (int i = 0; i < n_index; ++i) { | ||
| 1016 | - if (Index_obj.getArrayItem(i).isInteger()) { | ||
| 1017 | - indx.push_back(Index_obj.getArrayItem(i).getIntValue()); | 1035 | + std::vector<std::pair<int, int>> indx; |
| 1036 | + int num_entries = 0; | ||
| 1037 | + auto index_vec = Index_obj.getArrayAsVector(); | ||
| 1038 | + if ((index_vec.size() % 2) || index_vec.size() < 2) { | ||
| 1039 | + throw damaged("Cross-reference stream's /Index has an invalid number of values"); | ||
| 1040 | + } | ||
| 1041 | + | ||
| 1042 | + int i = 0; | ||
| 1043 | + long long first = 0; | ||
| 1044 | + for (auto& val: index_vec) { | ||
| 1045 | + if (val.isInteger()) { | ||
| 1046 | + if (i % 2) { | ||
| 1047 | + auto count = val.getIntValue(); | ||
| 1048 | + // We are guarding against the possibility of num_entries * entry_size | ||
| 1049 | + // overflowing. We are not checking that entries are in ascending order as | ||
| 1050 | + // required by the spec, which probably should generate a warning. We are also | ||
| 1051 | + // not checking that for each subsection first object number + number of entries | ||
| 1052 | + // <= /Size. The spec requires us to ignore object number > /Size. | ||
| 1053 | + if (first > (max_num_entries - count) || | ||
| 1054 | + count > (max_num_entries - num_entries)) { | ||
| 1055 | + throw damaged( | ||
| 1056 | + "Cross-reference stream claims to contain too many entries: " + | ||
| 1057 | + std::to_string(first) + " " + std::to_string(max_num_entries) + " " + | ||
| 1058 | + std::to_string(num_entries)); | ||
| 1059 | + } | ||
| 1060 | + indx.emplace_back(static_cast<int>(first), static_cast<int>(count)); | ||
| 1061 | + num_entries += static_cast<int>(count); | ||
| 1062 | + } else { | ||
| 1063 | + first = val.getIntValue(); | ||
| 1064 | + if (first < 0) { | ||
| 1065 | + throw damaged( | ||
| 1066 | + "Cross-reference stream's /Index contains a negative object id"); | ||
| 1067 | + } else if (first > max_num_entries) { | ||
| 1068 | + throw damaged("Cross-reference stream's /Index contains an impossibly " | ||
| 1069 | + "large object id"); | ||
| 1070 | + } | ||
| 1071 | + } | ||
| 1018 | } else { | 1072 | } else { |
| 1019 | - throw damagedPDF( | ||
| 1020 | - "xref stream", | ||
| 1021 | - xref_offset, | ||
| 1022 | - ("Cross-reference stream's /Index's item " + std::to_string(i) + | ||
| 1023 | - " is not an integer")); | 1073 | + throw damaged( |
| 1074 | + "Cross-reference stream's /Index's item " + std::to_string(i) + | ||
| 1075 | + " is not an integer"); | ||
| 1024 | } | 1076 | } |
| 1077 | + i++; | ||
| 1025 | } | 1078 | } |
| 1026 | - QTC::TC("qpdf", "QPDF xref /Index is array", n_index == 2 ? 0 : 1); | ||
| 1027 | - } else { | 1079 | + QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1); |
| 1080 | + return {num_entries, indx}; | ||
| 1081 | + } else if (Index_obj.isNull()) { | ||
| 1028 | QTC::TC("qpdf", "QPDF xref /Index is null"); | 1082 | QTC::TC("qpdf", "QPDF xref /Index is null"); |
| 1029 | - long long size = dict.getKey("/Size").getIntValue(); | ||
| 1030 | - indx.push_back(0); | ||
| 1031 | - indx.push_back(size); | 1083 | + return {size, {{0, size}}}; |
| 1084 | + } else { | ||
| 1085 | + throw damaged("Cross-reference stream does not have a proper /Index key"); | ||
| 1032 | } | 1086 | } |
| 1087 | +} | ||
| 1033 | 1088 | ||
| 1034 | - size_t num_entries = 0; | ||
| 1035 | - for (size_t i = 1; i < indx.size(); i += 2) { | ||
| 1036 | - if (indx.at(i) > QIntC::to_longlong(max_num_entries - num_entries)) { | ||
| 1037 | - throw damagedPDF( | ||
| 1038 | - "xref stream", | ||
| 1039 | - xref_offset, | ||
| 1040 | - ("Cross-reference stream claims to contain too many entries: " + | ||
| 1041 | - std::to_string(indx.at(i)) + " " + std::to_string(max_num_entries) + " " + | ||
| 1042 | - std::to_string(num_entries))); | ||
| 1043 | - } | ||
| 1044 | - num_entries += toS(indx.at(i)); | ||
| 1045 | - } | 1089 | +qpdf_offset_t |
| 1090 | +QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) | ||
| 1091 | +{ | ||
| 1092 | + auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc { | ||
| 1093 | + return damagedPDF("xref stream", xref_offset, msg.data()); | ||
| 1094 | + }; | ||
| 1095 | + | ||
| 1096 | + auto dict = xref_obj.getDict(); | ||
| 1046 | 1097 | ||
| 1047 | - // entry_size and num_entries have both been validated to ensure that this multiplication does | ||
| 1048 | - // not cause an overflow. | ||
| 1049 | - size_t expected_size = entry_size * num_entries; | 1098 | + auto [entry_size, W] = processXRefW(dict, damaged); |
| 1099 | + int max_num_entries = processXRefSize(dict, entry_size, damaged); | ||
| 1100 | + auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged); | ||
| 1050 | 1101 | ||
| 1051 | std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized); | 1102 | std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized); |
| 1052 | size_t actual_size = bp->getSize(); | 1103 | size_t actual_size = bp->getSize(); |
| 1104 | + auto expected_size = toS(entry_size) * toS(num_entries); | ||
| 1053 | 1105 | ||
| 1054 | if (expected_size != actual_size) { | 1106 | if (expected_size != actual_size) { |
| 1055 | - QPDFExc x = damagedPDF( | ||
| 1056 | - "xref stream", | ||
| 1057 | - xref_offset, | ||
| 1058 | - ("Cross-reference stream data has the wrong size; expected = " + | ||
| 1059 | - std::to_string(expected_size) + "; actual = " + std::to_string(actual_size))); | 1107 | + QPDFExc x = damaged( |
| 1108 | + "Cross-reference stream data has the wrong size; expected = " + | ||
| 1109 | + std::to_string(expected_size) + "; actual = " + std::to_string(actual_size)); | ||
| 1060 | if (expected_size > actual_size) { | 1110 | if (expected_size > actual_size) { |
| 1061 | throw x; | 1111 | throw x; |
| 1062 | } else { | 1112 | } else { |
| @@ -1064,65 +1114,48 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) | @@ -1064,65 +1114,48 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) | ||
| 1064 | } | 1114 | } |
| 1065 | } | 1115 | } |
| 1066 | 1116 | ||
| 1067 | - size_t cur_chunk = 0; | ||
| 1068 | - int chunk_count = 0; | ||
| 1069 | - | ||
| 1070 | bool saw_first_compressed_object = false; | 1117 | bool saw_first_compressed_object = false; |
| 1071 | 1118 | ||
| 1072 | // Actual size vs. expected size check above ensures that we will not overflow any buffers here. | 1119 | // Actual size vs. expected size check above ensures that we will not overflow any buffers here. |
| 1073 | - // We know that entry_size * num_entries is equal to the size of the buffer. | ||
| 1074 | - unsigned char const* data = bp->getBuffer(); | ||
| 1075 | - for (size_t i = 0; i < num_entries; ++i) { | ||
| 1076 | - // Read this entry | ||
| 1077 | - unsigned char const* entry = data + (entry_size * i); | ||
| 1078 | - qpdf_offset_t fields[3]; | ||
| 1079 | - unsigned char const* p = entry; | ||
| 1080 | - for (int j = 0; j < 3; ++j) { | ||
| 1081 | - fields[j] = 0; | ||
| 1082 | - if ((j == 0) && (W[0] == 0)) { | 1120 | + // We know that entry_size * num_entries is less or equal to the size of the buffer. |
| 1121 | + auto p = bp->getBuffer(); | ||
| 1122 | + for (auto [obj, sec_entries]: indx) { | ||
| 1123 | + // Process a subsection. | ||
| 1124 | + for (int i = 0; i < sec_entries; ++i) { | ||
| 1125 | + // Read this entry | ||
| 1126 | + std::array<qpdf_offset_t, 3> fields{}; | ||
| 1127 | + if (W[0] == 0) { | ||
| 1083 | QTC::TC("qpdf", "QPDF default for xref stream field 0"); | 1128 | QTC::TC("qpdf", "QPDF default for xref stream field 0"); |
| 1084 | fields[0] = 1; | 1129 | fields[0] = 1; |
| 1085 | } | 1130 | } |
| 1086 | - for (int k = 0; k < W[j]; ++k) { | ||
| 1087 | - fields[j] <<= 8; | ||
| 1088 | - fields[j] += toI(*p++); | 1131 | + for (size_t j = 0; j < 3; ++j) { |
| 1132 | + for (int k = 0; k < W[j]; ++k) { | ||
| 1133 | + fields[j] <<= 8; | ||
| 1134 | + fields[j] |= *p++; | ||
| 1135 | + } | ||
| 1089 | } | 1136 | } |
| 1090 | - } | ||
| 1091 | 1137 | ||
| 1092 | - // Get the object and generation number. The object number is based on /Index. The | ||
| 1093 | - // generation number is 0 unless this is an uncompressed object record, in which case the | ||
| 1094 | - // generation number appears as the third field. | ||
| 1095 | - int obj = toI(indx.at(cur_chunk)); | ||
| 1096 | - if ((obj < 0) || ((std::numeric_limits<int>::max() - obj) < chunk_count)) { | ||
| 1097 | - std::ostringstream msg; | ||
| 1098 | - msg.imbue(std::locale::classic()); | ||
| 1099 | - msg << "adding " << chunk_count << " to " << obj | ||
| 1100 | - << " while computing index in xref stream would cause an integer overflow"; | ||
| 1101 | - throw std::range_error(msg.str()); | ||
| 1102 | - } | ||
| 1103 | - obj += chunk_count; | ||
| 1104 | - ++chunk_count; | ||
| 1105 | - if (chunk_count >= indx.at(cur_chunk + 1)) { | ||
| 1106 | - cur_chunk += 2; | ||
| 1107 | - chunk_count = 0; | ||
| 1108 | - } | ||
| 1109 | - | ||
| 1110 | - if (saw_first_compressed_object) { | ||
| 1111 | - if (fields[0] != 2) { | ||
| 1112 | - m->uncompressed_after_compressed = true; | 1138 | + // Get the generation number. The generation number is 0 unless this is an uncompressed |
| 1139 | + // object record, in which case the generation number appears as the third field. | ||
| 1140 | + if (saw_first_compressed_object) { | ||
| 1141 | + if (fields[0] != 2) { | ||
| 1142 | + m->uncompressed_after_compressed = true; | ||
| 1143 | + } | ||
| 1144 | + } else if (fields[0] == 2) { | ||
| 1145 | + saw_first_compressed_object = true; | ||
| 1113 | } | 1146 | } |
| 1114 | - } else if (fields[0] == 2) { | ||
| 1115 | - saw_first_compressed_object = true; | ||
| 1116 | - } | ||
| 1117 | - if (obj == 0) { | ||
| 1118 | - // This is needed by checkLinearization() | ||
| 1119 | - m->first_xref_item_offset = xref_offset; | ||
| 1120 | - } else if (fields[0] == 0) { | ||
| 1121 | - // Ignore fields[2], which we don't care about in this case. This works around the issue | ||
| 1122 | - // of some PDF files that put invalid values, like -1, here for deleted objects. | ||
| 1123 | - insertFreeXrefEntry(QPDFObjGen(obj, 0)); | ||
| 1124 | - } else { | ||
| 1125 | - insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2])); | 1147 | + if (obj == 0) { |
| 1148 | + // This is needed by checkLinearization() | ||
| 1149 | + m->first_xref_item_offset = xref_offset; | ||
| 1150 | + } else if (fields[0] == 0) { | ||
| 1151 | + // Ignore fields[2], which we don't care about in this case. This works around the | ||
| 1152 | + // issue of some PDF files that put invalid values, like -1, here for deleted | ||
| 1153 | + // objects. | ||
| 1154 | + insertFreeXrefEntry(QPDFObjGen(obj, 0)); | ||
| 1155 | + } else { | ||
| 1156 | + insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2])); | ||
| 1157 | + } | ||
| 1158 | + ++obj; | ||
| 1126 | } | 1159 | } |
| 1127 | } | 1160 | } |
| 1128 | 1161 | ||
| @@ -1136,12 +1169,10 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) | @@ -1136,12 +1169,10 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) | ||
| 1136 | "xref stream", "/Prev key in xref stream dictionary is not an integer"); | 1169 | "xref stream", "/Prev key in xref stream dictionary is not an integer"); |
| 1137 | } | 1170 | } |
| 1138 | QTC::TC("qpdf", "QPDF prev key in xref stream dictionary"); | 1171 | QTC::TC("qpdf", "QPDF prev key in xref stream dictionary"); |
| 1139 | - xref_offset = dict.getKey("/Prev").getIntValue(); | 1172 | + return dict.getKey("/Prev").getIntValue(); |
| 1140 | } else { | 1173 | } else { |
| 1141 | - xref_offset = 0; | 1174 | + return 0; |
| 1142 | } | 1175 | } |
| 1143 | - | ||
| 1144 | - return xref_offset; | ||
| 1145 | } | 1176 | } |
| 1146 | 1177 | ||
| 1147 | void | 1178 | void |