Commit ce5b864c53eee2dccb912b363099409497c08a9c

Authored by m-holger
Committed by GitHub
2 parents 95ef3552 2b0c2da7

Merge pull request #1201 from m-holger/xref_stream

QPDF::processXRefStream
include/qpdf/QPDF.hh
@@ -1028,6 +1028,14 @@ class QPDF @@ -1028,6 +1028,14 @@ class QPDF
1028 qpdf_offset_t read_xrefTable(qpdf_offset_t offset); 1028 qpdf_offset_t read_xrefTable(qpdf_offset_t offset);
1029 qpdf_offset_t read_xrefStream(qpdf_offset_t offset); 1029 qpdf_offset_t read_xrefStream(qpdf_offset_t offset);
1030 qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream); 1030 qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream);
  1031 + std::pair<int, std::array<int, 3>>
  1032 + processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged);
  1033 + int processXRefSize(
  1034 + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged);
  1035 + std::pair<int, std::vector<std::pair<int, int>>> processXRefIndex(
  1036 + QPDFObjectHandle& dict,
  1037 + int max_num_entries,
  1038 + std::function<QPDFExc(std::string_view)> damaged);
1031 void insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2); 1039 void insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2);
1032 void insertFreeXrefEntry(QPDFObjGen); 1040 void insertFreeXrefEntry(QPDFObjGen);
1033 void insertReconstructedXrefEntry(int obj, qpdf_offset_t f1, int f2); 1041 void insertReconstructedXrefEntry(int obj, qpdf_offset_t f1, int f2);
libqpdf/QPDF.cc
@@ -2,6 +2,7 @@ @@ -2,6 +2,7 @@
2 2
3 #include <qpdf/QPDF.hh> 3 #include <qpdf/QPDF.hh>
4 4
  5 +#include <array>
5 #include <atomic> 6 #include <atomic>
6 #include <cstring> 7 #include <cstring>
7 #include <limits> 8 #include <limits>
@@ -968,95 +969,144 @@ QPDF::read_xrefStream(qpdf_offset_t xref_offset) @@ -968,95 +969,144 @@ QPDF::read_xrefStream(qpdf_offset_t xref_offset)
968 return 0; // unreachable 969 return 0; // unreachable
969 } 970 }
970 971
971 -qpdf_offset_t  
972 -QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) 972 +// Return the entry size of the xref stream and the processed W array.
  973 +std::pair<int, std::array<int, 3>>
  974 +QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
973 { 975 {
974 - QPDFObjectHandle dict = xref_obj.getDict();  
975 - QPDFObjectHandle W_obj = dict.getKey("/W");  
976 - QPDFObjectHandle Index_obj = dict.getKey("/Index"); 976 + auto W_obj = dict.getKey("/W");
977 if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() && 977 if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&
978 - W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger() &&  
979 - dict.getKey("/Size").isInteger() && (Index_obj.isArray() || Index_obj.isNull()))) {  
980 - throw damagedPDF(  
981 - "xref stream",  
982 - xref_offset,  
983 - "Cross-reference stream does not have proper /W and /Index keys"); 978 + W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
  979 + throw damaged("Cross-reference stream does not have a proper /W key");
984 } 980 }
985 981
986 - int W[3];  
987 - size_t entry_size = 0; 982 + std::array<int, 3> W;
  983 + int entry_size = 0;
  984 + auto w_vector = W_obj.getArrayAsVector();
988 int max_bytes = sizeof(qpdf_offset_t); 985 int max_bytes = sizeof(qpdf_offset_t);
989 - for (int i = 0; i < 3; ++i) {  
990 - W[i] = W_obj.getArrayItem(i).getIntValueAsInt(); 986 + for (size_t i = 0; i < 3; ++i) {
  987 + W[i] = w_vector[i].getIntValueAsInt();
991 if (W[i] > max_bytes) { 988 if (W[i] > max_bytes) {
992 - throw damagedPDF(  
993 - "xref stream",  
994 - xref_offset,  
995 - "Cross-reference stream's /W contains impossibly large values"); 989 + throw damaged("Cross-reference stream's /W contains impossibly large values");
  990 + }
  991 + if (W[i] < 0) {
  992 + throw damaged("Cross-reference stream's /W contains negative values");
996 } 993 }
997 - entry_size += toS(W[i]); 994 + entry_size += W[i];
998 } 995 }
999 if (entry_size == 0) { 996 if (entry_size == 0) {
1000 - throw damagedPDF(  
1001 - "xref stream", xref_offset, "Cross-reference stream's /W indicates entry size of 0"); 997 + throw damaged("Cross-reference stream's /W indicates entry size of 0");
  998 + }
  999 + return {entry_size, W};
  1000 +}
  1001 +
  1002 +// Validate Size key and return the maximum number of entries that the xref stream can contain.
  1003 +int
  1004 +QPDF::processXRefSize(
  1005 + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
  1006 +{
  1007 + // Number of entries is limited by the highest possible object id and stream size.
  1008 + auto max_num_entries = std::numeric_limits<int>::max();
  1009 + if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
  1010 + max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
  1011 + }
  1012 +
  1013 + auto Size_obj = dict.getKey("/Size");
  1014 + long long size;
  1015 + if (!dict.getKey("/Size").getValueAsInt(size)) {
  1016 + throw damaged("Cross-reference stream does not have a proper /Size key");
  1017 + } else if (size < 0) {
  1018 + throw damaged("Cross-reference stream has a negative /Size key");
  1019 + } else if (size >= max_num_entries) {
  1020 + throw damaged("Cross-reference stream has an impossibly large /Size key");
1002 } 1021 }
1003 - unsigned long long max_num_entries = static_cast<unsigned long long>(-1) / entry_size; 1022 + // We are not validating that Size <= (Size key of parent xref / trailer).
  1023 + return max_num_entries;
  1024 +}
  1025 +
  1026 +// Return the number of entries of the xref stream and the processed Index array.
  1027 +std::pair<int, std::vector<std::pair<int, int>>>
  1028 +QPDF::processXRefIndex(
  1029 + QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
  1030 +{
  1031 + auto size = dict.getKey("/Size").getIntValueAsInt();
  1032 + auto Index_obj = dict.getKey("/Index");
1004 1033
1005 - std::vector<long long> indx;  
1006 if (Index_obj.isArray()) { 1034 if (Index_obj.isArray()) {
1007 - int n_index = Index_obj.getArrayNItems();  
1008 - if ((n_index % 2) || (n_index < 2)) {  
1009 - throw damagedPDF(  
1010 - "xref stream",  
1011 - xref_offset,  
1012 - "Cross-reference stream's /Index has an invalid number of "  
1013 - "values");  
1014 - }  
1015 - for (int i = 0; i < n_index; ++i) {  
1016 - if (Index_obj.getArrayItem(i).isInteger()) {  
1017 - indx.push_back(Index_obj.getArrayItem(i).getIntValue()); 1035 + std::vector<std::pair<int, int>> indx;
  1036 + int num_entries = 0;
  1037 + auto index_vec = Index_obj.getArrayAsVector();
  1038 + if ((index_vec.size() % 2) || index_vec.size() < 2) {
  1039 + throw damaged("Cross-reference stream's /Index has an invalid number of values");
  1040 + }
  1041 +
  1042 + int i = 0;
  1043 + long long first = 0;
  1044 + for (auto& val: index_vec) {
  1045 + if (val.isInteger()) {
  1046 + if (i % 2) {
  1047 + auto count = val.getIntValue();
  1048 + // We are guarding against the possibility of num_entries * entry_size
  1049 + // overflowing. We are not checking that entries are in ascending order as
  1050 + // required by the spec, which probably should generate a warning. We are also
  1051 + // not checking that for each subsection first object number + number of entries
  1052 + // <= /Size. The spec requires us to ignore object number > /Size.
  1053 + if (first > (max_num_entries - count) ||
  1054 + count > (max_num_entries - num_entries)) {
  1055 + throw damaged(
  1056 + "Cross-reference stream claims to contain too many entries: " +
  1057 + std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
  1058 + std::to_string(num_entries));
  1059 + }
  1060 + indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
  1061 + num_entries += static_cast<int>(count);
  1062 + } else {
  1063 + first = val.getIntValue();
  1064 + if (first < 0) {
  1065 + throw damaged(
  1066 + "Cross-reference stream's /Index contains a negative object id");
  1067 + } else if (first > max_num_entries) {
  1068 + throw damaged("Cross-reference stream's /Index contains an impossibly "
  1069 + "large object id");
  1070 + }
  1071 + }
1018 } else { 1072 } else {
1019 - throw damagedPDF(  
1020 - "xref stream",  
1021 - xref_offset,  
1022 - ("Cross-reference stream's /Index's item " + std::to_string(i) +  
1023 - " is not an integer")); 1073 + throw damaged(
  1074 + "Cross-reference stream's /Index's item " + std::to_string(i) +
  1075 + " is not an integer");
1024 } 1076 }
  1077 + i++;
1025 } 1078 }
1026 - QTC::TC("qpdf", "QPDF xref /Index is array", n_index == 2 ? 0 : 1);  
1027 - } else { 1079 + QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
  1080 + return {num_entries, indx};
  1081 + } else if (Index_obj.isNull()) {
1028 QTC::TC("qpdf", "QPDF xref /Index is null"); 1082 QTC::TC("qpdf", "QPDF xref /Index is null");
1029 - long long size = dict.getKey("/Size").getIntValue();  
1030 - indx.push_back(0);  
1031 - indx.push_back(size); 1083 + return {size, {{0, size}}};
  1084 + } else {
  1085 + throw damaged("Cross-reference stream does not have a proper /Index key");
1032 } 1086 }
  1087 +}
1033 1088
1034 - size_t num_entries = 0;  
1035 - for (size_t i = 1; i < indx.size(); i += 2) {  
1036 - if (indx.at(i) > QIntC::to_longlong(max_num_entries - num_entries)) {  
1037 - throw damagedPDF(  
1038 - "xref stream",  
1039 - xref_offset,  
1040 - ("Cross-reference stream claims to contain too many entries: " +  
1041 - std::to_string(indx.at(i)) + " " + std::to_string(max_num_entries) + " " +  
1042 - std::to_string(num_entries)));  
1043 - }  
1044 - num_entries += toS(indx.at(i));  
1045 - } 1089 +qpdf_offset_t
  1090 +QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
  1091 +{
  1092 + auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
  1093 + return damagedPDF("xref stream", xref_offset, msg.data());
  1094 + };
  1095 +
  1096 + auto dict = xref_obj.getDict();
1046 1097
1047 - // entry_size and num_entries have both been validated to ensure that this multiplication does  
1048 - // not cause an overflow.  
1049 - size_t expected_size = entry_size * num_entries; 1098 + auto [entry_size, W] = processXRefW(dict, damaged);
  1099 + int max_num_entries = processXRefSize(dict, entry_size, damaged);
  1100 + auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
1050 1101
1051 std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized); 1102 std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
1052 size_t actual_size = bp->getSize(); 1103 size_t actual_size = bp->getSize();
  1104 + auto expected_size = toS(entry_size) * toS(num_entries);
1053 1105
1054 if (expected_size != actual_size) { 1106 if (expected_size != actual_size) {
1055 - QPDFExc x = damagedPDF(  
1056 - "xref stream",  
1057 - xref_offset,  
1058 - ("Cross-reference stream data has the wrong size; expected = " +  
1059 - std::to_string(expected_size) + "; actual = " + std::to_string(actual_size))); 1107 + QPDFExc x = damaged(
  1108 + "Cross-reference stream data has the wrong size; expected = " +
  1109 + std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
1060 if (expected_size > actual_size) { 1110 if (expected_size > actual_size) {
1061 throw x; 1111 throw x;
1062 } else { 1112 } else {
@@ -1064,65 +1114,48 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle&amp; xref_obj) @@ -1064,65 +1114,48 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle&amp; xref_obj)
1064 } 1114 }
1065 } 1115 }
1066 1116
1067 - size_t cur_chunk = 0;  
1068 - int chunk_count = 0;  
1069 -  
1070 bool saw_first_compressed_object = false; 1117 bool saw_first_compressed_object = false;
1071 1118
1072 // Actual size vs. expected size check above ensures that we will not overflow any buffers here. 1119 // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
1073 - // We know that entry_size * num_entries is equal to the size of the buffer.  
1074 - unsigned char const* data = bp->getBuffer();  
1075 - for (size_t i = 0; i < num_entries; ++i) {  
1076 - // Read this entry  
1077 - unsigned char const* entry = data + (entry_size * i);  
1078 - qpdf_offset_t fields[3];  
1079 - unsigned char const* p = entry;  
1080 - for (int j = 0; j < 3; ++j) {  
1081 - fields[j] = 0;  
1082 - if ((j == 0) && (W[0] == 0)) { 1120 + // We know that entry_size * num_entries is less or equal to the size of the buffer.
  1121 + auto p = bp->getBuffer();
  1122 + for (auto [obj, sec_entries]: indx) {
  1123 + // Process a subsection.
  1124 + for (int i = 0; i < sec_entries; ++i) {
  1125 + // Read this entry
  1126 + std::array<qpdf_offset_t, 3> fields{};
  1127 + if (W[0] == 0) {
1083 QTC::TC("qpdf", "QPDF default for xref stream field 0"); 1128 QTC::TC("qpdf", "QPDF default for xref stream field 0");
1084 fields[0] = 1; 1129 fields[0] = 1;
1085 } 1130 }
1086 - for (int k = 0; k < W[j]; ++k) {  
1087 - fields[j] <<= 8;  
1088 - fields[j] += toI(*p++); 1131 + for (size_t j = 0; j < 3; ++j) {
  1132 + for (int k = 0; k < W[j]; ++k) {
  1133 + fields[j] <<= 8;
  1134 + fields[j] |= *p++;
  1135 + }
1089 } 1136 }
1090 - }  
1091 1137
1092 - // Get the object and generation number. The object number is based on /Index. The  
1093 - // generation number is 0 unless this is an uncompressed object record, in which case the  
1094 - // generation number appears as the third field.  
1095 - int obj = toI(indx.at(cur_chunk));  
1096 - if ((obj < 0) || ((std::numeric_limits<int>::max() - obj) < chunk_count)) {  
1097 - std::ostringstream msg;  
1098 - msg.imbue(std::locale::classic());  
1099 - msg << "adding " << chunk_count << " to " << obj  
1100 - << " while computing index in xref stream would cause an integer overflow";  
1101 - throw std::range_error(msg.str());  
1102 - }  
1103 - obj += chunk_count;  
1104 - ++chunk_count;  
1105 - if (chunk_count >= indx.at(cur_chunk + 1)) {  
1106 - cur_chunk += 2;  
1107 - chunk_count = 0;  
1108 - }  
1109 -  
1110 - if (saw_first_compressed_object) {  
1111 - if (fields[0] != 2) {  
1112 - m->uncompressed_after_compressed = true; 1138 + // Get the generation number. The generation number is 0 unless this is an uncompressed
  1139 + // object record, in which case the generation number appears as the third field.
  1140 + if (saw_first_compressed_object) {
  1141 + if (fields[0] != 2) {
  1142 + m->uncompressed_after_compressed = true;
  1143 + }
  1144 + } else if (fields[0] == 2) {
  1145 + saw_first_compressed_object = true;
1113 } 1146 }
1114 - } else if (fields[0] == 2) {  
1115 - saw_first_compressed_object = true;  
1116 - }  
1117 - if (obj == 0) {  
1118 - // This is needed by checkLinearization()  
1119 - m->first_xref_item_offset = xref_offset;  
1120 - } else if (fields[0] == 0) {  
1121 - // Ignore fields[2], which we don't care about in this case. This works around the issue  
1122 - // of some PDF files that put invalid values, like -1, here for deleted objects.  
1123 - insertFreeXrefEntry(QPDFObjGen(obj, 0));  
1124 - } else {  
1125 - insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2])); 1147 + if (obj == 0) {
  1148 + // This is needed by checkLinearization()
  1149 + m->first_xref_item_offset = xref_offset;
  1150 + } else if (fields[0] == 0) {
  1151 + // Ignore fields[2], which we don't care about in this case. This works around the
  1152 + // issue of some PDF files that put invalid values, like -1, here for deleted
  1153 + // objects.
  1154 + insertFreeXrefEntry(QPDFObjGen(obj, 0));
  1155 + } else {
  1156 + insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
  1157 + }
  1158 + ++obj;
1126 } 1159 }
1127 } 1160 }
1128 1161
@@ -1136,12 +1169,10 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle&amp; xref_obj) @@ -1136,12 +1169,10 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle&amp; xref_obj)
1136 "xref stream", "/Prev key in xref stream dictionary is not an integer"); 1169 "xref stream", "/Prev key in xref stream dictionary is not an integer");
1137 } 1170 }
1138 QTC::TC("qpdf", "QPDF prev key in xref stream dictionary"); 1171 QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
1139 - xref_offset = dict.getKey("/Prev").getIntValue(); 1172 + return dict.getKey("/Prev").getIntValue();
1140 } else { 1173 } else {
1141 - xref_offset = 0; 1174 + return 0;
1142 } 1175 }
1143 -  
1144 - return xref_offset;  
1145 } 1176 }
1146 1177
1147 void 1178 void