diff --git a/libqpdf/CMakeLists.txt b/libqpdf/CMakeLists.txt index 0dcaf87..d8f15ca 100644 --- a/libqpdf/CMakeLists.txt +++ b/libqpdf/CMakeLists.txt @@ -93,7 +93,6 @@ set(libqpdf_SOURCES QPDF_json.cc QPDF_linearization.cc QPDF_objects.cc - QPDF_optimization.cc QPDF_pages.cc QTC.cc QUtil.cc diff --git a/libqpdf/QPDF_linearization.cc b/libqpdf/QPDF_linearization.cc index b549b37..9313d43 100644 --- a/libqpdf/QPDF_linearization.cc +++ b/libqpdf/QPDF_linearization.cc @@ -69,6 +69,293 @@ load_vector_vector( bit_stream.skipToNextByte(); } +QPDF::ObjUser::ObjUser(user_e type) : + ou_type(type) +{ + qpdf_expect(type == ou_root); +} + +QPDF::ObjUser::ObjUser(user_e type, size_t pageno) : + ou_type(type), + pageno(pageno) +{ + qpdf_expect(type == ou_page || type == ou_thumb); +} + +QPDF::ObjUser::ObjUser(user_e type, std::string const& key) : + ou_type(type), + key(key) +{ + qpdf_expect(type == ou_trailer_key || type == ou_root_key); +} + +bool +QPDF::ObjUser::operator<(ObjUser const& rhs) const +{ + if (ou_type < rhs.ou_type) { + return true; + } + if (ou_type == rhs.ou_type) { + if (pageno < rhs.pageno) { + return true; + } + if (pageno == rhs.pageno) { + return key < rhs.key; + } + } + return false; +} + +QPDF::UpdateObjectMapsFrame::UpdateObjectMapsFrame( + QPDF::ObjUser const& ou, QPDFObjectHandle oh, bool top) : + ou(ou), + oh(oh), + top(top) +{ +} + +void +QPDF::optimize( + std::map const& object_stream_data, + bool allow_changes, + std::function skip_stream_parameters) +{ + m->lin.optimize_internal(object_stream_data, allow_changes, skip_stream_parameters); +} + +void +Lin::optimize( + QPDFWriter::ObjTable const& obj, std::function skip_stream_parameters) +{ + optimize_internal(obj, true, skip_stream_parameters); +} + +template +void +Lin::optimize_internal( + T const& object_stream_data, + bool allow_changes, + std::function skip_stream_parameters) +{ + if (!m->obj_user_to_objects.empty()) { + // already optimized + return; + } + + // The PDF specification indicates that /Outlines is supposed to be an indirect reference. Force + // it to be so if it exists and is direct. (This has been seen in the wild.) + QPDFObjectHandle root = qpdf.getRoot(); + if (root.getKey("/Outlines").isDictionary()) { + QPDFObjectHandle outlines = root.getKey("/Outlines"); + if (!outlines.isIndirect()) { + root.replaceKey("/Outlines", qpdf.makeIndirectObject(outlines)); + } + } + + // Traverse pages tree pushing all inherited resources down to the page level. This also + // initializes m->all_pages. + m->pages.pushInheritedAttributesToPage(allow_changes, false); + // Traverse pages + + size_t n = 0; + for (auto const& page: m->pages) { + updateObjectMaps(ObjUser(ObjUser::ou_page, n), page, skip_stream_parameters); + ++n; + } + + // Traverse document-level items + for (auto const& [key, value]: m->trailer.as_dictionary()) { + if (key == "/Root") { + // handled separately + } else { + if (!value.null()) { + updateObjectMaps( + ObjUser(ObjUser::ou_trailer_key, key), value, skip_stream_parameters); + } + } + } + + for (auto const& [key, value]: root.as_dictionary()) { + // Technically, /I keys from /Thread dictionaries are supposed to be handled separately, but + // we are going to disregard that specification for now. There is loads of evidence that + // pdlin and Acrobat both disregard things like this from time to time, so this is almost + // certain not to cause any problems. + if (!value.null()) { + updateObjectMaps(ObjUser(ObjUser::ou_root_key, key), value, skip_stream_parameters); + } + } + + ObjUser root_ou = ObjUser(ObjUser::ou_root); + auto root_og = QPDFObjGen(root.getObjGen()); + m->obj_user_to_objects[root_ou].insert(root_og); + m->object_to_obj_users[root_og].insert(root_ou); + + filterCompressedObjects(object_stream_data); +} + +void +Lin::updateObjectMaps( + ObjUser const& first_ou, + QPDFObjectHandle first_oh, + std::function skip_stream_parameters) +{ + QPDFObjGen::set visited; + std::vector pending; + pending.emplace_back(first_ou, first_oh, true); + // Traverse the object tree from this point taking care to avoid crossing page boundaries. + std::unique_ptr thumb_ou; + while (!pending.empty()) { + auto cur = pending.back(); + pending.pop_back(); + + bool is_page_node = false; + + if (cur.oh.isDictionaryOfType("/Page")) { + is_page_node = true; + if (!cur.top) { + continue; + } + } + + if (cur.oh.isIndirect()) { + QPDFObjGen og(cur.oh.getObjGen()); + if (!visited.add(og)) { + QTC::TC("qpdf", "QPDF opt loop detected"); + continue; + } + m->obj_user_to_objects[cur.ou].insert(og); + m->object_to_obj_users[og].insert(cur.ou); + } + + if (cur.oh.isArray()) { + for (auto const& item: cur.oh.as_array()) { + pending.emplace_back(cur.ou, item, false); + } + } else if (cur.oh.isDictionary() || cur.oh.isStream()) { + QPDFObjectHandle dict = cur.oh; + bool is_stream = cur.oh.isStream(); + int ssp = 0; + if (is_stream) { + dict = cur.oh.getDict(); + if (skip_stream_parameters) { + ssp = skip_stream_parameters(cur.oh); + } + } + + for (auto& [key, value]: dict.as_dictionary()) { + if (value.null()) { + continue; + } + + if (is_page_node && (key == "/Thumb")) { + // Traverse page thumbnail dictionaries as a special case. There can only ever + // be one /Thumb key on a page, and we see at most one page node per call. + thumb_ou = std::make_unique(ObjUser::ou_thumb, cur.ou.pageno); + pending.emplace_back(*thumb_ou, dict.getKey(key), false); + } else if (is_page_node && (key == "/Parent")) { + // Don't traverse back up the page tree + } else if ( + ((ssp >= 1) && (key == "/Length")) || + ((ssp >= 2) && ((key == "/Filter") || (key == "/DecodeParms")))) { + // Don't traverse into stream parameters that we are not going to write. + } else { + pending.emplace_back(cur.ou, value, false); + } + } + } + } +} + +void +Lin::filterCompressedObjects(std::map const& object_stream_data) +{ + if (object_stream_data.empty()) { + return; + } + + // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed + // objects. If something is a user of a compressed object, then it is really a user of the + // object stream that contains it. + + std::map> t_obj_user_to_objects; + std::map> t_object_to_obj_users; + + for (auto const& i1: m->obj_user_to_objects) { + ObjUser const& ou = i1.first; + // Loop over objects. + for (auto const& og: i1.second) { + auto i2 = object_stream_data.find(og.getObj()); + if (i2 == object_stream_data.end()) { + t_obj_user_to_objects[ou].insert(og); + } else { + t_obj_user_to_objects[ou].insert(QPDFObjGen(i2->second, 0)); + } + } + } + + for (auto const& i1: m->object_to_obj_users) { + QPDFObjGen const& og = i1.first; + // Loop over obj_users. + for (auto const& ou: i1.second) { + auto i2 = object_stream_data.find(og.getObj()); + if (i2 == object_stream_data.end()) { + t_object_to_obj_users[og].insert(ou); + } else { + t_object_to_obj_users[QPDFObjGen(i2->second, 0)].insert(ou); + } + } + } + + m->obj_user_to_objects = t_obj_user_to_objects; + m->object_to_obj_users = t_object_to_obj_users; +} + +void +Lin::filterCompressedObjects(QPDFWriter::ObjTable const& obj) +{ + if (obj.getStreamsEmpty()) { + return; + } + + // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed + // objects. If something is a user of a compressed object, then it is really a user of the + // object stream that contains it. + + std::map> t_obj_user_to_objects; + std::map> t_object_to_obj_users; + + for (auto const& i1: m->obj_user_to_objects) { + ObjUser const& ou = i1.first; + // Loop over objects. + for (auto const& og: i1.second) { + if (obj.contains(og)) { + if (auto const& i2 = obj[og].object_stream; i2 <= 0) { + t_obj_user_to_objects[ou].insert(og); + } else { + t_obj_user_to_objects[ou].insert(QPDFObjGen(i2, 0)); + } + } + } + } + + for (auto const& i1: m->object_to_obj_users) { + QPDFObjGen const& og = i1.first; + if (obj.contains(og)) { + // Loop over obj_users. + for (auto const& ou: i1.second) { + if (auto i2 = obj[og].object_stream; i2 <= 0) { + t_object_to_obj_users[og].insert(ou); + } else { + t_object_to_obj_users[QPDFObjGen(i2, 0)].insert(ou); + } + } + } + } + + m->obj_user_to_objects = t_obj_user_to_objects; + m->object_to_obj_users = t_object_to_obj_users; +} + void Lin::linearizationWarning(std::string_view msg) { diff --git a/libqpdf/QPDF_optimization.cc b/libqpdf/QPDF_optimization.cc deleted file mode 100644 index b9c8bd7..0000000 --- a/libqpdf/QPDF_optimization.cc +++ /dev/null @@ -1,298 +0,0 @@ -// See the "Optimization" section of the manual. - -#include - -#include -#include -#include -#include - -using Lin = QPDF::Doc::Linearization; -using Pages = QPDF::Doc::Pages; - -QPDF::ObjUser::ObjUser(user_e type) : - ou_type(type) -{ - qpdf_assert_debug(type == ou_root); -} - -QPDF::ObjUser::ObjUser(user_e type, size_t pageno) : - ou_type(type), - pageno(pageno) -{ - qpdf_assert_debug((type == ou_page) || (type == ou_thumb)); -} - -QPDF::ObjUser::ObjUser(user_e type, std::string const& key) : - ou_type(type), - key(key) -{ - qpdf_assert_debug((type == ou_trailer_key) || (type == ou_root_key)); -} - -bool -QPDF::ObjUser::operator<(ObjUser const& rhs) const -{ - if (ou_type < rhs.ou_type) { - return true; - } - if (ou_type == rhs.ou_type) { - if (pageno < rhs.pageno) { - return true; - } - if (pageno == rhs.pageno) { - return key < rhs.key; - } - } - return false; -} - -QPDF::UpdateObjectMapsFrame::UpdateObjectMapsFrame( - QPDF::ObjUser const& ou, QPDFObjectHandle oh, bool top) : - ou(ou), - oh(oh), - top(top) -{ -} - -void -QPDF::optimize( - std::map const& object_stream_data, - bool allow_changes, - std::function skip_stream_parameters) -{ - m->lin.optimize_internal(object_stream_data, allow_changes, skip_stream_parameters); -} - -void -Lin::optimize( - QPDFWriter::ObjTable const& obj, std::function skip_stream_parameters) -{ - optimize_internal(obj, true, skip_stream_parameters); -} - -template -void -Lin::optimize_internal( - T const& object_stream_data, - bool allow_changes, - std::function skip_stream_parameters) -{ - if (!m->obj_user_to_objects.empty()) { - // already optimized - return; - } - - // The PDF specification indicates that /Outlines is supposed to be an indirect reference. Force - // it to be so if it exists and is direct. (This has been seen in the wild.) - QPDFObjectHandle root = qpdf.getRoot(); - if (root.getKey("/Outlines").isDictionary()) { - QPDFObjectHandle outlines = root.getKey("/Outlines"); - if (!outlines.isIndirect()) { - root.replaceKey("/Outlines", qpdf.makeIndirectObject(outlines)); - } - } - - // Traverse pages tree pushing all inherited resources down to the page level. This also - // initializes m->all_pages. - m->pages.pushInheritedAttributesToPage(allow_changes, false); - // Traverse pages - - size_t n = 0; - for (auto const& page: m->pages) { - updateObjectMaps(ObjUser(ObjUser::ou_page, n), page, skip_stream_parameters); - ++n; - } - - // Traverse document-level items - for (auto const& [key, value]: m->trailer.as_dictionary()) { - if (key == "/Root") { - // handled separately - } else { - if (!value.null()) { - updateObjectMaps( - ObjUser(ObjUser::ou_trailer_key, key), value, skip_stream_parameters); - } - } - } - - for (auto const& [key, value]: root.as_dictionary()) { - // Technically, /I keys from /Thread dictionaries are supposed to be handled separately, but - // we are going to disregard that specification for now. There is loads of evidence that - // pdlin and Acrobat both disregard things like this from time to time, so this is almost - // certain not to cause any problems. - if (!value.null()) { - updateObjectMaps(ObjUser(ObjUser::ou_root_key, key), value, skip_stream_parameters); - } - } - - ObjUser root_ou = ObjUser(ObjUser::ou_root); - auto root_og = QPDFObjGen(root.getObjGen()); - m->obj_user_to_objects[root_ou].insert(root_og); - m->object_to_obj_users[root_og].insert(root_ou); - - filterCompressedObjects(object_stream_data); -} - -void -Lin::updateObjectMaps( - ObjUser const& first_ou, - QPDFObjectHandle first_oh, - std::function skip_stream_parameters) -{ - QPDFObjGen::set visited; - std::vector pending; - pending.emplace_back(first_ou, first_oh, true); - // Traverse the object tree from this point taking care to avoid crossing page boundaries. - std::unique_ptr thumb_ou; - while (!pending.empty()) { - auto cur = pending.back(); - pending.pop_back(); - - bool is_page_node = false; - - if (cur.oh.isDictionaryOfType("/Page")) { - is_page_node = true; - if (!cur.top) { - continue; - } - } - - if (cur.oh.isIndirect()) { - QPDFObjGen og(cur.oh.getObjGen()); - if (!visited.add(og)) { - QTC::TC("qpdf", "QPDF opt loop detected"); - continue; - } - m->obj_user_to_objects[cur.ou].insert(og); - m->object_to_obj_users[og].insert(cur.ou); - } - - if (cur.oh.isArray()) { - for (auto const& item: cur.oh.as_array()) { - pending.emplace_back(cur.ou, item, false); - } - } else if (cur.oh.isDictionary() || cur.oh.isStream()) { - QPDFObjectHandle dict = cur.oh; - bool is_stream = cur.oh.isStream(); - int ssp = 0; - if (is_stream) { - dict = cur.oh.getDict(); - if (skip_stream_parameters) { - ssp = skip_stream_parameters(cur.oh); - } - } - - for (auto& [key, value]: dict.as_dictionary()) { - if (value.null()) { - continue; - } - - if (is_page_node && (key == "/Thumb")) { - // Traverse page thumbnail dictionaries as a special case. There can only ever - // be one /Thumb key on a page, and we see at most one page node per call. - thumb_ou = std::make_unique(ObjUser::ou_thumb, cur.ou.pageno); - pending.emplace_back(*thumb_ou, dict.getKey(key), false); - } else if (is_page_node && (key == "/Parent")) { - // Don't traverse back up the page tree - } else if ( - ((ssp >= 1) && (key == "/Length")) || - ((ssp >= 2) && ((key == "/Filter") || (key == "/DecodeParms")))) { - // Don't traverse into stream parameters that we are not going to write. - } else { - pending.emplace_back(cur.ou, value, false); - } - } - } - } -} - -void -Lin::filterCompressedObjects(std::map const& object_stream_data) -{ - if (object_stream_data.empty()) { - return; - } - - // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed - // objects. If something is a user of a compressed object, then it is really a user of the - // object stream that contains it. - - std::map> t_obj_user_to_objects; - std::map> t_object_to_obj_users; - - for (auto const& i1: m->obj_user_to_objects) { - ObjUser const& ou = i1.first; - // Loop over objects. - for (auto const& og: i1.second) { - auto i2 = object_stream_data.find(og.getObj()); - if (i2 == object_stream_data.end()) { - t_obj_user_to_objects[ou].insert(og); - } else { - t_obj_user_to_objects[ou].insert(QPDFObjGen(i2->second, 0)); - } - } - } - - for (auto const& i1: m->object_to_obj_users) { - QPDFObjGen const& og = i1.first; - // Loop over obj_users. - for (auto const& ou: i1.second) { - auto i2 = object_stream_data.find(og.getObj()); - if (i2 == object_stream_data.end()) { - t_object_to_obj_users[og].insert(ou); - } else { - t_object_to_obj_users[QPDFObjGen(i2->second, 0)].insert(ou); - } - } - } - - m->obj_user_to_objects = t_obj_user_to_objects; - m->object_to_obj_users = t_object_to_obj_users; -} - -void -Lin::filterCompressedObjects(QPDFWriter::ObjTable const& obj) -{ - if (obj.getStreamsEmpty()) { - return; - } - - // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed - // objects. If something is a user of a compressed object, then it is really a user of the - // object stream that contains it. - - std::map> t_obj_user_to_objects; - std::map> t_object_to_obj_users; - - for (auto const& i1: m->obj_user_to_objects) { - ObjUser const& ou = i1.first; - // Loop over objects. - for (auto const& og: i1.second) { - if (obj.contains(og)) { - if (auto const& i2 = obj[og].object_stream; i2 <= 0) { - t_obj_user_to_objects[ou].insert(og); - } else { - t_obj_user_to_objects[ou].insert(QPDFObjGen(i2, 0)); - } - } - } - } - - for (auto const& i1: m->object_to_obj_users) { - QPDFObjGen const& og = i1.first; - if (obj.contains(og)) { - // Loop over obj_users. - for (auto const& ou: i1.second) { - if (auto i2 = obj[og].object_stream; i2 <= 0) { - t_object_to_obj_users[og].insert(ou); - } else { - t_object_to_obj_users[QPDFObjGen(i2, 0)].insert(ou); - } - } - } - } - - m->obj_user_to_objects = t_obj_user_to_objects; - m->object_to_obj_users = t_object_to_obj_users; -}