Commit efb2e8f613511616744eb16aafc8f7138cd821a1

Authored by m-holger
1 parent 4bf09ff1

Revert "Merge pull request #1297 from m-holger/qpdf_objects"

This reverts commit c648b9a018105a3c30b3e7f3c5f8a058d3ddd92c, reversing
changes made to 12b67a3227df6b6df3a4f5f098e11cce173ff7d5.
include/qpdf/QPDF.hh
@@ -391,7 +391,7 @@ class QPDF @@ -391,7 +391,7 @@ class QPDF
391 void replaceObject(int objid, int generation, QPDFObjectHandle); 391 void replaceObject(int objid, int generation, QPDFObjectHandle);
392 392
393 // Swap two objects given by ID. Prior to qpdf 10.2.1, existing QPDFObjectHandle instances that 393 // Swap two objects given by ID. Prior to qpdf 10.2.1, existing QPDFObjectHandle instances that
394 - // reference the objects did not notice the swap, but this was fixed in 10.2.1. 394 + // reference them objects not notice the swap, but this was fixed in 10.2.1.
395 QPDF_DLL 395 QPDF_DLL
396 void swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2); 396 void swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2);
397 QPDF_DLL 397 QPDF_DLL
@@ -645,7 +645,7 @@ class QPDF @@ -645,7 +645,7 @@ class QPDF
645 QPDF_DLL 645 QPDF_DLL
646 void fixDanglingReferences(bool force = false); 646 void fixDanglingReferences(bool force = false);
647 647
648 - // Return the approximate number of indirect objects. It is approximate because not all objects 648 + // Return the approximate number of indirect objects. It is/ approximate because not all objects
649 // in the file are preserved in all cases, and gaps in object numbering are not preserved. 649 // in the file are preserved in all cases, and gaps in object numbering are not preserved.
650 QPDF_DLL 650 QPDF_DLL
651 size_t getObjectCount(); 651 size_t getObjectCount();
@@ -730,10 +730,10 @@ class QPDF @@ -730,10 +730,10 @@ class QPDF
730 class Writer; 730 class Writer;
731 class Resolver; 731 class Resolver;
732 class StreamCopier; 732 class StreamCopier;
733 - class Objects;  
734 class ParseGuard; 733 class ParseGuard;
735 class Pipe; 734 class Pipe;
736 class JobSetter; 735 class JobSetter;
  736 + class Xref_table;
737 737
738 // For testing only -- do not add to DLL 738 // For testing only -- do not add to DLL
739 static bool test_json_validators(); 739 static bool test_json_validators();
@@ -748,6 +748,7 @@ class QPDF @@ -748,6 +748,7 @@ class QPDF
748 748
749 static std::string const qpdf_version; 749 static std::string const qpdf_version;
750 750
  751 + class ObjCache;
751 class ObjCopier; 752 class ObjCopier;
752 class EncryptionParameters; 753 class EncryptionParameters;
753 class ForeignStreamData; 754 class ForeignStreamData;
@@ -756,15 +757,36 @@ class QPDF @@ -756,15 +757,36 @@ class QPDF
756 class ResolveRecorder; 757 class ResolveRecorder;
757 class JSONReactor; 758 class JSONReactor;
758 759
759 - inline Objects& objects() noexcept;  
760 - inline Objects const& objects() const noexcept;  
761 void parse(char const* password); 760 void parse(char const* password);
762 void inParse(bool); 761 void inParse(bool);
763 void setLastObjectDescription(std::string const& description, QPDFObjGen const& og); 762 void setLastObjectDescription(std::string const& description, QPDFObjGen const& og);
  763 + QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og);
  764 + void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
  765 + void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
  766 + QPDFObjectHandle readObjectInStream(std::shared_ptr<InputSource>& input, int obj);
  767 + size_t recoverStreamLength(
  768 + std::shared_ptr<InputSource> input, QPDFObjGen const& og, qpdf_offset_t stream_offset);
764 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); 769 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0);
765 770
  771 + QPDFObjectHandle readObjectAtOffset(
  772 + bool attempt_recovery,
  773 + qpdf_offset_t offset,
  774 + std::string const& description,
  775 + QPDFObjGen exp_og,
  776 + QPDFObjGen& og,
  777 + bool skip_cache_if_in_xref);
  778 + QPDFObject* resolve(QPDFObjGen og);
  779 + void resolveObjectsInStream(int obj_stream_number);
766 void stopOnError(std::string const& message); 780 void stopOnError(std::string const& message);
  781 + QPDFObjGen nextObjGen();
767 QPDFObjectHandle newIndirect(QPDFObjGen const&, std::shared_ptr<QPDFObject> const&); 782 QPDFObjectHandle newIndirect(QPDFObjGen const&, std::shared_ptr<QPDFObject> const&);
  783 + QPDFObjectHandle makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj);
  784 + bool isCached(QPDFObjGen const& og);
  785 + bool isUnresolved(QPDFObjGen const& og);
  786 + std::shared_ptr<QPDFObject> getObjectForParser(int id, int gen, bool parse_pdf);
  787 + std::shared_ptr<QPDFObject> getObjectForJSON(int id, int gen);
  788 + void removeObject(QPDFObjGen og);
  789 + void updateCache(QPDFObjGen const& og, std::shared_ptr<QPDFObject> const& object);
768 static QPDFExc damagedPDF( 790 static QPDFExc damagedPDF(
769 InputSource& input, 791 InputSource& input,
770 std::string const& object, 792 std::string const& object,
@@ -809,7 +831,8 @@ class QPDF @@ -809,7 +831,8 @@ class QPDF
809 void optimize( 831 void optimize(
810 QPDFWriter::ObjTable const& obj, 832 QPDFWriter::ObjTable const& obj,
811 std::function<int(QPDFObjectHandle&)> skip_stream_parameters); 833 std::function<int(QPDFObjectHandle&)> skip_stream_parameters);
812 - void optimize(Objects const& obj); 834 + void optimize(Xref_table const& obj);
  835 + size_t tableSize();
813 836
814 // Get lists of all objects in order according to the part of a linearized file that they belong 837 // Get lists of all objects in order according to the part of a linearized file that they belong
815 // to. 838 // to.
@@ -829,6 +852,12 @@ class QPDF @@ -829,6 +852,12 @@ class QPDF
829 int& O, 852 int& O,
830 bool compressed); 853 bool compressed);
831 854
  855 + // Get a list of objects that would be permitted in an object stream.
  856 + template <typename T>
  857 + std::vector<T> getCompressibleObjGens();
  858 + std::vector<QPDFObjGen> getCompressibleObjVector();
  859 + std::vector<bool> getCompressibleObjSet();
  860 +
832 // methods to support page handling 861 // methods to support page handling
833 862
834 void getAllPagesInternal( 863 void getAllPagesInternal(
@@ -902,7 +931,7 @@ class QPDF @@ -902,7 +931,7 @@ class QPDF
902 QPDFObjectHandle 931 QPDFObjectHandle
903 getUncompressedObject(QPDFObjectHandle&, std::map<int, int> const& object_stream_data); 932 getUncompressedObject(QPDFObjectHandle&, std::map<int, int> const& object_stream_data);
904 QPDFObjectHandle getUncompressedObject(QPDFObjectHandle&, QPDFWriter::ObjTable const& obj); 933 QPDFObjectHandle getUncompressedObject(QPDFObjectHandle&, QPDFWriter::ObjTable const& obj);
905 - QPDFObjectHandle getUncompressedObject(QPDFObjectHandle&, Objects const& obj); 934 + QPDFObjectHandle getUncompressedObject(QPDFObjectHandle&, Xref_table const& obj);
906 int lengthNextN(int first_object, int n); 935 int lengthNextN(int first_object, int n);
907 void 936 void
908 checkHPageOffset(std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& idx_to_obj); 937 checkHPageOffset(std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& idx_to_obj);
@@ -948,7 +977,7 @@ class QPDF @@ -948,7 +977,7 @@ class QPDF
948 std::function<int(QPDFObjectHandle&)> skip_stream_parameters); 977 std::function<int(QPDFObjectHandle&)> skip_stream_parameters);
949 void filterCompressedObjects(std::map<int, int> const& object_stream_data); 978 void filterCompressedObjects(std::map<int, int> const& object_stream_data);
950 void filterCompressedObjects(QPDFWriter::ObjTable const& object_stream_data); 979 void filterCompressedObjects(QPDFWriter::ObjTable const& object_stream_data);
951 - void filterCompressedObjects(Objects const& object_stream_data); 980 + void filterCompressedObjects(Xref_table const& object_stream_data);
952 981
953 // JSON import 982 // JSON import
954 void importJSON(std::shared_ptr<InputSource>, bool must_be_complete); 983 void importJSON(std::shared_ptr<InputSource>, bool must_be_complete);
libqpdf/CMakeLists.txt
@@ -107,7 +107,6 @@ set(libqpdf_SOURCES @@ -107,7 +107,6 @@ set(libqpdf_SOURCES
107 QPDF_encryption.cc 107 QPDF_encryption.cc
108 QPDF_json.cc 108 QPDF_json.cc
109 QPDF_linearization.cc 109 QPDF_linearization.cc
110 - QPDF_objects.cc  
111 QPDF_optimization.cc 110 QPDF_optimization.cc
112 QPDF_pages.cc 111 QPDF_pages.cc
113 QTC.cc 112 QTC.cc
libqpdf/QPDF.cc
@@ -2,8 +2,10 @@ @@ -2,8 +2,10 @@
2 2
3 #include <qpdf/QPDF_private.hh> 3 #include <qpdf/QPDF_private.hh>
4 4
  5 +#include <array>
5 #include <atomic> 6 #include <atomic>
6 #include <cstring> 7 #include <cstring>
  8 +#include <limits>
7 #include <map> 9 #include <map>
8 #include <regex> 10 #include <regex>
9 #include <sstream> 11 #include <sstream>
@@ -185,7 +187,7 @@ QPDF::Members::Members(QPDF&amp; qpdf) : @@ -185,7 +187,7 @@ QPDF::Members::Members(QPDF&amp; qpdf) :
185 file_sp(new InvalidInputSource(no_input_name)), 187 file_sp(new InvalidInputSource(no_input_name)),
186 file(file_sp.get()), 188 file(file_sp.get()),
187 encp(new EncryptionParameters), 189 encp(new EncryptionParameters),
188 - objects(qpdf, this, file) 190 + xref_table(qpdf, file)
189 { 191 {
190 } 192 }
191 193
@@ -199,7 +201,25 @@ QPDF::QPDF() : @@ -199,7 +201,25 @@ QPDF::QPDF() :
199 m->unique_id = unique_id.fetch_add(1ULL); 201 m->unique_id = unique_id.fetch_add(1ULL);
200 } 202 }
201 203
202 -QPDF::~QPDF() = default; 204 +QPDF::~QPDF()
  205 +{
  206 + // If two objects are mutually referential (through each object having an array or dictionary
  207 + // that contains an indirect reference to the other), the circular references in the
  208 + // std::shared_ptr objects will prevent the objects from being deleted. Walk through all objects
  209 + // in the object cache, which is those objects that we read from the file, and break all
  210 + // resolved indirect references by replacing them with an internal object type representing that
  211 + // they have been destroyed. Note that we can't break references like this at any time when the
  212 + // QPDF object is active. The call to reset also causes all direct QPDFObjectHandle objects that
  213 + // are reachable from this object to release their association with this QPDF. Direct objects
  214 + // are not destroyed since they can be moved to other QPDF objects safely.
  215 +
  216 + for (auto const& iter: m->obj_cache) {
  217 + iter.second.object->disconnect();
  218 + if (iter.second.object->getTypeCode() != ::ot_null) {
  219 + iter.second.object->destroy();
  220 + }
  221 + }
  222 +}
203 223
204 std::shared_ptr<QPDF> 224 std::shared_ptr<QPDF>
205 QPDF::create() 225 QPDF::create()
@@ -260,7 +280,7 @@ QPDF::emptyPDF() @@ -260,7 +280,7 @@ QPDF::emptyPDF()
260 { 280 {
261 m->pdf_version = "1.3"; 281 m->pdf_version = "1.3";
262 m->no_input_name = "empty PDF"; 282 m->no_input_name = "empty PDF";
263 - m->objects.xref_table().initialize_empty(); 283 + m->xref_table.initialize_empty();
264 } 284 }
265 285
266 void 286 void
@@ -273,7 +293,7 @@ QPDF::registerStreamFilter( @@ -273,7 +293,7 @@ QPDF::registerStreamFilter(
273 void 293 void
274 QPDF::setIgnoreXRefStreams(bool val) 294 QPDF::setIgnoreXRefStreams(bool val)
275 { 295 {
276 - m->objects.xref_table().ignore_streams(val); 296 + m->xref_table.ignore_streams(val);
277 } 297 }
278 298
279 std::shared_ptr<QPDFLogger> 299 std::shared_ptr<QPDFLogger>
@@ -311,7 +331,7 @@ void @@ -311,7 +331,7 @@ void
311 QPDF::setAttemptRecovery(bool val) 331 QPDF::setAttemptRecovery(bool val)
312 { 332 {
313 m->attempt_recovery = val; 333 m->attempt_recovery = val;
314 - m->objects.xref_table().attempt_recovery(val); 334 + m->xref_table.attempt_recovery(val);
315 } 335 }
316 336
317 void 337 void
@@ -389,6 +409,17 @@ QPDF::findHeader() @@ -389,6 +409,17 @@ QPDF::findHeader()
389 return valid; 409 return valid;
390 } 410 }
391 411
  412 +bool
  413 +QPDF::findStartxref()
  414 +{
  415 + if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {
  416 + // Position in front of offset token
  417 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  418 + return true;
  419 + }
  420 + return false;
  421 +}
  422 +
392 void 423 void
393 QPDF::parse(char const* password) 424 QPDF::parse(char const* password)
394 { 425 {
@@ -405,9 +436,9 @@ QPDF::parse(char const* password) @@ -405,9 +436,9 @@ QPDF::parse(char const* password)
405 m->pdf_version = "1.2"; 436 m->pdf_version = "1.2";
406 } 437 }
407 438
408 - m->objects.xref_table().initialize(); 439 + m->xref_table.initialize();
409 initializeEncryption(); 440 initializeEncryption();
410 - if (m->objects.xref_table().size() > 0 && !getRoot().getKey("/Pages").isDictionary()) { 441 + if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) {
411 // QPDFs created from JSON have an empty xref table and no root object yet. 442 // QPDFs created from JSON have an empty xref table and no root object yet.
412 throw damagedPDF("", 0, "unable to find page tree"); 443 throw damagedPDF("", 0, "unable to find page tree");
413 } 444 }
@@ -422,35 +453,1084 @@ QPDF::inParse(bool v) @@ -422,35 +453,1084 @@ QPDF::inParse(bool v)
422 throw std::logic_error("QPDF: re-entrant parsing detected. This is a qpdf bug." 453 throw std::logic_error("QPDF: re-entrant parsing detected. This is a qpdf bug."
423 " Please report at https://github.com/qpdf/qpdf/issues."); 454 " Please report at https://github.com/qpdf/qpdf/issues.");
424 } 455 }
425 - m->in_parse = v; 456 + m->in_parse = v;
  457 +}
  458 +
  459 +void
  460 +QPDF::warn(QPDFExc const& e)
  461 +{
  462 + if (m->max_warnings > 0 && m->warnings.size() >= m->max_warnings) {
  463 + stopOnError("Too many warnings - file is too badly damaged");
  464 + }
  465 + m->warnings.push_back(e);
  466 + if (!m->suppress_warnings) {
  467 + *m->log->getWarn() << "WARNING: " << m->warnings.back().what() << "\n";
  468 + }
  469 +}
  470 +
  471 +void
  472 +QPDF::warn(
  473 + qpdf_error_code_e error_code,
  474 + std::string const& object,
  475 + qpdf_offset_t offset,
  476 + std::string const& message)
  477 +{
  478 + warn(QPDFExc(error_code, getFilename(), object, offset, message));
  479 +}
  480 +
  481 +void
  482 +QPDF::Xref_table::initialize_empty()
  483 +{
  484 + initialized_ = true;
  485 + trailer_ = QPDFObjectHandle::newDictionary();
  486 + auto rt = qpdf.makeIndirectObject(QPDFObjectHandle::newDictionary());
  487 + auto pgs = qpdf.makeIndirectObject(QPDFObjectHandle::newDictionary());
  488 + pgs.replaceKey("/Type", QPDFObjectHandle::newName("/Pages"));
  489 + pgs.replaceKey("/Kids", QPDFObjectHandle::newArray());
  490 + pgs.replaceKey("/Count", QPDFObjectHandle::newInteger(0));
  491 + rt.replaceKey("/Type", QPDFObjectHandle::newName("/Catalog"));
  492 + rt.replaceKey("/Pages", pgs);
  493 + trailer_.replaceKey("/Root", rt);
  494 + trailer_.replaceKey("/Size", QPDFObjectHandle::newInteger(3));
  495 +}
  496 +
  497 +void
  498 +QPDF::Xref_table::initialize_json()
  499 +{
  500 + initialized_ = true;
  501 + table.resize(1);
  502 + trailer_ = QPDFObjectHandle::newDictionary();
  503 + trailer_.replaceKey("/Size", QPDFObjectHandle::newInteger(1));
  504 +}
  505 +
  506 +void
  507 +QPDF::Xref_table::initialize()
  508 +{
  509 + // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
  510 + // 30 characters to leave room for the startxref stuff.
  511 + file->seek(0, SEEK_END);
  512 + qpdf_offset_t end_offset = file->tell();
  513 + // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
  514 + // scenarios at least 3 bytes are required.
  515 + if (max_id_ > end_offset / 3) {
  516 + max_id_ = static_cast<int>(end_offset / 3);
  517 + }
  518 + qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
  519 + PatternFinder sf(qpdf, &QPDF::findStartxref);
  520 + qpdf_offset_t xref_offset = 0;
  521 + if (file->findLast("startxref", start_offset, 0, sf)) {
  522 + xref_offset = QUtil::string_to_ll(read_token().getValue().c_str());
  523 + }
  524 +
  525 + try {
  526 + if (xref_offset == 0) {
  527 + QTC::TC("qpdf", "QPDF can't find startxref");
  528 + throw damaged_pdf("can't find startxref");
  529 + }
  530 + try {
  531 + read(xref_offset);
  532 + } catch (QPDFExc&) {
  533 + throw;
  534 + } catch (std::exception& e) {
  535 + throw damaged_pdf(std::string("error reading xref: ") + e.what());
  536 + }
  537 + } catch (QPDFExc& e) {
  538 + if (attempt_recovery_) {
  539 + reconstruct(e);
  540 + QTC::TC("qpdf", "QPDF reconstructed xref table");
  541 + } else {
  542 + throw;
  543 + }
  544 + }
  545 +
  546 + initialized_ = true;
  547 +}
  548 +
  549 +void
  550 +QPDF::Xref_table::reconstruct(QPDFExc& e)
  551 +{
  552 + if (reconstructed_) {
  553 + // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
  554 + // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
  555 + throw e;
  556 + }
  557 +
  558 + // If recovery generates more than 1000 warnings, the file is so severely damaged that there
  559 + // probably is no point trying to continue.
  560 + const auto max_warnings = qpdf.m->warnings.size() + 1000U;
  561 + auto check_warnings = [this, max_warnings]() {
  562 + if (qpdf.m->warnings.size() > max_warnings) {
  563 + throw damaged_pdf("too many errors while reconstructing cross-reference table");
  564 + }
  565 + };
  566 +
  567 + reconstructed_ = true;
  568 + // We may find more objects, which may contain dangling references.
  569 + qpdf.m->fixed_dangling_refs = false;
  570 +
  571 + warn_damaged("file is damaged");
  572 + qpdf.warn(e);
  573 + warn_damaged("Attempting to reconstruct cross-reference table");
  574 +
  575 + // Delete all references to type 1 (uncompressed) objects
  576 + for (auto& iter: table) {
  577 + if (iter.type() == 1) {
  578 + iter = {};
  579 + }
  580 + }
  581 +
  582 + std::vector<std::tuple<int, int, qpdf_offset_t>> objects;
  583 + std::vector<qpdf_offset_t> trailers;
  584 + int max_found = 0;
  585 +
  586 + file->seek(0, SEEK_END);
  587 + qpdf_offset_t eof = file->tell();
  588 + file->seek(0, SEEK_SET);
  589 + // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
  590 + static size_t const MAX_LEN = 10;
  591 + while (file->tell() < eof) {
  592 + QPDFTokenizer::Token t1 = read_token(MAX_LEN);
  593 + qpdf_offset_t token_start = file->tell() - toO(t1.getValue().length());
  594 + if (t1.isInteger()) {
  595 + auto pos = file->tell();
  596 + QPDFTokenizer::Token t2 = read_token(MAX_LEN);
  597 + if (t2.isInteger() && read_token(MAX_LEN).isWord("obj")) {
  598 + int obj = QUtil::string_to_int(t1.getValue().c_str());
  599 + int gen = QUtil::string_to_int(t2.getValue().c_str());
  600 + if (obj <= max_id_) {
  601 + objects.emplace_back(obj, gen, token_start);
  602 + if (obj > max_found) {
  603 + max_found = obj;
  604 + }
  605 + } else {
  606 + warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));
  607 + }
  608 + }
  609 + file->seek(pos, SEEK_SET);
  610 + } else if (!trailer_ && t1.isWord("trailer")) {
  611 + trailers.emplace_back(file->tell());
  612 + }
  613 + file->findAndSkipNextEOL();
  614 + }
  615 +
  616 + table.resize(toS(max_found) + 1);
  617 +
  618 + for (auto tr: trailers) {
  619 + file->seek(tr, SEEK_SET);
  620 + auto t = read_trailer();
  621 + if (!t.isDictionary()) {
  622 + // Oh well. It was worth a try.
  623 + } else {
  624 + trailer_ = t;
  625 + break;
  626 + }
  627 + check_warnings();
  628 + }
  629 +
  630 + auto rend = objects.rend();
  631 + for (auto it = objects.rbegin(); it != rend; it++) {
  632 + auto [obj, gen, token_start] = *it;
  633 + insert(obj, 1, token_start, gen);
  634 + check_warnings();
  635 + }
  636 +
  637 + if (!trailer_) {
  638 + qpdf_offset_t max_offset{0};
  639 + // If there are any xref streams, take the last one to appear.
  640 + int i = -1;
  641 + for (auto const& item: table) {
  642 + ++i;
  643 + if (item.type() != 1) {
  644 + continue;
  645 + }
  646 + auto oh = qpdf.getObject(i, item.gen());
  647 + try {
  648 + if (!oh.isStreamOfType("/XRef")) {
  649 + continue;
  650 + }
  651 + } catch (std::exception&) {
  652 + continue;
  653 + }
  654 + auto offset = item.offset();
  655 + if (offset > max_offset) {
  656 + max_offset = offset;
  657 + trailer_ = oh.getDict();
  658 + }
  659 + check_warnings();
  660 + }
  661 + if (max_offset > 0) {
  662 + try {
  663 + read(max_offset);
  664 + } catch (std::exception&) {
  665 + throw damaged_pdf(
  666 + "error decoding candidate xref stream while recovering damaged file");
  667 + }
  668 + QTC::TC("qpdf", "QPDF recover xref stream");
  669 + }
  670 + }
  671 +
  672 + if (!trailer_) {
  673 + // We could check the last encountered object to see if it was an xref stream. If so, we
  674 + // could try to get the trailer from there. This may make it possible to recover files with
  675 + // bad startxref pointers even when they have object streams.
  676 +
  677 + throw damaged_pdf("unable to find trailer dictionary while recovering damaged file");
  678 + }
  679 + if (table.empty()) {
  680 + // We cannot check for an empty xref table in parse because empty tables are valid when
  681 + // creating QPDF objects from JSON.
  682 + throw damaged_pdf("unable to find objects while recovering damaged file");
  683 + }
  684 + check_warnings();
  685 + if (!initialized_) {
  686 + initialized_ = true;
  687 + qpdf.getAllPages();
  688 + check_warnings();
  689 + if (qpdf.m->all_pages.empty()) {
  690 + initialized_ = false;
  691 + throw damaged_pdf("unable to find any pages while recovering damaged file");
  692 + }
  693 + }
  694 + // We could iterate through the objects looking for streams and try to find objects inside of
  695 + // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors
  696 + // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything
  697 + // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
  698 + // It's safe to call it more than once.
  699 +}
  700 +
  701 +void
  702 +QPDF::Xref_table::read(qpdf_offset_t xref_offset)
  703 +{
  704 + std::map<int, int> free_table;
  705 + std::set<qpdf_offset_t> visited;
  706 + while (xref_offset) {
  707 + visited.insert(xref_offset);
  708 + char buf[7];
  709 + memset(buf, 0, sizeof(buf));
  710 + file->seek(xref_offset, SEEK_SET);
  711 + // Some files miss the mark a little with startxref. We could do a better job of searching
  712 + // in the neighborhood for something that looks like either an xref table or stream, but the
  713 + // simple heuristic of skipping whitespace can help with the xref table case and is harmless
  714 + // with the stream case.
  715 + bool done = false;
  716 + bool skipped_space = false;
  717 + while (!done) {
  718 + char ch;
  719 + if (1 == file->read(&ch, 1)) {
  720 + if (QUtil::is_space(ch)) {
  721 + skipped_space = true;
  722 + } else {
  723 + file->unreadCh(ch);
  724 + done = true;
  725 + }
  726 + } else {
  727 + QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
  728 + done = true;
  729 + }
  730 + }
  731 +
  732 + file->read(buf, sizeof(buf) - 1);
  733 + // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
  734 + // where it is terminated by arbitrary whitespace.
  735 + if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) {
  736 + if (skipped_space) {
  737 + QTC::TC("qpdf", "QPDF xref skipped space");
  738 + warn_damaged("extraneous whitespace seen before xref");
  739 + }
  740 + QTC::TC(
  741 + "qpdf",
  742 + "QPDF xref space",
  743 + ((buf[4] == '\n') ? 0
  744 + : (buf[4] == '\r') ? 1
  745 + : (buf[4] == ' ') ? 2
  746 + : 9999));
  747 + int skip = 4;
  748 + // buf is null-terminated, and QUtil::is_space('\0') is false, so this won't overrun.
  749 + while (QUtil::is_space(buf[skip])) {
  750 + ++skip;
  751 + }
  752 + xref_offset = process_section(xref_offset + skip);
  753 + } else {
  754 + xref_offset = read_stream(xref_offset);
  755 + }
  756 + if (visited.count(xref_offset) != 0) {
  757 + QTC::TC("qpdf", "QPDF xref loop");
  758 + throw damaged_pdf("loop detected following xref tables");
  759 + }
  760 + }
  761 +
  762 + if (!trailer_) {
  763 + throw damaged_pdf("unable to find trailer while reading xref");
  764 + }
  765 + int size = trailer_.getKey("/Size").getIntValueAsInt();
  766 +
  767 + if (size < 3) {
  768 + throw damaged_pdf("too few objects - file can't have a page tree");
  769 + }
  770 +
  771 + // We are no longer reporting what the highest id in the xref table is. I don't think it adds
  772 + // anything. If we want to report more detail, we should report the total number of missing
  773 + // entries, including missing entries before the last actual entry.
  774 +}
  775 +
  776 +QPDF::Xref_table::Subsection
  777 +QPDF::Xref_table::subsection(std::string const& line)
  778 +{
  779 + auto terminate = [this]() -> void {
  780 + QTC::TC("qpdf", "QPDF invalid xref");
  781 + throw damaged_table("xref syntax invalid");
  782 + };
  783 +
  784 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  785 + // buffer.
  786 + char const* p = line.c_str();
  787 + char const* start = line.c_str();
  788 +
  789 + // Skip zero or more spaces
  790 + while (QUtil::is_space(*p)) {
  791 + ++p;
  792 + }
  793 + // Require digit
  794 + if (!QUtil::is_digit(*p)) {
  795 + terminate();
  796 + }
  797 + // Gather digits
  798 + std::string obj_str;
  799 + while (QUtil::is_digit(*p)) {
  800 + obj_str.append(1, *p++);
  801 + }
  802 + // Require space
  803 + if (!QUtil::is_space(*p)) {
  804 + terminate();
  805 + }
  806 + // Skip spaces
  807 + while (QUtil::is_space(*p)) {
  808 + ++p;
  809 + }
  810 + // Require digit
  811 + if (!QUtil::is_digit(*p)) {
  812 + terminate();
  813 + }
  814 + // Gather digits
  815 + std::string num_str;
  816 + while (QUtil::is_digit(*p)) {
  817 + num_str.append(1, *p++);
  818 + }
  819 + // Skip any space including line terminators
  820 + while (QUtil::is_space(*p)) {
  821 + ++p;
  822 + }
  823 + auto obj = QUtil::string_to_int(obj_str.c_str());
  824 + auto count = QUtil::string_to_int(num_str.c_str());
  825 + if (obj > max_id() || count > max_id() || (obj + count) > max_id()) {
  826 + throw damaged_table("xref table subsection header contains impossibly large entry");
  827 + }
  828 + return {obj, count, file->getLastOffset() + toI(p - start)};
  829 +}
  830 +
  831 +std::vector<QPDF::Xref_table::Subsection>
  832 +QPDF::Xref_table::bad_subsections(std::string& line, qpdf_offset_t start)
  833 +{
  834 + std::vector<QPDF::Xref_table::Subsection> result;
  835 + file->seek(start, SEEK_SET);
  836 +
  837 + while (true) {
  838 + line.assign(50, '\0');
  839 + file->read(line.data(), line.size());
  840 + auto [obj, num, offset] = result.emplace_back(subsection(line));
  841 + file->seek(offset, SEEK_SET);
  842 + for (qpdf_offset_t i = obj; i - num < obj; ++i) {
  843 + if (!std::get<0>(read_entry())) {
  844 + QTC::TC("qpdf", "QPDF invalid xref entry");
  845 + throw damaged_table("invalid xref entry (obj=" + std::to_string(i) + ")");
  846 + }
  847 + }
  848 + qpdf_offset_t pos = file->tell();
  849 + if (read_token().isWord("trailer")) {
  850 + return result;
  851 + } else {
  852 + file->seek(pos, SEEK_SET);
  853 + }
  854 + }
  855 +}
  856 +
  857 +// Optimistically read and parse all subsection headers. If an error is encountered return the
  858 +// result of bad_subsections.
  859 +std::vector<QPDF::Xref_table::Subsection>
  860 +QPDF::Xref_table::subsections(std::string& line)
  861 +{
  862 + auto recovery_offset = file->tell();
  863 + try {
  864 + std::vector<QPDF::Xref_table::Subsection> result;
  865 +
  866 + while (true) {
  867 + line.assign(50, '\0');
  868 + file->read(line.data(), line.size());
  869 + auto& sub = result.emplace_back(subsection(line));
  870 + auto count = std::get<1>(sub);
  871 + auto offset = std::get<2>(sub);
  872 + file->seek(offset + 20 * toO(count) - 1, SEEK_SET);
  873 + file->read(line.data(), 1);
  874 + if (!(line[0] == '\n' || line[0] == '\n')) {
  875 + return bad_subsections(line, recovery_offset);
  876 + }
  877 + qpdf_offset_t pos = file->tell();
  878 + if (read_token().isWord("trailer")) {
  879 + return result;
  880 + } else {
  881 + file->seek(pos, SEEK_SET);
  882 + }
  883 + }
  884 + } catch (...) {
  885 + return bad_subsections(line, recovery_offset);
  886 + }
  887 +}
  888 +
  889 +// Returns (success, f1, f2, type).
  890 +std::tuple<bool, qpdf_offset_t, int, char>
  891 +QPDF::Xref_table::read_bad_entry()
  892 +{
  893 + qpdf_offset_t f1{0};
  894 + int f2{0};
  895 + char type{'\0'};
  896 + // Reposition after initial read attempt and reread.
  897 + file->seek(file->getLastOffset(), SEEK_SET);
  898 + auto line = file->readLine(30);
  899 +
  900 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  901 + // buffer.
  902 + char const* p = line.data();
  903 +
  904 + // Skip zero or more spaces. There aren't supposed to be any.
  905 + bool invalid = false;
  906 + while (QUtil::is_space(*p)) {
  907 + ++p;
  908 + QTC::TC("qpdf", "QPDF ignore first space in xref entry");
  909 + invalid = true;
  910 + }
  911 + // Require digit
  912 + if (!QUtil::is_digit(*p)) {
  913 + return {false, 0, 0, '\0'};
  914 + }
  915 + // Gather digits
  916 + std::string f1_str;
  917 + while (QUtil::is_digit(*p)) {
  918 + f1_str.append(1, *p++);
  919 + }
  920 + // Require space
  921 + if (!QUtil::is_space(*p)) {
  922 + return {false, 0, 0, '\0'};
  923 + }
  924 + if (QUtil::is_space(*(p + 1))) {
  925 + QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
  926 + invalid = true;
  927 + }
  928 + // Skip spaces
  929 + while (QUtil::is_space(*p)) {
  930 + ++p;
  931 + }
  932 + // Require digit
  933 + if (!QUtil::is_digit(*p)) {
  934 + return {false, 0, 0, '\0'};
  935 + }
  936 + // Gather digits
  937 + std::string f2_str;
  938 + while (QUtil::is_digit(*p)) {
  939 + f2_str.append(1, *p++);
  940 + }
  941 + // Require space
  942 + if (!QUtil::is_space(*p)) {
  943 + return {false, 0, 0, '\0'};
  944 + }
  945 + if (QUtil::is_space(*(p + 1))) {
  946 + QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
  947 + invalid = true;
  948 + }
  949 + // Skip spaces
  950 + while (QUtil::is_space(*p)) {
  951 + ++p;
  952 + }
  953 + if ((*p == 'f') || (*p == 'n')) {
  954 + type = *p;
  955 + } else {
  956 + return {false, 0, 0, '\0'};
  957 + }
  958 + if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
  959 + QTC::TC("qpdf", "QPDF ignore length error xref entry");
  960 + invalid = true;
  961 + }
  962 +
  963 + if (invalid) {
  964 + qpdf.warn(damaged_table("accepting invalid xref table entry"));
  965 + }
  966 +
  967 + f1 = QUtil::string_to_ll(f1_str.c_str());
  968 + f2 = QUtil::string_to_int(f2_str.c_str());
  969 +
  970 + return {true, f1, f2, type};
  971 +}
  972 +
  973 +// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
  974 +// result. Returns (success, f1, f2, type).
  975 +std::tuple<bool, qpdf_offset_t, int, char>
  976 +QPDF::Xref_table::read_entry()
  977 +{
  978 + qpdf_offset_t f1{0};
  979 + int f2{0};
  980 + char type{'\0'};
  981 + std::array<char, 21> line;
  982 + f1 = 0;
  983 + f2 = 0;
  984 + if (file->read(line.data(), 20) != 20) {
  985 + // C++20: [[unlikely]]
  986 + return {false, 0, 0, '\0'};
  987 + }
  988 + line[20] = '\0';
  989 + char const* p = line.data();
  990 +
  991 + int f1_len = 0;
  992 + int f2_len = 0;
  993 +
  994 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  995 + // buffer.
  996 +
  997 + // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
  998 + while (*p == '0') {
  999 + ++f1_len;
  1000 + ++p;
  1001 + }
  1002 + while (QUtil::is_digit(*p) && f1_len++ < 10) {
  1003 + f1 *= 10;
  1004 + f1 += *p++ - '0';
  1005 + }
  1006 + // Require space
  1007 + if (!QUtil::is_space(*p++)) {
  1008 + // Entry doesn't start with space or digit.
  1009 + // C++20: [[unlikely]]
  1010 + return {false, 0, 0, '\0'};
  1011 + }
  1012 + // Gather digits. NB No risk of overflow as 99'999 < max int.
  1013 + while (*p == '0') {
  1014 + ++f2_len;
  1015 + ++p;
  1016 + }
  1017 + while (QUtil::is_digit(*p) && f2_len++ < 5) {
  1018 + f2 *= 10;
  1019 + f2 += static_cast<int>(*p++ - '0');
  1020 + }
  1021 + if (QUtil::is_space(*p++) && (*p == 'f' || *p == 'n')) {
  1022 + // C++20: [[likely]]
  1023 + type = *p;
  1024 + // No test for valid line[19].
  1025 + if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
  1026 + // C++20: [[likely]]
  1027 + return {true, f1, f2, type};
  1028 + }
  1029 + }
  1030 + return read_bad_entry();
  1031 +}
  1032 +
  1033 +// Read a single cross-reference table section and associated trailer.
  1034 +qpdf_offset_t
  1035 +QPDF::Xref_table::process_section(qpdf_offset_t xref_offset)
  1036 +{
  1037 + file->seek(xref_offset, SEEK_SET);
  1038 + std::string line;
  1039 + auto subs = subsections(line);
  1040 +
  1041 + auto cur_trailer_offset = file->tell();
  1042 + auto cur_trailer = read_trailer();
  1043 + if (!cur_trailer.isDictionary()) {
  1044 + QTC::TC("qpdf", "QPDF missing trailer");
  1045 + throw qpdf.damagedPDF("", "expected trailer dictionary");
  1046 + }
  1047 +
  1048 + if (!trailer_) {
  1049 + unsigned int sz;
  1050 + trailer_ = cur_trailer;
  1051 +
  1052 + if (!trailer_.hasKey("/Size")) {
  1053 + QTC::TC("qpdf", "QPDF trailer lacks size");
  1054 + throw qpdf.damagedPDF("trailer", "trailer dictionary lacks /Size key");
  1055 + }
  1056 + if (!trailer_.getKey("/Size").getValueAsUInt(sz)) {
  1057 + QTC::TC("qpdf", "QPDF trailer size not integer");
  1058 + throw qpdf.damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
  1059 + }
  1060 + if (sz >= static_cast<unsigned int>(max_id_)) {
  1061 + QTC::TC("qpdf", "QPDF trailer size impossibly large");
  1062 + throw qpdf.damagedPDF("trailer", "/Size key in trailer dictionary is impossibly large");
  1063 + }
  1064 + table.resize(sz);
  1065 + }
  1066 +
  1067 + for (auto [obj, num, offset]: subs) {
  1068 + file->seek(offset, SEEK_SET);
  1069 + for (qpdf_offset_t i = obj; i - num < obj; ++i) {
  1070 + if (i == 0) {
  1071 + // This is needed by checkLinearization()
  1072 + first_item_offset_ = file->tell();
  1073 + }
  1074 + // For xref_table, these will always be small enough to be ints
  1075 + auto [success, f1, f2, type] = read_entry();
  1076 + if (!success) {
  1077 + throw damaged_table("invalid xref entry (obj=" + std::to_string(i) + ")");
  1078 + }
  1079 + if (type == 'f') {
  1080 + insert_free(QPDFObjGen(toI(i), f2));
  1081 + } else {
  1082 + insert(toI(i), 1, f1, f2);
  1083 + }
  1084 + }
  1085 + qpdf_offset_t pos = file->tell();
  1086 + if (read_token().isWord("trailer")) {
  1087 + break;
  1088 + } else {
  1089 + file->seek(pos, SEEK_SET);
  1090 + }
  1091 + }
  1092 +
  1093 + if (cur_trailer.hasKey("/XRefStm")) {
  1094 + if (ignore_streams_) {
  1095 + QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
  1096 + } else {
  1097 + if (cur_trailer.getKey("/XRefStm").isInteger()) {
  1098 + // Read the xref stream but disregard any return value -- we'll use our trailer's
  1099 + // /Prev key instead of the xref stream's.
  1100 + (void)read_stream(cur_trailer.getKey("/XRefStm").getIntValue());
  1101 + } else {
  1102 + throw qpdf.damagedPDF("xref stream", cur_trailer_offset, "invalid /XRefStm");
  1103 + }
  1104 + }
  1105 + }
  1106 +
  1107 + if (cur_trailer.hasKey("/Prev")) {
  1108 + if (!cur_trailer.getKey("/Prev").isInteger()) {
  1109 + QTC::TC("qpdf", "QPDF trailer prev not integer");
  1110 + throw qpdf.damagedPDF(
  1111 + "trailer", cur_trailer_offset, "/Prev key in trailer dictionary is not an integer");
  1112 + }
  1113 + QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
  1114 + return cur_trailer.getKey("/Prev").getIntValue();
  1115 + }
  1116 +
  1117 + return 0;
  1118 +}
  1119 +
  1120 +// Read a single cross-reference stream.
  1121 +qpdf_offset_t
  1122 +QPDF::Xref_table::read_stream(qpdf_offset_t xref_offset)
  1123 +{
  1124 + if (!ignore_streams_) {
  1125 + QPDFObjGen x_og;
  1126 + QPDFObjectHandle xref_obj;
  1127 + try {
  1128 + xref_obj = qpdf.readObjectAtOffset(
  1129 + false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);
  1130 + } catch (QPDFExc&) {
  1131 + // ignore -- report error below
  1132 + }
  1133 + if (xref_obj.isStreamOfType("/XRef")) {
  1134 + QTC::TC("qpdf", "QPDF found xref stream");
  1135 + return process_stream(xref_offset, xref_obj);
  1136 + }
  1137 + }
  1138 +
  1139 + QTC::TC("qpdf", "QPDF can't find xref");
  1140 + throw qpdf.damagedPDF("", xref_offset, "xref not found");
  1141 + return 0; // unreachable
  1142 +}
  1143 +
  1144 +// Return the entry size of the xref stream and the processed W array.
  1145 +std::pair<int, std::array<int, 3>>
  1146 +QPDF::Xref_table::process_W(
  1147 + QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
  1148 +{
  1149 + auto W_obj = dict.getKey("/W");
  1150 + if (!(W_obj.isArray() && W_obj.getArrayNItems() >= 3 && W_obj.getArrayItem(0).isInteger() &&
  1151 + W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
  1152 + throw damaged("Cross-reference stream does not have a proper /W key");
  1153 + }
  1154 +
  1155 + std::array<int, 3> W;
  1156 + int entry_size = 0;
  1157 + auto w_vector = W_obj.getArrayAsVector();
  1158 + int max_bytes = sizeof(qpdf_offset_t);
  1159 + for (size_t i = 0; i < 3; ++i) {
  1160 + W[i] = w_vector[i].getIntValueAsInt();
  1161 + if (W[i] > max_bytes) {
  1162 + throw damaged("Cross-reference stream's /W contains impossibly large values");
  1163 + }
  1164 + if (W[i] < 0) {
  1165 + throw damaged("Cross-reference stream's /W contains negative values");
  1166 + }
  1167 + entry_size += W[i];
  1168 + }
  1169 + if (entry_size == 0) {
  1170 + throw damaged("Cross-reference stream's /W indicates entry size of 0");
  1171 + }
  1172 + return {entry_size, W};
  1173 +}
  1174 +
  1175 +// Validate Size entry and return the maximum number of entries that the xref stream can contain and
  1176 +// the value of the Size entry.
  1177 +std::pair<int, size_t>
  1178 +QPDF::Xref_table::process_Size(
  1179 + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
  1180 +{
  1181 + // Number of entries is limited by the highest possible object id and stream size.
  1182 + auto max_num_entries = std::numeric_limits<int>::max();
  1183 + if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
  1184 + max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
  1185 + }
  1186 +
  1187 + auto Size_obj = dict.getKey("/Size");
  1188 + long long size;
  1189 + if (!dict.getKey("/Size").getValueAsInt(size)) {
  1190 + throw damaged("Cross-reference stream does not have a proper /Size key");
  1191 + } else if (size < 0) {
  1192 + throw damaged("Cross-reference stream has a negative /Size key");
  1193 + } else if (size >= max_num_entries) {
  1194 + throw damaged("Cross-reference stream has an impossibly large /Size key");
  1195 + }
  1196 + // We are not validating that Size <= (Size key of parent xref / trailer).
  1197 + return {max_num_entries, toS(size)};
  1198 +}
  1199 +
  1200 +// Return the number of entries of the xref stream and the processed Index array.
  1201 +std::pair<int, std::vector<std::pair<int, int>>>
  1202 +QPDF::Xref_table::process_Index(
  1203 + QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
  1204 +{
  1205 + auto size = dict.getKey("/Size").getIntValueAsInt();
  1206 + auto Index_obj = dict.getKey("/Index");
  1207 +
  1208 + if (Index_obj.isArray()) {
  1209 + std::vector<std::pair<int, int>> indx;
  1210 + int num_entries = 0;
  1211 + auto index_vec = Index_obj.getArrayAsVector();
  1212 + if ((index_vec.size() % 2) || index_vec.size() < 2) {
  1213 + throw damaged("Cross-reference stream's /Index has an invalid number of values");
  1214 + }
  1215 +
  1216 + int i = 0;
  1217 + long long first = 0;
  1218 + for (auto& val: index_vec) {
  1219 + if (val.isInteger()) {
  1220 + if (i % 2) {
  1221 + auto count = val.getIntValue();
  1222 + if (count <= 0) {
  1223 + throw damaged(
  1224 + "Cross-reference stream section claims to contain " +
  1225 + std::to_string(count) + " entries");
  1226 + }
  1227 + // We are guarding against the possibility of num_entries * entry_size
  1228 + // overflowing. We are not checking that entries are in ascending order as
  1229 + // required by the spec, which probably should generate a warning. We are also
  1230 + // not checking that for each subsection first object number + number of entries
  1231 + // <= /Size. The spec requires us to ignore object number > /Size.
  1232 + if (first > (max_num_entries - count) ||
  1233 + count > (max_num_entries - num_entries)) {
  1234 + throw damaged(
  1235 + "Cross-reference stream claims to contain too many entries: " +
  1236 + std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
  1237 + std::to_string(num_entries));
  1238 + }
  1239 + indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
  1240 + num_entries += static_cast<int>(count);
  1241 + } else {
  1242 + first = val.getIntValue();
  1243 + if (first < 0) {
  1244 + throw damaged(
  1245 + "Cross-reference stream's /Index contains a negative object id");
  1246 + } else if (first > max_num_entries) {
  1247 + throw damaged("Cross-reference stream's /Index contains an impossibly "
  1248 + "large object id");
  1249 + }
  1250 + }
  1251 + } else {
  1252 + throw damaged(
  1253 + "Cross-reference stream's /Index's item " + std::to_string(i) +
  1254 + " is not an integer");
  1255 + }
  1256 + i++;
  1257 + }
  1258 + QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
  1259 + return {num_entries, indx};
  1260 + } else if (Index_obj.isNull()) {
  1261 + QTC::TC("qpdf", "QPDF xref /Index is null");
  1262 + return {size, {{0, size}}};
  1263 + } else {
  1264 + throw damaged("Cross-reference stream does not have a proper /Index key");
  1265 + }
  1266 +}
  1267 +
  1268 +qpdf_offset_t
  1269 +QPDF::Xref_table::process_stream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
  1270 +{
  1271 + auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
  1272 + return qpdf.damagedPDF("xref stream", xref_offset, msg.data());
  1273 + };
  1274 +
  1275 + auto dict = xref_obj.getDict();
  1276 +
  1277 + auto [entry_size, W] = process_W(dict, damaged);
  1278 + auto [max_num_entries, size] = process_Size(dict, entry_size, damaged);
  1279 + auto [num_entries, indx] = process_Index(dict, max_num_entries, damaged);
  1280 +
  1281 + std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
  1282 + size_t actual_size = bp->getSize();
  1283 + auto expected_size = toS(entry_size) * toS(num_entries);
  1284 +
  1285 + if (expected_size != actual_size) {
  1286 + QPDFExc x = damaged(
  1287 + "Cross-reference stream data has the wrong size; expected = " +
  1288 + std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
  1289 + if (expected_size > actual_size) {
  1290 + throw x;
  1291 + } else {
  1292 + qpdf.warn(x);
  1293 + }
  1294 + }
  1295 +
  1296 + if (!trailer_) {
  1297 + trailer_ = dict;
  1298 + if (size > toS(max_id_)) {
  1299 + throw damaged("Cross-reference stream /Size entry is impossibly large");
  1300 + }
  1301 + table.resize(size);
  1302 + }
  1303 +
  1304 + bool saw_first_compressed_object = false;
  1305 +
  1306 + // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
  1307 + // We know that entry_size * num_entries is less or equal to the size of the buffer.
  1308 + auto p = bp->getBuffer();
  1309 + for (auto [obj, sec_entries]: indx) {
  1310 + // Process a subsection.
  1311 + for (int i = 0; i < sec_entries; ++i) {
  1312 + // Read this entry
  1313 + std::array<qpdf_offset_t, 3> fields{};
  1314 + if (W[0] == 0) {
  1315 + QTC::TC("qpdf", "QPDF default for xref stream field 0");
  1316 + fields[0] = 1;
  1317 + }
  1318 + for (size_t j = 0; j < 3; ++j) {
  1319 + for (int k = 0; k < W[j]; ++k) {
  1320 + fields[j] <<= 8;
  1321 + fields[j] |= *p++;
  1322 + }
  1323 + }
  1324 +
  1325 + // Get the generation number. The generation number is 0 unless this is an uncompressed
  1326 + // object record, in which case the generation number appears as the third field.
  1327 + if (saw_first_compressed_object) {
  1328 + if (fields[0] != 2) {
  1329 + uncompressed_after_compressed_ = true;
  1330 + }
  1331 + } else if (fields[0] == 2) {
  1332 + saw_first_compressed_object = true;
  1333 + }
  1334 + if (obj == 0) {
  1335 + // This is needed by checkLinearization()
  1336 + first_item_offset_ = xref_offset;
  1337 + } else if (fields[0] == 0) {
  1338 + // Ignore fields[2], which we don't care about in this case. This works around the
  1339 + // issue of some PDF files that put invalid values, like -1, here for deleted
  1340 + // objects.
  1341 + insert_free(QPDFObjGen(obj, 0));
  1342 + } else {
  1343 + insert(obj, toI(fields[0]), fields[1], toI(fields[2]));
  1344 + }
  1345 + ++obj;
  1346 + }
  1347 + }
  1348 +
  1349 + if (dict.hasKey("/Prev")) {
  1350 + if (!dict.getKey("/Prev").isInteger()) {
  1351 + throw qpdf.damagedPDF(
  1352 + "xref stream", "/Prev key in xref stream dictionary is not an integer");
  1353 + }
  1354 + QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
  1355 + return dict.getKey("/Prev").getIntValue();
  1356 + } else {
  1357 + return 0;
  1358 + }
  1359 +}
  1360 +
  1361 +void
  1362 +QPDF::Xref_table::insert(int obj, int f0, qpdf_offset_t f1, int f2)
  1363 +{
  1364 + // Populate the xref table in such a way that the first reference to an object that we see,
  1365 + // which is the one in the latest xref table in which it appears, is the one that gets stored.
  1366 + // This works because we are reading more recent appends before older ones.
  1367 +
  1368 + // If there is already an entry for this object and generation in the table, it means that a
  1369 + // later xref table has registered this object. Disregard this one.
  1370 +
  1371 + int new_gen = f0 == 2 ? 0 : f2;
  1372 +
  1373 + if (!(obj > 0 && static_cast<size_t>(obj) < table.size() && 0 <= f2 && new_gen < 65535)) {
  1374 + // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There
  1375 + // is probably no point having another warning but we could count invalid items in order to
  1376 + // decide when to give up.
  1377 + QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");
  1378 + return;
  1379 + }
  1380 +
  1381 + auto& entry = table[static_cast<size_t>(obj)];
  1382 + auto old_type = entry.type();
  1383 +
  1384 + if (!old_type && entry.gen() > 0) {
  1385 + // At the moment we are processing the updates last to first and therefore the gen doesn't
  1386 + // matter as long as it > 0 to distinguish it from an uninitialized entry. This will need
  1387 + // to be revisited when we want to support incremental updates or more comprhensive
  1388 + // checking.
  1389 + QTC::TC("qpdf", "QPDF xref deleted object");
  1390 + return;
  1391 + }
  1392 +
  1393 + if (f0 == 2 && static_cast<int>(f1) == obj) {
  1394 + qpdf.warn(qpdf.damagedPDF(
  1395 + "xref stream", "self-referential object stream " + std::to_string(obj)));
  1396 + return;
  1397 + }
  1398 +
  1399 + if (old_type && entry.gen() >= new_gen) {
  1400 + QTC::TC("qpdf", "QPDF xref reused object");
  1401 + return;
  1402 + }
  1403 +
  1404 + switch (f0) {
  1405 + case 1:
  1406 + // f2 is generation
  1407 + QTC::TC("qpdf", "QPDF xref gen > 0", (f2 > 0) ? 1 : 0);
  1408 + entry = {f2, Uncompressed(f1)};
  1409 + break;
  1410 +
  1411 + case 2:
  1412 + entry = {0, Compressed(toI(f1), f2)};
  1413 + object_streams_ = true;
  1414 + break;
  1415 +
  1416 + default:
  1417 + throw qpdf.damagedPDF(
  1418 + "xref stream", "unknown xref stream entry type " + std::to_string(f0));
  1419 + break;
  1420 + }
426 } 1421 }
427 1422
428 void 1423 void
429 -QPDF::warn(QPDFExc const& e) 1424 +QPDF::Xref_table::insert_free(QPDFObjGen og)
430 { 1425 {
431 - if (m->max_warnings > 0 && m->warnings.size() >= m->max_warnings) {  
432 - stopOnError("Too many warnings - file is too badly damaged"); 1426 + // At the moment we are processing the updates last to first and therefore the gen doesn't
  1427 + // matter as long as it > 0 to distinguish it from an uninitialized entry. This will need to be
  1428 + // revisited when we want to support incremental updates or more comprhensive checking.
  1429 + if (og.getObj() < 1) {
  1430 + return;
433 } 1431 }
434 - m->warnings.push_back(e);  
435 - if (!m->suppress_warnings) {  
436 - *m->log->getWarn() << "WARNING: " << m->warnings.back().what() << "\n"; 1432 + size_t id = static_cast<size_t>(og.getObj());
  1433 + if (id < table.size() && !type(id)) {
  1434 + table[id] = {1, {}};
437 } 1435 }
438 } 1436 }
439 1437
440 -void  
441 -QPDF::warn(  
442 - qpdf_error_code_e error_code,  
443 - std::string const& object,  
444 - qpdf_offset_t offset,  
445 - std::string const& message) 1438 +QPDFObjGen
  1439 +QPDF::Xref_table::at_offset(qpdf_offset_t offset) const noexcept
446 { 1440 {
447 - warn(QPDFExc(error_code, getFilename(), object, offset, message)); 1441 + int id = 0;
  1442 + int gen = 0;
  1443 + qpdf_offset_t start = 0;
  1444 +
  1445 + int i = 0;
  1446 + for (auto const& item: table) {
  1447 + auto o = item.offset();
  1448 + if (start < o && o <= offset) {
  1449 + start = o;
  1450 + id = i;
  1451 + gen = item.gen();
  1452 + }
  1453 + ++i;
  1454 + }
  1455 + return QPDFObjGen(id, gen);
  1456 +}
  1457 +
  1458 +std::map<QPDFObjGen, QPDFXRefEntry>
  1459 +QPDF::Xref_table::as_map() const
  1460 +{
  1461 + std::map<QPDFObjGen, QPDFXRefEntry> result;
  1462 + int i{0};
  1463 + for (auto const& item: table) {
  1464 + switch (item.type()) {
  1465 + case 0:
  1466 + break;
  1467 + case 1:
  1468 + result.emplace(QPDFObjGen(i, item.gen()), item.offset());
  1469 + break;
  1470 + case 2:
  1471 + result.emplace(
  1472 + QPDFObjGen(i, 0), QPDFXRefEntry(item.stream_number(), item.stream_index()));
  1473 + break;
  1474 + default:
  1475 + throw std::logic_error("Xref_table: invalid entry type");
  1476 + }
  1477 + ++i;
  1478 + }
  1479 + return result;
448 } 1480 }
449 1481
450 void 1482 void
451 QPDF::showXRefTable() 1483 QPDF::showXRefTable()
452 { 1484 {
453 - m->objects.xref_table().show(); 1485 + m->xref_table.show();
  1486 +}
  1487 +
  1488 +void
  1489 +QPDF::Xref_table::show()
  1490 +{
  1491 + auto& cout = *qpdf.m->log->getInfo();
  1492 + int i = -1;
  1493 + for (auto const& item: table) {
  1494 + ++i;
  1495 + if (item.type()) {
  1496 + cout << std::to_string(i) << "/" << std::to_string(item.gen()) << ": ";
  1497 + switch (item.type()) {
  1498 + case 1:
  1499 + cout << "uncompressed; offset = " << item.offset() << "\n";
  1500 + break;
  1501 +
  1502 + case 2:
  1503 + cout << "compressed; stream = " << item.stream_number()
  1504 + << ", index = " << item.stream_index() << "\n";
  1505 + break;
  1506 +
  1507 + default:
  1508 + throw std::logic_error(
  1509 + "unknown cross-reference table type while showing xref_table");
  1510 + }
  1511 + }
  1512 + }
  1513 +}
  1514 +
  1515 +// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
  1516 +// return false. Otherwise return true.
  1517 +bool
  1518 +QPDF::Xref_table::resolve()
  1519 +{
  1520 + bool may_change = !reconstructed_;
  1521 + int i = -1;
  1522 + for (auto& item: table) {
  1523 + ++i;
  1524 + if (item.type()) {
  1525 + if (qpdf.isUnresolved(QPDFObjGen(i, item.gen()))) {
  1526 + qpdf.resolve(QPDFObjGen(i, item.gen()));
  1527 + if (may_change && reconstructed_) {
  1528 + return false;
  1529 + }
  1530 + }
  1531 + }
  1532 + }
  1533 + return true;
454 } 1534 }
455 1535
456 // Ensure all objects in the pdf file, including those in indirect references, appear in the object 1536 // Ensure all objects in the pdf file, including those in indirect references, appear in the object
@@ -461,9 +1541,9 @@ QPDF::fixDanglingReferences(bool force) @@ -461,9 +1541,9 @@ QPDF::fixDanglingReferences(bool force)
461 if (m->fixed_dangling_refs) { 1541 if (m->fixed_dangling_refs) {
462 return; 1542 return;
463 } 1543 }
464 - if (!m->objects.xref_table().resolve()) { 1544 + if (!m->xref_table.resolve()) {
465 QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction"); 1545 QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");
466 - m->objects.xref_table().resolve(); 1546 + m->xref_table.resolve();
467 } 1547 }
468 m->fixed_dangling_refs = true; 1548 m->fixed_dangling_refs = true;
469 } 1549 }
@@ -474,13 +1554,24 @@ QPDF::getObjectCount() @@ -474,13 +1554,24 @@ QPDF::getObjectCount()
474 // This method returns the next available indirect object number. makeIndirectObject uses it for 1554 // This method returns the next available indirect object number. makeIndirectObject uses it for
475 // this purpose. After fixDanglingReferences is called, all objects in the xref table will also 1555 // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
476 // be in obj_cache. 1556 // be in obj_cache.
477 - return toS(m->objects.next_id().getObj() - 1); 1557 + fixDanglingReferences();
  1558 + QPDFObjGen og;
  1559 + if (!m->obj_cache.empty()) {
  1560 + og = (*(m->obj_cache.rbegin())).first;
  1561 + }
  1562 + return toS(og.getObj());
478 } 1563 }
479 1564
480 std::vector<QPDFObjectHandle> 1565 std::vector<QPDFObjectHandle>
481 QPDF::getAllObjects() 1566 QPDF::getAllObjects()
482 { 1567 {
483 - return m->objects.all(); 1568 + // After fixDanglingReferences is called, all objects are in the object cache.
  1569 + fixDanglingReferences();
  1570 + std::vector<QPDFObjectHandle> result;
  1571 + for (auto const& iter: m->obj_cache) {
  1572 + result.push_back(newIndirect(iter.first, iter.second.object));
  1573 + }
  1574 + return result;
484 } 1575 }
485 1576
486 void 1577 void
@@ -498,6 +1589,220 @@ QPDF::setLastObjectDescription(std::string const&amp; description, QPDFObjGen const&amp; @@ -498,6 +1589,220 @@ QPDF::setLastObjectDescription(std::string const&amp; description, QPDFObjGen const&amp;
498 } 1589 }
499 } 1590 }
500 1591
  1592 +QPDFObjectHandle
  1593 +QPDF::Xref_table::read_trailer()
  1594 +{
  1595 + qpdf_offset_t offset = file->tell();
  1596 + bool empty = false;
  1597 + auto object = QPDFParser(*file, "trailer", tokenizer, nullptr, &qpdf, true).parse(empty, false);
  1598 + if (empty) {
  1599 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1600 + // actual PDF files and Adobe Reader appears to ignore them.
  1601 + qpdf.warn(qpdf.damagedPDF("trailer", "empty object treated as null"));
  1602 + } else if (object.isDictionary() && read_token().isWord("stream")) {
  1603 + qpdf.warn(qpdf.damagedPDF("trailer", file->tell(), "stream keyword found in trailer"));
  1604 + }
  1605 + // Override last_offset so that it points to the beginning of the object we just read
  1606 + file->setLastOffset(offset);
  1607 + return object;
  1608 +}
  1609 +
  1610 +QPDFObjectHandle
  1611 +QPDF::readObject(std::string const& description, QPDFObjGen og)
  1612 +{
  1613 + setLastObjectDescription(description, og);
  1614 + qpdf_offset_t offset = m->file->tell();
  1615 + bool empty = false;
  1616 +
  1617 + StringDecrypter decrypter{this, og};
  1618 + StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
  1619 + auto object =
  1620 + QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this, true)
  1621 + .parse(empty, false);
  1622 + if (empty) {
  1623 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1624 + // actual PDF files and Adobe Reader appears to ignore them.
  1625 + warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));
  1626 + return object;
  1627 + }
  1628 + auto token = readToken(*m->file);
  1629 + if (object.isDictionary() && token.isWord("stream")) {
  1630 + readStream(object, og, offset);
  1631 + token = readToken(*m->file);
  1632 + }
  1633 + if (!token.isWord("endobj")) {
  1634 + QTC::TC("qpdf", "QPDF err expected endobj");
  1635 + warn(damagedPDF("expected endobj"));
  1636 + }
  1637 + return object;
  1638 +}
  1639 +
  1640 +// After reading stream dictionary and stream keyword, read rest of stream.
  1641 +void
  1642 +QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
  1643 +{
  1644 + validateStreamLineEnd(object, og, offset);
  1645 +
  1646 + // Must get offset before accessing any additional objects since resolving a previously
  1647 + // unresolved indirect object will change file position.
  1648 + qpdf_offset_t stream_offset = m->file->tell();
  1649 + size_t length = 0;
  1650 +
  1651 + try {
  1652 + auto length_obj = object.getKey("/Length");
  1653 +
  1654 + if (!length_obj.isInteger()) {
  1655 + if (length_obj.isNull()) {
  1656 + QTC::TC("qpdf", "QPDF stream without length");
  1657 + throw damagedPDF(offset, "stream dictionary lacks /Length key");
  1658 + }
  1659 + QTC::TC("qpdf", "QPDF stream length not integer");
  1660 + throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");
  1661 + }
  1662 +
  1663 + length = toS(length_obj.getUIntValue());
  1664 + // Seek in two steps to avoid potential integer overflow
  1665 + m->file->seek(stream_offset, SEEK_SET);
  1666 + m->file->seek(toO(length), SEEK_CUR);
  1667 + if (!readToken(*m->file).isWord("endstream")) {
  1668 + QTC::TC("qpdf", "QPDF missing endstream");
  1669 + throw damagedPDF("expected endstream");
  1670 + }
  1671 + } catch (QPDFExc& e) {
  1672 + if (m->attempt_recovery) {
  1673 + warn(e);
  1674 + length = recoverStreamLength(m->file_sp, og, stream_offset);
  1675 + } else {
  1676 + throw;
  1677 + }
  1678 + }
  1679 + object = {QPDF_Stream::create(this, og, object, stream_offset, length)};
  1680 +}
  1681 +
  1682 +void
  1683 +QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
  1684 +{
  1685 + // The PDF specification states that the word "stream" should be followed by either a carriage
  1686 + // return and a newline or by a newline alone. It specifically disallowed following it by a
  1687 + // carriage return alone since, in that case, there would be no way to tell whether the NL in a
  1688 + // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,
  1689 + // accept a carriage return by itself when followed by a non-newline character, so that's what
  1690 + // we do here. We have also seen files that have extraneous whitespace between the stream
  1691 + // keyword and the newline.
  1692 + while (true) {
  1693 + char ch;
  1694 + if (m->file->read(&ch, 1) == 0) {
  1695 + // A premature EOF here will result in some other problem that will get reported at
  1696 + // another time.
  1697 + return;
  1698 + }
  1699 + if (ch == '\n') {
  1700 + // ready to read stream data
  1701 + QTC::TC("qpdf", "QPDF stream with NL only");
  1702 + return;
  1703 + }
  1704 + if (ch == '\r') {
  1705 + // Read another character
  1706 + if (m->file->read(&ch, 1) != 0) {
  1707 + if (ch == '\n') {
  1708 + // Ready to read stream data
  1709 + QTC::TC("qpdf", "QPDF stream with CRNL");
  1710 + } else {
  1711 + // Treat the \r by itself as the whitespace after endstream and start reading
  1712 + // stream data in spite of not having seen a newline.
  1713 + QTC::TC("qpdf", "QPDF stream with CR only");
  1714 + m->file->unreadCh(ch);
  1715 + warn(damagedPDF(
  1716 + m->file->tell(), "stream keyword followed by carriage return only"));
  1717 + }
  1718 + }
  1719 + return;
  1720 + }
  1721 + if (!QUtil::is_space(ch)) {
  1722 + QTC::TC("qpdf", "QPDF stream without newline");
  1723 + m->file->unreadCh(ch);
  1724 + warn(damagedPDF(
  1725 + m->file->tell(), "stream keyword not followed by proper line terminator"));
  1726 + return;
  1727 + }
  1728 + warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
  1729 + }
  1730 +}
  1731 +
  1732 +QPDFObjectHandle
  1733 +QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)
  1734 +{
  1735 + m->last_object_description.erase(7); // last_object_description starts with "object "
  1736 + m->last_object_description += std::to_string(obj);
  1737 + m->last_object_description += " 0";
  1738 +
  1739 + bool empty = false;
  1740 + auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true)
  1741 + .parse(empty, false);
  1742 + if (empty) {
  1743 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1744 + // actual PDF files and Adobe Reader appears to ignore them.
  1745 + warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));
  1746 + }
  1747 + return object;
  1748 +}
  1749 +
  1750 +bool
  1751 +QPDF::findEndstream()
  1752 +{
  1753 + // Find endstream or endobj. Position the input at that token.
  1754 + auto t = readToken(*m->file, 20);
  1755 + if (t.isWord("endobj") || t.isWord("endstream")) {
  1756 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1757 + return true;
  1758 + }
  1759 + return false;
  1760 +}
  1761 +
  1762 +size_t
  1763 +QPDF::recoverStreamLength(
  1764 + std::shared_ptr<InputSource> input, QPDFObjGen const& og, qpdf_offset_t stream_offset)
  1765 +{
  1766 + // Try to reconstruct stream length by looking for endstream or endobj
  1767 + warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));
  1768 +
  1769 + PatternFinder ef(*this, &QPDF::findEndstream);
  1770 + size_t length = 0;
  1771 + if (m->file->findFirst("end", stream_offset, 0, ef)) {
  1772 + length = toS(m->file->tell() - stream_offset);
  1773 + // Reread endstream but, if it was endobj, don't skip that.
  1774 + QPDFTokenizer::Token t = readToken(*m->file);
  1775 + if (t.getValue() == "endobj") {
  1776 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1777 + }
  1778 + }
  1779 +
  1780 + if (length) {
  1781 + // Make sure this is inside this object
  1782 + auto found = m->xref_table.at_offset(stream_offset + toO(length));
  1783 + if (found == QPDFObjGen() || found == og) {
  1784 + // If we are trying to recover an XRef stream the xref table will not contain and
  1785 + // won't contain any entries, therefore we cannot check the found length. Otherwise we
  1786 + // found endstream\nendobj within the space allowed for this object, so we're probably
  1787 + // in good shape.
  1788 + } else {
  1789 + QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
  1790 + length = 0;
  1791 + }
  1792 + }
  1793 +
  1794 + if (length == 0) {
  1795 + warn(damagedPDF(
  1796 + *input, stream_offset, "unable to recover stream data; treating stream as empty"));
  1797 + } else {
  1798 + warn(damagedPDF(
  1799 + *input, stream_offset, "recovered stream length: " + std::to_string(length)));
  1800 + }
  1801 +
  1802 + QTC::TC("qpdf", "QPDF recovered stream length");
  1803 + return length;
  1804 +}
  1805 +
501 QPDFTokenizer::Token 1806 QPDFTokenizer::Token
502 QPDF::readToken(InputSource& input, size_t max_len) 1807 QPDF::readToken(InputSource& input, size_t max_len)
503 { 1808 {
@@ -505,38 +1810,393 @@ QPDF::readToken(InputSource&amp; input, size_t max_len) @@ -505,38 +1810,393 @@ QPDF::readToken(InputSource&amp; input, size_t max_len)
505 } 1810 }
506 1811
507 QPDFObjectHandle 1812 QPDFObjectHandle
  1813 +QPDF::readObjectAtOffset(
  1814 + bool try_recovery,
  1815 + qpdf_offset_t offset,
  1816 + std::string const& description,
  1817 + QPDFObjGen exp_og,
  1818 + QPDFObjGen& og,
  1819 + bool skip_cache_if_in_xref)
  1820 +{
  1821 + bool check_og = true;
  1822 + if (exp_og.getObj() == 0) {
  1823 + // This method uses an expect object ID of 0 to indicate that we don't know or don't care
  1824 + // what the actual object ID is at this offset. This is true when we read the xref stream
  1825 + // and linearization hint streams. In this case, we don't verify the expect object
  1826 + // ID/generation against what was read from the file. There is also no reason to attempt
  1827 + // xref recovery if we get a failure in this case since the read attempt was not triggered
  1828 + // by an xref lookup.
  1829 + check_og = false;
  1830 + try_recovery = false;
  1831 + }
  1832 + setLastObjectDescription(description, exp_og);
  1833 +
  1834 + if (!m->attempt_recovery) {
  1835 + try_recovery = false;
  1836 + }
  1837 +
  1838 + // Special case: if offset is 0, just return null. Some PDF writers, in particular
  1839 + // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
  1840 + // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
  1841 + // these.
  1842 + if (offset == 0) {
  1843 + QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
  1844 + warn(damagedPDF(0, "object has offset 0"));
  1845 + return QPDFObjectHandle::newNull();
  1846 + }
  1847 +
  1848 + m->file->seek(offset, SEEK_SET);
  1849 + try {
  1850 + QPDFTokenizer::Token tobjid = readToken(*m->file);
  1851 + bool objidok = tobjid.isInteger();
  1852 + QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
  1853 + if (!objidok) {
  1854 + QTC::TC("qpdf", "QPDF expected n n obj");
  1855 + throw damagedPDF(offset, "expected n n obj");
  1856 + }
  1857 + QPDFTokenizer::Token tgen = readToken(*m->file);
  1858 + bool genok = tgen.isInteger();
  1859 + QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
  1860 + if (!genok) {
  1861 + throw damagedPDF(offset, "expected n n obj");
  1862 + }
  1863 + QPDFTokenizer::Token tobj = readToken(*m->file);
  1864 +
  1865 + bool objok = tobj.isWord("obj");
  1866 + QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
  1867 +
  1868 + if (!objok) {
  1869 + throw damagedPDF(offset, "expected n n obj");
  1870 + }
  1871 + int objid = QUtil::string_to_int(tobjid.getValue().c_str());
  1872 + int generation = QUtil::string_to_int(tgen.getValue().c_str());
  1873 + og = QPDFObjGen(objid, generation);
  1874 + if (objid == 0) {
  1875 + QTC::TC("qpdf", "QPDF object id 0");
  1876 + throw damagedPDF(offset, "object with ID 0");
  1877 + }
  1878 + if (check_og && (exp_og != og)) {
  1879 + QTC::TC("qpdf", "QPDF err wrong objid/generation");
  1880 + QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
  1881 + if (try_recovery) {
  1882 + // Will be retried below
  1883 + throw e;
  1884 + } else {
  1885 + // We can try reading the object anyway even if the ID doesn't match.
  1886 + warn(e);
  1887 + }
  1888 + }
  1889 + } catch (QPDFExc& e) {
  1890 + if (try_recovery) {
  1891 + // Try again after reconstructing xref table
  1892 + m->xref_table.reconstruct(e);
  1893 + if (m->xref_table.type(exp_og) == 1) {
  1894 + QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
  1895 + return readObjectAtOffset(
  1896 + false, m->xref_table.offset(exp_og), description, exp_og, og, false);
  1897 + } else {
  1898 + QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
  1899 + warn(damagedPDF(
  1900 + "",
  1901 + 0,
  1902 + ("object " + exp_og.unparse(' ') +
  1903 + " not found in file after regenerating cross reference table")));
  1904 + return QPDFObjectHandle::newNull();
  1905 + }
  1906 + } else {
  1907 + throw;
  1908 + }
  1909 + }
  1910 +
  1911 + QPDFObjectHandle oh = readObject(description, og);
  1912 +
  1913 + if (isUnresolved(og)) {
  1914 + // Store the object in the cache here so it gets cached whether we first know the offset or
  1915 + // whether we first know the object ID and generation (in which we case we would get here
  1916 + // through resolve).
  1917 +
  1918 + // Determine the end offset of this object before and after white space. We use these
  1919 + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
  1920 + // the end of an object to be anywhere between these values.
  1921 + qpdf_offset_t end_before_space = m->file->tell();
  1922 +
  1923 + // skip over spaces
  1924 + while (true) {
  1925 + char ch;
  1926 + if (m->file->read(&ch, 1)) {
  1927 + if (!isspace(static_cast<unsigned char>(ch))) {
  1928 + m->file->seek(-1, SEEK_CUR);
  1929 + break;
  1930 + }
  1931 + } else {
  1932 + throw damagedPDF(m->file->tell(), "EOF after endobj");
  1933 + }
  1934 + }
  1935 + qpdf_offset_t end_after_space = m->file->tell();
  1936 + if (skip_cache_if_in_xref && m->xref_table.type(og)) {
  1937 + // Ordinarily, an object gets read here when resolved through xref table or stream. In
  1938 + // the special case of the xref stream and linearization hint tables, the offset comes
  1939 + // from another source. For the specific case of xref streams, the xref stream is read
  1940 + // and loaded into the object cache very early in parsing. Ordinarily, when a file is
  1941 + // updated by appending, items inserted into the xref table in later updates take
  1942 + // precedence over earlier items. In the special case of reusing the object number
  1943 + // previously used as the xref stream, we have the following order of events:
  1944 + //
  1945 + // * reused object gets loaded into the xref table
  1946 + // * old object is read here while reading xref streams
  1947 + // * original xref entry is ignored (since already in xref table)
  1948 + //
  1949 + // It is the second step that causes a problem. Even though the xref table is correct in
  1950 + // this case, the old object is already in the cache and so effectively prevails over
  1951 + // the reused object. To work around this issue, we have a special case for the xref
  1952 + // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,
  1953 + // don't cache what we read here.
  1954 + //
  1955 + // It is likely that the same bug may exist for linearization hint tables, but the
  1956 + // existing code uses end_before_space and end_after_space from the cache, so fixing
  1957 + // that would require more significant rework. The chances of a linearization hint
  1958 + // stream being reused seems smaller because the xref stream is probably the highest
  1959 + // object in the file and the linearization hint stream would be some random place in
  1960 + // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we
  1961 + // could use !check_og in place of skip_cache_if_in_xref.
  1962 + QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
  1963 + } else {
  1964 + m->xref_table.linearization_offsets(
  1965 + toS(og.getObj()), end_before_space, end_after_space);
  1966 + updateCache(og, oh.getObj());
  1967 + }
  1968 + }
  1969 +
  1970 + return oh;
  1971 +}
  1972 +
  1973 +QPDFObject*
  1974 +QPDF::resolve(QPDFObjGen og)
  1975 +{
  1976 + if (!isUnresolved(og)) {
  1977 + return m->obj_cache[og].object.get();
  1978 + }
  1979 +
  1980 + if (m->resolving.count(og)) {
  1981 + // This can happen if an object references itself directly or indirectly in some key that
  1982 + // has to be resolved during object parsing, such as stream length.
  1983 + QTC::TC("qpdf", "QPDF recursion loop in resolve");
  1984 + warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
  1985 + updateCache(og, QPDF_Null::create());
  1986 + return m->obj_cache[og].object.get();
  1987 + }
  1988 + ResolveRecorder rr(this, og);
  1989 +
  1990 + try {
  1991 + switch (m->xref_table.type(og)) {
  1992 + case 0:
  1993 + break;
  1994 + case 1:
  1995 + {
  1996 + // Object stored in cache by readObjectAtOffset
  1997 + QPDFObjGen a_og;
  1998 + QPDFObjectHandle oh =
  1999 + readObjectAtOffset(true, m->xref_table.offset(og), "", og, a_og, false);
  2000 + }
  2001 + break;
  2002 +
  2003 + case 2:
  2004 + resolveObjectsInStream(m->xref_table.stream_number(og.getObj()));
  2005 + break;
  2006 +
  2007 + default:
  2008 + throw damagedPDF(
  2009 + "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));
  2010 + }
  2011 + } catch (QPDFExc& e) {
  2012 + warn(e);
  2013 + } catch (std::exception& e) {
  2014 + warn(damagedPDF(
  2015 + "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
  2016 + }
  2017 +
  2018 + if (isUnresolved(og)) {
  2019 + // PDF spec says unknown objects resolve to the null object.
  2020 + QTC::TC("qpdf", "QPDF resolve failure to null");
  2021 + updateCache(og, QPDF_Null::create());
  2022 + }
  2023 +
  2024 + auto result(m->obj_cache[og].object);
  2025 + result->setDefaultDescription(this, og);
  2026 + return result.get();
  2027 +}
  2028 +
  2029 +void
  2030 +QPDF::resolveObjectsInStream(int obj_stream_number)
  2031 +{
  2032 + if (m->resolved_object_streams.count(obj_stream_number)) {
  2033 + return;
  2034 + }
  2035 + m->resolved_object_streams.insert(obj_stream_number);
  2036 + // Force resolution of object stream
  2037 + QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
  2038 + if (!obj_stream.isStream()) {
  2039 + throw damagedPDF(
  2040 + "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
  2041 + }
  2042 +
  2043 + QPDFObjectHandle dict = obj_stream.getDict();
  2044 + if (!dict.isDictionaryOfType("/ObjStm")) {
  2045 + QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
  2046 + warn(damagedPDF(
  2047 + "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
  2048 + }
  2049 +
  2050 + if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {
  2051 + throw damagedPDF(
  2052 + ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));
  2053 + }
  2054 +
  2055 + int n = dict.getKey("/N").getIntValueAsInt();
  2056 + int first = dict.getKey("/First").getIntValueAsInt();
  2057 +
  2058 + std::map<int, int> offsets;
  2059 +
  2060 + std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
  2061 + auto input = std::shared_ptr<InputSource>(
  2062 + // line-break
  2063 + new BufferInputSource(
  2064 + (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),
  2065 + bp.get()));
  2066 +
  2067 + qpdf_offset_t last_offset = -1;
  2068 + for (int i = 0; i < n; ++i) {
  2069 + QPDFTokenizer::Token tnum = readToken(*input);
  2070 + QPDFTokenizer::Token toffset = readToken(*input);
  2071 + if (!(tnum.isInteger() && toffset.isInteger())) {
  2072 + throw damagedPDF(
  2073 + *input,
  2074 + m->last_object_description,
  2075 + input->getLastOffset(),
  2076 + "expected integer in object stream header");
  2077 + }
  2078 +
  2079 + int num = QUtil::string_to_int(tnum.getValue().c_str());
  2080 + long long offset = QUtil::string_to_int(toffset.getValue().c_str());
  2081 + if (num > m->xref_table.max_id()) {
  2082 + continue;
  2083 + }
  2084 + if (num == obj_stream_number) {
  2085 + QTC::TC("qpdf", "QPDF ignore self-referential object stream");
  2086 + warn(damagedPDF(
  2087 + *input,
  2088 + m->last_object_description,
  2089 + input->getLastOffset(),
  2090 + "object stream claims to contain itself"));
  2091 + continue;
  2092 + }
  2093 + if (offset <= last_offset) {
  2094 + throw damagedPDF(
  2095 + *input,
  2096 + m->last_object_description,
  2097 + input->getLastOffset(),
  2098 + "expected offsets in object stream to be increasing");
  2099 + }
  2100 + last_offset = offset;
  2101 +
  2102 + offsets[num] = toI(offset + first);
  2103 + }
  2104 +
  2105 + // To avoid having to read the object stream multiple times, store all objects that would be
  2106 + // found here in the cache. Remember that some objects stored here might have been overridden
  2107 + // by new objects appended to the file, so it is necessary to recheck the xref table and only
  2108 + // cache what would actually be resolved here.
  2109 + m->last_object_description.clear();
  2110 + m->last_object_description += "object ";
  2111 + for (auto const& iter: offsets) {
  2112 + QPDFObjGen og(iter.first, 0);
  2113 + if (m->xref_table.type(og) == 2 &&
  2114 + m->xref_table.stream_number(og.getObj()) == obj_stream_number) {
  2115 + int offset = iter.second;
  2116 + input->seek(offset, SEEK_SET);
  2117 + QPDFObjectHandle oh = readObjectInStream(input, iter.first);
  2118 + updateCache(og, oh.getObj());
  2119 + } else {
  2120 + QTC::TC("qpdf", "QPDF not caching overridden objstm object");
  2121 + }
  2122 + }
  2123 +}
  2124 +
  2125 +QPDFObjectHandle
508 QPDF::newIndirect(QPDFObjGen const& og, std::shared_ptr<QPDFObject> const& obj) 2126 QPDF::newIndirect(QPDFObjGen const& og, std::shared_ptr<QPDFObject> const& obj)
509 { 2127 {
510 obj->setDefaultDescription(this, og); 2128 obj->setDefaultDescription(this, og);
511 return {obj}; 2129 return {obj};
512 } 2130 }
513 2131
  2132 +void
  2133 +QPDF::updateCache(QPDFObjGen const& og, std::shared_ptr<QPDFObject> const& object)
  2134 +{
  2135 + object->setObjGen(this, og);
  2136 + if (isCached(og)) {
  2137 + auto& cache = m->obj_cache[og];
  2138 + cache.object->assign(object);
  2139 + } else {
  2140 + m->obj_cache[og] = ObjCache(object);
  2141 + }
  2142 +}
  2143 +
  2144 +bool
  2145 +QPDF::isCached(QPDFObjGen const& og)
  2146 +{
  2147 + return m->obj_cache.count(og) != 0;
  2148 +}
  2149 +
  2150 +bool
  2151 +QPDF::isUnresolved(QPDFObjGen const& og)
  2152 +{
  2153 + return !isCached(og) || m->obj_cache[og].object->isUnresolved();
  2154 +}
  2155 +
  2156 +QPDFObjGen
  2157 +QPDF::nextObjGen()
  2158 +{
  2159 + int max_objid = toI(getObjectCount());
  2160 + if (max_objid == std::numeric_limits<int>::max()) {
  2161 + throw std::range_error("max object id is too high to create new objects");
  2162 + }
  2163 + return QPDFObjGen(max_objid + 1, 0);
  2164 +}
  2165 +
  2166 +QPDFObjectHandle
  2167 +QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
  2168 +{
  2169 + QPDFObjGen next{nextObjGen()};
  2170 + m->obj_cache[next] = ObjCache(obj);
  2171 + return newIndirect(next, m->obj_cache[next].object);
  2172 +}
  2173 +
514 QPDFObjectHandle 2174 QPDFObjectHandle
515 QPDF::makeIndirectObject(QPDFObjectHandle oh) 2175 QPDF::makeIndirectObject(QPDFObjectHandle oh)
516 { 2176 {
517 if (!oh) { 2177 if (!oh) {
518 throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect"); 2178 throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
519 } 2179 }
520 - return m->objects.make_indirect(oh.getObj()); 2180 + return makeIndirectFromQPDFObject(oh.getObj());
521 } 2181 }
522 2182
523 QPDFObjectHandle 2183 QPDFObjectHandle
524 QPDF::newReserved() 2184 QPDF::newReserved()
525 { 2185 {
526 - return m->objects.make_indirect(QPDF_Reserved::create()); 2186 + return makeIndirectFromQPDFObject(QPDF_Reserved::create());
527 } 2187 }
528 2188
529 QPDFObjectHandle 2189 QPDFObjectHandle
530 QPDF::newIndirectNull() 2190 QPDF::newIndirectNull()
531 { 2191 {
532 - return m->objects.make_indirect(QPDF_Null::create()); 2192 + return makeIndirectFromQPDFObject(QPDF_Null::create());
533 } 2193 }
534 2194
535 QPDFObjectHandle 2195 QPDFObjectHandle
536 QPDF::newStream() 2196 QPDF::newStream()
537 { 2197 {
538 - return m->objects.make_indirect(  
539 - QPDF_Stream::create(this, m->objects.next_id(), QPDFObjectHandle::newDictionary(), 0, 0)); 2198 + return makeIndirectFromQPDFObject(
  2199 + QPDF_Stream::create(this, nextObjGen(), QPDFObjectHandle::newDictionary(), 0, 0));
540 } 2200 }
541 2201
542 QPDFObjectHandle 2202 QPDFObjectHandle
@@ -555,40 +2215,93 @@ QPDF::newStream(std::string const&amp; data) @@ -555,40 +2215,93 @@ QPDF::newStream(std::string const&amp; data)
555 return result; 2215 return result;
556 } 2216 }
557 2217
  2218 +std::shared_ptr<QPDFObject>
  2219 +QPDF::getObjectForParser(int id, int gen, bool parse_pdf)
  2220 +{
  2221 + // This method is called by the parser and therefore must not resolve any objects.
  2222 + auto og = QPDFObjGen(id, gen);
  2223 + if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
  2224 + return iter->second.object;
  2225 + }
  2226 + if (m->xref_table.type(og) || !m->xref_table.initialized()) {
  2227 + return m->obj_cache.insert({og, QPDF_Unresolved::create(this, og)}).first->second.object;
  2228 + }
  2229 + if (parse_pdf) {
  2230 + return QPDF_Null::create();
  2231 + }
  2232 + return m->obj_cache.insert({og, QPDF_Null::create(this, og)}).first->second.object;
  2233 +}
  2234 +
  2235 +std::shared_ptr<QPDFObject>
  2236 +QPDF::getObjectForJSON(int id, int gen)
  2237 +{
  2238 + auto og = QPDFObjGen(id, gen);
  2239 + auto [it, inserted] = m->obj_cache.try_emplace(og);
  2240 + auto& obj = it->second.object;
  2241 + if (inserted) {
  2242 + obj = (m->xref_table.initialized() && !m->xref_table.type(og))
  2243 + ? QPDF_Null::create(this, og)
  2244 + : QPDF_Unresolved::create(this, og);
  2245 + }
  2246 + return obj;
  2247 +}
  2248 +
558 QPDFObjectHandle 2249 QPDFObjectHandle
559 QPDF::getObject(QPDFObjGen const& og) 2250 QPDF::getObject(QPDFObjGen const& og)
560 { 2251 {
561 - return m->objects.get(og); 2252 + if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
  2253 + return {it->second.object};
  2254 + } else if (m->xref_table.initialized() && !m->xref_table.type(og)) {
  2255 + return QPDF_Null::create();
  2256 + } else {
  2257 + auto result = m->obj_cache.try_emplace(og, QPDF_Unresolved::create(this, og));
  2258 + return {result.first->second.object};
  2259 + }
562 } 2260 }
563 2261
564 QPDFObjectHandle 2262 QPDFObjectHandle
565 -QPDF::getObject(int id, int gen) 2263 +QPDF::getObject(int objid, int generation)
566 { 2264 {
567 - return m->objects.get(id, gen); 2265 + return getObject(QPDFObjGen(objid, generation));
568 } 2266 }
569 2267
570 QPDFObjectHandle 2268 QPDFObjectHandle
571 QPDF::getObjectByObjGen(QPDFObjGen const& og) 2269 QPDF::getObjectByObjGen(QPDFObjGen const& og)
572 { 2270 {
573 - return m->objects.get(og); 2271 + return getObject(og);
574 } 2272 }
575 2273
576 QPDFObjectHandle 2274 QPDFObjectHandle
577 -QPDF::getObjectByID(int id, int gen) 2275 +QPDF::getObjectByID(int objid, int generation)
  2276 +{
  2277 + return getObject(QPDFObjGen(objid, generation));
  2278 +}
  2279 +
  2280 +void
  2281 +QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
578 { 2282 {
579 - return m->objects.get(id, gen); 2283 + replaceObject(QPDFObjGen(objid, generation), oh);
580 } 2284 }
581 2285
582 void 2286 void
583 -QPDF::replaceObject(int id, int gen, QPDFObjectHandle replacement) 2287 +QPDF::replaceObject(QPDFObjGen const& og, QPDFObjectHandle oh)
584 { 2288 {
585 - m->objects.replace(QPDFObjGen(id, gen), replacement); 2289 + if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {
  2290 + QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
  2291 + throw std::logic_error("QPDF::replaceObject called with indirect object handle");
  2292 + }
  2293 + updateCache(og, oh.getObj());
586 } 2294 }
587 2295
588 void 2296 void
589 -QPDF::replaceObject(QPDFObjGen const& og, QPDFObjectHandle replacement) 2297 +QPDF::removeObject(QPDFObjGen og)
590 { 2298 {
591 - m->objects.replace(og, replacement); 2299 + if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
  2300 + // Take care of any object handles that may be floating around.
  2301 + cached->second.object->assign(QPDF_Null::create());
  2302 + cached->second.object->setObjGen(nullptr, QPDFObjGen());
  2303 + m->obj_cache.erase(cached);
  2304 + }
592 } 2305 }
593 2306
594 void 2307 void
@@ -599,7 +2312,7 @@ QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement) @@ -599,7 +2312,7 @@ QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
599 if (!(tc == ::ot_reserved || tc == ::ot_null)) { 2312 if (!(tc == ::ot_reserved || tc == ::ot_null)) {
600 throw std::logic_error("replaceReserved called with non-reserved object"); 2313 throw std::logic_error("replaceReserved called with non-reserved object");
601 } 2314 }
602 - m->objects.replace(reserved.getObjGen(), replacement); 2315 + replaceObject(reserved.getObjGen(), replacement);
603 } 2316 }
604 2317
605 QPDFObjectHandle 2318 QPDFObjectHandle
@@ -865,13 +2578,16 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign) @@ -865,13 +2578,16 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign)
865 void 2578 void
866 QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2) 2579 QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
867 { 2580 {
868 - m->objects.swap(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2)); 2581 + swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
869 } 2582 }
870 2583
871 void 2584 void
872 QPDF::swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2) 2585 QPDF::swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2)
873 { 2586 {
874 - m->objects.swap(og1, og2); 2587 + // Force objects to be read from the input source if needed, then swap them in the cache.
  2588 + resolve(og1);
  2589 + resolve(og2);
  2590 + m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
875 } 2591 }
876 2592
877 unsigned long long 2593 unsigned long long
@@ -932,13 +2648,13 @@ QPDF::getExtensionLevel() @@ -932,13 +2648,13 @@ QPDF::getExtensionLevel()
932 QPDFObjectHandle 2648 QPDFObjectHandle
933 QPDF::getTrailer() 2649 QPDF::getTrailer()
934 { 2650 {
935 - return m->objects.trailer(); 2651 + return m->xref_table.trailer();
936 } 2652 }
937 2653
938 QPDFObjectHandle 2654 QPDFObjectHandle
939 QPDF::getRoot() 2655 QPDF::getRoot()
940 { 2656 {
941 - auto root = m->objects.trailer().getKey("/Root"); 2657 + QPDFObjectHandle root = m->xref_table.trailer().getKey("/Root");
942 if (!root.isDictionary()) { 2658 if (!root.isDictionary()) {
943 throw damagedPDF("", 0, "unable to find /Root dictionary"); 2659 throw damagedPDF("", 0, "unable to find /Root dictionary");
944 } else if ( 2660 } else if (
@@ -954,10 +2670,141 @@ QPDF::getRoot() @@ -954,10 +2670,141 @@ QPDF::getRoot()
954 std::map<QPDFObjGen, QPDFXRefEntry> 2670 std::map<QPDFObjGen, QPDFXRefEntry>
955 QPDF::getXRefTable() 2671 QPDF::getXRefTable()
956 { 2672 {
957 - if (!m->objects.xref_table().initialized()) { 2673 + if (!m->xref_table.initialized()) {
958 throw std::logic_error("QPDF::getXRefTable called before parsing."); 2674 throw std::logic_error("QPDF::getXRefTable called before parsing.");
959 } 2675 }
960 - return m->objects.xref_table().as_map(); 2676 + return m->xref_table.as_map();
  2677 +}
  2678 +
  2679 +size_t
  2680 +QPDF::tableSize()
  2681 +{
  2682 + // If obj_cache is dense, accommodate all object in tables,else accommodate only original
  2683 + // objects.
  2684 + auto max_xref = toI(m->xref_table.size());
  2685 + if (max_xref > 0) {
  2686 + --max_xref;
  2687 + }
  2688 + auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;
  2689 + auto max_id = std::numeric_limits<int>::max() - 1;
  2690 + if (max_obj >= max_id || max_xref >= max_id) {
  2691 + // Temporary fix. Long-term solution is
  2692 + // - QPDFObjGen to enforce objgens are valid and sensible
  2693 + // - xref table and obj cache to protect against insertion of impossibly large obj ids
  2694 + stopOnError("Impossibly large object id encountered.");
  2695 + }
  2696 + if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
  2697 + return toS(++max_obj);
  2698 + }
  2699 + return toS(++max_xref);
  2700 +}
  2701 +
  2702 +std::vector<QPDFObjGen>
  2703 +QPDF::getCompressibleObjVector()
  2704 +{
  2705 + return getCompressibleObjGens<QPDFObjGen>();
  2706 +}
  2707 +
  2708 +std::vector<bool>
  2709 +QPDF::getCompressibleObjSet()
  2710 +{
  2711 + return getCompressibleObjGens<bool>();
  2712 +}
  2713 +
  2714 +template <typename T>
  2715 +std::vector<T>
  2716 +QPDF::getCompressibleObjGens()
  2717 +{
  2718 + // Return a list of objects that are allowed to be in object streams. Walk through the objects
  2719 + // by traversing the document from the root, including a traversal of the pages tree. This
  2720 + // makes that objects that are on the same page are more likely to be in the same object stream,
  2721 + // which is slightly more efficient, particularly with linearized files. This is better than
  2722 + // iterating through the xref table since it avoids preserving orphaned items.
  2723 +
  2724 + // Exclude encryption dictionary, if any
  2725 + QPDFObjectHandle encryption_dict = m->xref_table.trailer().getKey("/Encrypt");
  2726 + QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
  2727 +
  2728 + const size_t max_obj = getObjectCount();
  2729 + std::vector<bool> visited(max_obj, false);
  2730 + std::vector<QPDFObjectHandle> queue;
  2731 + queue.reserve(512);
  2732 + queue.push_back(m->xref_table.trailer());
  2733 + std::vector<T> result;
  2734 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  2735 + result.reserve(m->obj_cache.size());
  2736 + } else if constexpr (std::is_same_v<T, bool>) {
  2737 + result.resize(max_obj + 1U, false);
  2738 + } else {
  2739 + throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
  2740 + }
  2741 + while (!queue.empty()) {
  2742 + auto obj = queue.back();
  2743 + queue.pop_back();
  2744 + if (obj.getObjectID() > 0) {
  2745 + QPDFObjGen og = obj.getObjGen();
  2746 + const size_t id = toS(og.getObj() - 1);
  2747 + if (id >= max_obj) {
  2748 + throw std::logic_error(
  2749 + "unexpected object id encountered in getCompressibleObjGens");
  2750 + }
  2751 + if (visited[id]) {
  2752 + QTC::TC("qpdf", "QPDF loop detected traversing objects");
  2753 + continue;
  2754 + }
  2755 +
  2756 + // Check whether this is the current object. If not, remove it (which changes it into a
  2757 + // direct null and therefore stops us from revisiting it) and move on to the next object
  2758 + // in the queue.
  2759 + auto upper = m->obj_cache.upper_bound(og);
  2760 + if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
  2761 + removeObject(og);
  2762 + continue;
  2763 + }
  2764 +
  2765 + visited[id] = true;
  2766 +
  2767 + if (og == encryption_dict_og) {
  2768 + QTC::TC("qpdf", "QPDF exclude encryption dictionary");
  2769 + } else if (!(obj.isStream() ||
  2770 + (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
  2771 + obj.hasKey("/Contents")))) {
  2772 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  2773 + result.push_back(og);
  2774 + } else if constexpr (std::is_same_v<T, bool>) {
  2775 + result[id + 1U] = true;
  2776 + }
  2777 + }
  2778 + }
  2779 + if (obj.isStream()) {
  2780 + QPDFObjectHandle dict = obj.getDict();
  2781 + std::set<std::string> keys = dict.getKeys();
  2782 + for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {
  2783 + std::string const& key = *iter;
  2784 + QPDFObjectHandle value = dict.getKey(key);
  2785 + if (key == "/Length") {
  2786 + // omit stream lengths
  2787 + if (value.isIndirect()) {
  2788 + QTC::TC("qpdf", "QPDF exclude indirect length");
  2789 + }
  2790 + } else {
  2791 + queue.push_back(value);
  2792 + }
  2793 + }
  2794 + } else if (obj.isDictionary()) {
  2795 + std::set<std::string> keys = obj.getKeys();
  2796 + for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {
  2797 + queue.push_back(obj.getKey(*iter));
  2798 + }
  2799 + } else if (obj.isArray()) {
  2800 + int n = obj.getArrayNItems();
  2801 + for (int i = 1; i <= n; ++i) {
  2802 + queue.push_back(obj.getArrayItem(n - i));
  2803 + }
  2804 + }
  2805 + }
  2806 +
  2807 + return result;
961 } 2808 }
962 2809
963 bool 2810 bool
libqpdf/QPDF_encryption.cc
@@ -727,7 +727,7 @@ QPDF::initializeEncryption() @@ -727,7 +727,7 @@ QPDF::initializeEncryption()
727 // at /Encrypt again. Otherwise, things could go wrong if someone mutates the encryption 727 // at /Encrypt again. Otherwise, things could go wrong if someone mutates the encryption
728 // dictionary. 728 // dictionary.
729 729
730 - if (!m->objects.trailer().hasKey("/Encrypt")) { 730 + if (!m->xref_table.trailer().hasKey("/Encrypt")) {
731 return; 731 return;
732 } 732 }
733 733
@@ -736,7 +736,7 @@ QPDF::initializeEncryption() @@ -736,7 +736,7 @@ QPDF::initializeEncryption()
736 m->encp->encrypted = true; 736 m->encp->encrypted = true;
737 737
738 std::string id1; 738 std::string id1;
739 - QPDFObjectHandle id_obj = m->objects.trailer().getKey("/ID"); 739 + QPDFObjectHandle id_obj = m->xref_table.trailer().getKey("/ID");
740 if ((id_obj.isArray() && (id_obj.getArrayNItems() == 2) && id_obj.getArrayItem(0).isString())) { 740 if ((id_obj.isArray() && (id_obj.getArrayNItems() == 2) && id_obj.getArrayItem(0).isString())) {
741 id1 = id_obj.getArrayItem(0).getStringValue(); 741 id1 = id_obj.getArrayItem(0).getStringValue();
742 } else { 742 } else {
@@ -745,7 +745,7 @@ QPDF::initializeEncryption() @@ -745,7 +745,7 @@ QPDF::initializeEncryption()
745 warn(damagedPDF("trailer", "invalid /ID in trailer dictionary")); 745 warn(damagedPDF("trailer", "invalid /ID in trailer dictionary"));
746 } 746 }
747 747
748 - QPDFObjectHandle encryption_dict = m->objects.trailer().getKey("/Encrypt"); 748 + QPDFObjectHandle encryption_dict = m->xref_table.trailer().getKey("/Encrypt");
749 if (!encryption_dict.isDictionary()) { 749 if (!encryption_dict.isDictionary()) {
750 throw damagedPDF("/Encrypt in trailer dictionary is not a dictionary"); 750 throw damagedPDF("/Encrypt in trailer dictionary is not a dictionary");
751 } 751 }
libqpdf/QPDF_json.cc
@@ -536,7 +536,7 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value) @@ -536,7 +536,7 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value)
536 } else if (is_obj_key(key, obj, gen)) { 536 } else if (is_obj_key(key, obj, gen)) {
537 this->cur_object = key; 537 this->cur_object = key;
538 if (setNextStateIfDictionary(key, value, st_object_top)) { 538 if (setNextStateIfDictionary(key, value, st_object_top)) {
539 - next_obj = pdf.objects().get_for_json(obj, gen); 539 + next_obj = pdf.getObjectForJSON(obj, gen);
540 } 540 }
541 } else { 541 } else {
542 QTC::TC("qpdf", "QPDF_json bad object key"); 542 QTC::TC("qpdf", "QPDF_json bad object key");
@@ -582,7 +582,7 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value) @@ -582,7 +582,7 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value)
582 this->saw_value = true; 582 this->saw_value = true;
583 // The trailer must be a dictionary, so we can use setNextStateIfDictionary. 583 // The trailer must be a dictionary, so we can use setNextStateIfDictionary.
584 if (setNextStateIfDictionary("trailer.value", value, st_object)) { 584 if (setNextStateIfDictionary("trailer.value", value, st_object)) {
585 - pdf.m->objects.xref_table().trailer(makeObject(value)); 585 + pdf.m->xref_table.trailer(makeObject(value));
586 } 586 }
587 } else if (key == "stream") { 587 } else if (key == "stream") {
588 // Don't need to set saw_stream here since there's already an error. 588 // Don't need to set saw_stream here since there's already an error.
@@ -740,7 +740,7 @@ QPDF::JSONReactor::makeObject(JSON const&amp; value) @@ -740,7 +740,7 @@ QPDF::JSONReactor::makeObject(JSON const&amp; value)
740 int gen = 0; 740 int gen = 0;
741 std::string str; 741 std::string str;
742 if (is_indirect_object(str_v, obj, gen)) { 742 if (is_indirect_object(str_v, obj, gen)) {
743 - result = pdf.objects().get_for_json(obj, gen); 743 + result = pdf.getObjectForJSON(obj, gen);
744 } else if (is_unicode_string(str_v, str)) { 744 } else if (is_unicode_string(str_v, str)) {
745 result = QPDFObjectHandle::newUnicodeString(str); 745 result = QPDFObjectHandle::newUnicodeString(str);
746 } else if (is_binary_string(str_v, str)) { 746 } else if (is_binary_string(str_v, str)) {
@@ -776,7 +776,7 @@ QPDF::createFromJSON(std::shared_ptr&lt;InputSource&gt; is) @@ -776,7 +776,7 @@ QPDF::createFromJSON(std::shared_ptr&lt;InputSource&gt; is)
776 { 776 {
777 m->pdf_version = "1.3"; 777 m->pdf_version = "1.3";
778 m->no_input_name = is->getName(); 778 m->no_input_name = is->getName();
779 - m->objects.xref_table().initialize_json(); 779 + m->xref_table.initialize_json();
780 importJSON(is, true); 780 importJSON(is, true);
781 } 781 }
782 782
libqpdf/QPDF_linearization.cc
@@ -130,7 +130,7 @@ QPDF::isLinearized() @@ -130,7 +130,7 @@ QPDF::isLinearized()
130 return false; 130 return false;
131 } 131 }
132 132
133 - auto candidate = m->objects.get(lindict_obj, 0); 133 + auto candidate = getObjectByID(lindict_obj, 0);
134 if (!candidate.isDictionary()) { 134 if (!candidate.isDictionary()) {
135 return false; 135 return false;
136 } 136 }
@@ -287,9 +287,9 @@ QPDF::readHintStream(Pipeline&amp; pl, qpdf_offset_t offset, size_t length) @@ -287,9 +287,9 @@ QPDF::readHintStream(Pipeline&amp; pl, qpdf_offset_t offset, size_t length)
287 { 287 {
288 QPDFObjGen og; 288 QPDFObjGen og;
289 QPDFObjectHandle H = 289 QPDFObjectHandle H =
290 - objects().read(false, offset, "linearization hint stream", QPDFObjGen(0, 0), og, false);  
291 - qpdf_offset_t min_end_offset = m->objects.xref_table().end_before_space(og);  
292 - qpdf_offset_t max_end_offset = m->objects.xref_table().end_after_space(og); 290 + readObjectAtOffset(false, offset, "linearization hint stream", QPDFObjGen(0, 0), og, false);
  291 + qpdf_offset_t min_end_offset = m->xref_table.end_before_space(og);
  292 + qpdf_offset_t max_end_offset = m->xref_table.end_after_space(og);
293 if (!H.isStream()) { 293 if (!H.isStream()) {
294 throw damagedPDF("linearization dictionary", "hint table is not a stream"); 294 throw damagedPDF("linearization dictionary", "hint table is not a stream");
295 } 295 }
@@ -303,8 +303,8 @@ QPDF::readHintStream(Pipeline&amp; pl, qpdf_offset_t offset, size_t length) @@ -303,8 +303,8 @@ QPDF::readHintStream(Pipeline&amp; pl, qpdf_offset_t offset, size_t length)
303 auto length_og = Hdict.getKey("/Length").getObjGen(); 303 auto length_og = Hdict.getKey("/Length").getObjGen();
304 if (length_og.isIndirect()) { 304 if (length_og.isIndirect()) {
305 QTC::TC("qpdf", "QPDF hint table length indirect"); 305 QTC::TC("qpdf", "QPDF hint table length indirect");
306 - min_end_offset = m->objects.xref_table().end_before_space(length_og);  
307 - max_end_offset = m->objects.xref_table().end_after_space(length_og); 306 + min_end_offset = m->xref_table.end_before_space(length_og);
  307 + max_end_offset = m->xref_table.end_after_space(length_og);
308 } else { 308 } else {
309 QTC::TC("qpdf", "QPDF hint table length direct"); 309 QTC::TC("qpdf", "QPDF hint table length direct");
310 } 310 }
@@ -441,7 +441,7 @@ QPDF::checkLinearizationInternal() @@ -441,7 +441,7 @@ QPDF::checkLinearizationInternal()
441 for (size_t i = 0; i < toS(npages); ++i) { 441 for (size_t i = 0; i < toS(npages); ++i) {
442 QPDFObjectHandle const& page = pages.at(i); 442 QPDFObjectHandle const& page = pages.at(i);
443 QPDFObjGen og(page.getObjGen()); 443 QPDFObjGen og(page.getObjGen());
444 - if (m->objects.xref_table().type(og) == 2) { 444 + if (m->xref_table.type(og) == 2) {
445 linearizationWarning( 445 linearizationWarning(
446 "page dictionary for page " + std::to_string(i) + " is compressed"); 446 "page dictionary for page " + std::to_string(i) + " is compressed");
447 } 447 }
@@ -457,11 +457,11 @@ QPDF::checkLinearizationInternal() @@ -457,11 +457,11 @@ QPDF::checkLinearizationInternal()
457 break; 457 break;
458 } 458 }
459 } 459 }
460 - if (m->file->tell() != m->objects.xref_table().first_item_offset()) { 460 + if (m->file->tell() != m->xref_table.first_item_offset()) {
461 QTC::TC("qpdf", "QPDF err /T mismatch"); 461 QTC::TC("qpdf", "QPDF err /T mismatch");
462 linearizationWarning( 462 linearizationWarning(
463 "space before first xref item (/T) mismatch (computed = " + 463 "space before first xref item (/T) mismatch (computed = " +
464 - std::to_string(m->objects.xref_table().first_item_offset()) + 464 + std::to_string(m->xref_table.first_item_offset()) +
465 "; file = " + std::to_string(m->file->tell())); 465 "; file = " + std::to_string(m->file->tell()));
466 } 466 }
467 467
@@ -472,7 +472,7 @@ QPDF::checkLinearizationInternal() @@ -472,7 +472,7 @@ QPDF::checkLinearizationInternal()
472 // compressed objects are supposed to be at the end of the containing xref section if any object 472 // compressed objects are supposed to be at the end of the containing xref section if any object
473 // streams are in use. 473 // streams are in use.
474 474
475 - if (m->objects.xref_table().uncompressed_after_compressed()) { 475 + if (m->xref_table.uncompressed_after_compressed()) {
476 linearizationWarning("linearized file contains an uncompressed object after a compressed " 476 linearizationWarning("linearized file contains an uncompressed object after a compressed "
477 "one in a cross-reference stream"); 477 "one in a cross-reference stream");
478 } 478 }
@@ -481,8 +481,8 @@ QPDF::checkLinearizationInternal() @@ -481,8 +481,8 @@ QPDF::checkLinearizationInternal()
481 // make changes. If it has to, then the file is not properly linearized. We use the xref table 481 // make changes. If it has to, then the file is not properly linearized. We use the xref table
482 // to figure out which objects are compressed and which are uncompressed. 482 // to figure out which objects are compressed and which are uncompressed.
483 483
484 - optimize(m->objects);  
485 - calculateLinearizationData(m->objects); 484 + optimize(m->xref_table);
  485 + calculateLinearizationData(m->xref_table);
486 486
487 // E: offset of end of first page -- Implementation note 123 says Acrobat includes on extra 487 // E: offset of end of first page -- Implementation note 123 says Acrobat includes on extra
488 // object here by mistake. pdlin fails to place thumbnail images in section 9, so when 488 // object here by mistake. pdlin fails to place thumbnail images in section 9, so when
@@ -499,8 +499,8 @@ QPDF::checkLinearizationInternal() @@ -499,8 +499,8 @@ QPDF::checkLinearizationInternal()
499 qpdf_offset_t max_E = -1; 499 qpdf_offset_t max_E = -1;
500 for (auto const& oh: m->part6) { 500 for (auto const& oh: m->part6) {
501 QPDFObjGen og(oh.getObjGen()); 501 QPDFObjGen og(oh.getObjGen());
502 - auto before = m->objects.xref_table().end_before_space(og);  
503 - auto after = m->objects.xref_table().end_after_space(og); 502 + auto before = m->xref_table.end_before_space(og);
  503 + auto after = m->xref_table.end_after_space(og);
504 if (before <= 0) { 504 if (before <= 0) {
505 // All objects have to have been dereferenced to be classified. 505 // All objects have to have been dereferenced to be classified.
506 throw std::logic_error("linearization part6 object not in cache"); 506 throw std::logic_error("linearization part6 object not in cache");
@@ -533,7 +533,7 @@ QPDF::maxEnd(ObjUser const&amp; ou) @@ -533,7 +533,7 @@ QPDF::maxEnd(ObjUser const&amp; ou)
533 } 533 }
534 qpdf_offset_t end = 0; 534 qpdf_offset_t end = 0;
535 for (auto const& og: m->obj_user_to_objects[ou]) { 535 for (auto const& og: m->obj_user_to_objects[ou]) {
536 - auto e = m->objects.xref_table().end_after_space(og); 536 + auto e = m->xref_table.end_after_space(og);
537 if (e <= 0) { 537 if (e <= 0) {
538 stopOnError("unknown object referenced in object user table"); 538 stopOnError("unknown object referenced in object user table");
539 } 539 }
@@ -545,14 +545,13 @@ QPDF::maxEnd(ObjUser const&amp; ou) @@ -545,14 +545,13 @@ QPDF::maxEnd(ObjUser const&amp; ou)
545 qpdf_offset_t 545 qpdf_offset_t
546 QPDF::getLinearizationOffset(QPDFObjGen const& og) 546 QPDF::getLinearizationOffset(QPDFObjGen const& og)
547 { 547 {
548 - switch (m->objects.xref_table().type(og)) { 548 + switch (m->xref_table.type(og)) {
549 case 1: 549 case 1:
550 - return m->objects.xref_table().offset(og); 550 + return m->xref_table.offset(og);
551 551
552 case 2: 552 case 2:
553 // For compressed objects, return the offset of the object stream that contains them. 553 // For compressed objects, return the offset of the object stream that contains them.
554 - return getLinearizationOffset(  
555 - QPDFObjGen(m->objects.xref_table().stream_number(og.getObj()), 0)); 554 + return getLinearizationOffset(QPDFObjGen(m->xref_table.stream_number(og.getObj()), 0));
556 555
557 default: 556 default:
558 stopOnError("getLinearizationOffset called for xref entry not of type 1 or 2"); 557 stopOnError("getLinearizationOffset called for xref entry not of type 1 or 2");
@@ -563,22 +562,22 @@ QPDF::getLinearizationOffset(QPDFObjGen const&amp; og) @@ -563,22 +562,22 @@ QPDF::getLinearizationOffset(QPDFObjGen const&amp; og)
563 QPDFObjectHandle 562 QPDFObjectHandle
564 QPDF::getUncompressedObject(QPDFObjectHandle& obj, std::map<int, int> const& object_stream_data) 563 QPDF::getUncompressedObject(QPDFObjectHandle& obj, std::map<int, int> const& object_stream_data)
565 { 564 {
566 - if (obj.isNull() || !object_stream_data.count(obj.getObjectID())) { 565 + if (obj.isNull() || (object_stream_data.count(obj.getObjectID()) == 0)) {
567 return obj; 566 return obj;
568 } else { 567 } else {
569 int repl = (*(object_stream_data.find(obj.getObjectID()))).second; 568 int repl = (*(object_stream_data.find(obj.getObjectID()))).second;
570 - return m->objects.get(repl, 0); 569 + return getObject(repl, 0);
571 } 570 }
572 } 571 }
573 572
574 QPDFObjectHandle 573 QPDFObjectHandle
575 -QPDF::getUncompressedObject(QPDFObjectHandle& obj, Objects const& objects) 574 +QPDF::getUncompressedObject(QPDFObjectHandle& obj, Xref_table const& xref)
576 { 575 {
577 auto og = obj.getObjGen(); 576 auto og = obj.getObjGen();
578 - if (obj.isNull() || objects.xref_table().type(og) != 2) { 577 + if (obj.isNull() || xref.type(og) != 2) {
579 return obj; 578 return obj;
580 } 579 }
581 - return m->objects.get(objects.xref_table().stream_number(og.getObj()), 0); 580 + return getObject(xref.stream_number(og.getObj()), 0);
582 } 581 }
583 582
584 QPDFObjectHandle 583 QPDFObjectHandle
@@ -586,7 +585,7 @@ QPDF::getUncompressedObject(QPDFObjectHandle&amp; oh, QPDFWriter::ObjTable const&amp; ob @@ -586,7 +585,7 @@ QPDF::getUncompressedObject(QPDFObjectHandle&amp; oh, QPDFWriter::ObjTable const&amp; ob
586 { 585 {
587 if (obj.contains(oh)) { 586 if (obj.contains(oh)) {
588 if (auto id = obj[oh].object_stream; id > 0) { 587 if (auto id = obj[oh].object_stream; id > 0) {
589 - return oh.isNull() ? oh : m->objects.get(id, 0); 588 + return oh.isNull() ? oh : getObject(id, 0);
590 } 589 }
591 } 590 }
592 return oh; 591 return oh;
@@ -598,7 +597,7 @@ QPDF::lengthNextN(int first_object, int n) @@ -598,7 +597,7 @@ QPDF::lengthNextN(int first_object, int n)
598 int length = 0; 597 int length = 0;
599 for (int i = 0; i < n; ++i) { 598 for (int i = 0; i < n; ++i) {
600 QPDFObjGen og(first_object + i, 0); 599 QPDFObjGen og(first_object + i, 0);
601 - auto end = m->objects.xref_table().end_after_space(og); 600 + auto end = m->xref_table.end_after_space(og);
602 if (end <= 0) { 601 if (end <= 0) {
603 linearizationWarning( 602 linearizationWarning(
604 "no xref table entry for " + std::to_string(first_object + i) + " 0"); 603 "no xref table entry for " + std::to_string(first_object + i) + " 0");
@@ -628,7 +627,7 @@ QPDF::checkHPageOffset( @@ -628,7 +627,7 @@ QPDF::checkHPageOffset(
628 int npages = toI(pages.size()); 627 int npages = toI(pages.size());
629 qpdf_offset_t table_offset = adjusted_offset(m->page_offset_hints.first_page_offset); 628 qpdf_offset_t table_offset = adjusted_offset(m->page_offset_hints.first_page_offset);
630 QPDFObjGen first_page_og(pages.at(0).getObjGen()); 629 QPDFObjGen first_page_og(pages.at(0).getObjGen());
631 - if (m->objects.xref_table().type(first_page_og) == 0) { 630 + if (m->xref_table.type(first_page_og) == 0) {
632 stopOnError("supposed first page object is not known"); 631 stopOnError("supposed first page object is not known");
633 } 632 }
634 qpdf_offset_t offset = getLinearizationOffset(first_page_og); 633 qpdf_offset_t offset = getLinearizationOffset(first_page_og);
@@ -639,7 +638,7 @@ QPDF::checkHPageOffset( @@ -639,7 +638,7 @@ QPDF::checkHPageOffset(
639 for (int pageno = 0; pageno < npages; ++pageno) { 638 for (int pageno = 0; pageno < npages; ++pageno) {
640 QPDFObjGen page_og(pages.at(toS(pageno)).getObjGen()); 639 QPDFObjGen page_og(pages.at(toS(pageno)).getObjGen());
641 int first_object = page_og.getObj(); 640 int first_object = page_og.getObj();
642 - if (m->objects.xref_table().type(page_og) == 0) { 641 + if (m->xref_table.type(page_og) == 0) {
643 stopOnError("unknown object in page offset hint table"); 642 stopOnError("unknown object in page offset hint table");
644 } 643 }
645 offset = getLinearizationOffset(page_og); 644 offset = getLinearizationOffset(page_og);
@@ -761,7 +760,7 @@ QPDF::checkHSharedObject(std::vector&lt;QPDFObjectHandle&gt; const&amp; pages, std::map&lt;in @@ -761,7 +760,7 @@ QPDF::checkHSharedObject(std::vector&lt;QPDFObjectHandle&gt; const&amp; pages, std::map&lt;in
761 cur_object = so.first_shared_obj; 760 cur_object = so.first_shared_obj;
762 761
763 QPDFObjGen og(cur_object, 0); 762 QPDFObjGen og(cur_object, 0);
764 - if (m->objects.xref_table().type(og) == 0) { 763 + if (m->xref_table.type(og) == 0) {
765 stopOnError("unknown object in shared object hint table"); 764 stopOnError("unknown object in shared object hint table");
766 } 765 }
767 qpdf_offset_t offset = getLinearizationOffset(og); 766 qpdf_offset_t offset = getLinearizationOffset(og);
@@ -812,7 +811,7 @@ QPDF::checkHOutlines() @@ -812,7 +811,7 @@ QPDF::checkHOutlines()
812 return; 811 return;
813 } 812 }
814 QPDFObjGen og(outlines.getObjGen()); 813 QPDFObjGen og(outlines.getObjGen());
815 - if (m->objects.xref_table().type(og) == 0) { 814 + if (m->xref_table.type(og) == 0) {
816 stopOnError("unknown object in outlines hint table"); 815 stopOnError("unknown object in outlines hint table");
817 } 816 }
818 qpdf_offset_t offset = getLinearizationOffset(og); 817 qpdf_offset_t offset = getLinearizationOffset(og);
@@ -1159,7 +1158,7 @@ QPDF::calculateLinearizationData(T const&amp; object_stream_data) @@ -1159,7 +1158,7 @@ QPDF::calculateLinearizationData(T const&amp; object_stream_data)
1159 // Map all page objects to the containing object stream. This should be a no-op in a 1158 // Map all page objects to the containing object stream. This should be a no-op in a
1160 // properly linearized file. 1159 // properly linearized file.
1161 for (auto oh: getAllPages()) { 1160 for (auto oh: getAllPages()) {
1162 - pages.emplace_back(getUncompressedObject(oh, object_stream_data)); 1161 + pages.push_back(getUncompressedObject(oh, object_stream_data));
1163 } 1162 }
1164 } 1163 }
1165 int npages = toI(pages.size()); 1164 int npages = toI(pages.size());
@@ -1430,9 +1429,9 @@ QPDF::pushOutlinesToPart( @@ -1430,9 +1429,9 @@ QPDF::pushOutlinesToPart(
1430 m->c_outline_data.first_object = outlines_og.getObj(); 1429 m->c_outline_data.first_object = outlines_og.getObj();
1431 m->c_outline_data.nobjects = 1; 1430 m->c_outline_data.nobjects = 1;
1432 lc_outlines.erase(outlines_og); 1431 lc_outlines.erase(outlines_og);
1433 - part.emplace_back(outlines); 1432 + part.push_back(outlines);
1434 for (auto const& og: lc_outlines) { 1433 for (auto const& og: lc_outlines) {
1435 - part.emplace_back(m->objects.get(og)); 1434 + part.push_back(getObject(og));
1436 ++m->c_outline_data.nobjects; 1435 ++m->c_outline_data.nobjects;
1437 } 1436 }
1438 } 1437 }
libqpdf/QPDF_objects.cc deleted
1 -#include <qpdf/qpdf-config.h> // include first for large file support  
2 -  
3 -#include <qpdf/QPDF_private.hh>  
4 -  
5 -#include <array>  
6 -#include <cstring>  
7 -#include <limits>  
8 -#include <map>  
9 -#include <vector>  
10 -  
11 -#include <qpdf/BufferInputSource.hh>  
12 -#include <qpdf/OffsetInputSource.hh>  
13 -#include <qpdf/Pipeline.hh>  
14 -#include <qpdf/QPDFExc.hh>  
15 -#include <qpdf/QPDFLogger.hh>  
16 -#include <qpdf/QPDFObject_private.hh>  
17 -#include <qpdf/QPDFParser.hh>  
18 -#include <qpdf/QPDF_Array.hh>  
19 -#include <qpdf/QPDF_Dictionary.hh>  
20 -#include <qpdf/QPDF_Null.hh>  
21 -#include <qpdf/QPDF_Reserved.hh>  
22 -#include <qpdf/QPDF_Stream.hh>  
23 -#include <qpdf/QPDF_Unresolved.hh>  
24 -#include <qpdf/QTC.hh>  
25 -#include <qpdf/QUtil.hh>  
26 -  
27 -using Objects = QPDF::Objects;  
28 -using Xref_table = Objects::Xref_table;  
29 -  
30 -namespace  
31 -{  
32 - class InvalidInputSource final: public InputSource  
33 - {  
34 - public:  
35 - InvalidInputSource(std::string const& name) :  
36 - name(name)  
37 - {  
38 - }  
39 - ~InvalidInputSource() final = default;  
40 - qpdf_offset_t  
41 - findAndSkipNextEOL() final  
42 - {  
43 - throwException();  
44 - return 0;  
45 - }  
46 - std::string const&  
47 - getName() const final  
48 - {  
49 - return name;  
50 - }  
51 - qpdf_offset_t  
52 - tell() final  
53 - {  
54 - throwException();  
55 - return 0;  
56 - }  
57 - void  
58 - seek(qpdf_offset_t offset, int whence) final  
59 - {  
60 - throwException();  
61 - }  
62 - void  
63 - rewind() final  
64 - {  
65 - throwException();  
66 - }  
67 - size_t  
68 - read(char* buffer, size_t length) final  
69 - {  
70 - throwException();  
71 - return 0;  
72 - }  
73 - void  
74 - unreadCh(char ch) final  
75 - {  
76 - throwException();  
77 - }  
78 -  
79 - private:  
80 - void  
81 - throwException()  
82 - {  
83 - throw std::logic_error("QPDF operation attempted on a QPDF object with no input "  
84 - "source. QPDF operations are invalid before processFile (or "  
85 - "another process method) or after closeInputSource");  
86 - }  
87 -  
88 - std::string const& name;  
89 - };  
90 -} // namespace  
91 -  
92 -bool  
93 -QPDF::findStartxref()  
94 -{  
95 - if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {  
96 - // Position in front of offset token  
97 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
98 - return true;  
99 - }  
100 - return false;  
101 -}  
102 -  
103 -void  
104 -Xref_table::initialize_empty()  
105 -{  
106 - initialized_ = true;  
107 - trailer_ = QPDFObjectHandle::newDictionary();  
108 - auto rt = qpdf.makeIndirectObject(QPDFObjectHandle::newDictionary());  
109 - auto pgs = qpdf.makeIndirectObject(QPDFObjectHandle::newDictionary());  
110 - pgs.replaceKey("/Type", QPDFObjectHandle::newName("/Pages"));  
111 - pgs.replaceKey("/Kids", QPDFObjectHandle::newArray());  
112 - pgs.replaceKey("/Count", QPDFObjectHandle::newInteger(0));  
113 - rt.replaceKey("/Type", QPDFObjectHandle::newName("/Catalog"));  
114 - rt.replaceKey("/Pages", pgs);  
115 - trailer_.replaceKey("/Root", rt);  
116 - trailer_.replaceKey("/Size", QPDFObjectHandle::newInteger(3));  
117 -}  
118 -  
119 -void  
120 -Xref_table::initialize_json()  
121 -{  
122 - initialized_ = true;  
123 - table.resize(1);  
124 - trailer_ = QPDFObjectHandle::newDictionary();  
125 - trailer_.replaceKey("/Size", QPDFObjectHandle::newInteger(1));  
126 -}  
127 -  
128 -void  
129 -Xref_table::initialize()  
130 -{  
131 - // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra  
132 - // 30 characters to leave room for the startxref stuff.  
133 - file->seek(0, SEEK_END);  
134 - qpdf_offset_t end_offset = file->tell();  
135 - // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic  
136 - // scenarios at least 3 bytes are required.  
137 - if (max_id_ > end_offset / 3) {  
138 - max_id_ = static_cast<int>(end_offset / 3);  
139 - }  
140 - qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);  
141 - PatternFinder sf(qpdf, &QPDF::findStartxref);  
142 - qpdf_offset_t xref_offset = 0;  
143 - if (file->findLast("startxref", start_offset, 0, sf)) {  
144 - xref_offset = QUtil::string_to_ll(read_token().getValue().c_str());  
145 - }  
146 -  
147 - try {  
148 - if (xref_offset == 0) {  
149 - QTC::TC("qpdf", "QPDF can't find startxref");  
150 - throw damaged_pdf("can't find startxref");  
151 - }  
152 - try {  
153 - read(xref_offset);  
154 - } catch (QPDFExc&) {  
155 - throw;  
156 - } catch (std::exception& e) {  
157 - throw damaged_pdf(std::string("error reading xref: ") + e.what());  
158 - }  
159 - } catch (QPDFExc& e) {  
160 - if (attempt_recovery_) {  
161 - reconstruct(e);  
162 - QTC::TC("qpdf", "QPDF reconstructed xref table");  
163 - } else {  
164 - throw;  
165 - }  
166 - }  
167 -  
168 - initialized_ = true;  
169 -}  
170 -  
171 -void  
172 -Xref_table::reconstruct(QPDFExc& e)  
173 -{  
174 - if (reconstructed_) {  
175 - // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because  
176 - // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.  
177 - throw e;  
178 - }  
179 -  
180 - // If recovery generates more than 1000 warnings, the file is so severely damaged that there  
181 - // probably is no point trying to continue.  
182 - const auto max_warnings = qpdf.m->warnings.size() + 1000U;  
183 - auto check_warnings = [this, max_warnings]() {  
184 - if (qpdf.m->warnings.size() > max_warnings) {  
185 - throw damaged_pdf("too many errors while reconstructing cross-reference table");  
186 - }  
187 - };  
188 -  
189 - reconstructed_ = true;  
190 - // We may find more objects, which may contain dangling references.  
191 - qpdf.m->fixed_dangling_refs = false;  
192 -  
193 - warn_damaged("file is damaged");  
194 - qpdf.warn(e);  
195 - warn_damaged("Attempting to reconstruct cross-reference table");  
196 -  
197 - // Delete all references to type 1 (uncompressed) objects  
198 - for (auto& iter: table) {  
199 - if (iter.type() == 1) {  
200 - iter = {};  
201 - }  
202 - }  
203 -  
204 - std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;  
205 - std::vector<qpdf_offset_t> trailers;  
206 - int max_found = 0;  
207 -  
208 - file->seek(0, SEEK_END);  
209 - qpdf_offset_t eof = file->tell();  
210 - file->seek(0, SEEK_SET);  
211 - // Don't allow very long tokens here during recovery. All the interesting tokens are covered.  
212 - static size_t const MAX_LEN = 10;  
213 - while (file->tell() < eof) {  
214 - QPDFTokenizer::Token t1 = read_token(MAX_LEN);  
215 - qpdf_offset_t token_start = file->tell() - toO(t1.getValue().length());  
216 - if (t1.isInteger()) {  
217 - auto pos = file->tell();  
218 - QPDFTokenizer::Token t2 = read_token(MAX_LEN);  
219 - if (t2.isInteger() && read_token(MAX_LEN).isWord("obj")) {  
220 - int obj = QUtil::string_to_int(t1.getValue().c_str());  
221 - int gen = QUtil::string_to_int(t2.getValue().c_str());  
222 - if (obj <= max_id_) {  
223 - found_objects.emplace_back(obj, gen, token_start);  
224 - if (obj > max_found) {  
225 - max_found = obj;  
226 - }  
227 - } else {  
228 - warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));  
229 - }  
230 - }  
231 - file->seek(pos, SEEK_SET);  
232 - } else if (!trailer_ && t1.isWord("trailer")) {  
233 - trailers.emplace_back(file->tell());  
234 - }  
235 - file->findAndSkipNextEOL();  
236 - }  
237 -  
238 - table.resize(toS(max_found) + 1);  
239 -  
240 - for (auto tr: trailers) {  
241 - file->seek(tr, SEEK_SET);  
242 - auto t = read_trailer();  
243 - if (!t.isDictionary()) {  
244 - // Oh well. It was worth a try.  
245 - } else {  
246 - trailer_ = t;  
247 - break;  
248 - }  
249 - check_warnings();  
250 - }  
251 -  
252 - auto rend = found_objects.rend();  
253 - for (auto it = found_objects.rbegin(); it != rend; it++) {  
254 - auto [obj, gen, token_start] = *it;  
255 - insert(obj, 1, token_start, gen);  
256 - check_warnings();  
257 - }  
258 -  
259 - if (!trailer_) {  
260 - qpdf_offset_t max_offset{0};  
261 - // If there are any xref streams, take the last one to appear.  
262 - int i = -1;  
263 - for (auto const& item: table) {  
264 - ++i;  
265 - if (item.type() != 1) {  
266 - continue;  
267 - }  
268 - auto oh = objects.get(i, item.gen());  
269 - try {  
270 - if (!oh.isStreamOfType("/XRef")) {  
271 - continue;  
272 - }  
273 - } catch (std::exception&) {  
274 - continue;  
275 - }  
276 - auto offset = item.offset();  
277 - if (offset > max_offset) {  
278 - max_offset = offset;  
279 - trailer_ = oh.getDict();  
280 - }  
281 - check_warnings();  
282 - }  
283 - if (max_offset > 0) {  
284 - try {  
285 - read(max_offset);  
286 - } catch (std::exception&) {  
287 - throw damaged_pdf(  
288 - "error decoding candidate xref stream while recovering damaged file");  
289 - }  
290 - QTC::TC("qpdf", "QPDF recover xref stream");  
291 - }  
292 - }  
293 -  
294 - if (!trailer_) {  
295 - // We could check the last encountered object to see if it was an xref stream. If so, we  
296 - // could try to get the trailer from there. This may make it possible to recover files with  
297 - // bad startxref pointers even when they have object streams.  
298 -  
299 - throw damaged_pdf("unable to find trailer dictionary while recovering damaged file");  
300 - }  
301 - if (table.empty()) {  
302 - // We cannot check for an empty xref table in parse because empty tables are valid when  
303 - // creating QPDF objects from JSON.  
304 - throw damaged_pdf("unable to find objects while recovering damaged file");  
305 - }  
306 - check_warnings();  
307 - if (!initialized_) {  
308 - initialized_ = true;  
309 - qpdf.getAllPages();  
310 - check_warnings();  
311 - if (qpdf.m->all_pages.empty()) {  
312 - initialized_ = false;  
313 - throw damaged_pdf("unable to find any pages while recovering damaged file");  
314 - }  
315 - }  
316 - // We could iterate through the objects looking for streams and try to find objects inside of  
317 - // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors  
318 - // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything  
319 - // that involved looking at stream contents, we'd also have to call initializeEncryption() here.  
320 - // It's safe to call it more than once.  
321 -}  
322 -  
323 -void  
324 -Xref_table::read(qpdf_offset_t xref_offset)  
325 -{  
326 - std::map<int, int> free_table;  
327 - std::set<qpdf_offset_t> visited;  
328 - while (xref_offset) {  
329 - visited.insert(xref_offset);  
330 - char buf[7];  
331 - memset(buf, 0, sizeof(buf));  
332 - file->seek(xref_offset, SEEK_SET);  
333 - // Some files miss the mark a little with startxref. We could do a better job of searching  
334 - // in the neighborhood for something that looks like either an xref table or stream, but the  
335 - // simple heuristic of skipping whitespace can help with the xref table case and is harmless  
336 - // with the stream case.  
337 - bool done = false;  
338 - bool skipped_space = false;  
339 - while (!done) {  
340 - char ch;  
341 - if (1 == file->read(&ch, 1)) {  
342 - if (QUtil::is_space(ch)) {  
343 - skipped_space = true;  
344 - } else {  
345 - file->unreadCh(ch);  
346 - done = true;  
347 - }  
348 - } else {  
349 - QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);  
350 - done = true;  
351 - }  
352 - }  
353 -  
354 - file->read(buf, sizeof(buf) - 1);  
355 - // The PDF spec says xref must be followed by a line terminator, but files exist in the wild  
356 - // where it is terminated by arbitrary whitespace.  
357 - if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) {  
358 - if (skipped_space) {  
359 - QTC::TC("qpdf", "QPDF xref skipped space");  
360 - warn_damaged("extraneous whitespace seen before xref");  
361 - }  
362 - QTC::TC(  
363 - "qpdf",  
364 - "QPDF xref space",  
365 - ((buf[4] == '\n') ? 0  
366 - : (buf[4] == '\r') ? 1  
367 - : (buf[4] == ' ') ? 2  
368 - : 9999));  
369 - int skip = 4;  
370 - // buf is null-terminated, and QUtil::is_space('\0') is false, so this won't overrun.  
371 - while (QUtil::is_space(buf[skip])) {  
372 - ++skip;  
373 - }  
374 - xref_offset = process_section(xref_offset + skip);  
375 - } else {  
376 - xref_offset = read_stream(xref_offset);  
377 - }  
378 - if (visited.count(xref_offset) != 0) {  
379 - QTC::TC("qpdf", "QPDF xref loop");  
380 - throw damaged_pdf("loop detected following xref tables");  
381 - }  
382 - }  
383 -  
384 - if (!trailer_) {  
385 - throw damaged_pdf("unable to find trailer while reading xref");  
386 - }  
387 - int size = trailer_.getKey("/Size").getIntValueAsInt();  
388 -  
389 - if (size < 3) {  
390 - throw damaged_pdf("too few objects - file can't have a page tree");  
391 - }  
392 -  
393 - // We are no longer reporting what the highest id in the xref table is. I don't think it adds  
394 - // anything. If we want to report more detail, we should report the total number of missing  
395 - // entries, including missing entries before the last actual entry.  
396 -}  
397 -  
398 -Xref_table::Subsection  
399 -Xref_table::subsection(std::string const& line)  
400 -{  
401 - auto terminate = [this]() -> void {  
402 - QTC::TC("qpdf", "QPDF invalid xref");  
403 - throw damaged_table("xref syntax invalid");  
404 - };  
405 -  
406 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
407 - // buffer.  
408 - char const* p = line.c_str();  
409 - char const* start = line.c_str();  
410 -  
411 - // Skip zero or more spaces  
412 - while (QUtil::is_space(*p)) {  
413 - ++p;  
414 - }  
415 - // Require digit  
416 - if (!QUtil::is_digit(*p)) {  
417 - terminate();  
418 - }  
419 - // Gather digits  
420 - std::string obj_str;  
421 - while (QUtil::is_digit(*p)) {  
422 - obj_str.append(1, *p++);  
423 - }  
424 - // Require space  
425 - if (!QUtil::is_space(*p)) {  
426 - terminate();  
427 - }  
428 - // Skip spaces  
429 - while (QUtil::is_space(*p)) {  
430 - ++p;  
431 - }  
432 - // Require digit  
433 - if (!QUtil::is_digit(*p)) {  
434 - terminate();  
435 - }  
436 - // Gather digits  
437 - std::string num_str;  
438 - while (QUtil::is_digit(*p)) {  
439 - num_str.append(1, *p++);  
440 - }  
441 - // Skip any space including line terminators  
442 - while (QUtil::is_space(*p)) {  
443 - ++p;  
444 - }  
445 - auto obj = QUtil::string_to_int(obj_str.c_str());  
446 - auto count = QUtil::string_to_int(num_str.c_str());  
447 - if (obj > max_id() || count > max_id() || (obj + count) > max_id()) {  
448 - throw damaged_table("xref table subsection header contains impossibly large entry");  
449 - }  
450 - return {obj, count, file->getLastOffset() + toI(p - start)};  
451 -}  
452 -  
453 -std::vector<Xref_table::Subsection>  
454 -Xref_table::bad_subsections(std::string& line, qpdf_offset_t start)  
455 -{  
456 - std::vector<Xref_table::Subsection> result;  
457 - file->seek(start, SEEK_SET);  
458 -  
459 - while (true) {  
460 - line.assign(50, '\0');  
461 - file->read(line.data(), line.size());  
462 - auto [obj, num, offset] = result.emplace_back(subsection(line));  
463 - file->seek(offset, SEEK_SET);  
464 - for (qpdf_offset_t i = obj; i - num < obj; ++i) {  
465 - if (!std::get<0>(read_entry())) {  
466 - QTC::TC("qpdf", "QPDF invalid xref entry");  
467 - throw damaged_table("invalid xref entry (obj=" + std::to_string(i) + ")");  
468 - }  
469 - }  
470 - qpdf_offset_t pos = file->tell();  
471 - if (read_token().isWord("trailer")) {  
472 - return result;  
473 - } else {  
474 - file->seek(pos, SEEK_SET);  
475 - }  
476 - }  
477 -}  
478 -  
479 -// Optimistically read and parse all subsection headers. If an error is encountered return the  
480 -// result of bad_subsections.  
481 -std::vector<Xref_table::Subsection>  
482 -Xref_table::subsections(std::string& line)  
483 -{  
484 - auto recovery_offset = file->tell();  
485 - try {  
486 - std::vector<Xref_table::Subsection> result;  
487 -  
488 - while (true) {  
489 - line.assign(50, '\0');  
490 - file->read(line.data(), line.size());  
491 - auto& sub = result.emplace_back(subsection(line));  
492 - auto count = std::get<1>(sub);  
493 - auto offset = std::get<2>(sub);  
494 - file->seek(offset + 20 * toO(count) - 1, SEEK_SET);  
495 - file->read(line.data(), 1);  
496 - if (!(line[0] == '\n' || line[0] == '\n')) {  
497 - return bad_subsections(line, recovery_offset);  
498 - }  
499 - qpdf_offset_t pos = file->tell();  
500 - if (read_token().isWord("trailer")) {  
501 - return result;  
502 - } else {  
503 - file->seek(pos, SEEK_SET);  
504 - }  
505 - }  
506 - } catch (...) {  
507 - return bad_subsections(line, recovery_offset);  
508 - }  
509 -}  
510 -  
511 -// Returns (success, f1, f2, type).  
512 -std::tuple<bool, qpdf_offset_t, int, char>  
513 -Xref_table::read_bad_entry()  
514 -{  
515 - qpdf_offset_t f1{0};  
516 - int f2{0};  
517 - char type{'\0'};  
518 - // Reposition after initial read attempt and reread.  
519 - file->seek(file->getLastOffset(), SEEK_SET);  
520 - auto line = file->readLine(30);  
521 -  
522 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
523 - // buffer.  
524 - char const* p = line.data();  
525 -  
526 - // Skip zero or more spaces. There aren't supposed to be any.  
527 - bool invalid = false;  
528 - while (QUtil::is_space(*p)) {  
529 - ++p;  
530 - QTC::TC("qpdf", "QPDF ignore first space in xref entry");  
531 - invalid = true;  
532 - }  
533 - // Require digit  
534 - if (!QUtil::is_digit(*p)) {  
535 - return {false, 0, 0, '\0'};  
536 - }  
537 - // Gather digits  
538 - std::string f1_str;  
539 - while (QUtil::is_digit(*p)) {  
540 - f1_str.append(1, *p++);  
541 - }  
542 - // Require space  
543 - if (!QUtil::is_space(*p)) {  
544 - return {false, 0, 0, '\0'};  
545 - }  
546 - if (QUtil::is_space(*(p + 1))) {  
547 - QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");  
548 - invalid = true;  
549 - }  
550 - // Skip spaces  
551 - while (QUtil::is_space(*p)) {  
552 - ++p;  
553 - }  
554 - // Require digit  
555 - if (!QUtil::is_digit(*p)) {  
556 - return {false, 0, 0, '\0'};  
557 - }  
558 - // Gather digits  
559 - std::string f2_str;  
560 - while (QUtil::is_digit(*p)) {  
561 - f2_str.append(1, *p++);  
562 - }  
563 - // Require space  
564 - if (!QUtil::is_space(*p)) {  
565 - return {false, 0, 0, '\0'};  
566 - }  
567 - if (QUtil::is_space(*(p + 1))) {  
568 - QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");  
569 - invalid = true;  
570 - }  
571 - // Skip spaces  
572 - while (QUtil::is_space(*p)) {  
573 - ++p;  
574 - }  
575 - if ((*p == 'f') || (*p == 'n')) {  
576 - type = *p;  
577 - } else {  
578 - return {false, 0, 0, '\0'};  
579 - }  
580 - if ((f1_str.length() != 10) || (f2_str.length() != 5)) {  
581 - QTC::TC("qpdf", "QPDF ignore length error xref entry");  
582 - invalid = true;  
583 - }  
584 -  
585 - if (invalid) {  
586 - qpdf.warn(damaged_table("accepting invalid xref table entry"));  
587 - }  
588 -  
589 - f1 = QUtil::string_to_ll(f1_str.c_str());  
590 - f2 = QUtil::string_to_int(f2_str.c_str());  
591 -  
592 - return {true, f1, f2, type};  
593 -}  
594 -  
595 -// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return  
596 -// result. Returns (success, f1, f2, type).  
597 -std::tuple<bool, qpdf_offset_t, int, char>  
598 -Xref_table::read_entry()  
599 -{  
600 - qpdf_offset_t f1{0};  
601 - int f2{0};  
602 - char type{'\0'};  
603 - std::array<char, 21> line;  
604 - f1 = 0;  
605 - f2 = 0;  
606 - if (file->read(line.data(), 20) != 20) {  
607 - // C++20: [[unlikely]]  
608 - return {false, 0, 0, '\0'};  
609 - }  
610 - line[20] = '\0';  
611 - char const* p = line.data();  
612 -  
613 - int f1_len = 0;  
614 - int f2_len = 0;  
615 -  
616 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
617 - // buffer.  
618 -  
619 - // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.  
620 - while (*p == '0') {  
621 - ++f1_len;  
622 - ++p;  
623 - }  
624 - while (QUtil::is_digit(*p) && f1_len++ < 10) {  
625 - f1 *= 10;  
626 - f1 += *p++ - '0';  
627 - }  
628 - // Require space  
629 - if (!QUtil::is_space(*p++)) {  
630 - // Entry doesn't start with space or digit.  
631 - // C++20: [[unlikely]]  
632 - return {false, 0, 0, '\0'};  
633 - }  
634 - // Gather digits. NB No risk of overflow as 99'999 < max int.  
635 - while (*p == '0') {  
636 - ++f2_len;  
637 - ++p;  
638 - }  
639 - while (QUtil::is_digit(*p) && f2_len++ < 5) {  
640 - f2 *= 10;  
641 - f2 += static_cast<int>(*p++ - '0');  
642 - }  
643 - if (QUtil::is_space(*p++) && (*p == 'f' || *p == 'n')) {  
644 - // C++20: [[likely]]  
645 - type = *p;  
646 - // No test for valid line[19].  
647 - if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {  
648 - // C++20: [[likely]]  
649 - return {true, f1, f2, type};  
650 - }  
651 - }  
652 - return read_bad_entry();  
653 -}  
654 -  
655 -// Read a single cross-reference table section and associated trailer.  
656 -qpdf_offset_t  
657 -Xref_table::process_section(qpdf_offset_t xref_offset)  
658 -{  
659 - file->seek(xref_offset, SEEK_SET);  
660 - std::string line;  
661 - auto subs = subsections(line);  
662 -  
663 - auto cur_trailer_offset = file->tell();  
664 - auto cur_trailer = read_trailer();  
665 - if (!cur_trailer.isDictionary()) {  
666 - QTC::TC("qpdf", "QPDF missing trailer");  
667 - throw qpdf.damagedPDF("", "expected trailer dictionary");  
668 - }  
669 -  
670 - if (!trailer_) {  
671 - unsigned int sz;  
672 - trailer_ = cur_trailer;  
673 -  
674 - if (!trailer_.hasKey("/Size")) {  
675 - QTC::TC("qpdf", "QPDF trailer lacks size");  
676 - throw qpdf.damagedPDF("trailer", "trailer dictionary lacks /Size key");  
677 - }  
678 - if (!trailer_.getKey("/Size").getValueAsUInt(sz)) {  
679 - QTC::TC("qpdf", "QPDF trailer size not integer");  
680 - throw qpdf.damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");  
681 - }  
682 - if (sz >= static_cast<unsigned int>(max_id_)) {  
683 - QTC::TC("qpdf", "QPDF trailer size impossibly large");  
684 - throw qpdf.damagedPDF("trailer", "/Size key in trailer dictionary is impossibly large");  
685 - }  
686 - table.resize(sz);  
687 - }  
688 -  
689 - for (auto [obj, num, offset]: subs) {  
690 - file->seek(offset, SEEK_SET);  
691 - for (qpdf_offset_t i = obj; i - num < obj; ++i) {  
692 - if (i == 0) {  
693 - // This is needed by checkLinearization()  
694 - first_item_offset_ = file->tell();  
695 - }  
696 - // For xref_table, these will always be small enough to be ints  
697 - auto [success, f1, f2, type] = read_entry();  
698 - if (!success) {  
699 - throw damaged_table("invalid xref entry (obj=" + std::to_string(i) + ")");  
700 - }  
701 - if (type == 'f') {  
702 - insert_free(QPDFObjGen(toI(i), f2));  
703 - } else {  
704 - insert(toI(i), 1, f1, f2);  
705 - }  
706 - }  
707 - qpdf_offset_t pos = file->tell();  
708 - if (read_token().isWord("trailer")) {  
709 - break;  
710 - } else {  
711 - file->seek(pos, SEEK_SET);  
712 - }  
713 - }  
714 -  
715 - if (cur_trailer.hasKey("/XRefStm")) {  
716 - if (ignore_streams_) {  
717 - QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");  
718 - } else {  
719 - if (cur_trailer.getKey("/XRefStm").isInteger()) {  
720 - // Read the xref stream but disregard any return value -- we'll use our trailer's  
721 - // /Prev key instead of the xref stream's.  
722 - (void)read_stream(cur_trailer.getKey("/XRefStm").getIntValue());  
723 - } else {  
724 - throw qpdf.damagedPDF("xref stream", cur_trailer_offset, "invalid /XRefStm");  
725 - }  
726 - }  
727 - }  
728 -  
729 - if (cur_trailer.hasKey("/Prev")) {  
730 - if (!cur_trailer.getKey("/Prev").isInteger()) {  
731 - QTC::TC("qpdf", "QPDF trailer prev not integer");  
732 - throw qpdf.damagedPDF(  
733 - "trailer", cur_trailer_offset, "/Prev key in trailer dictionary is not an integer");  
734 - }  
735 - QTC::TC("qpdf", "QPDF prev key in trailer dictionary");  
736 - return cur_trailer.getKey("/Prev").getIntValue();  
737 - }  
738 -  
739 - return 0;  
740 -}  
741 -  
742 -// Read a single cross-reference stream.  
743 -qpdf_offset_t  
744 -Xref_table::read_stream(qpdf_offset_t xref_offset)  
745 -{  
746 - if (!ignore_streams_) {  
747 - QPDFObjGen x_og;  
748 - QPDFObjectHandle xref_obj;  
749 - try {  
750 - xref_obj =  
751 - objects.read(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);  
752 - } catch (QPDFExc&) {  
753 - // ignore -- report error below  
754 - }  
755 - if (xref_obj.isStreamOfType("/XRef")) {  
756 - QTC::TC("qpdf", "QPDF found xref stream");  
757 - return process_stream(xref_offset, xref_obj);  
758 - }  
759 - }  
760 -  
761 - QTC::TC("qpdf", "QPDF can't find xref");  
762 - throw qpdf.damagedPDF("", xref_offset, "xref not found");  
763 - return 0; // unreachable  
764 -}  
765 -  
766 -// Return the entry size of the xref stream and the processed W array.  
767 -std::pair<int, std::array<int, 3>>  
768 -Xref_table::process_W(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)  
769 -{  
770 - auto W_obj = dict.getKey("/W");  
771 - if (!(W_obj.isArray() && W_obj.getArrayNItems() >= 3 && W_obj.getArrayItem(0).isInteger() &&  
772 - W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {  
773 - throw damaged("Cross-reference stream does not have a proper /W key");  
774 - }  
775 -  
776 - std::array<int, 3> W;  
777 - int entry_size = 0;  
778 - auto w_vector = W_obj.getArrayAsVector();  
779 - int max_bytes = sizeof(qpdf_offset_t);  
780 - for (size_t i = 0; i < 3; ++i) {  
781 - W[i] = w_vector[i].getIntValueAsInt();  
782 - if (W[i] > max_bytes) {  
783 - throw damaged("Cross-reference stream's /W contains impossibly large values");  
784 - }  
785 - if (W[i] < 0) {  
786 - throw damaged("Cross-reference stream's /W contains negative values");  
787 - }  
788 - entry_size += W[i];  
789 - }  
790 - if (entry_size == 0) {  
791 - throw damaged("Cross-reference stream's /W indicates entry size of 0");  
792 - }  
793 - return {entry_size, W};  
794 -}  
795 -  
796 -// Validate Size entry and return the maximum number of entries that the xref stream can contain and  
797 -// the value of the Size entry.  
798 -std::pair<int, size_t>  
799 -Xref_table::process_Size(  
800 - QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)  
801 -{  
802 - // Number of entries is limited by the highest possible object id and stream size.  
803 - auto max_num_entries = std::numeric_limits<int>::max();  
804 - if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {  
805 - max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);  
806 - }  
807 -  
808 - auto Size_obj = dict.getKey("/Size");  
809 - long long size;  
810 - if (!dict.getKey("/Size").getValueAsInt(size)) {  
811 - throw damaged("Cross-reference stream does not have a proper /Size key");  
812 - } else if (size < 0) {  
813 - throw damaged("Cross-reference stream has a negative /Size key");  
814 - } else if (size >= max_num_entries) {  
815 - throw damaged("Cross-reference stream has an impossibly large /Size key");  
816 - }  
817 - // We are not validating that Size <= (Size key of parent xref / trailer).  
818 - return {max_num_entries, toS(size)};  
819 -}  
820 -  
821 -// Return the number of entries of the xref stream and the processed Index array.  
822 -std::pair<int, std::vector<std::pair<int, int>>>  
823 -Xref_table::process_Index(  
824 - QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)  
825 -{  
826 - auto size = dict.getKey("/Size").getIntValueAsInt();  
827 - auto Index_obj = dict.getKey("/Index");  
828 -  
829 - if (Index_obj.isArray()) {  
830 - std::vector<std::pair<int, int>> indx;  
831 - int num_entries = 0;  
832 - auto index_vec = Index_obj.getArrayAsVector();  
833 - if ((index_vec.size() % 2) || index_vec.size() < 2) {  
834 - throw damaged("Cross-reference stream's /Index has an invalid number of values");  
835 - }  
836 -  
837 - int i = 0;  
838 - long long first = 0;  
839 - for (auto& val: index_vec) {  
840 - if (val.isInteger()) {  
841 - if (i % 2) {  
842 - auto count = val.getIntValue();  
843 - if (count <= 0) {  
844 - throw damaged(  
845 - "Cross-reference stream section claims to contain " +  
846 - std::to_string(count) + " entries");  
847 - }  
848 - // We are guarding against the possibility of num_entries * entry_size  
849 - // overflowing. We are not checking that entries are in ascending order as  
850 - // required by the spec, which probably should generate a warning. We are also  
851 - // not checking that for each subsection first object number + number of entries  
852 - // <= /Size. The spec requires us to ignore object number > /Size.  
853 - if (first > (max_num_entries - count) ||  
854 - count > (max_num_entries - num_entries)) {  
855 - throw damaged(  
856 - "Cross-reference stream claims to contain too many entries: " +  
857 - std::to_string(first) + " " + std::to_string(max_num_entries) + " " +  
858 - std::to_string(num_entries));  
859 - }  
860 - indx.emplace_back(static_cast<int>(first), static_cast<int>(count));  
861 - num_entries += static_cast<int>(count);  
862 - } else {  
863 - first = val.getIntValue();  
864 - if (first < 0) {  
865 - throw damaged(  
866 - "Cross-reference stream's /Index contains a negative object id");  
867 - } else if (first > max_num_entries) {  
868 - throw damaged("Cross-reference stream's /Index contains an impossibly "  
869 - "large object id");  
870 - }  
871 - }  
872 - } else {  
873 - throw damaged(  
874 - "Cross-reference stream's /Index's item " + std::to_string(i) +  
875 - " is not an integer");  
876 - }  
877 - i++;  
878 - }  
879 - QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);  
880 - return {num_entries, indx};  
881 - } else if (Index_obj.isNull()) {  
882 - QTC::TC("qpdf", "QPDF xref /Index is null");  
883 - return {size, {{0, size}}};  
884 - } else {  
885 - throw damaged("Cross-reference stream does not have a proper /Index key");  
886 - }  
887 -}  
888 -  
889 -qpdf_offset_t  
890 -Xref_table::process_stream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)  
891 -{  
892 - auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {  
893 - return qpdf.damagedPDF("xref stream", xref_offset, msg.data());  
894 - };  
895 -  
896 - auto dict = xref_obj.getDict();  
897 -  
898 - auto [entry_size, W] = process_W(dict, damaged);  
899 - auto [max_num_entries, size] = process_Size(dict, entry_size, damaged);  
900 - auto [num_entries, indx] = process_Index(dict, max_num_entries, damaged);  
901 -  
902 - std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);  
903 - size_t actual_size = bp->getSize();  
904 - auto expected_size = toS(entry_size) * toS(num_entries);  
905 -  
906 - if (expected_size != actual_size) {  
907 - QPDFExc x = damaged(  
908 - "Cross-reference stream data has the wrong size; expected = " +  
909 - std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));  
910 - if (expected_size > actual_size) {  
911 - throw x;  
912 - } else {  
913 - qpdf.warn(x);  
914 - }  
915 - }  
916 -  
917 - if (!trailer_) {  
918 - trailer_ = dict;  
919 - if (size > toS(max_id_)) {  
920 - throw damaged("Cross-reference stream /Size entry is impossibly large");  
921 - }  
922 - table.resize(size);  
923 - }  
924 -  
925 - bool saw_first_compressed_object = false;  
926 -  
927 - // Actual size vs. expected size check above ensures that we will not overflow any buffers here.  
928 - // We know that entry_size * num_entries is less or equal to the size of the buffer.  
929 - auto p = bp->getBuffer();  
930 - for (auto [obj, sec_entries]: indx) {  
931 - // Process a subsection.  
932 - for (int i = 0; i < sec_entries; ++i) {  
933 - // Read this entry  
934 - std::array<qpdf_offset_t, 3> fields{};  
935 - if (W[0] == 0) {  
936 - QTC::TC("qpdf", "QPDF default for xref stream field 0");  
937 - fields[0] = 1;  
938 - }  
939 - for (size_t j = 0; j < 3; ++j) {  
940 - for (int k = 0; k < W[j]; ++k) {  
941 - fields[j] <<= 8;  
942 - fields[j] |= *p++;  
943 - }  
944 - }  
945 -  
946 - // Get the generation number. The generation number is 0 unless this is an uncompressed  
947 - // object record, in which case the generation number appears as the third field.  
948 - if (saw_first_compressed_object) {  
949 - if (fields[0] != 2) {  
950 - uncompressed_after_compressed_ = true;  
951 - }  
952 - } else if (fields[0] == 2) {  
953 - saw_first_compressed_object = true;  
954 - }  
955 - if (obj == 0) {  
956 - // This is needed by checkLinearization()  
957 - first_item_offset_ = xref_offset;  
958 - } else if (fields[0] == 0) {  
959 - // Ignore fields[2], which we don't care about in this case. This works around the  
960 - // issue of some PDF files that put invalid values, like -1, here for deleted  
961 - // objects.  
962 - insert_free(QPDFObjGen(obj, 0));  
963 - } else {  
964 - insert(obj, toI(fields[0]), fields[1], toI(fields[2]));  
965 - }  
966 - ++obj;  
967 - }  
968 - }  
969 -  
970 - if (dict.hasKey("/Prev")) {  
971 - if (!dict.getKey("/Prev").isInteger()) {  
972 - throw qpdf.damagedPDF(  
973 - "xref stream", "/Prev key in xref stream dictionary is not an integer");  
974 - }  
975 - QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");  
976 - return dict.getKey("/Prev").getIntValue();  
977 - } else {  
978 - return 0;  
979 - }  
980 -}  
981 -  
982 -void  
983 -Xref_table::insert(int obj, int f0, qpdf_offset_t f1, int f2)  
984 -{  
985 - // Populate the xref table in such a way that the first reference to an object that we see,  
986 - // which is the one in the latest xref table in which it appears, is the one that gets stored.  
987 - // This works because we are reading more recent appends before older ones.  
988 -  
989 - // If there is already an entry for this object and generation in the table, it means that a  
990 - // later xref table has registered this object. Disregard this one.  
991 -  
992 - int new_gen = f0 == 2 ? 0 : f2;  
993 -  
994 - if (!(obj > 0 && static_cast<size_t>(obj) < table.size() && 0 <= f2 && new_gen < 65535)) {  
995 - // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There  
996 - // is probably no point having another warning but we could count invalid items in order to  
997 - // decide when to give up.  
998 - QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");  
999 - return;  
1000 - }  
1001 -  
1002 - auto& entry = table[static_cast<size_t>(obj)];  
1003 - auto old_type = entry.type();  
1004 -  
1005 - if (!old_type && entry.gen() > 0) {  
1006 - // At the moment we are processing the updates last to first and therefore the gen doesn't  
1007 - // matter as long as it > 0 to distinguish it from an uninitialized entry. This will need  
1008 - // to be revisited when we want to support incremental updates or more comprehensive  
1009 - // checking.  
1010 - QTC::TC("qpdf", "QPDF xref deleted object");  
1011 - return;  
1012 - }  
1013 -  
1014 - if (f0 == 2 && static_cast<int>(f1) == obj) {  
1015 - qpdf.warn(qpdf.damagedPDF(  
1016 - "xref stream", "self-referential object stream " + std::to_string(obj)));  
1017 - return;  
1018 - }  
1019 -  
1020 - if (old_type && entry.gen() >= new_gen) {  
1021 - QTC::TC("qpdf", "QPDF xref reused object");  
1022 - return;  
1023 - }  
1024 -  
1025 - switch (f0) {  
1026 - case 1:  
1027 - // f2 is generation  
1028 - QTC::TC("qpdf", "QPDF xref gen > 0", (f2 > 0) ? 1 : 0);  
1029 - entry = {f2, Uncompressed(f1)};  
1030 - break;  
1031 -  
1032 - case 2:  
1033 - entry = {0, Compressed(toI(f1), f2)};  
1034 - object_streams_ = true;  
1035 - break;  
1036 -  
1037 - default:  
1038 - throw qpdf.damagedPDF(  
1039 - "xref stream", "unknown xref stream entry type " + std::to_string(f0));  
1040 - break;  
1041 - }  
1042 -}  
1043 -  
1044 -void  
1045 -Xref_table::insert_free(QPDFObjGen og)  
1046 -{  
1047 - // At the moment we are processing the updates last to first and therefore the gen doesn't  
1048 - // matter as long as it > 0 to distinguish it from an uninitialized entry. This will need to be  
1049 - // revisited when we want to support incremental updates or more comprehensive checking.  
1050 - if (og.getObj() < 1) {  
1051 - return;  
1052 - }  
1053 - size_t id = static_cast<size_t>(og.getObj());  
1054 - if (id < table.size() && !type(id)) {  
1055 - table[id] = {1, {}};  
1056 - }  
1057 -}  
1058 -  
1059 -QPDFObjGen  
1060 -Xref_table::at_offset(qpdf_offset_t offset) const noexcept  
1061 -{  
1062 - int id = 0;  
1063 - int gen = 0;  
1064 - qpdf_offset_t start = 0;  
1065 -  
1066 - int i = 0;  
1067 - for (auto const& item: table) {  
1068 - auto o = item.offset();  
1069 - if (start < o && o <= offset) {  
1070 - start = o;  
1071 - id = i;  
1072 - gen = item.gen();  
1073 - }  
1074 - ++i;  
1075 - }  
1076 - return QPDFObjGen(id, gen);  
1077 -}  
1078 -  
1079 -std::map<QPDFObjGen, QPDFXRefEntry>  
1080 -Xref_table::as_map() const  
1081 -{  
1082 - std::map<QPDFObjGen, QPDFXRefEntry> result;  
1083 - int i{0};  
1084 - for (auto const& item: table) {  
1085 - switch (item.type()) {  
1086 - case 0:  
1087 - break;  
1088 - case 1:  
1089 - result.emplace(QPDFObjGen(i, item.gen()), item.offset());  
1090 - break;  
1091 - case 2:  
1092 - result.emplace(  
1093 - QPDFObjGen(i, 0), QPDFXRefEntry(item.stream_number(), item.stream_index()));  
1094 - break;  
1095 - default:  
1096 - throw std::logic_error("Xref_table: invalid entry type");  
1097 - }  
1098 - ++i;  
1099 - }  
1100 - return result;  
1101 -}  
1102 -  
1103 -void  
1104 -Xref_table::show()  
1105 -{  
1106 - auto& cout = *qpdf.m->log->getInfo();  
1107 - int i = -1;  
1108 - for (auto const& item: table) {  
1109 - ++i;  
1110 - if (item.type()) {  
1111 - cout << std::to_string(i) << "/" << std::to_string(item.gen()) << ": ";  
1112 - switch (item.type()) {  
1113 - case 1:  
1114 - cout << "uncompressed; offset = " << item.offset() << "\n";  
1115 - break;  
1116 -  
1117 - case 2:  
1118 - cout << "compressed; stream = " << item.stream_number()  
1119 - << ", index = " << item.stream_index() << "\n";  
1120 - break;  
1121 -  
1122 - default:  
1123 - throw std::logic_error(  
1124 - "unknown cross-reference table type while showing xref_table");  
1125 - }  
1126 - }  
1127 - }  
1128 -}  
1129 -  
1130 -// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and  
1131 -// return false. Otherwise return true.  
1132 -bool  
1133 -Xref_table::resolve()  
1134 -{  
1135 - bool may_change = !reconstructed_;  
1136 - int i = -1;  
1137 - for (auto& item: table) {  
1138 - ++i;  
1139 - if (item.type()) {  
1140 - if (objects.unresolved(QPDFObjGen(i, item.gen()))) {  
1141 - objects.resolve(QPDFObjGen(i, item.gen()));  
1142 - if (may_change && reconstructed_) {  
1143 - return false;  
1144 - }  
1145 - }  
1146 - }  
1147 - }  
1148 - return true;  
1149 -}  
1150 -  
1151 -std::vector<QPDFObjectHandle>  
1152 -Objects ::all()  
1153 -{  
1154 - // After fixDanglingReferences is called, all objects are in the object cache.  
1155 - qpdf.fixDanglingReferences();  
1156 - std::vector<QPDFObjectHandle> result;  
1157 - for (auto const& iter: table) {  
1158 - result.emplace_back(iter.second.object);  
1159 - }  
1160 - return result;  
1161 -}  
1162 -  
1163 -QPDFObjectHandle  
1164 -Xref_table::read_trailer()  
1165 -{  
1166 - qpdf_offset_t offset = file->tell();  
1167 - bool empty = false;  
1168 - auto object = QPDFParser(*file, "trailer", tokenizer, nullptr, &qpdf, true).parse(empty, false);  
1169 - if (empty) {  
1170 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1171 - // actual PDF files and Adobe Reader appears to ignore them.  
1172 - qpdf.warn(qpdf.damagedPDF("trailer", "empty object treated as null"));  
1173 - } else if (object.isDictionary() && read_token().isWord("stream")) {  
1174 - qpdf.warn(qpdf.damagedPDF("trailer", file->tell(), "stream keyword found in trailer"));  
1175 - }  
1176 - // Override last_offset so that it points to the beginning of the object we just read  
1177 - file->setLastOffset(offset);  
1178 - return object;  
1179 -}  
1180 -  
1181 -QPDFObjectHandle  
1182 -Objects::read_object(std::string const& description, QPDFObjGen og)  
1183 -{  
1184 - qpdf.setLastObjectDescription(description, og);  
1185 - qpdf_offset_t offset = m->file->tell();  
1186 - bool empty = false;  
1187 -  
1188 - StringDecrypter decrypter{&qpdf, og};  
1189 - StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;  
1190 - auto object =  
1191 - QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, &qpdf, true)  
1192 - .parse(empty, false);  
1193 - if (empty) {  
1194 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1195 - // actual PDF files and Adobe Reader appears to ignore them.  
1196 - qpdf.warn(  
1197 - qpdf.damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));  
1198 - return object;  
1199 - }  
1200 - auto token = qpdf.readToken(*m->file);  
1201 - if (object.isDictionary() && token.isWord("stream")) {  
1202 - read_stream(object, og, offset);  
1203 - token = qpdf.readToken(*m->file);  
1204 - }  
1205 - if (!token.isWord("endobj")) {  
1206 - QTC::TC("qpdf", "QPDF err expected endobj");  
1207 - qpdf.warn(qpdf.damagedPDF("expected endobj"));  
1208 - }  
1209 - return object;  
1210 -}  
1211 -  
1212 -// After reading stream dictionary and stream keyword, read rest of stream.  
1213 -void  
1214 -Objects::read_stream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)  
1215 -{  
1216 - validate_stream_line_end(object, og, offset);  
1217 -  
1218 - // Must get offset before accessing any additional objects since resolving a previously  
1219 - // unresolved indirect object will change file position.  
1220 - qpdf_offset_t stream_offset = m->file->tell();  
1221 - size_t length = 0;  
1222 -  
1223 - try {  
1224 - auto length_obj = object.getKey("/Length");  
1225 -  
1226 - if (!length_obj.isInteger()) {  
1227 - if (length_obj.isNull()) {  
1228 - QTC::TC("qpdf", "QPDF stream without length");  
1229 - throw qpdf.damagedPDF(offset, "stream dictionary lacks /Length key");  
1230 - }  
1231 - QTC::TC("qpdf", "QPDF stream length not integer");  
1232 - throw qpdf.damagedPDF(offset, "/Length key in stream dictionary is not an integer");  
1233 - }  
1234 -  
1235 - length = toS(length_obj.getUIntValue());  
1236 - // Seek in two steps to avoid potential integer overflow  
1237 - m->file->seek(stream_offset, SEEK_SET);  
1238 - m->file->seek(toO(length), SEEK_CUR);  
1239 - if (!qpdf.readToken(*m->file).isWord("endstream")) {  
1240 - QTC::TC("qpdf", "QPDF missing endstream");  
1241 - throw qpdf.damagedPDF("expected endstream");  
1242 - }  
1243 - } catch (QPDFExc& e) {  
1244 - if (m->attempt_recovery) {  
1245 - qpdf.warn(e);  
1246 - length = recover_stream_length(m->file_sp, og, stream_offset);  
1247 - } else {  
1248 - throw;  
1249 - }  
1250 - }  
1251 - object = {QPDF_Stream::create(&qpdf, og, object, stream_offset, length)};  
1252 -}  
1253 -  
1254 -void  
1255 -Objects::validate_stream_line_end(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)  
1256 -{  
1257 - // The PDF specification states that the word "stream" should be followed by either a carriage  
1258 - // return and a newline or by a newline alone. It specifically disallowed following it by a  
1259 - // carriage return alone since, in that case, there would be no way to tell whether the NL in a  
1260 - // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,  
1261 - // accept a carriage return by itself when followed by a non-newline character, so that's what  
1262 - // we do here. We have also seen files that have extraneous whitespace between the stream  
1263 - // keyword and the newline.  
1264 - while (true) {  
1265 - char ch;  
1266 - if (m->file->read(&ch, 1) == 0) {  
1267 - // A premature EOF here will result in some other problem that will get reported at  
1268 - // another time.  
1269 - return;  
1270 - }  
1271 - if (ch == '\n') {  
1272 - // ready to read stream data  
1273 - QTC::TC("qpdf", "QPDF stream with NL only");  
1274 - return;  
1275 - }  
1276 - if (ch == '\r') {  
1277 - // Read another character  
1278 - if (m->file->read(&ch, 1) != 0) {  
1279 - if (ch == '\n') {  
1280 - // Ready to read stream data  
1281 - QTC::TC("qpdf", "QPDF stream with CRNL");  
1282 - } else {  
1283 - // Treat the \r by itself as the whitespace after endstream and start reading  
1284 - // stream data in spite of not having seen a newline.  
1285 - QTC::TC("qpdf", "QPDF stream with CR only");  
1286 - m->file->unreadCh(ch);  
1287 - qpdf.warn(qpdf.damagedPDF(  
1288 - m->file->tell(), "stream keyword followed by carriage return only"));  
1289 - }  
1290 - }  
1291 - return;  
1292 - }  
1293 - if (!QUtil::is_space(ch)) {  
1294 - QTC::TC("qpdf", "QPDF stream without newline");  
1295 - m->file->unreadCh(ch);  
1296 - qpdf.warn(qpdf.damagedPDF(  
1297 - m->file->tell(), "stream keyword not followed by proper line terminator"));  
1298 - return;  
1299 - }  
1300 - qpdf.warn(  
1301 - qpdf.damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));  
1302 - }  
1303 -}  
1304 -  
1305 -QPDFObjectHandle  
1306 -Objects::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)  
1307 -{  
1308 - m->last_object_description.erase(7); // last_object_description starts with "object "  
1309 - m->last_object_description += std::to_string(obj);  
1310 - m->last_object_description += " 0";  
1311 -  
1312 - bool empty = false;  
1313 - auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, &qpdf, true)  
1314 - .parse(empty, false);  
1315 - if (empty) {  
1316 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1317 - // actual PDF files and Adobe Reader appears to ignore them.  
1318 - qpdf.warn(qpdf.damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));  
1319 - }  
1320 - return object;  
1321 -}  
1322 -  
1323 -bool  
1324 -QPDF::findEndstream()  
1325 -{  
1326 - // Find endstream or endobj. Position the input at that token.  
1327 - auto t = readToken(*m->file, 20);  
1328 - if (t.isWord("endobj") || t.isWord("endstream")) {  
1329 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
1330 - return true;  
1331 - }  
1332 - return false;  
1333 -}  
1334 -  
1335 -size_t  
1336 -Objects::recover_stream_length(  
1337 - std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)  
1338 -{  
1339 - // Try to reconstruct stream length by looking for endstream or endobj  
1340 - qpdf.warn(qpdf.damagedPDF(*input, stream_offset, "attempting to recover stream length"));  
1341 -  
1342 - PatternFinder ef(qpdf, &QPDF::findEndstream);  
1343 - size_t length = 0;  
1344 - if (m->file->findFirst("end", stream_offset, 0, ef)) {  
1345 - length = toS(m->file->tell() - stream_offset);  
1346 - // Reread endstream but, if it was endobj, don't skip that.  
1347 - QPDFTokenizer::Token t = qpdf.readToken(*m->file);  
1348 - if (t.getValue() == "endobj") {  
1349 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
1350 - }  
1351 - }  
1352 -  
1353 - if (length) {  
1354 - // Make sure this is inside this object  
1355 - auto found = xref.at_offset(stream_offset + toO(length));  
1356 - if (found == QPDFObjGen() || found == og) {  
1357 - // If we are trying to recover an XRef stream the xref table will not contain and  
1358 - // won't contain any entries, therefore we cannot check the found length. Otherwise we  
1359 - // found endstream\endobj within the space allowed for this object, so we're probably  
1360 - // in good shape.  
1361 - } else {  
1362 - QTC::TC("qpdf", "QPDF found wrong endstream in recovery");  
1363 - length = 0;  
1364 - }  
1365 - }  
1366 -  
1367 - if (length == 0) {  
1368 - qpdf.warn(qpdf.damagedPDF(  
1369 - *input, stream_offset, "unable to recover stream data; treating stream as empty"));  
1370 - } else {  
1371 - qpdf.warn(qpdf.damagedPDF(  
1372 - *input, stream_offset, "recovered stream length: " + std::to_string(length)));  
1373 - }  
1374 -  
1375 - QTC::TC("qpdf", "QPDF recovered stream length");  
1376 - return length;  
1377 -}  
1378 -  
1379 -QPDFObjectHandle  
1380 -Objects::read(  
1381 - bool try_recovery,  
1382 - qpdf_offset_t offset,  
1383 - std::string const& description,  
1384 - QPDFObjGen exp_og,  
1385 - QPDFObjGen& og,  
1386 - bool skip_cache_if_in_xref)  
1387 -{  
1388 - bool check_og = true;  
1389 - if (exp_og.getObj() == 0) {  
1390 - // This method uses an expect object ID of 0 to indicate that we don't know or don't care  
1391 - // what the actual object ID is at this offset. This is true when we read the xref stream  
1392 - // and linearization hint streams. In this case, we don't verify the expect object  
1393 - // ID/generation against what was read from the file. There is also no reason to attempt  
1394 - // xref recovery if we get a failure in this case since the read attempt was not triggered  
1395 - // by an xref lookup.  
1396 - check_og = false;  
1397 - try_recovery = false;  
1398 - }  
1399 - qpdf.setLastObjectDescription(description, exp_og);  
1400 -  
1401 - if (!m->attempt_recovery) {  
1402 - try_recovery = false;  
1403 - }  
1404 -  
1405 - // Special case: if offset is 0, just return null. Some PDF writers, in particular  
1406 - // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as  
1407 - // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore  
1408 - // these.  
1409 - if (offset == 0) {  
1410 - QTC::TC("qpdf", "QPDF bogus 0 offset", 0);  
1411 - qpdf.warn(qpdf.damagedPDF(0, "object has offset 0"));  
1412 - return QPDFObjectHandle::newNull();  
1413 - }  
1414 -  
1415 - m->file->seek(offset, SEEK_SET);  
1416 - try {  
1417 - QPDFTokenizer::Token tobjid = qpdf.readToken(*m->file);  
1418 - bool objidok = tobjid.isInteger();  
1419 - QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);  
1420 - if (!objidok) {  
1421 - QTC::TC("qpdf", "QPDF expected n n obj");  
1422 - throw qpdf.damagedPDF(offset, "expected n n obj");  
1423 - }  
1424 - QPDFTokenizer::Token tgen = qpdf.readToken(*m->file);  
1425 - bool genok = tgen.isInteger();  
1426 - QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);  
1427 - if (!genok) {  
1428 - throw qpdf.damagedPDF(offset, "expected n n obj");  
1429 - }  
1430 - QPDFTokenizer::Token tobj = qpdf.readToken(*m->file);  
1431 -  
1432 - bool objok = tobj.isWord("obj");  
1433 - QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);  
1434 -  
1435 - if (!objok) {  
1436 - throw qpdf.damagedPDF(offset, "expected n n obj");  
1437 - }  
1438 - int objid = QUtil::string_to_int(tobjid.getValue().c_str());  
1439 - int generation = QUtil::string_to_int(tgen.getValue().c_str());  
1440 - og = QPDFObjGen(objid, generation);  
1441 - if (objid == 0) {  
1442 - QTC::TC("qpdf", "QPDF object id 0");  
1443 - throw qpdf.damagedPDF(offset, "object with ID 0");  
1444 - }  
1445 - if (check_og && (exp_og != og)) {  
1446 - QTC::TC("qpdf", "QPDF err wrong objid/generation");  
1447 - QPDFExc e = qpdf.damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");  
1448 - if (try_recovery) {  
1449 - // Will be retried below  
1450 - throw e;  
1451 - } else {  
1452 - // We can try reading the object anyway even if the ID doesn't match.  
1453 - qpdf.warn(e);  
1454 - }  
1455 - }  
1456 - } catch (QPDFExc& e) {  
1457 - if (try_recovery) {  
1458 - // Try again after reconstructing xref table  
1459 - xref.reconstruct(e);  
1460 - if (xref.type(exp_og) == 1) {  
1461 - QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");  
1462 - return read(false, xref.offset(exp_og), description, exp_og, og, false);  
1463 - } else {  
1464 - QTC::TC("qpdf", "QPDF object gone after xref reconstruction");  
1465 - qpdf.warn(qpdf.damagedPDF(  
1466 - "",  
1467 - 0,  
1468 - ("object " + exp_og.unparse(' ') +  
1469 - " not found in file after regenerating cross reference table")));  
1470 - return QPDFObjectHandle::newNull();  
1471 - }  
1472 - } else {  
1473 - throw;  
1474 - }  
1475 - }  
1476 -  
1477 - QPDFObjectHandle oh = read_object(description, og);  
1478 -  
1479 - if (unresolved(og)) {  
1480 - // Store the object in the cache here so it gets cached whether we first know the offset or  
1481 - // whether we first know the object ID and generation (in which we case we would get here  
1482 - // through resolve).  
1483 -  
1484 - // Determine the end offset of this object before and after white space. We use these  
1485 - // numbers to validate linearization hint tables. Offsets and lengths of objects may imply  
1486 - // the end of an object to be anywhere between these values.  
1487 - qpdf_offset_t end_before_space = m->file->tell();  
1488 -  
1489 - // skip over spaces  
1490 - while (true) {  
1491 - char ch;  
1492 - if (m->file->read(&ch, 1)) {  
1493 - if (!isspace(static_cast<unsigned char>(ch))) {  
1494 - m->file->seek(-1, SEEK_CUR);  
1495 - break;  
1496 - }  
1497 - } else {  
1498 - throw qpdf.damagedPDF(m->file->tell(), "EOF after endobj");  
1499 - }  
1500 - }  
1501 - qpdf_offset_t end_after_space = m->file->tell();  
1502 - if (skip_cache_if_in_xref && xref.type(og)) {  
1503 - // Ordinarily, an object gets read here when resolved through xref table or stream. In  
1504 - // the special case of the xref stream and linearization hint tables, the offset comes  
1505 - // from another source. For the specific case of xref streams, the xref stream is read  
1506 - // and loaded into the object cache very early in parsing. Ordinarily, when a file is  
1507 - // updated by appending, items inserted into the xref table in later updates take  
1508 - // precedence over earlier items. In the special case of reusing the object number  
1509 - // previously used as the xref stream, we have the following order of events:  
1510 - //  
1511 - // * reused object gets loaded into the xref table  
1512 - // * old object is read here while reading xref streams  
1513 - // * original xref entry is ignored (since already in xref table)  
1514 - //  
1515 - // It is the second step that causes a problem. Even though the xref table is correct in  
1516 - // this case, the old object is already in the cache and so effectively prevails over  
1517 - // the reused object. To work around this issue, we have a special case for the xref  
1518 - // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,  
1519 - // don't cache what we read here.  
1520 - //  
1521 - // It is likely that the same bug may exist for linearization hint tables, but the  
1522 - // existing code uses end_before_space and end_after_space from the cache, so fixing  
1523 - // that would require more significant rework. The chances of a linearization hint  
1524 - // stream being reused seems smaller because the xref stream is probably the highest  
1525 - // object in the file and the linearization hint stream would be some random place in  
1526 - // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we  
1527 - // could use !check_og in place of skip_cache_if_in_xref.  
1528 - QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");  
1529 - } else {  
1530 - xref.linearization_offsets(toS(og.getObj()), end_before_space, end_after_space);  
1531 - update_table(og, oh.getObj());  
1532 - }  
1533 - }  
1534 -  
1535 - return oh;  
1536 -}  
1537 -  
1538 -QPDFObject*  
1539 -Objects::resolve(QPDFObjGen og)  
1540 -{  
1541 - if (!unresolved(og)) {  
1542 - return table[og].object.get();  
1543 - }  
1544 -  
1545 - if (m->resolving.count(og)) {  
1546 - // This can happen if an object references itself directly or indirectly in some key that  
1547 - // has to be resolved during object parsing, such as stream length.  
1548 - QTC::TC("qpdf", "QPDF recursion loop in resolve");  
1549 - qpdf.warn(qpdf.damagedPDF("", "loop detected resolving object " + og.unparse(' ')));  
1550 - update_table(og, QPDF_Null::create());  
1551 - return table[og].object.get();  
1552 - }  
1553 - ResolveRecorder rr(&qpdf, og);  
1554 -  
1555 - try {  
1556 - switch (xref.type(og)) {  
1557 - case 0:  
1558 - break;  
1559 - case 1:  
1560 - {  
1561 - // Object stored in cache by readObjectAtOffset  
1562 - QPDFObjGen a_og;  
1563 - QPDFObjectHandle oh = read(true, xref.offset(og), "", og, a_og, false);  
1564 - }  
1565 - break;  
1566 -  
1567 - case 2:  
1568 - resolveObjectsInStream(xref.stream_number(og.getObj()));  
1569 - break;  
1570 -  
1571 - default:  
1572 - throw qpdf.damagedPDF(  
1573 - "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));  
1574 - }  
1575 - } catch (QPDFExc& e) {  
1576 - qpdf.warn(e);  
1577 - } catch (std::exception& e) {  
1578 - qpdf.warn(qpdf.damagedPDF(  
1579 - "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));  
1580 - }  
1581 -  
1582 - if (unresolved(og)) {  
1583 - // PDF spec says unknown objects resolve to the null object.  
1584 - QTC::TC("qpdf", "QPDF resolve failure to null");  
1585 - update_table(og, QPDF_Null::create());  
1586 - }  
1587 -  
1588 - auto result(table[og].object);  
1589 - result->setDefaultDescription(&qpdf, og);  
1590 - return result.get();  
1591 -}  
1592 -  
1593 -void  
1594 -Objects::resolveObjectsInStream(int obj_stream_number)  
1595 -{  
1596 - if (m->resolved_object_streams.count(obj_stream_number)) {  
1597 - return;  
1598 - }  
1599 - m->resolved_object_streams.insert(obj_stream_number);  
1600 - // Force resolution of object stream  
1601 - QPDFObjectHandle obj_stream = get(obj_stream_number, 0);  
1602 - if (!obj_stream.isStream()) {  
1603 - throw qpdf.damagedPDF(  
1604 - "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");  
1605 - }  
1606 -  
1607 - QPDFObjectHandle dict = obj_stream.getDict();  
1608 - if (!dict.isDictionaryOfType("/ObjStm")) {  
1609 - QTC::TC("qpdf", "QPDF ERR object stream with wrong type");  
1610 - qpdf.warn(qpdf.damagedPDF(  
1611 - "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));  
1612 - }  
1613 -  
1614 - if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {  
1615 - throw qpdf.damagedPDF(  
1616 - ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));  
1617 - }  
1618 -  
1619 - int n = dict.getKey("/N").getIntValueAsInt();  
1620 - int first = dict.getKey("/First").getIntValueAsInt();  
1621 -  
1622 - std::map<int, int> offsets;  
1623 -  
1624 - std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);  
1625 - auto input = std::shared_ptr<InputSource>(  
1626 - // line-break  
1627 - new BufferInputSource(  
1628 - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),  
1629 - bp.get()));  
1630 -  
1631 - qpdf_offset_t last_offset = -1;  
1632 - for (int i = 0; i < n; ++i) {  
1633 - QPDFTokenizer::Token tnum = qpdf.readToken(*input);  
1634 - QPDFTokenizer::Token toffset = qpdf.readToken(*input);  
1635 - if (!(tnum.isInteger() && toffset.isInteger())) {  
1636 - throw damagedPDF(  
1637 - *input,  
1638 - m->last_object_description,  
1639 - input->getLastOffset(),  
1640 - "expected integer in object stream header");  
1641 - }  
1642 -  
1643 - int num = QUtil::string_to_int(tnum.getValue().c_str());  
1644 - long long offset = QUtil::string_to_int(toffset.getValue().c_str());  
1645 - if (num > xref.max_id()) {  
1646 - continue;  
1647 - }  
1648 - if (num == obj_stream_number) {  
1649 - QTC::TC("qpdf", "QPDF ignore self-referential object stream");  
1650 - qpdf.warn(damagedPDF(  
1651 - *input,  
1652 - m->last_object_description,  
1653 - input->getLastOffset(),  
1654 - "object stream claims to contain itself"));  
1655 - continue;  
1656 - }  
1657 - if (offset <= last_offset) {  
1658 - throw damagedPDF(  
1659 - *input,  
1660 - m->last_object_description,  
1661 - input->getLastOffset(),  
1662 - "expected offsets in object stream to be increasing");  
1663 - }  
1664 - last_offset = offset;  
1665 -  
1666 - offsets[num] = toI(offset + first);  
1667 - }  
1668 -  
1669 - // To avoid having to read the object stream multiple times, store all objects that would be  
1670 - // found here in the cache. Remember that some objects stored here might have been overridden  
1671 - // by new objects appended to the file, so it is necessary to recheck the xref table and only  
1672 - // cache what would actually be resolved here.  
1673 - m->last_object_description.clear();  
1674 - m->last_object_description += "object ";  
1675 - for (auto const& iter: offsets) {  
1676 - QPDFObjGen og(iter.first, 0);  
1677 - if (xref.type(og) == 2 && xref.stream_number(og.getObj()) == obj_stream_number) {  
1678 - int offset = iter.second;  
1679 - input->seek(offset, SEEK_SET);  
1680 - QPDFObjectHandle oh = readObjectInStream(input, iter.first);  
1681 - update_table(og, oh.getObj());  
1682 - } else {  
1683 - QTC::TC("qpdf", "QPDF not caching overridden objstm object");  
1684 - }  
1685 - }  
1686 -}  
1687 -  
1688 -Objects::~Objects()  
1689 -{  
1690 - // If two objects are mutually referential (through each object having an array or dictionary  
1691 - // that contains an indirect reference to the other), the circular references in the  
1692 - // std::shared_ptr objects will prevent the objects from being deleted. Walk through all objects  
1693 - // in the object cache, which is those objects that we read from the file, and break all  
1694 - // resolved indirect references by replacing them with an internal object type representing that  
1695 - // they have been destroyed. Note that we can't break references like this at any time when the  
1696 - // QPDF object is active. The call to reset also causes all direct QPDFObjectHandle objects that  
1697 - // are reachable from this object to release their association with this QPDF. Direct objects  
1698 - // are not destroyed since they can be moved to other QPDF objects safely.  
1699 -  
1700 - for (auto const& iter: table) {  
1701 - iter.second.object->disconnect();  
1702 - if (iter.second.object->getTypeCode() != ::ot_null) {  
1703 - iter.second.object->destroy();  
1704 - }  
1705 - }  
1706 -}  
1707 -  
1708 -void  
1709 -Objects::update_table(QPDFObjGen og, const std::shared_ptr<QPDFObject>& object)  
1710 -{  
1711 - object->setObjGen(&qpdf, og);  
1712 - if (cached(og)) {  
1713 - auto& cache = table[og];  
1714 - cache.object->assign(object);  
1715 - } else {  
1716 - table[og] = Entry(object);  
1717 - }  
1718 -}  
1719 -  
1720 -bool  
1721 -Objects::cached(QPDFObjGen og)  
1722 -{  
1723 - return table.count(og) != 0;  
1724 -}  
1725 -  
1726 -bool  
1727 -Objects::unresolved(QPDFObjGen og)  
1728 -{  
1729 - return !cached(og) || table[og].object->isUnresolved();  
1730 -}  
1731 -  
1732 -QPDFObjGen  
1733 -Objects::next_id()  
1734 -{  
1735 - qpdf.fixDanglingReferences();  
1736 - QPDFObjGen og;  
1737 - if (!table.empty()) {  
1738 - og = (*(m->objects.table.rbegin())).first;  
1739 - }  
1740 - int max_objid = og.getObj();  
1741 - if (max_objid == std::numeric_limits<int>::max()) {  
1742 - throw std::range_error("max object id is too high to create new objects");  
1743 - }  
1744 - return QPDFObjGen(max_objid + 1, 0);  
1745 -}  
1746 -  
1747 -QPDFObjectHandle  
1748 -Objects::make_indirect(std::shared_ptr<QPDFObject> const& obj)  
1749 -{  
1750 - QPDFObjGen next{next_id()};  
1751 - table[next] = Entry(obj);  
1752 - return qpdf.newIndirect(next, table[next].object);  
1753 -}  
1754 -  
1755 -std::shared_ptr<QPDFObject>  
1756 -Objects::get_for_parser(int id, int gen, bool parse_pdf)  
1757 -{  
1758 - // This method is called by the parser and therefore must not resolve any objects.  
1759 - auto og = QPDFObjGen(id, gen);  
1760 - if (auto iter = table.find(og); iter != table.end()) {  
1761 - return iter->second.object;  
1762 - }  
1763 - if (xref.type(og) || !xref.initialized()) {  
1764 - return table.insert({og, QPDF_Unresolved::create(&qpdf, og)}).first->second.object;  
1765 - }  
1766 - if (parse_pdf) {  
1767 - return QPDF_Null::create();  
1768 - }  
1769 - return table.insert({og, QPDF_Null::create(&qpdf, og)}).first->second.object;  
1770 -}  
1771 -  
1772 -std::shared_ptr<QPDFObject>  
1773 -Objects::get_for_json(int id, int gen)  
1774 -{  
1775 - auto og = QPDFObjGen(id, gen);  
1776 - auto [it, inserted] = table.try_emplace(og);  
1777 - auto& obj = it->second.object;  
1778 - if (inserted) {  
1779 - obj = (xref.initialized() && !xref.type(og)) ? QPDF_Null::create(&qpdf, og)  
1780 - : QPDF_Unresolved::create(&qpdf, og);  
1781 - }  
1782 - return obj;  
1783 -}  
1784 -  
1785 -void  
1786 -Objects::replace(QPDFObjGen og, QPDFObjectHandle oh)  
1787 -{  
1788 - if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {  
1789 - QTC::TC("qpdf", "QPDF replaceObject called with indirect object");  
1790 - throw std::logic_error("QPDF::replaceObject called with indirect object handle");  
1791 - }  
1792 - update_table(og, oh.getObj());  
1793 -}  
1794 -  
1795 -void  
1796 -Objects::erase(QPDFObjGen og)  
1797 -{  
1798 - if (auto cached = table.find(og); cached != table.end()) {  
1799 - // Take care of any object handles that may be floating around.  
1800 - cached->second.object->assign(QPDF_Null::create());  
1801 - cached->second.object->setObjGen(nullptr, QPDFObjGen());  
1802 - table.erase(cached);  
1803 - }  
1804 -}  
1805 -  
1806 -void  
1807 -Objects::swap(QPDFObjGen og1, QPDFObjGen og2)  
1808 -{  
1809 - // Force objects to be read from the input source if needed, then swap them in the cache.  
1810 - resolve(og1);  
1811 - resolve(og2);  
1812 - table[og1].object->swapWith(table[og2].object);  
1813 -}  
1814 -  
1815 -size_t  
1816 -Objects::table_size()  
1817 -{  
1818 - // If table is dense, accommodate all object in tables,else accommodate only original  
1819 - // objects.  
1820 - auto max_xref = toI(xref.size());  
1821 - if (max_xref > 0) {  
1822 - --max_xref;  
1823 - }  
1824 - auto max_obj = table.size() ? table.crbegin()->first.getObj() : 0;  
1825 - auto max_id = std::numeric_limits<int>::max() - 1;  
1826 - if (max_obj >= max_id || max_xref >= max_id) {  
1827 - // Temporary fix. Long-term solution is  
1828 - // - QPDFObjGen to enforce objgens are valid and sensible  
1829 - // - xref table and obj cache to protect against insertion of impossibly large obj ids  
1830 - qpdf.stopOnError("Impossibly large object id encountered.");  
1831 - }  
1832 - if (max_obj < 1.1 * std::max(toI(table.size()), max_xref)) {  
1833 - return toS(++max_obj);  
1834 - }  
1835 - return toS(++max_xref);  
1836 -}  
1837 -  
1838 -std::vector<QPDFObjGen>  
1839 -Objects::compressible_vector()  
1840 -{  
1841 - return compressible<QPDFObjGen>();  
1842 -}  
1843 -  
1844 -std::vector<bool>  
1845 -Objects::compressible_set()  
1846 -{  
1847 - return compressible<bool>();  
1848 -}  
1849 -  
1850 -template <typename T>  
1851 -std::vector<T>  
1852 -Objects::compressible()  
1853 -{  
1854 - // Return a list of objects that are allowed to be in object streams. Walk through the objects  
1855 - // by traversing the document from the root, including a traversal of the pages tree. This  
1856 - // makes that objects that are on the same page are more likely to be in the same object stream,  
1857 - // which is slightly more efficient, particularly with linearized files. This is better than  
1858 - // iterating through the xref table since it avoids preserving orphaned items.  
1859 -  
1860 - // Exclude encryption dictionary, if any  
1861 - QPDFObjectHandle encryption_dict = trailer().getKey("/Encrypt");  
1862 - QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();  
1863 -  
1864 - const size_t max_obj = qpdf.getObjectCount();  
1865 - std::vector<bool> visited(max_obj, false);  
1866 - std::vector<QPDFObjectHandle> queue;  
1867 - queue.reserve(512);  
1868 - queue.emplace_back(trailer());  
1869 - std::vector<T> result;  
1870 - if constexpr (std::is_same_v<T, QPDFObjGen>) {  
1871 - result.reserve(table.size());  
1872 - } else if constexpr (std::is_same_v<T, bool>) {  
1873 - result.resize(max_obj + 1U, false);  
1874 - } else {  
1875 - throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");  
1876 - }  
1877 - while (!queue.empty()) {  
1878 - auto obj = queue.back();  
1879 - queue.pop_back();  
1880 - if (obj.getObjectID() > 0) {  
1881 - QPDFObjGen og = obj.getObjGen();  
1882 - const size_t id = toS(og.getObj() - 1);  
1883 - if (id >= max_obj) {  
1884 - throw std::logic_error(  
1885 - "unexpected object id encountered in getCompressibleObjGens");  
1886 - }  
1887 - if (visited[id]) {  
1888 - QTC::TC("qpdf", "QPDF loop detected traversing objects");  
1889 - continue;  
1890 - }  
1891 -  
1892 - // Check whether this is the current object. If not, remove it (which changes it into a  
1893 - // direct null and therefore stops us from revisiting it) and move on to the next object  
1894 - // in the queue.  
1895 - auto upper = table.upper_bound(og);  
1896 - if (upper != table.end() && upper->first.getObj() == og.getObj()) {  
1897 - erase(og);  
1898 - continue;  
1899 - }  
1900 -  
1901 - visited[id] = true;  
1902 -  
1903 - if (og == encryption_dict_og) {  
1904 - QTC::TC("qpdf", "QPDF exclude encryption dictionary");  
1905 - } else if (!(obj.isStream() ||  
1906 - (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&  
1907 - obj.hasKey("/Contents")))) {  
1908 - if constexpr (std::is_same_v<T, QPDFObjGen>) {  
1909 - result.push_back(og);  
1910 - } else if constexpr (std::is_same_v<T, bool>) {  
1911 - result[id + 1U] = true;  
1912 - }  
1913 - }  
1914 - }  
1915 - if (obj.isStream()) {  
1916 - QPDFObjectHandle dict = obj.getDict();  
1917 - std::set<std::string> keys = dict.getKeys();  
1918 - for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {  
1919 - std::string const& key = *iter;  
1920 - QPDFObjectHandle value = dict.getKey(key);  
1921 - if (key == "/Length") {  
1922 - // omit stream lengths  
1923 - if (value.isIndirect()) {  
1924 - QTC::TC("qpdf", "QPDF exclude indirect length");  
1925 - }  
1926 - } else {  
1927 - queue.push_back(value);  
1928 - }  
1929 - }  
1930 - } else if (obj.isDictionary()) {  
1931 - std::set<std::string> keys = obj.getKeys();  
1932 - for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {  
1933 - queue.push_back(obj.getKey(*iter));  
1934 - }  
1935 - } else if (obj.isArray()) {  
1936 - int n = obj.getArrayNItems();  
1937 - for (int i = 1; i <= n; ++i) {  
1938 - queue.push_back(obj.getArrayItem(n - i));  
1939 - }  
1940 - }  
1941 - }  
1942 -  
1943 - return result;  
1944 -}  
libqpdf/QPDF_optimization.cc
@@ -79,9 +79,9 @@ QPDF::optimize( @@ -79,9 +79,9 @@ QPDF::optimize(
79 } 79 }
80 80
81 void 81 void
82 -QPDF::optimize(QPDF::Objects const& objects) 82 +QPDF::optimize(QPDF::Xref_table const& xref)
83 { 83 {
84 - optimize_internal(objects, false, nullptr); 84 + optimize_internal(xref, false, nullptr);
85 } 85 }
86 86
87 template <typename T> 87 template <typename T>
@@ -121,13 +121,13 @@ QPDF::optimize_internal( @@ -121,13 +121,13 @@ QPDF::optimize_internal(
121 } 121 }
122 122
123 // Traverse document-level items 123 // Traverse document-level items
124 - for (auto const& key: m->objects.trailer().getKeys()) { 124 + for (auto const& key: m->xref_table.trailer().getKeys()) {
125 if (key == "/Root") { 125 if (key == "/Root") {
126 // handled separately 126 // handled separately
127 } else { 127 } else {
128 updateObjectMaps( 128 updateObjectMaps(
129 ObjUser(ObjUser::ou_trailer_key, key), 129 ObjUser(ObjUser::ou_trailer_key, key),
130 - m->objects.trailer().getKey(key), 130 + m->xref_table.trailer().getKey(key),
131 skip_stream_parameters); 131 skip_stream_parameters);
132 } 132 }
133 } 133 }
@@ -175,7 +175,7 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys) @@ -175,7 +175,7 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
175 // values for them. 175 // values for them.
176 std::map<std::string, std::vector<QPDFObjectHandle>> key_ancestors; 176 std::map<std::string, std::vector<QPDFObjectHandle>> key_ancestors;
177 pushInheritedAttributesToPageInternal( 177 pushInheritedAttributesToPageInternal(
178 - m->objects.trailer().getKey("/Root").getKey("/Pages"), 178 + m->xref_table.trailer().getKey("/Root").getKey("/Pages"),
179 key_ancestors, 179 key_ancestors,
180 allow_changes, 180 allow_changes,
181 warn_skipped_keys); 181 warn_skipped_keys);
@@ -450,9 +450,8 @@ QPDF::filterCompressedObjects(QPDFWriter::ObjTable const&amp; obj) @@ -450,9 +450,8 @@ QPDF::filterCompressedObjects(QPDFWriter::ObjTable const&amp; obj)
450 } 450 }
451 451
452 void 452 void
453 -QPDF::filterCompressedObjects(QPDF::Objects const& objects) 453 +QPDF::filterCompressedObjects(QPDF::Xref_table const& xref)
454 { 454 {
455 - auto const& xref = objects.xref_table();  
456 if (!xref.object_streams()) { 455 if (!xref.object_streams()) {
457 return; 456 return;
458 } 457 }
libqpdf/qpdf-c.cc
@@ -905,7 +905,7 @@ qpdf_oh @@ -905,7 +905,7 @@ qpdf_oh
905 qpdf_get_object_by_id(qpdf_data qpdf, int objid, int generation) 905 qpdf_get_object_by_id(qpdf_data qpdf, int objid, int generation)
906 { 906 {
907 QTC::TC("qpdf", "qpdf-c called qpdf_get_object_by_id"); 907 QTC::TC("qpdf", "qpdf-c called qpdf_get_object_by_id");
908 - return new_object(qpdf, qpdf->qpdf->getObject(objid, generation)); 908 + return new_object(qpdf, qpdf->qpdf->getObjectByID(objid, generation));
909 } 909 }
910 910
911 template <class RET> 911 template <class RET>
libqpdf/qpdf/QPDF_objects.hh deleted
1 -#ifndef QPDF_OBJECTS_HH  
2 -#define QPDF_OBJECTS_HH  
3 -  
4 -#include <qpdf/QPDF.hh>  
5 -  
6 -#include <qpdf/QPDF_Null.hh>  
7 -#include <qpdf/QPDF_Unresolved.hh>  
8 -  
9 -#include <variant>  
10 -  
11 -// The Objects class is responsible for keeping track of all objects belonging to a QPDF instance,  
12 -// including loading it from an input source when required.  
13 -class QPDF::Objects  
14 -{  
15 - public:  
16 - // Xref_table encapsulates the pdf's xref table and trailer.  
17 - class Xref_table  
18 - {  
19 - public:  
20 - Xref_table(Objects& objects) :  
21 - qpdf(objects.qpdf),  
22 - objects(objects),  
23 - file(objects.file)  
24 - {  
25 - tokenizer.allowEOF();  
26 - }  
27 -  
28 - void initialize();  
29 - void initialize_empty();  
30 - void initialize_json();  
31 - void reconstruct(QPDFExc& e);  
32 - void show();  
33 - bool resolve();  
34 -  
35 - QPDFObjectHandle  
36 - trailer() noexcept  
37 - {  
38 - return trailer_;  
39 - }  
40 -  
41 - QPDFObjectHandle const&  
42 - trailer() const noexcept  
43 - {  
44 - return trailer_;  
45 - }  
46 -  
47 - void  
48 - trailer(QPDFObjectHandle&& oh)  
49 - {  
50 - trailer_ = std::move(oh);  
51 - }  
52 -  
53 - // Returns 0 if og is not in table.  
54 - size_t  
55 - type(QPDFObjGen og) const  
56 - {  
57 - int id = og.getObj();  
58 - if (id < 1 || static_cast<size_t>(id) >= table.size()) {  
59 - return 0;  
60 - }  
61 - auto& e = table[static_cast<size_t>(id)];  
62 - return e.gen() == og.getGen() ? e.type() : 0;  
63 - }  
64 -  
65 - // Returns 0 if og is not in table.  
66 - size_t  
67 - type(size_t id) const noexcept  
68 - {  
69 - if (id >= table.size()) {  
70 - return 0;  
71 - }  
72 - return table[id].type();  
73 - }  
74 -  
75 - // Returns 0 if og is not in table.  
76 - qpdf_offset_t  
77 - offset(QPDFObjGen og) const noexcept  
78 - {  
79 - int id = og.getObj();  
80 - if (id < 1 || static_cast<size_t>(id) >= table.size()) {  
81 - return 0;  
82 - }  
83 - return table[static_cast<size_t>(id)].offset();  
84 - }  
85 -  
86 - // Returns 0 if id is not in table.  
87 - int  
88 - stream_number(int id) const noexcept  
89 - {  
90 - if (id < 1 || static_cast<size_t>(id) >= table.size()) {  
91 - return 0;  
92 - }  
93 - return table[static_cast<size_t>(id)].stream_number();  
94 - }  
95 -  
96 - int  
97 - stream_index(int id) const noexcept  
98 - {  
99 - if (id < 1 || static_cast<size_t>(id) >= table.size()) {  
100 - return 0;  
101 - }  
102 - return table[static_cast<size_t>(id)].stream_index();  
103 - }  
104 -  
105 - QPDFObjGen at_offset(qpdf_offset_t offset) const noexcept;  
106 -  
107 - std::map<QPDFObjGen, QPDFXRefEntry> as_map() const;  
108 -  
109 - bool  
110 - object_streams() const noexcept  
111 - {  
112 - return object_streams_;  
113 - }  
114 -  
115 - // Return a vector of object id and stream number for each compressed object.  
116 - std::vector<std::pair<unsigned int, int>>  
117 - compressed_objects() const  
118 - {  
119 - if (!initialized()) {  
120 - throw std::logic_error("Xref_table::compressed_objects called before parsing.");  
121 - }  
122 -  
123 - std::vector<std::pair<unsigned int, int>> result;  
124 - result.reserve(table.size());  
125 -  
126 - unsigned int i{0};  
127 - for (auto const& item: table) {  
128 - if (item.type() == 2) {  
129 - result.emplace_back(i, item.stream_number());  
130 - }  
131 - ++i;  
132 - }  
133 - return result;  
134 - }  
135 -  
136 - // Temporary access to underlying table size  
137 - size_t  
138 - size() const noexcept  
139 - {  
140 - return table.size();  
141 - }  
142 -  
143 - void  
144 - ignore_streams(bool val) noexcept  
145 - {  
146 - ignore_streams_ = val;  
147 - }  
148 -  
149 - bool  
150 - initialized() const noexcept  
151 - {  
152 - return initialized_;  
153 - }  
154 -  
155 - void  
156 - attempt_recovery(bool val) noexcept  
157 - {  
158 - attempt_recovery_ = val;  
159 - }  
160 -  
161 - int  
162 - max_id() const noexcept  
163 - {  
164 - return max_id_;  
165 - }  
166 -  
167 - // For Linearization  
168 -  
169 - qpdf_offset_t  
170 - end_after_space(QPDFObjGen og)  
171 - {  
172 - auto& e = entry(toS(og.getObj()));  
173 - switch (e.type()) {  
174 - case 1:  
175 - return e.end_after_space_;  
176 - case 2:  
177 - {  
178 - auto es = entry(toS(e.stream_number()));  
179 - return es.type() == 1 ? es.end_after_space_ : 0;  
180 - }  
181 - default:  
182 - return 0;  
183 - }  
184 - }  
185 -  
186 - qpdf_offset_t  
187 - end_before_space(QPDFObjGen og)  
188 - {  
189 - auto& e = entry(toS(og.getObj()));  
190 - switch (e.type()) {  
191 - case 1:  
192 - return e.end_before_space_;  
193 - case 2:  
194 - {  
195 - auto es = entry(toS(e.stream_number()));  
196 - return es.type() == 1 ? es.end_before_space_ : 0;  
197 - }  
198 - default:  
199 - return 0;  
200 - }  
201 - }  
202 -  
203 - void  
204 - linearization_offsets(size_t id, qpdf_offset_t before, qpdf_offset_t after)  
205 - {  
206 - if (type(id)) {  
207 - table[id].end_before_space_ = before;  
208 - table[id].end_after_space_ = after;  
209 - }  
210 - }  
211 -  
212 - bool  
213 - uncompressed_after_compressed() const noexcept  
214 - {  
215 - return uncompressed_after_compressed_;  
216 - }  
217 -  
218 - // Actual value from file  
219 - qpdf_offset_t  
220 - first_item_offset() const noexcept  
221 - {  
222 - return first_item_offset_;  
223 - }  
224 -  
225 - private:  
226 - // Object, count, offset of first entry  
227 - typedef std::tuple<int, int, qpdf_offset_t> Subsection;  
228 -  
229 - struct Uncompressed  
230 - {  
231 - Uncompressed(qpdf_offset_t offset) :  
232 - offset(offset)  
233 - {  
234 - }  
235 - qpdf_offset_t offset;  
236 - };  
237 -  
238 - struct Compressed  
239 - {  
240 - Compressed(int stream_number, int stream_index) :  
241 - stream_number(stream_number),  
242 - stream_index(stream_index)  
243 - {  
244 - }  
245 - int stream_number{0};  
246 - int stream_index{0};  
247 - };  
248 -  
249 - typedef std::variant<std::monostate, Uncompressed, Compressed> Xref;  
250 -  
251 - struct Entry  
252 - {  
253 - Entry() = default;  
254 -  
255 - Entry(int gen, Xref entry) :  
256 - gen_(gen),  
257 - entry(entry)  
258 - {  
259 - }  
260 -  
261 - int  
262 - gen() const noexcept  
263 - {  
264 - return gen_;  
265 - }  
266 -  
267 - size_t  
268 - type() const noexcept  
269 - {  
270 - return entry.index();  
271 - }  
272 -  
273 - qpdf_offset_t  
274 - offset() const noexcept  
275 - {  
276 - return type() == 1 ? std::get<1>(entry).offset : 0;  
277 - }  
278 -  
279 - int  
280 - stream_number() const noexcept  
281 - {  
282 - return type() == 2 ? std::get<2>(entry).stream_number : 0;  
283 - }  
284 -  
285 - int  
286 - stream_index() const noexcept  
287 - {  
288 - return type() == 2 ? std::get<2>(entry).stream_index : 0;  
289 - }  
290 -  
291 - int gen_{0};  
292 - Xref entry;  
293 - qpdf_offset_t end_before_space_{0};  
294 - qpdf_offset_t end_after_space_{0};  
295 - };  
296 -  
297 - Entry&  
298 - entry(size_t id)  
299 - {  
300 - return id < table.size() ? table[id] : table[0];  
301 - }  
302 -  
303 - void read(qpdf_offset_t offset);  
304 -  
305 - // Methods to parse tables  
306 - qpdf_offset_t process_section(qpdf_offset_t offset);  
307 - std::vector<Subsection> subsections(std::string& line);  
308 - std::vector<Subsection> bad_subsections(std::string& line, qpdf_offset_t offset);  
309 - Subsection subsection(std::string const& line);  
310 - std::tuple<bool, qpdf_offset_t, int, char> read_entry();  
311 - std::tuple<bool, qpdf_offset_t, int, char> read_bad_entry();  
312 -  
313 - // Methods to parse streams  
314 - qpdf_offset_t read_stream(qpdf_offset_t offset);  
315 - qpdf_offset_t process_stream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream);  
316 - std::pair<int, std::array<int, 3>>  
317 - process_W(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged);  
318 - std::pair<int, size_t> process_Size(  
319 - QPDFObjectHandle& dict,  
320 - int entry_size,  
321 - std::function<QPDFExc(std::string_view)> damaged);  
322 - std::pair<int, std::vector<std::pair<int, int>>> process_Index(  
323 - QPDFObjectHandle& dict,  
324 - int max_num_entries,  
325 - std::function<QPDFExc(std::string_view)> damaged);  
326 -  
327 - QPDFObjectHandle read_trailer();  
328 -  
329 - QPDFTokenizer::Token  
330 - read_token(size_t max_len = 0)  
331 - {  
332 - return tokenizer.readToken(*file, "", true, max_len);  
333 - }  
334 -  
335 - // Methods to insert table entries  
336 - void insert(int obj, int f0, qpdf_offset_t f1, int f2);  
337 - void insert_free(QPDFObjGen);  
338 -  
339 - QPDFExc  
340 - damaged_pdf(std::string const& msg)  
341 - {  
342 - return qpdf.damagedPDF("", 0, msg);  
343 - }  
344 -  
345 - QPDFExc  
346 - damaged_table(std::string const& msg)  
347 - {  
348 - return qpdf.damagedPDF("xref table", msg);  
349 - }  
350 -  
351 - void  
352 - warn_damaged(std::string const& msg)  
353 - {  
354 - qpdf.warn(damaged_pdf(msg));  
355 - }  
356 -  
357 - QPDF& qpdf;  
358 - QPDF::Objects& objects;  
359 - InputSource* const& file;  
360 - QPDFTokenizer tokenizer;  
361 -  
362 - std::vector<Entry> table;  
363 - QPDFObjectHandle trailer_;  
364 -  
365 - bool attempt_recovery_{true};  
366 - bool initialized_{false};  
367 - bool ignore_streams_{false};  
368 - bool reconstructed_{false};  
369 - bool object_streams_{false};  
370 - // Before the xref table is initialized, max_id_ is an upper bound on the possible object  
371 - // ids that could be present in the PDF file. Once the trailer has been read, max_id_ is set  
372 - // to the value of /Size. If the file is damaged, max_id_ becomes the maximum object id in  
373 - // the xref table after reconstruction.  
374 - int max_id_{std::numeric_limits<int>::max() - 1};  
375 -  
376 - // Linearization data  
377 - bool uncompressed_after_compressed_{false};  
378 - qpdf_offset_t first_item_offset_{0}; // actual value from file  
379 - }; // Xref_table;  
380 -  
381 - ~Objects();  
382 -  
383 - Objects(QPDF& qpdf, QPDF::Members* m, InputSource* const& file) :  
384 - qpdf(qpdf),  
385 - file(file),  
386 - m(m),  
387 - xref(*this)  
388 - {  
389 - }  
390 -  
391 - Xref_table&  
392 - xref_table() noexcept  
393 - {  
394 - return xref;  
395 - }  
396 -  
397 - Xref_table const&  
398 - xref_table() const noexcept  
399 - {  
400 - return xref;  
401 - }  
402 -  
403 - QPDFObjectHandle  
404 - trailer() noexcept  
405 - {  
406 - return xref.trailer();  
407 - }  
408 -  
409 - QPDFObjectHandle const&  
410 - trailer() const noexcept  
411 - {  
412 - return xref.trailer();  
413 - }  
414 -  
415 - QPDFObjectHandle  
416 - get(QPDFObjGen og)  
417 - {  
418 - if (auto it = table.find(og); it != table.end()) {  
419 - return {it->second.object};  
420 - } else if (xref.initialized() && !xref.type(og)) {  
421 - return QPDF_Null::create();  
422 - } else {  
423 - auto result = table.try_emplace(og, QPDF_Unresolved::create(&qpdf, og));  
424 - return {result.first->second.object};  
425 - }  
426 - }  
427 -  
428 - QPDFObjectHandle  
429 - get(int id, int gen)  
430 - {  
431 - return get(QPDFObjGen(id, gen));  
432 - }  
433 -  
434 - std::vector<QPDFObjectHandle> all();  
435 -  
436 - void erase(QPDFObjGen og);  
437 -  
438 - void replace(QPDFObjGen og, QPDFObjectHandle oh);  
439 -  
440 - void swap(QPDFObjGen og1, QPDFObjGen og2);  
441 -  
442 - QPDFObjectHandle read(  
443 - bool attempt_recovery,  
444 - qpdf_offset_t offset,  
445 - std::string const& description,  
446 - QPDFObjGen exp_og,  
447 - QPDFObjGen& og,  
448 - bool skip_cache_if_in_xref);  
449 - QPDFObject* resolve(QPDFObjGen og);  
450 - void update_table(QPDFObjGen og, std::shared_ptr<QPDFObject> const& object);  
451 - QPDFObjGen next_id();  
452 - QPDFObjectHandle make_indirect(std::shared_ptr<QPDFObject> const& obj);  
453 - std::shared_ptr<QPDFObject> get_for_parser(int id, int gen, bool parse_pdf);  
454 - std::shared_ptr<QPDFObject> get_for_json(int id, int gen);  
455 -  
456 - // Get a list of objects that would be permitted in an object stream.  
457 - template <typename T>  
458 - std::vector<T> compressible();  
459 - std::vector<QPDFObjGen> compressible_vector();  
460 - std::vector<bool> compressible_set();  
461 -  
462 - // Used by QPDFWriter to determine the vector part of its object tables.  
463 - size_t table_size();  
464 -  
465 - private:  
466 - struct Entry  
467 - {  
468 - Entry() = default;  
469 -  
470 - Entry(std::shared_ptr<QPDFObject> object) :  
471 - object(object)  
472 - {  
473 - }  
474 -  
475 - std::shared_ptr<QPDFObject> object;  
476 - };  
477 -  
478 - bool cached(QPDFObjGen og);  
479 - bool unresolved(QPDFObjGen og);  
480 -  
481 - QPDFObjectHandle readObjectInStream(std::shared_ptr<InputSource>& input, int obj);  
482 - void resolveObjectsInStream(int obj_stream_number);  
483 - QPDFObjectHandle read_object(std::string const& description, QPDFObjGen og);  
484 - void read_stream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);  
485 - void validate_stream_line_end(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);  
486 - size_t recover_stream_length(  
487 - std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset);  
488 -  
489 - QPDF& qpdf;  
490 - InputSource* const& file;  
491 - QPDF::Members* m;  
492 - Xref_table xref;  
493 -  
494 - std::map<QPDFObjGen, Entry> table;  
495 -}; // Objects  
496 -  
497 -#endif // QPDF_OBJECTS_HH  
libqpdf/qpdf/QPDF_private.hh
@@ -3,10 +3,378 @@ @@ -3,10 +3,378 @@
3 3
4 #include <qpdf/QPDF.hh> 4 #include <qpdf/QPDF.hh>
5 5
6 -#include <qpdf/QPDF_objects.hh>  
7 -  
8 #include <variant> 6 #include <variant>
9 7
  8 +// Xref_table encapsulates the pdf's xref table and trailer.
  9 +class QPDF::Xref_table
  10 +{
  11 + public:
  12 + Xref_table(QPDF& qpdf, InputSource* const& file) :
  13 + qpdf(qpdf),
  14 + file(file)
  15 + {
  16 + tokenizer.allowEOF();
  17 + }
  18 +
  19 + void initialize();
  20 + void initialize_empty();
  21 + void initialize_json();
  22 + void reconstruct(QPDFExc& e);
  23 + void show();
  24 + bool resolve();
  25 +
  26 + QPDFObjectHandle
  27 + trailer() const
  28 + {
  29 + return trailer_;
  30 + }
  31 +
  32 + void
  33 + trailer(QPDFObjectHandle&& oh)
  34 + {
  35 + trailer_ = std::move(oh);
  36 + }
  37 +
  38 + // Returns 0 if og is not in table.
  39 + size_t
  40 + type(QPDFObjGen og) const
  41 + {
  42 + int id = og.getObj();
  43 + if (id < 1 || static_cast<size_t>(id) >= table.size()) {
  44 + return 0;
  45 + }
  46 + auto& e = table[static_cast<size_t>(id)];
  47 + return e.gen() == og.getGen() ? e.type() : 0;
  48 + }
  49 +
  50 + // Returns 0 if og is not in table.
  51 + size_t
  52 + type(size_t id) const noexcept
  53 + {
  54 + if (id >= table.size()) {
  55 + return 0;
  56 + }
  57 + return table[id].type();
  58 + }
  59 +
  60 + // Returns 0 if og is not in table.
  61 + qpdf_offset_t
  62 + offset(QPDFObjGen og) const noexcept
  63 + {
  64 + int id = og.getObj();
  65 + if (id < 1 || static_cast<size_t>(id) >= table.size()) {
  66 + return 0;
  67 + }
  68 + return table[static_cast<size_t>(id)].offset();
  69 + }
  70 +
  71 + // Returns 0 if id is not in table.
  72 + int
  73 + stream_number(int id) const noexcept
  74 + {
  75 + if (id < 1 || static_cast<size_t>(id) >= table.size()) {
  76 + return 0;
  77 + }
  78 + return table[static_cast<size_t>(id)].stream_number();
  79 + }
  80 +
  81 + int
  82 + stream_index(int id) const noexcept
  83 + {
  84 + if (id < 1 || static_cast<size_t>(id) >= table.size()) {
  85 + return 0;
  86 + }
  87 + return table[static_cast<size_t>(id)].stream_index();
  88 + }
  89 +
  90 + QPDFObjGen at_offset(qpdf_offset_t offset) const noexcept;
  91 +
  92 + std::map<QPDFObjGen, QPDFXRefEntry> as_map() const;
  93 +
  94 + bool
  95 + object_streams() const noexcept
  96 + {
  97 + return object_streams_;
  98 + }
  99 +
  100 + // Return a vector of object id and stream number for each compressed object.
  101 + std::vector<std::pair<unsigned int, int>>
  102 + compressed_objects() const
  103 + {
  104 + if (!initialized()) {
  105 + throw std::logic_error("Xref_table::compressed_objects called before parsing.");
  106 + }
  107 +
  108 + std::vector<std::pair<unsigned int, int>> result;
  109 + result.reserve(table.size());
  110 +
  111 + unsigned int i{0};
  112 + for (auto const& item: table) {
  113 + if (item.type() == 2) {
  114 + result.emplace_back(i, item.stream_number());
  115 + }
  116 + ++i;
  117 + }
  118 + return result;
  119 + }
  120 +
  121 + // Temporary access to underlying table size
  122 + size_t
  123 + size() const noexcept
  124 + {
  125 + return table.size();
  126 + }
  127 +
  128 + void
  129 + ignore_streams(bool val) noexcept
  130 + {
  131 + ignore_streams_ = val;
  132 + }
  133 +
  134 + bool
  135 + initialized() const noexcept
  136 + {
  137 + return initialized_;
  138 + }
  139 +
  140 + void
  141 + attempt_recovery(bool val) noexcept
  142 + {
  143 + attempt_recovery_ = val;
  144 + }
  145 +
  146 + int
  147 + max_id() const noexcept
  148 + {
  149 + return max_id_;
  150 + }
  151 +
  152 + // For Linearization
  153 +
  154 + qpdf_offset_t
  155 + end_after_space(QPDFObjGen og)
  156 + {
  157 + auto& e = entry(toS(og.getObj()));
  158 + switch (e.type()) {
  159 + case 1:
  160 + return e.end_after_space_;
  161 + case 2:
  162 + {
  163 + auto es = entry(toS(e.stream_number()));
  164 + return es.type() == 1 ? es.end_after_space_ : 0;
  165 + }
  166 + default:
  167 + return 0;
  168 + }
  169 + }
  170 +
  171 + qpdf_offset_t
  172 + end_before_space(QPDFObjGen og)
  173 + {
  174 + auto& e = entry(toS(og.getObj()));
  175 + switch (e.type()) {
  176 + case 1:
  177 + return e.end_before_space_;
  178 + case 2:
  179 + {
  180 + auto es = entry(toS(e.stream_number()));
  181 + return es.type() == 1 ? es.end_before_space_ : 0;
  182 + }
  183 + default:
  184 + return 0;
  185 + }
  186 + }
  187 +
  188 + void
  189 + linearization_offsets(size_t id, qpdf_offset_t before, qpdf_offset_t after)
  190 + {
  191 + if (type(id)) {
  192 + table[id].end_before_space_ = before;
  193 + table[id].end_after_space_ = after;
  194 + }
  195 + }
  196 +
  197 + bool
  198 + uncompressed_after_compressed() const noexcept
  199 + {
  200 + return uncompressed_after_compressed_;
  201 + }
  202 +
  203 + // Actual value from file
  204 + qpdf_offset_t
  205 + first_item_offset() const noexcept
  206 + {
  207 + return first_item_offset_;
  208 + }
  209 +
  210 + private:
  211 + // Object, count, offset of first entry
  212 + typedef std::tuple<int, int, qpdf_offset_t> Subsection;
  213 +
  214 + struct Uncompressed
  215 + {
  216 + Uncompressed(qpdf_offset_t offset) :
  217 + offset(offset)
  218 + {
  219 + }
  220 + qpdf_offset_t offset;
  221 + };
  222 +
  223 + struct Compressed
  224 + {
  225 + Compressed(int stream_number, int stream_index) :
  226 + stream_number(stream_number),
  227 + stream_index(stream_index)
  228 + {
  229 + }
  230 + int stream_number{0};
  231 + int stream_index{0};
  232 + };
  233 +
  234 + typedef std::variant<std::monostate, Uncompressed, Compressed> Xref;
  235 +
  236 + struct Entry
  237 + {
  238 + Entry() = default;
  239 +
  240 + Entry(int gen, Xref entry) :
  241 + gen_(gen),
  242 + entry(entry)
  243 + {
  244 + }
  245 +
  246 + int
  247 + gen() const noexcept
  248 + {
  249 + return gen_;
  250 + }
  251 +
  252 + size_t
  253 + type() const noexcept
  254 + {
  255 + return entry.index();
  256 + }
  257 +
  258 + qpdf_offset_t
  259 + offset() const noexcept
  260 + {
  261 + return type() == 1 ? std::get<1>(entry).offset : 0;
  262 + }
  263 +
  264 + int
  265 + stream_number() const noexcept
  266 + {
  267 + return type() == 2 ? std::get<2>(entry).stream_number : 0;
  268 + }
  269 +
  270 + int
  271 + stream_index() const noexcept
  272 + {
  273 + return type() == 2 ? std::get<2>(entry).stream_index : 0;
  274 + }
  275 +
  276 + int gen_{0};
  277 + Xref entry;
  278 + qpdf_offset_t end_before_space_{0};
  279 + qpdf_offset_t end_after_space_{0};
  280 + };
  281 +
  282 + Entry&
  283 + entry(size_t id)
  284 + {
  285 + return id < table.size() ? table[id] : table[0];
  286 + }
  287 +
  288 + void read(qpdf_offset_t offset);
  289 +
  290 + // Methods to parse tables
  291 + qpdf_offset_t process_section(qpdf_offset_t offset);
  292 + std::vector<Subsection> subsections(std::string& line);
  293 + std::vector<Subsection> bad_subsections(std::string& line, qpdf_offset_t offset);
  294 + Subsection subsection(std::string const& line);
  295 + std::tuple<bool, qpdf_offset_t, int, char> read_entry();
  296 + std::tuple<bool, qpdf_offset_t, int, char> read_bad_entry();
  297 +
  298 + // Methods to parse streams
  299 + qpdf_offset_t read_stream(qpdf_offset_t offset);
  300 + qpdf_offset_t process_stream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream);
  301 + std::pair<int, std::array<int, 3>>
  302 + process_W(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged);
  303 + std::pair<int, size_t> process_Size(
  304 + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged);
  305 + std::pair<int, std::vector<std::pair<int, int>>> process_Index(
  306 + QPDFObjectHandle& dict,
  307 + int max_num_entries,
  308 + std::function<QPDFExc(std::string_view)> damaged);
  309 +
  310 + QPDFObjectHandle read_trailer();
  311 +
  312 + QPDFTokenizer::Token
  313 + read_token(size_t max_len = 0)
  314 + {
  315 + return tokenizer.readToken(*file, "", true, max_len);
  316 + }
  317 +
  318 + // Methods to insert table entries
  319 + void insert(int obj, int f0, qpdf_offset_t f1, int f2);
  320 + void insert_free(QPDFObjGen);
  321 +
  322 + QPDFExc
  323 + damaged_pdf(std::string const& msg)
  324 + {
  325 + return qpdf.damagedPDF("", 0, msg);
  326 + }
  327 +
  328 + QPDFExc
  329 + damaged_table(std::string const& msg)
  330 + {
  331 + return qpdf.damagedPDF("xref table", msg);
  332 + }
  333 +
  334 + void
  335 + warn_damaged(std::string const& msg)
  336 + {
  337 + qpdf.warn(damaged_pdf(msg));
  338 + }
  339 +
  340 + QPDF& qpdf;
  341 + InputSource* const& file;
  342 + QPDFTokenizer tokenizer;
  343 +
  344 + std::vector<Entry> table;
  345 + QPDFObjectHandle trailer_;
  346 +
  347 + bool attempt_recovery_{true};
  348 + bool initialized_{false};
  349 + bool ignore_streams_{false};
  350 + bool reconstructed_{false};
  351 + bool object_streams_{false};
  352 + // Before the xref table is initialized, max_id_ is an upper bound on the possible object ids
  353 + // that could be present in the PDF file. Once the trailer has been read, max_id_ is set to the
  354 + // value of /Size. If the file is damaged, max_id_ becomes the maximum object id in the xref
  355 + // table after reconstruction.
  356 + int max_id_{std::numeric_limits<int>::max() - 1};
  357 +
  358 + // Linearization data
  359 + bool uncompressed_after_compressed_{false};
  360 + qpdf_offset_t first_item_offset_{0}; // actual value from file
  361 +};
  362 +
  363 +// The Resolver class is restricted to QPDFObject so that only it can resolve indirect
  364 +// references.
  365 +class QPDF::Resolver
  366 +{
  367 + friend class QPDFObject;
  368 + friend class QPDF_Unresolved;
  369 +
  370 + private:
  371 + static QPDFObject*
  372 + resolved(QPDF* qpdf, QPDFObjGen og)
  373 + {
  374 + return qpdf->resolve(og);
  375 + }
  376 +};
  377 +
10 // StreamCopier class is restricted to QPDFObjectHandle so it can copy stream data. 378 // StreamCopier class is restricted to QPDFObjectHandle so it can copy stream data.
11 class QPDF::StreamCopier 379 class QPDF::StreamCopier
12 { 380 {
@@ -38,7 +406,7 @@ class QPDF::ParseGuard @@ -38,7 +406,7 @@ class QPDF::ParseGuard
38 static std::shared_ptr<QPDFObject> 406 static std::shared_ptr<QPDFObject>
39 getObject(QPDF* qpdf, int id, int gen, bool parse_pdf) 407 getObject(QPDF* qpdf, int id, int gen, bool parse_pdf)
40 { 408 {
41 - return qpdf->objects().get_for_parser(id, gen, parse_pdf); 409 + return qpdf->getObjectForParser(id, gen, parse_pdf);
42 } 410 }
43 411
44 ~ParseGuard() 412 ~ParseGuard()
@@ -72,6 +440,19 @@ class QPDF::Pipe @@ -72,6 +440,19 @@ class QPDF::Pipe
72 } 440 }
73 }; 441 };
74 442
  443 +class QPDF::ObjCache
  444 +{
  445 + public:
  446 + ObjCache() = default;
  447 +
  448 + ObjCache(std::shared_ptr<QPDFObject> object) :
  449 + object(object)
  450 + {
  451 + }
  452 +
  453 + std::shared_ptr<QPDFObject> object;
  454 +};
  455 +
75 class QPDF::ObjCopier 456 class QPDF::ObjCopier
76 { 457 {
77 public: 458 public:
@@ -369,7 +750,8 @@ class QPDF::Members @@ -369,7 +750,8 @@ class QPDF::Members
369 bool check_mode{false}; 750 bool check_mode{false};
370 std::shared_ptr<EncryptionParameters> encp; 751 std::shared_ptr<EncryptionParameters> encp;
371 std::string pdf_version; 752 std::string pdf_version;
372 - Objects objects; 753 + Xref_table xref_table;
  754 + std::map<QPDFObjGen, ObjCache> obj_cache;
373 std::set<QPDFObjGen> resolving; 755 std::set<QPDFObjGen> resolving;
374 std::vector<QPDFObjectHandle> all_pages; 756 std::vector<QPDFObjectHandle> all_pages;
375 bool invalid_page_found{false}; 757 bool invalid_page_found{false};
@@ -418,33 +800,6 @@ class QPDF::Members @@ -418,33 +800,6 @@ class QPDF::Members
418 std::map<QPDFObjGen, std::set<ObjUser>> object_to_obj_users; 800 std::map<QPDFObjGen, std::set<ObjUser>> object_to_obj_users;
419 }; 801 };
420 802
421 -inline QPDF::Objects&  
422 -QPDF::objects() noexcept  
423 -{  
424 - return m->objects;  
425 -}  
426 -  
427 -inline QPDF::Objects const&  
428 -QPDF::objects() const noexcept  
429 -{  
430 - return m->objects;  
431 -}  
432 -  
433 -// The Resolver class is restricted to QPDFObject so that only it can resolve indirect  
434 -// references.  
435 -class QPDF::Resolver  
436 -{  
437 - friend class QPDFObject;  
438 - friend class QPDF_Unresolved;  
439 -  
440 - private:  
441 - static QPDFObject*  
442 - resolved(QPDF* qpdf, QPDFObjGen og)  
443 - {  
444 - return qpdf->m->objects.resolve(og);  
445 - }  
446 -};  
447 -  
448 // JobSetter class is restricted to QPDFJob. 803 // JobSetter class is restricted to QPDFJob.
449 class QPDF::JobSetter 804 class QPDF::JobSetter
450 { 805 {
@@ -521,25 +876,25 @@ class QPDF::Writer @@ -521,25 +876,25 @@ class QPDF::Writer
521 static std::vector<QPDFObjGen> 876 static std::vector<QPDFObjGen>
522 getCompressibleObjGens(QPDF& qpdf) 877 getCompressibleObjGens(QPDF& qpdf)
523 { 878 {
524 - return qpdf.objects().compressible_vector(); 879 + return qpdf.getCompressibleObjVector();
525 } 880 }
526 881
527 static std::vector<bool> 882 static std::vector<bool>
528 getCompressibleObjSet(QPDF& qpdf) 883 getCompressibleObjSet(QPDF& qpdf)
529 { 884 {
530 - return qpdf.objects().compressible_set(); 885 + return qpdf.getCompressibleObjSet();
531 } 886 }
532 887
533 - static Objects::Xref_table const& 888 + static Xref_table const&
534 getXRefTable(QPDF& qpdf) 889 getXRefTable(QPDF& qpdf)
535 { 890 {
536 - return qpdf.objects().xref_table(); 891 + return qpdf.m->xref_table;
537 } 892 }
538 893
539 static size_t 894 static size_t
540 tableSize(QPDF& qpdf) 895 tableSize(QPDF& qpdf)
541 { 896 {
542 - return qpdf.objects().table_size(); 897 + return qpdf.tableSize();
543 } 898 }
544 }; 899 };
545 900