Commit 57da88747e02c9d6bb66b7ff38b4db963fd388c9

Authored by m-holger
1 parent fba542f9

Revert "Split QPDF.cc into QPDF.cc and QPDF_objects.cc"

This reverts commit bb045907a043b5c6de9fb804ff11087333747329.
libqpdf/CMakeLists.txt
@@ -95,7 +95,6 @@ set(libqpdf_SOURCES @@ -95,7 +95,6 @@ set(libqpdf_SOURCES
95 QPDF_encryption.cc 95 QPDF_encryption.cc
96 QPDF_json.cc 96 QPDF_json.cc
97 QPDF_linearization.cc 97 QPDF_linearization.cc
98 - QPDF_objects.cc  
99 QPDF_optimization.cc 98 QPDF_optimization.cc
100 QPDF_pages.cc 99 QPDF_pages.cc
101 QTC.cc 100 QTC.cc
libqpdf/QPDF.cc
@@ -413,26 +413,1726 @@ QPDF::findHeader() @@ -413,26 +413,1726 @@ QPDF::findHeader()
413 return valid; 413 return valid;
414 } 414 }
415 415
  416 +bool
  417 +QPDF::findStartxref()
  418 +{
  419 + if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {
  420 + // Position in front of offset token
  421 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  422 + return true;
  423 + }
  424 + return false;
  425 +}
  426 +
  427 +void
  428 +QPDF::parse(char const* password)
  429 +{
  430 + if (password) {
  431 + m->encp->provided_password = password;
  432 + }
  433 +
  434 + // Find the header anywhere in the first 1024 bytes of the file.
  435 + PatternFinder hf(*this, &QPDF::findHeader);
  436 + if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {
  437 + QTC::TC("qpdf", "QPDF not a pdf file");
  438 + warn(damagedPDF("", 0, "can't find PDF header"));
  439 + // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode
  440 + m->pdf_version = "1.2";
  441 + }
  442 +
  443 + // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
  444 + // 30 characters to leave room for the startxref stuff.
  445 + m->file->seek(0, SEEK_END);
  446 + qpdf_offset_t end_offset = m->file->tell();
  447 + m->xref_table_max_offset = end_offset;
  448 + // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
  449 + // scenarios at least 3 bytes are required.
  450 + if (m->xref_table_max_id > m->xref_table_max_offset / 3) {
  451 + m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);
  452 + }
  453 + qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
  454 + PatternFinder sf(*this, &QPDF::findStartxref);
  455 + qpdf_offset_t xref_offset = 0;
  456 + if (m->file->findLast("startxref", start_offset, 0, sf)) {
  457 + xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());
  458 + }
  459 +
  460 + try {
  461 + if (xref_offset == 0) {
  462 + QTC::TC("qpdf", "QPDF can't find startxref");
  463 + throw damagedPDF("", 0, "can't find startxref");
  464 + }
  465 + try {
  466 + read_xref(xref_offset);
  467 + } catch (QPDFExc&) {
  468 + throw;
  469 + } catch (std::exception& e) {
  470 + throw damagedPDF("", 0, std::string("error reading xref: ") + e.what());
  471 + }
  472 + } catch (QPDFExc& e) {
  473 + if (m->attempt_recovery) {
  474 + reconstruct_xref(e, xref_offset > 0);
  475 + QTC::TC("qpdf", "QPDF reconstructed xref table");
  476 + } else {
  477 + throw;
  478 + }
  479 + }
  480 +
  481 + initializeEncryption();
  482 + m->parsed = true;
  483 + if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) {
  484 + // QPDFs created from JSON have an empty xref table and no root object yet.
  485 + throw damagedPDF("", 0, "unable to find page tree");
  486 + }
  487 +}
  488 +
  489 +void
  490 +QPDF::inParse(bool v)
  491 +{
  492 + if (m->in_parse == v) {
  493 + // This happens if QPDFParser::parse tries to resolve an indirect object while it is
  494 + // parsing.
  495 + throw std::logic_error(
  496 + "QPDF: re-entrant parsing detected. This is a qpdf bug."
  497 + " Please report at https://github.com/qpdf/qpdf/issues.");
  498 + }
  499 + m->in_parse = v;
  500 +}
  501 +
  502 +void
  503 +QPDF::warn(QPDFExc const& e)
  504 +{
  505 + if (m->max_warnings > 0 && m->warnings.size() >= m->max_warnings) {
  506 + stopOnError("Too many warnings - file is too badly damaged");
  507 + }
  508 + m->warnings.push_back(e);
  509 + if (!m->suppress_warnings) {
  510 + *m->log->getWarn() << "WARNING: " << m->warnings.back().what() << "\n";
  511 + }
  512 +}
  513 +
  514 +void
  515 +QPDF::warn(
  516 + qpdf_error_code_e error_code,
  517 + std::string const& object,
  518 + qpdf_offset_t offset,
  519 + std::string const& message)
  520 +{
  521 + warn(QPDFExc(error_code, getFilename(), object, offset, message));
  522 +}
  523 +
  524 +void
  525 +QPDF::setTrailer(QPDFObjectHandle obj)
  526 +{
  527 + if (m->trailer) {
  528 + return;
  529 + }
  530 + m->trailer = obj;
  531 +}
  532 +
  533 +void
  534 +QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
  535 +{
  536 + if (m->reconstructed_xref) {
  537 + // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
  538 + // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
  539 + throw e;
  540 + }
  541 +
  542 + // If recovery generates more than 1000 warnings, the file is so severely damaged that there
  543 + // probably is no point trying to continue.
  544 + const auto max_warnings = m->warnings.size() + 1000U;
  545 + auto check_warnings = [this, max_warnings]() {
  546 + if (m->warnings.size() > max_warnings) {
  547 + throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");
  548 + }
  549 + };
  550 +
  551 + m->reconstructed_xref = true;
  552 + // We may find more objects, which may contain dangling references.
  553 + m->fixed_dangling_refs = false;
  554 +
  555 + warn(damagedPDF("", 0, "file is damaged"));
  556 + warn(e);
  557 + warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));
  558 +
  559 + // Delete all references to type 1 (uncompressed) objects
  560 + std::vector<QPDFObjGen> to_delete;
  561 + for (auto const& iter: m->xref_table) {
  562 + if (iter.second.getType() == 1) {
  563 + to_delete.emplace_back(iter.first);
  564 + }
  565 + }
  566 + for (auto const& iter: to_delete) {
  567 + m->xref_table.erase(iter);
  568 + }
  569 +
  570 + std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
  571 + std::vector<qpdf_offset_t> trailers;
  572 + std::vector<qpdf_offset_t> startxrefs;
  573 +
  574 + m->file->seek(0, SEEK_END);
  575 + qpdf_offset_t eof = m->file->tell();
  576 + m->file->seek(0, SEEK_SET);
  577 + // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
  578 + static size_t const MAX_LEN = 10;
  579 + while (m->file->tell() < eof) {
  580 + QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);
  581 + qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
  582 + if (t1.isInteger()) {
  583 + auto pos = m->file->tell();
  584 + auto t2 = readToken(*m->file, MAX_LEN);
  585 + if (t2.isInteger() && readToken(*m->file, MAX_LEN).isWord("obj")) {
  586 + int obj = QUtil::string_to_int(t1.getValue().c_str());
  587 + int gen = QUtil::string_to_int(t2.getValue().c_str());
  588 + if (obj <= m->xref_table_max_id) {
  589 + found_objects.emplace_back(obj, gen, token_start);
  590 + } else {
  591 + warn(damagedPDF(
  592 + "", 0, "ignoring object with impossibly large id " + std::to_string(obj)));
  593 + }
  594 + }
  595 + m->file->seek(pos, SEEK_SET);
  596 + } else if (!m->trailer && t1.isWord("trailer")) {
  597 + trailers.emplace_back(m->file->tell());
  598 + } else if (!found_startxref && t1.isWord("startxref")) {
  599 + startxrefs.emplace_back(m->file->tell());
  600 + }
  601 + check_warnings();
  602 + m->file->findAndSkipNextEOL();
  603 + }
  604 +
  605 + if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
  606 + startxrefs.back() > std::get<2>(found_objects.back())) {
  607 + try {
  608 + m->file->seek(startxrefs.back(), SEEK_SET);
  609 + if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
  610 + read_xref(offset);
  611 + if (getRoot().getKey("/Pages").isDictionary()) {
  612 + QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
  613 + warn(
  614 + damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));
  615 + initializeEncryption();
  616 + m->parsed = true;
  617 + m->reconstructed_xref = false;
  618 + return;
  619 + }
  620 + }
  621 + } catch (...) {
  622 + // ok, bad luck. Do recovery.
  623 + }
  624 + }
  625 +
  626 + auto rend = found_objects.rend();
  627 + for (auto it = found_objects.rbegin(); it != rend; it++) {
  628 + auto [obj, gen, token_start] = *it;
  629 + insertXrefEntry(obj, 1, token_start, gen);
  630 + check_warnings();
  631 + }
  632 + m->deleted_objects.clear();
  633 +
  634 + for (auto it = trailers.rbegin(); it != trailers.rend(); it++) {
  635 + m->file->seek(*it, SEEK_SET);
  636 + auto t = readTrailer();
  637 + if (!t.isDictionary()) {
  638 + // Oh well. It was worth a try.
  639 + } else {
  640 + if (t.hasKey("/Root")) {
  641 + m->trailer = t;
  642 + break;
  643 + }
  644 + warn(damagedPDF("trailer", *it, "recovered trailer has no /Root entry"));
  645 + }
  646 + check_warnings();
  647 + }
  648 +
  649 + if (!m->trailer) {
  650 + qpdf_offset_t max_offset{0};
  651 + size_t max_size{0};
  652 + // If there are any xref streams, take the last one to appear.
  653 + for (auto const& iter: m->xref_table) {
  654 + auto entry = iter.second;
  655 + if (entry.getType() != 1) {
  656 + continue;
  657 + }
  658 + auto oh = getObject(iter.first);
  659 + try {
  660 + if (!oh.isStreamOfType("/XRef")) {
  661 + continue;
  662 + }
  663 + } catch (std::exception&) {
  664 + continue;
  665 + }
  666 + auto offset = entry.getOffset();
  667 + auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();
  668 + if (size > max_size || (size == max_size && offset > max_offset)) {
  669 + max_offset = offset;
  670 + setTrailer(oh.getDict());
  671 + }
  672 + check_warnings();
  673 + }
  674 + if (max_offset > 0) {
  675 + try {
  676 + read_xref(max_offset);
  677 + } catch (std::exception&) {
  678 + warn(damagedPDF(
  679 + "", 0, "error decoding candidate xref stream while recovering damaged file"));
  680 + }
  681 + QTC::TC("qpdf", "QPDF recover xref stream");
  682 + }
  683 + }
  684 +
  685 + if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {
  686 + // Try to find a Root dictionary. As a quick fix try the one with the highest object id.
  687 + QPDFObjectHandle root;
  688 + for (auto const& iter: m->obj_cache) {
  689 + try {
  690 + if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {
  691 + root = iter.second.object;
  692 + }
  693 + } catch (std::exception&) {
  694 + continue;
  695 + }
  696 + }
  697 + if (root) {
  698 + if (!m->trailer) {
  699 + warn(damagedPDF(
  700 + "", 0, "unable to find trailer dictionary while recovering damaged file"));
  701 + m->trailer = QPDFObjectHandle::newDictionary();
  702 + }
  703 + m->trailer.replaceKey("/Root", root);
  704 + }
  705 + }
  706 +
  707 + if (!m->trailer) {
  708 + // We could check the last encountered object to see if it was an xref stream. If so, we
  709 + // could try to get the trailer from there. This may make it possible to recover files with
  710 + // bad startxref pointers even when they have object streams.
  711 +
  712 + throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");
  713 + }
  714 + if (m->xref_table.empty()) {
  715 + // We cannot check for an empty xref table in parse because empty tables are valid when
  716 + // creating QPDF objects from JSON.
  717 + throw damagedPDF("", 0, "unable to find objects while recovering damaged file");
  718 + }
  719 + check_warnings();
  720 + if (!m->parsed) {
  721 + m->parsed = true;
  722 + getAllPages();
  723 + check_warnings();
  724 + if (m->all_pages.empty()) {
  725 + m->parsed = false;
  726 + throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");
  727 + }
  728 + }
  729 + // We could iterate through the objects looking for streams and try to find objects inside of
  730 + // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors
  731 + // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything
  732 + // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
  733 + // It's safe to call it more than once.
  734 +}
  735 +
  736 +void
  737 +QPDF::read_xref(qpdf_offset_t xref_offset)
  738 +{
  739 + std::map<int, int> free_table;
  740 + std::set<qpdf_offset_t> visited;
  741 + while (xref_offset) {
  742 + visited.insert(xref_offset);
  743 + char buf[7];
  744 + memset(buf, 0, sizeof(buf));
  745 + m->file->seek(xref_offset, SEEK_SET);
  746 + // Some files miss the mark a little with startxref. We could do a better job of searching
  747 + // in the neighborhood for something that looks like either an xref table or stream, but the
  748 + // simple heuristic of skipping whitespace can help with the xref table case and is harmless
  749 + // with the stream case.
  750 + bool done = false;
  751 + bool skipped_space = false;
  752 + while (!done) {
  753 + char ch;
  754 + if (1 == m->file->read(&ch, 1)) {
  755 + if (util::is_space(ch)) {
  756 + skipped_space = true;
  757 + } else {
  758 + m->file->unreadCh(ch);
  759 + done = true;
  760 + }
  761 + } else {
  762 + QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
  763 + done = true;
  764 + }
  765 + }
  766 +
  767 + m->file->read(buf, sizeof(buf) - 1);
  768 + // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
  769 + // where it is terminated by arbitrary whitespace.
  770 + if ((strncmp(buf, "xref", 4) == 0) && util::is_space(buf[4])) {
  771 + if (skipped_space) {
  772 + QTC::TC("qpdf", "QPDF xref skipped space");
  773 + warn(damagedPDF("", 0, "extraneous whitespace seen before xref"));
  774 + }
  775 + QTC::TC(
  776 + "qpdf",
  777 + "QPDF xref space",
  778 + ((buf[4] == '\n') ? 0
  779 + : (buf[4] == '\r') ? 1
  780 + : (buf[4] == ' ') ? 2
  781 + : 9999));
  782 + int skip = 4;
  783 + // buf is null-terminated, and util::is_space('\0') is false, so this won't overrun.
  784 + while (util::is_space(buf[skip])) {
  785 + ++skip;
  786 + }
  787 + xref_offset = read_xrefTable(xref_offset + skip);
  788 + } else {
  789 + xref_offset = read_xrefStream(xref_offset);
  790 + }
  791 + if (visited.count(xref_offset) != 0) {
  792 + QTC::TC("qpdf", "QPDF xref loop");
  793 + throw damagedPDF("", 0, "loop detected following xref tables");
  794 + }
  795 + }
  796 +
  797 + if (!m->trailer) {
  798 + throw damagedPDF("", 0, "unable to find trailer while reading xref");
  799 + }
  800 + int size = m->trailer.getKey("/Size").getIntValueAsInt();
  801 + int max_obj = 0;
  802 + if (!m->xref_table.empty()) {
  803 + max_obj = m->xref_table.rbegin()->first.getObj();
  804 + }
  805 + if (!m->deleted_objects.empty()) {
  806 + max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));
  807 + }
  808 + if ((size < 1) || (size - 1 != max_obj)) {
  809 + QTC::TC("qpdf", "QPDF xref size mismatch");
  810 + warn(damagedPDF(
  811 + "",
  812 + 0,
  813 + ("reported number of objects (" + std::to_string(size) +
  814 + ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));
  815 + }
  816 +
  817 + // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we
  818 + // never depend on its being set.
  819 + m->deleted_objects.clear();
  820 +
  821 + // Make sure we keep only the highest generation for any object.
  822 + QPDFObjGen last_og{-1, 0};
  823 + for (auto const& item: m->xref_table) {
  824 + auto id = item.first.getObj();
  825 + if (id == last_og.getObj() && id > 0) {
  826 + removeObject(last_og);
  827 + }
  828 + last_og = item.first;
  829 + }
  830 +}
  831 +
  832 +bool
  833 +QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)
  834 +{
  835 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  836 + // buffer.
  837 + char const* p = line.c_str();
  838 + char const* start = line.c_str();
  839 +
  840 + // Skip zero or more spaces
  841 + while (util::is_space(*p)) {
  842 + ++p;
  843 + }
  844 + // Require digit
  845 + if (!util::is_digit(*p)) {
  846 + return false;
  847 + }
  848 + // Gather digits
  849 + std::string obj_str;
  850 + while (util::is_digit(*p)) {
  851 + obj_str.append(1, *p++);
  852 + }
  853 + // Require space
  854 + if (!util::is_space(*p)) {
  855 + return false;
  856 + }
  857 + // Skip spaces
  858 + while (util::is_space(*p)) {
  859 + ++p;
  860 + }
  861 + // Require digit
  862 + if (!util::is_digit(*p)) {
  863 + return false;
  864 + }
  865 + // Gather digits
  866 + std::string num_str;
  867 + while (util::is_digit(*p)) {
  868 + num_str.append(1, *p++);
  869 + }
  870 + // Skip any space including line terminators
  871 + while (util::is_space(*p)) {
  872 + ++p;
  873 + }
  874 + bytes = toI(p - start);
  875 + obj = QUtil::string_to_int(obj_str.c_str());
  876 + num = QUtil::string_to_int(num_str.c_str());
  877 + return true;
  878 +}
  879 +
  880 +bool
  881 +QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
  882 +{
  883 + // Reposition after initial read attempt and reread.
  884 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  885 + auto line = m->file->readLine(30);
  886 +
  887 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  888 + // buffer.
  889 + char const* p = line.data();
  890 +
  891 + // Skip zero or more spaces. There aren't supposed to be any.
  892 + bool invalid = false;
  893 + while (util::is_space(*p)) {
  894 + ++p;
  895 + QTC::TC("qpdf", "QPDF ignore first space in xref entry");
  896 + invalid = true;
  897 + }
  898 + // Require digit
  899 + if (!util::is_digit(*p)) {
  900 + return false;
  901 + }
  902 + // Gather digits
  903 + std::string f1_str;
  904 + while (util::is_digit(*p)) {
  905 + f1_str.append(1, *p++);
  906 + }
  907 + // Require space
  908 + if (!util::is_space(*p)) {
  909 + return false;
  910 + }
  911 + if (util::is_space(*(p + 1))) {
  912 + QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
  913 + invalid = true;
  914 + }
  915 + // Skip spaces
  916 + while (util::is_space(*p)) {
  917 + ++p;
  918 + }
  919 + // Require digit
  920 + if (!util::is_digit(*p)) {
  921 + return false;
  922 + }
  923 + // Gather digits
  924 + std::string f2_str;
  925 + while (util::is_digit(*p)) {
  926 + f2_str.append(1, *p++);
  927 + }
  928 + // Require space
  929 + if (!util::is_space(*p)) {
  930 + return false;
  931 + }
  932 + if (util::is_space(*(p + 1))) {
  933 + QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
  934 + invalid = true;
  935 + }
  936 + // Skip spaces
  937 + while (util::is_space(*p)) {
  938 + ++p;
  939 + }
  940 + if ((*p == 'f') || (*p == 'n')) {
  941 + type = *p;
  942 + } else {
  943 + return false;
  944 + }
  945 + if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
  946 + QTC::TC("qpdf", "QPDF ignore length error xref entry");
  947 + invalid = true;
  948 + }
  949 +
  950 + if (invalid) {
  951 + warn(damagedPDF("xref table", "accepting invalid xref table entry"));
  952 + }
  953 +
  954 + f1 = QUtil::string_to_ll(f1_str.c_str());
  955 + f2 = QUtil::string_to_int(f2_str.c_str());
  956 +
  957 + return true;
  958 +}
  959 +
  960 +// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
  961 +// result.
  962 +bool
  963 +QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
  964 +{
  965 + std::array<char, 21> line;
  966 + if (m->file->read(line.data(), 20) != 20) {
  967 + // C++20: [[unlikely]]
  968 + return false;
  969 + }
  970 + line[20] = '\0';
  971 + char const* p = line.data();
  972 +
  973 + int f1_len = 0;
  974 + int f2_len = 0;
  975 +
  976 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  977 + // buffer.
  978 +
  979 + // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
  980 + while (*p == '0') {
  981 + ++f1_len;
  982 + ++p;
  983 + }
  984 + while (util::is_digit(*p) && f1_len++ < 10) {
  985 + f1 *= 10;
  986 + f1 += *p++ - '0';
  987 + }
  988 + // Require space
  989 + if (!util::is_space(*p++)) {
  990 + // Entry doesn't start with space or digit.
  991 + // C++20: [[unlikely]]
  992 + return false;
  993 + }
  994 + // Gather digits. NB No risk of overflow as 99'999 < max int.
  995 + while (*p == '0') {
  996 + ++f2_len;
  997 + ++p;
  998 + }
  999 + while (util::is_digit(*p) && f2_len++ < 5) {
  1000 + f2 *= 10;
  1001 + f2 += static_cast<int>(*p++ - '0');
  1002 + }
  1003 + if (util::is_space(*p++) && (*p == 'f' || *p == 'n')) {
  1004 + // C++20: [[likely]]
  1005 + type = *p;
  1006 + // No test for valid line[19].
  1007 + if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
  1008 + // C++20: [[likely]]
  1009 + return true;
  1010 + }
  1011 + }
  1012 + return read_bad_xrefEntry(f1, f2, type);
  1013 +}
  1014 +
  1015 +// Read a single cross-reference table section and associated trailer.
  1016 +qpdf_offset_t
  1017 +QPDF::read_xrefTable(qpdf_offset_t xref_offset)
  1018 +{
  1019 + m->file->seek(xref_offset, SEEK_SET);
  1020 + std::string line;
  1021 + while (true) {
  1022 + line.assign(50, '\0');
  1023 + m->file->read(line.data(), line.size());
  1024 + int obj = 0;
  1025 + int num = 0;
  1026 + int bytes = 0;
  1027 + if (!parse_xrefFirst(line, obj, num, bytes)) {
  1028 + QTC::TC("qpdf", "QPDF invalid xref");
  1029 + throw damagedPDF("xref table", "xref syntax invalid");
  1030 + }
  1031 + m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
  1032 + for (qpdf_offset_t i = obj; i - num < obj; ++i) {
  1033 + if (i == 0) {
  1034 + // This is needed by checkLinearization()
  1035 + m->first_xref_item_offset = m->file->tell();
  1036 + }
  1037 + // For xref_table, these will always be small enough to be ints
  1038 + qpdf_offset_t f1 = 0;
  1039 + int f2 = 0;
  1040 + char type = '\0';
  1041 + if (!read_xrefEntry(f1, f2, type)) {
  1042 + QTC::TC("qpdf", "QPDF invalid xref entry");
  1043 + throw damagedPDF(
  1044 + "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
  1045 + }
  1046 + if (type == 'f') {
  1047 + insertFreeXrefEntry(QPDFObjGen(toI(i), f2));
  1048 + } else {
  1049 + insertXrefEntry(toI(i), 1, f1, f2);
  1050 + }
  1051 + }
  1052 + qpdf_offset_t pos = m->file->tell();
  1053 + if (readToken(*m->file).isWord("trailer")) {
  1054 + break;
  1055 + } else {
  1056 + m->file->seek(pos, SEEK_SET);
  1057 + }
  1058 + }
  1059 +
  1060 + // Set offset to previous xref table if any
  1061 + QPDFObjectHandle cur_trailer = readTrailer();
  1062 + if (!cur_trailer.isDictionary()) {
  1063 + QTC::TC("qpdf", "QPDF missing trailer");
  1064 + throw damagedPDF("", "expected trailer dictionary");
  1065 + }
  1066 +
  1067 + if (!m->trailer) {
  1068 + setTrailer(cur_trailer);
  1069 +
  1070 + if (!m->trailer.hasKey("/Size")) {
  1071 + QTC::TC("qpdf", "QPDF trailer lacks size");
  1072 + throw damagedPDF("trailer", "trailer dictionary lacks /Size key");
  1073 + }
  1074 + if (!m->trailer.getKey("/Size").isInteger()) {
  1075 + QTC::TC("qpdf", "QPDF trailer size not integer");
  1076 + throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
  1077 + }
  1078 + }
  1079 +
  1080 + if (cur_trailer.hasKey("/XRefStm")) {
  1081 + if (m->ignore_xref_streams) {
  1082 + QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
  1083 + } else {
  1084 + if (cur_trailer.getKey("/XRefStm").isInteger()) {
  1085 + // Read the xref stream but disregard any return value -- we'll use our trailer's
  1086 + // /Prev key instead of the xref stream's.
  1087 + (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());
  1088 + } else {
  1089 + throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");
  1090 + }
  1091 + }
  1092 + }
  1093 +
  1094 + if (cur_trailer.hasKey("/Prev")) {
  1095 + if (!cur_trailer.getKey("/Prev").isInteger()) {
  1096 + QTC::TC("qpdf", "QPDF trailer prev not integer");
  1097 + throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");
  1098 + }
  1099 + QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
  1100 + return cur_trailer.getKey("/Prev").getIntValue();
  1101 + }
  1102 +
  1103 + return 0;
  1104 +}
  1105 +
  1106 +// Read a single cross-reference stream.
  1107 +qpdf_offset_t
  1108 +QPDF::read_xrefStream(qpdf_offset_t xref_offset)
  1109 +{
  1110 + if (!m->ignore_xref_streams) {
  1111 + QPDFObjGen x_og;
  1112 + QPDFObjectHandle xref_obj;
  1113 + try {
  1114 + xref_obj =
  1115 + readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);
  1116 + } catch (QPDFExc&) {
  1117 + // ignore -- report error below
  1118 + }
  1119 + if (xref_obj.isStreamOfType("/XRef")) {
  1120 + QTC::TC("qpdf", "QPDF found xref stream");
  1121 + return processXRefStream(xref_offset, xref_obj);
  1122 + }
  1123 + }
  1124 +
  1125 + QTC::TC("qpdf", "QPDF can't find xref");
  1126 + throw damagedPDF("", xref_offset, "xref not found");
  1127 + return 0; // unreachable
  1128 +}
  1129 +
  1130 +// Return the entry size of the xref stream and the processed W array.
  1131 +std::pair<int, std::array<int, 3>>
  1132 +QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
  1133 +{
  1134 + auto W_obj = dict.getKey("/W");
  1135 + if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&
  1136 + W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
  1137 + throw damaged("Cross-reference stream does not have a proper /W key");
  1138 + }
  1139 +
  1140 + std::array<int, 3> W;
  1141 + int entry_size = 0;
  1142 + auto w_vector = W_obj.getArrayAsVector();
  1143 + int max_bytes = sizeof(qpdf_offset_t);
  1144 + for (size_t i = 0; i < 3; ++i) {
  1145 + W[i] = w_vector[i].getIntValueAsInt();
  1146 + if (W[i] > max_bytes) {
  1147 + throw damaged("Cross-reference stream's /W contains impossibly large values");
  1148 + }
  1149 + if (W[i] < 0) {
  1150 + throw damaged("Cross-reference stream's /W contains negative values");
  1151 + }
  1152 + entry_size += W[i];
  1153 + }
  1154 + if (entry_size == 0) {
  1155 + throw damaged("Cross-reference stream's /W indicates entry size of 0");
  1156 + }
  1157 + return {entry_size, W};
  1158 +}
  1159 +
  1160 +// Validate Size key and return the maximum number of entries that the xref stream can contain.
  1161 +int
  1162 +QPDF::processXRefSize(
  1163 + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
  1164 +{
  1165 + // Number of entries is limited by the highest possible object id and stream size.
  1166 + auto max_num_entries = std::numeric_limits<int>::max();
  1167 + if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
  1168 + max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
  1169 + }
  1170 +
  1171 + auto Size_obj = dict.getKey("/Size");
  1172 + long long size;
  1173 + if (!dict.getKey("/Size").getValueAsInt(size)) {
  1174 + throw damaged("Cross-reference stream does not have a proper /Size key");
  1175 + } else if (size < 0) {
  1176 + throw damaged("Cross-reference stream has a negative /Size key");
  1177 + } else if (size >= max_num_entries) {
  1178 + throw damaged("Cross-reference stream has an impossibly large /Size key");
  1179 + }
  1180 + // We are not validating that Size <= (Size key of parent xref / trailer).
  1181 + return max_num_entries;
  1182 +}
  1183 +
  1184 +// Return the number of entries of the xref stream and the processed Index array.
  1185 +std::pair<int, std::vector<std::pair<int, int>>>
  1186 +QPDF::processXRefIndex(
  1187 + QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
  1188 +{
  1189 + auto size = dict.getKey("/Size").getIntValueAsInt();
  1190 + auto Index_obj = dict.getKey("/Index");
  1191 +
  1192 + if (Index_obj.isArray()) {
  1193 + std::vector<std::pair<int, int>> indx;
  1194 + int num_entries = 0;
  1195 + auto index_vec = Index_obj.getArrayAsVector();
  1196 + if ((index_vec.size() % 2) || index_vec.size() < 2) {
  1197 + throw damaged("Cross-reference stream's /Index has an invalid number of values");
  1198 + }
  1199 +
  1200 + int i = 0;
  1201 + long long first = 0;
  1202 + for (auto& val: index_vec) {
  1203 + if (val.isInteger()) {
  1204 + if (i % 2) {
  1205 + auto count = val.getIntValue();
  1206 + if (count <= 0) {
  1207 + throw damaged(
  1208 + "Cross-reference stream section claims to contain " +
  1209 + std::to_string(count) + " entries");
  1210 + }
  1211 + // We are guarding against the possibility of num_entries * entry_size
  1212 + // overflowing. We are not checking that entries are in ascending order as
  1213 + // required by the spec, which probably should generate a warning. We are also
  1214 + // not checking that for each subsection first object number + number of entries
  1215 + // <= /Size. The spec requires us to ignore object number > /Size.
  1216 + if (first > (max_num_entries - count) ||
  1217 + count > (max_num_entries - num_entries)) {
  1218 + throw damaged(
  1219 + "Cross-reference stream claims to contain too many entries: " +
  1220 + std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
  1221 + std::to_string(num_entries));
  1222 + }
  1223 + indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
  1224 + num_entries += static_cast<int>(count);
  1225 + } else {
  1226 + first = val.getIntValue();
  1227 + if (first < 0) {
  1228 + throw damaged(
  1229 + "Cross-reference stream's /Index contains a negative object id");
  1230 + } else if (first > max_num_entries) {
  1231 + throw damaged(
  1232 + "Cross-reference stream's /Index contains an impossibly "
  1233 + "large object id");
  1234 + }
  1235 + }
  1236 + } else {
  1237 + throw damaged(
  1238 + "Cross-reference stream's /Index's item " + std::to_string(i) +
  1239 + " is not an integer");
  1240 + }
  1241 + i++;
  1242 + }
  1243 + QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
  1244 + return {num_entries, indx};
  1245 + } else if (Index_obj.isNull()) {
  1246 + QTC::TC("qpdf", "QPDF xref /Index is null");
  1247 + return {size, {{0, size}}};
  1248 + } else {
  1249 + throw damaged("Cross-reference stream does not have a proper /Index key");
  1250 + }
  1251 +}
  1252 +
  1253 +qpdf_offset_t
  1254 +QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
  1255 +{
  1256 + auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
  1257 + return damagedPDF("xref stream", xref_offset, msg.data());
  1258 + };
  1259 +
  1260 + auto dict = xref_obj.getDict();
  1261 +
  1262 + auto [entry_size, W] = processXRefW(dict, damaged);
  1263 + int max_num_entries = processXRefSize(dict, entry_size, damaged);
  1264 + auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
  1265 +
  1266 + std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
  1267 + size_t actual_size = bp->getSize();
  1268 + auto expected_size = toS(entry_size) * toS(num_entries);
  1269 +
  1270 + if (expected_size != actual_size) {
  1271 + QPDFExc x = damaged(
  1272 + "Cross-reference stream data has the wrong size; expected = " +
  1273 + std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
  1274 + if (expected_size > actual_size) {
  1275 + throw x;
  1276 + } else {
  1277 + warn(x);
  1278 + }
  1279 + }
  1280 +
  1281 + bool saw_first_compressed_object = false;
  1282 +
  1283 + // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
  1284 + // We know that entry_size * num_entries is less or equal to the size of the buffer.
  1285 + auto p = bp->getBuffer();
  1286 + for (auto [obj, sec_entries]: indx) {
  1287 + // Process a subsection.
  1288 + for (int i = 0; i < sec_entries; ++i) {
  1289 + // Read this entry
  1290 + std::array<qpdf_offset_t, 3> fields{};
  1291 + if (W[0] == 0) {
  1292 + QTC::TC("qpdf", "QPDF default for xref stream field 0");
  1293 + fields[0] = 1;
  1294 + }
  1295 + for (size_t j = 0; j < 3; ++j) {
  1296 + for (int k = 0; k < W[j]; ++k) {
  1297 + fields[j] <<= 8;
  1298 + fields[j] |= *p++;
  1299 + }
  1300 + }
  1301 +
  1302 + // Get the generation number. The generation number is 0 unless this is an uncompressed
  1303 + // object record, in which case the generation number appears as the third field.
  1304 + if (saw_first_compressed_object) {
  1305 + if (fields[0] != 2) {
  1306 + m->uncompressed_after_compressed = true;
  1307 + }
  1308 + } else if (fields[0] == 2) {
  1309 + saw_first_compressed_object = true;
  1310 + }
  1311 + if (obj == 0) {
  1312 + // This is needed by checkLinearization()
  1313 + m->first_xref_item_offset = xref_offset;
  1314 + } else if (fields[0] == 0) {
  1315 + // Ignore fields[2], which we don't care about in this case. This works around the
  1316 + // issue of some PDF files that put invalid values, like -1, here for deleted
  1317 + // objects.
  1318 + insertFreeXrefEntry(QPDFObjGen(obj, 0));
  1319 + } else {
  1320 + insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
  1321 + }
  1322 + ++obj;
  1323 + }
  1324 + }
  1325 +
  1326 + if (!m->trailer) {
  1327 + setTrailer(dict);
  1328 + }
  1329 +
  1330 + if (dict.hasKey("/Prev")) {
  1331 + if (!dict.getKey("/Prev").isInteger()) {
  1332 + throw damagedPDF(
  1333 + "xref stream", "/Prev key in xref stream dictionary is not an integer");
  1334 + }
  1335 + QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
  1336 + return dict.getKey("/Prev").getIntValue();
  1337 + } else {
  1338 + return 0;
  1339 + }
  1340 +}
  1341 +
  1342 +void
  1343 +QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)
  1344 +{
  1345 + // Populate the xref table in such a way that the first reference to an object that we see,
  1346 + // which is the one in the latest xref table in which it appears, is the one that gets stored.
  1347 + // This works because we are reading more recent appends before older ones.
  1348 +
  1349 + // If there is already an entry for this object and generation in the table, it means that a
  1350 + // later xref table has registered this object. Disregard this one.
  1351 + int new_gen = f0 == 2 ? 0 : f2;
  1352 +
  1353 + if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && new_gen < 65535)) {
  1354 + // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There
  1355 + // is probably no point having another warning but we could count invalid items in order to
  1356 + // decide when to give up.
  1357 + QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");
  1358 + // ignore impossibly large object ids or object ids > Size.
  1359 + return;
  1360 + }
  1361 +
  1362 + if (m->deleted_objects.count(obj)) {
  1363 + QTC::TC("qpdf", "QPDF xref deleted object");
  1364 + return;
  1365 + }
  1366 +
  1367 + if (f0 == 2 && static_cast<int>(f1) == obj) {
  1368 + warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));
  1369 + return;
  1370 + }
  1371 +
  1372 + auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));
  1373 + if (!created) {
  1374 + QTC::TC("qpdf", "QPDF xref reused object");
  1375 + return;
  1376 + }
  1377 +
  1378 + switch (f0) {
  1379 + case 1:
  1380 + // f2 is generation
  1381 + QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
  1382 + iter->second = QPDFXRefEntry(f1);
  1383 + break;
  1384 +
  1385 + case 2:
  1386 + iter->second = QPDFXRefEntry(toI(f1), f2);
  1387 + break;
  1388 +
  1389 + default:
  1390 + throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));
  1391 + break;
  1392 + }
  1393 +}
  1394 +
  1395 +void
  1396 +QPDF::insertFreeXrefEntry(QPDFObjGen og)
  1397 +{
  1398 + if (!m->xref_table.count(og)) {
  1399 + m->deleted_objects.insert(og.getObj());
  1400 + }
  1401 +}
  1402 +
  1403 +void
  1404 +QPDF::showXRefTable()
  1405 +{
  1406 + auto& cout = *m->log->getInfo();
  1407 + for (auto const& iter: m->xref_table) {
  1408 + QPDFObjGen const& og = iter.first;
  1409 + QPDFXRefEntry const& entry = iter.second;
  1410 + cout << og.unparse('/') << ": ";
  1411 + switch (entry.getType()) {
  1412 + case 1:
  1413 + cout << "uncompressed; offset = " << entry.getOffset();
  1414 + break;
  1415 +
  1416 + case 2:
  1417 + *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()
  1418 + << ", index = " << entry.getObjStreamIndex();
  1419 + break;
  1420 +
  1421 + default:
  1422 + throw std::logic_error("unknown cross-reference table type while showing xref_table");
  1423 + break;
  1424 + }
  1425 + m->log->info("\n");
  1426 + }
  1427 +}
  1428 +
  1429 +// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
  1430 +// return false. Otherwise return true.
  1431 +bool
  1432 +QPDF::resolveXRefTable()
  1433 +{
  1434 + bool may_change = !m->reconstructed_xref;
  1435 + for (auto& iter: m->xref_table) {
  1436 + if (isUnresolved(iter.first)) {
  1437 + resolve(iter.first);
  1438 + if (may_change && m->reconstructed_xref) {
  1439 + return false;
  1440 + }
  1441 + }
  1442 + }
  1443 + return true;
  1444 +}
  1445 +
  1446 +// Ensure all objects in the pdf file, including those in indirect references, appear in the object
  1447 +// cache.
  1448 +void
  1449 +QPDF::fixDanglingReferences(bool force)
  1450 +{
  1451 + if (m->fixed_dangling_refs) {
  1452 + return;
  1453 + }
  1454 + if (!resolveXRefTable()) {
  1455 + QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");
  1456 + resolveXRefTable();
  1457 + }
  1458 + m->fixed_dangling_refs = true;
  1459 +}
  1460 +
  1461 +size_t
  1462 +QPDF::getObjectCount()
  1463 +{
  1464 + // This method returns the next available indirect object number. makeIndirectObject uses it for
  1465 + // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
  1466 + // be in obj_cache.
  1467 + fixDanglingReferences();
  1468 + QPDFObjGen og;
  1469 + if (!m->obj_cache.empty()) {
  1470 + og = (*(m->obj_cache.rbegin())).first;
  1471 + }
  1472 + return toS(og.getObj());
  1473 +}
  1474 +
  1475 +std::vector<QPDFObjectHandle>
  1476 +QPDF::getAllObjects()
  1477 +{
  1478 + // After fixDanglingReferences is called, all objects are in the object cache.
  1479 + fixDanglingReferences();
  1480 + std::vector<QPDFObjectHandle> result;
  1481 + for (auto const& iter: m->obj_cache) {
  1482 + result.push_back(newIndirect(iter.first, iter.second.object));
  1483 + }
  1484 + return result;
  1485 +}
  1486 +
  1487 +void
  1488 +QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen og)
  1489 +{
  1490 + m->last_object_description.clear();
  1491 + if (!description.empty()) {
  1492 + m->last_object_description += description;
  1493 + if (og.isIndirect()) {
  1494 + m->last_object_description += ": ";
  1495 + }
  1496 + }
  1497 + if (og.isIndirect()) {
  1498 + m->last_object_description += "object " + og.unparse(' ');
  1499 + }
  1500 +}
  1501 +
  1502 +QPDFObjectHandle
  1503 +QPDF::readTrailer()
  1504 +{
  1505 + qpdf_offset_t offset = m->file->tell();
  1506 + bool empty = false;
  1507 + auto object =
  1508 + QPDFParser(*m->file, "trailer", m->tokenizer, nullptr, this, true).parse(empty, false);
  1509 + if (empty) {
  1510 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1511 + // actual PDF files and Adobe Reader appears to ignore them.
  1512 + warn(damagedPDF("trailer", "empty object treated as null"));
  1513 + } else if (object.isDictionary() && readToken(*m->file).isWord("stream")) {
  1514 + warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
  1515 + }
  1516 + // Override last_offset so that it points to the beginning of the object we just read
  1517 + m->file->setLastOffset(offset);
  1518 + return object;
  1519 +}
  1520 +
  1521 +QPDFObjectHandle
  1522 +QPDF::readObject(std::string const& description, QPDFObjGen og)
  1523 +{
  1524 + setLastObjectDescription(description, og);
  1525 + qpdf_offset_t offset = m->file->tell();
  1526 + bool empty = false;
  1527 +
  1528 + StringDecrypter decrypter{this, og};
  1529 + StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
  1530 + auto object =
  1531 + QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this, true)
  1532 + .parse(empty, false);
  1533 + if (empty) {
  1534 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1535 + // actual PDF files and Adobe Reader appears to ignore them.
  1536 + warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));
  1537 + return object;
  1538 + }
  1539 + auto token = readToken(*m->file);
  1540 + if (object.isDictionary() && token.isWord("stream")) {
  1541 + readStream(object, og, offset);
  1542 + token = readToken(*m->file);
  1543 + }
  1544 + if (!token.isWord("endobj")) {
  1545 + QTC::TC("qpdf", "QPDF err expected endobj");
  1546 + warn(damagedPDF("expected endobj"));
  1547 + }
  1548 + return object;
  1549 +}
  1550 +
  1551 +// After reading stream dictionary and stream keyword, read rest of stream.
416 void 1552 void
417 -QPDF::warn(QPDFExc const& e) 1553 +QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
418 { 1554 {
419 - if (m->max_warnings > 0 && m->warnings.size() >= m->max_warnings) {  
420 - stopOnError("Too many warnings - file is too badly damaged");  
421 - }  
422 - m->warnings.push_back(e);  
423 - if (!m->suppress_warnings) {  
424 - *m->log->getWarn() << "WARNING: " << m->warnings.back().what() << "\n"; 1555 + validateStreamLineEnd(object, og, offset);
  1556 +
  1557 + // Must get offset before accessing any additional objects since resolving a previously
  1558 + // unresolved indirect object will change file position.
  1559 + qpdf_offset_t stream_offset = m->file->tell();
  1560 + size_t length = 0;
  1561 +
  1562 + try {
  1563 + auto length_obj = object.getKey("/Length");
  1564 +
  1565 + if (!length_obj.isInteger()) {
  1566 + if (length_obj.isNull()) {
  1567 + QTC::TC("qpdf", "QPDF stream without length");
  1568 + throw damagedPDF(offset, "stream dictionary lacks /Length key");
  1569 + }
  1570 + QTC::TC("qpdf", "QPDF stream length not integer");
  1571 + throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");
  1572 + }
  1573 +
  1574 + length = toS(length_obj.getUIntValue());
  1575 + // Seek in two steps to avoid potential integer overflow
  1576 + m->file->seek(stream_offset, SEEK_SET);
  1577 + m->file->seek(toO(length), SEEK_CUR);
  1578 + if (!readToken(*m->file).isWord("endstream")) {
  1579 + QTC::TC("qpdf", "QPDF missing endstream");
  1580 + throw damagedPDF("expected endstream");
  1581 + }
  1582 + } catch (QPDFExc& e) {
  1583 + if (m->attempt_recovery) {
  1584 + warn(e);
  1585 + length = recoverStreamLength(m->file, og, stream_offset);
  1586 + } else {
  1587 + throw;
  1588 + }
425 } 1589 }
  1590 + object = QPDFObjectHandle(qpdf::Stream(*this, og, object, stream_offset, length));
426 } 1591 }
427 1592
428 void 1593 void
429 -QPDF::warn(  
430 - qpdf_error_code_e error_code,  
431 - std::string const& object, 1594 +QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
  1595 +{
  1596 + // The PDF specification states that the word "stream" should be followed by either a carriage
  1597 + // return and a newline or by a newline alone. It specifically disallowed following it by a
  1598 + // carriage return alone since, in that case, there would be no way to tell whether the NL in a
  1599 + // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,
  1600 + // accept a carriage return by itself when followed by a non-newline character, so that's what
  1601 + // we do here. We have also seen files that have extraneous whitespace between the stream
  1602 + // keyword and the newline.
  1603 + while (true) {
  1604 + char ch;
  1605 + if (m->file->read(&ch, 1) == 0) {
  1606 + // A premature EOF here will result in some other problem that will get reported at
  1607 + // another time.
  1608 + return;
  1609 + }
  1610 + if (ch == '\n') {
  1611 + // ready to read stream data
  1612 + QTC::TC("qpdf", "QPDF stream with NL only");
  1613 + return;
  1614 + }
  1615 + if (ch == '\r') {
  1616 + // Read another character
  1617 + if (m->file->read(&ch, 1) != 0) {
  1618 + if (ch == '\n') {
  1619 + // Ready to read stream data
  1620 + QTC::TC("qpdf", "QPDF stream with CRNL");
  1621 + } else {
  1622 + // Treat the \r by itself as the whitespace after endstream and start reading
  1623 + // stream data in spite of not having seen a newline.
  1624 + QTC::TC("qpdf", "QPDF stream with CR only");
  1625 + m->file->unreadCh(ch);
  1626 + warn(damagedPDF(
  1627 + m->file->tell(), "stream keyword followed by carriage return only"));
  1628 + }
  1629 + }
  1630 + return;
  1631 + }
  1632 + if (!util::is_space(ch)) {
  1633 + QTC::TC("qpdf", "QPDF stream without newline");
  1634 + m->file->unreadCh(ch);
  1635 + warn(damagedPDF(
  1636 + m->file->tell(), "stream keyword not followed by proper line terminator"));
  1637 + return;
  1638 + }
  1639 + warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
  1640 + }
  1641 +}
  1642 +
  1643 +QPDFObjectHandle
  1644 +QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)
  1645 +{
  1646 + m->last_object_description.erase(7); // last_object_description starts with "object "
  1647 + m->last_object_description += std::to_string(obj);
  1648 + m->last_object_description += " 0";
  1649 +
  1650 + bool empty = false;
  1651 + auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true)
  1652 + .parse(empty, false);
  1653 + if (empty) {
  1654 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1655 + // actual PDF files and Adobe Reader appears to ignore them.
  1656 + warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));
  1657 + }
  1658 + return object;
  1659 +}
  1660 +
  1661 +bool
  1662 +QPDF::findEndstream()
  1663 +{
  1664 + // Find endstream or endobj. Position the input at that token.
  1665 + auto t = readToken(*m->file, 20);
  1666 + if (t.isWord("endobj") || t.isWord("endstream")) {
  1667 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1668 + return true;
  1669 + }
  1670 + return false;
  1671 +}
  1672 +
  1673 +size_t
  1674 +QPDF::recoverStreamLength(
  1675 + std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)
  1676 +{
  1677 + // Try to reconstruct stream length by looking for endstream or endobj
  1678 + warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));
  1679 +
  1680 + PatternFinder ef(*this, &QPDF::findEndstream);
  1681 + size_t length = 0;
  1682 + if (m->file->findFirst("end", stream_offset, 0, ef)) {
  1683 + length = toS(m->file->tell() - stream_offset);
  1684 + // Reread endstream but, if it was endobj, don't skip that.
  1685 + QPDFTokenizer::Token t = readToken(*m->file);
  1686 + if (t.getValue() == "endobj") {
  1687 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1688 + }
  1689 + }
  1690 +
  1691 + if (length) {
  1692 + auto end = stream_offset + toO(length);
  1693 + qpdf_offset_t found_offset = 0;
  1694 + QPDFObjGen found_og;
  1695 +
  1696 + // Make sure this is inside this object
  1697 + for (auto const& [current_og, entry]: m->xref_table) {
  1698 + if (entry.getType() == 1) {
  1699 + qpdf_offset_t obj_offset = entry.getOffset();
  1700 + if (found_offset < obj_offset && obj_offset < end) {
  1701 + found_offset = obj_offset;
  1702 + found_og = current_og;
  1703 + }
  1704 + }
  1705 + }
  1706 + if (!found_offset || found_og == og) {
  1707 + // If we are trying to recover an XRef stream the xref table will not contain and
  1708 + // won't contain any entries, therefore we cannot check the found length. Otherwise we
  1709 + // found endstream\nendobj within the space allowed for this object, so we're probably
  1710 + // in good shape.
  1711 + } else {
  1712 + QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
  1713 + length = 0;
  1714 + }
  1715 + }
  1716 +
  1717 + if (length == 0) {
  1718 + warn(damagedPDF(
  1719 + *input, stream_offset, "unable to recover stream data; treating stream as empty"));
  1720 + } else {
  1721 + warn(damagedPDF(
  1722 + *input, stream_offset, "recovered stream length: " + std::to_string(length)));
  1723 + }
  1724 +
  1725 + QTC::TC("qpdf", "QPDF recovered stream length");
  1726 + return length;
  1727 +}
  1728 +
  1729 +QPDFTokenizer::Token
  1730 +QPDF::readToken(InputSource& input, size_t max_len)
  1731 +{
  1732 + return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
  1733 +}
  1734 +
  1735 +QPDFObjectHandle
  1736 +QPDF::readObjectAtOffset(
  1737 + bool try_recovery,
432 qpdf_offset_t offset, 1738 qpdf_offset_t offset,
433 - std::string const& message) 1739 + std::string const& description,
  1740 + QPDFObjGen exp_og,
  1741 + QPDFObjGen& og,
  1742 + bool skip_cache_if_in_xref)
  1743 +{
  1744 + bool check_og = true;
  1745 + if (exp_og.getObj() == 0) {
  1746 + // This method uses an expect object ID of 0 to indicate that we don't know or don't care
  1747 + // what the actual object ID is at this offset. This is true when we read the xref stream
  1748 + // and linearization hint streams. In this case, we don't verify the expect object
  1749 + // ID/generation against what was read from the file. There is also no reason to attempt
  1750 + // xref recovery if we get a failure in this case since the read attempt was not triggered
  1751 + // by an xref lookup.
  1752 + check_og = false;
  1753 + try_recovery = false;
  1754 + }
  1755 + setLastObjectDescription(description, exp_og);
  1756 +
  1757 + if (!m->attempt_recovery) {
  1758 + try_recovery = false;
  1759 + }
  1760 +
  1761 + // Special case: if offset is 0, just return null. Some PDF writers, in particular
  1762 + // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
  1763 + // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
  1764 + // these.
  1765 + if (offset == 0) {
  1766 + QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
  1767 + warn(damagedPDF(0, "object has offset 0"));
  1768 + return QPDFObjectHandle::newNull();
  1769 + }
  1770 +
  1771 + m->file->seek(offset, SEEK_SET);
  1772 + try {
  1773 + QPDFTokenizer::Token tobjid = readToken(*m->file);
  1774 + bool objidok = tobjid.isInteger();
  1775 + QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
  1776 + if (!objidok) {
  1777 + QTC::TC("qpdf", "QPDF expected n n obj");
  1778 + throw damagedPDF(offset, "expected n n obj");
  1779 + }
  1780 + QPDFTokenizer::Token tgen = readToken(*m->file);
  1781 + bool genok = tgen.isInteger();
  1782 + QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
  1783 + if (!genok) {
  1784 + throw damagedPDF(offset, "expected n n obj");
  1785 + }
  1786 + QPDFTokenizer::Token tobj = readToken(*m->file);
  1787 +
  1788 + bool objok = tobj.isWord("obj");
  1789 + QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
  1790 +
  1791 + if (!objok) {
  1792 + throw damagedPDF(offset, "expected n n obj");
  1793 + }
  1794 + int objid = QUtil::string_to_int(tobjid.getValue().c_str());
  1795 + int generation = QUtil::string_to_int(tgen.getValue().c_str());
  1796 + og = QPDFObjGen(objid, generation);
  1797 + if (objid == 0) {
  1798 + QTC::TC("qpdf", "QPDF object id 0");
  1799 + throw damagedPDF(offset, "object with ID 0");
  1800 + }
  1801 + if (check_og && (exp_og != og)) {
  1802 + QTC::TC("qpdf", "QPDF err wrong objid/generation");
  1803 + QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
  1804 + if (try_recovery) {
  1805 + // Will be retried below
  1806 + throw e;
  1807 + } else {
  1808 + // We can try reading the object anyway even if the ID doesn't match.
  1809 + warn(e);
  1810 + }
  1811 + }
  1812 + } catch (QPDFExc& e) {
  1813 + if (try_recovery) {
  1814 + // Try again after reconstructing xref table
  1815 + reconstruct_xref(e);
  1816 + if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {
  1817 + qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
  1818 + QPDFObjectHandle result =
  1819 + readObjectAtOffset(false, new_offset, description, exp_og, og, false);
  1820 + QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
  1821 + return result;
  1822 + } else {
  1823 + QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
  1824 + warn(damagedPDF(
  1825 + "",
  1826 + 0,
  1827 + ("object " + exp_og.unparse(' ') +
  1828 + " not found in file after regenerating cross reference "
  1829 + "table")));
  1830 + return QPDFObjectHandle::newNull();
  1831 + }
  1832 + } else {
  1833 + throw;
  1834 + }
  1835 + }
  1836 +
  1837 + QPDFObjectHandle oh = readObject(description, og);
  1838 +
  1839 + if (isUnresolved(og)) {
  1840 + // Store the object in the cache here so it gets cached whether we first know the offset or
  1841 + // whether we first know the object ID and generation (in which we case we would get here
  1842 + // through resolve).
  1843 +
  1844 + // Determine the end offset of this object before and after white space. We use these
  1845 + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
  1846 + // the end of an object to be anywhere between these values.
  1847 + qpdf_offset_t end_before_space = m->file->tell();
  1848 +
  1849 + // skip over spaces
  1850 + while (true) {
  1851 + char ch;
  1852 + if (m->file->read(&ch, 1)) {
  1853 + if (!isspace(static_cast<unsigned char>(ch))) {
  1854 + m->file->seek(-1, SEEK_CUR);
  1855 + break;
  1856 + }
  1857 + } else {
  1858 + throw damagedPDF(m->file->tell(), "EOF after endobj");
  1859 + }
  1860 + }
  1861 + qpdf_offset_t end_after_space = m->file->tell();
  1862 + if (skip_cache_if_in_xref && m->xref_table.count(og)) {
  1863 + // Ordinarily, an object gets read here when resolved through xref table or stream. In
  1864 + // the special case of the xref stream and linearization hint tables, the offset comes
  1865 + // from another source. For the specific case of xref streams, the xref stream is read
  1866 + // and loaded into the object cache very early in parsing. Ordinarily, when a file is
  1867 + // updated by appending, items inserted into the xref table in later updates take
  1868 + // precedence over earlier items. In the special case of reusing the object number
  1869 + // previously used as the xref stream, we have the following order of events:
  1870 + //
  1871 + // * reused object gets loaded into the xref table
  1872 + // * old object is read here while reading xref streams
  1873 + // * original xref entry is ignored (since already in xref table)
  1874 + //
  1875 + // It is the second step that causes a problem. Even though the xref table is correct in
  1876 + // this case, the old object is already in the cache and so effectively prevails over
  1877 + // the reused object. To work around this issue, we have a special case for the xref
  1878 + // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,
  1879 + // don't cache what we read here.
  1880 + //
  1881 + // It is likely that the same bug may exist for linearization hint tables, but the
  1882 + // existing code uses end_before_space and end_after_space from the cache, so fixing
  1883 + // that would require more significant rework. The chances of a linearization hint
  1884 + // stream being reused seems smaller because the xref stream is probably the highest
  1885 + // object in the file and the linearization hint stream would be some random place in
  1886 + // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we
  1887 + // could use !check_og in place of skip_cache_if_in_xref.
  1888 + QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
  1889 + } else {
  1890 + updateCache(og, oh.getObj(), end_before_space, end_after_space);
  1891 + }
  1892 + }
  1893 +
  1894 + return oh;
  1895 +}
  1896 +
  1897 +std::shared_ptr<QPDFObject> const&
  1898 +QPDF::resolve(QPDFObjGen og)
434 { 1899 {
435 - warn(QPDFExc(error_code, getFilename(), object, offset, message)); 1900 + if (!isUnresolved(og)) {
  1901 + return m->obj_cache[og].object;
  1902 + }
  1903 +
  1904 + if (m->resolving.count(og)) {
  1905 + // This can happen if an object references itself directly or indirectly in some key that
  1906 + // has to be resolved during object parsing, such as stream length.
  1907 + QTC::TC("qpdf", "QPDF recursion loop in resolve");
  1908 + warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
  1909 + updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
  1910 + return m->obj_cache[og].object;
  1911 + }
  1912 + ResolveRecorder rr(this, og);
  1913 +
  1914 + if (m->xref_table.count(og) != 0) {
  1915 + QPDFXRefEntry const& entry = m->xref_table[og];
  1916 + try {
  1917 + switch (entry.getType()) {
  1918 + case 1:
  1919 + {
  1920 + qpdf_offset_t offset = entry.getOffset();
  1921 + // Object stored in cache by readObjectAtOffset
  1922 + QPDFObjGen a_og;
  1923 + QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);
  1924 + }
  1925 + break;
  1926 +
  1927 + case 2:
  1928 + resolveObjectsInStream(entry.getObjStreamNumber());
  1929 + break;
  1930 +
  1931 + default:
  1932 + throw damagedPDF(
  1933 + "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));
  1934 + }
  1935 + } catch (QPDFExc& e) {
  1936 + warn(e);
  1937 + } catch (std::exception& e) {
  1938 + warn(damagedPDF(
  1939 + "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
  1940 + }
  1941 + }
  1942 +
  1943 + if (isUnresolved(og)) {
  1944 + // PDF spec says unknown objects resolve to the null object.
  1945 + QTC::TC("qpdf", "QPDF resolve failure to null");
  1946 + updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
  1947 + }
  1948 +
  1949 + auto& result(m->obj_cache[og].object);
  1950 + result->setDefaultDescription(this, og);
  1951 + return result;
  1952 +}
  1953 +
  1954 +void
  1955 +QPDF::resolveObjectsInStream(int obj_stream_number)
  1956 +{
  1957 + if (m->resolved_object_streams.count(obj_stream_number)) {
  1958 + return;
  1959 + }
  1960 + m->resolved_object_streams.insert(obj_stream_number);
  1961 + // Force resolution of object stream
  1962 + QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
  1963 + if (!obj_stream.isStream()) {
  1964 + throw damagedPDF(
  1965 + "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
  1966 + }
  1967 +
  1968 + // For linearization data in the object, use the data from the object stream for the objects in
  1969 + // the stream.
  1970 + QPDFObjGen stream_og(obj_stream_number, 0);
  1971 + qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;
  1972 + qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;
  1973 +
  1974 + QPDFObjectHandle dict = obj_stream.getDict();
  1975 + if (!dict.isDictionaryOfType("/ObjStm")) {
  1976 + QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
  1977 + warn(damagedPDF(
  1978 + "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
  1979 + }
  1980 +
  1981 + if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {
  1982 + throw damagedPDF(
  1983 + ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));
  1984 + }
  1985 +
  1986 + int n = dict.getKey("/N").getIntValueAsInt();
  1987 + int first = dict.getKey("/First").getIntValueAsInt();
  1988 +
  1989 + std::map<int, int> offsets;
  1990 +
  1991 + std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
  1992 + auto input = std::shared_ptr<InputSource>(
  1993 + // line-break
  1994 + new BufferInputSource(
  1995 + (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),
  1996 + bp.get()));
  1997 +
  1998 + long long last_offset = -1;
  1999 + for (int i = 0; i < n; ++i) {
  2000 + QPDFTokenizer::Token tnum = readToken(*input);
  2001 + QPDFTokenizer::Token toffset = readToken(*input);
  2002 + if (!(tnum.isInteger() && toffset.isInteger())) {
  2003 + throw damagedPDF(
  2004 + *input,
  2005 + m->last_object_description,
  2006 + input->getLastOffset(),
  2007 + "expected integer in object stream header");
  2008 + }
  2009 +
  2010 + int num = QUtil::string_to_int(tnum.getValue().c_str());
  2011 + long long offset = QUtil::string_to_int(toffset.getValue().c_str());
  2012 +
  2013 + if (num == obj_stream_number) {
  2014 + QTC::TC("qpdf", "QPDF ignore self-referential object stream");
  2015 + warn(damagedPDF(
  2016 + *input,
  2017 + m->last_object_description,
  2018 + input->getLastOffset(),
  2019 + "object stream claims to contain itself"));
  2020 + continue;
  2021 + }
  2022 +
  2023 + if (num < 1) {
  2024 + QTC::TC("qpdf", "QPDF object stream contains id < 1");
  2025 + warn(damagedPDF(
  2026 + *input,
  2027 + "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
  2028 + 0,
  2029 + "object id is invalid"s));
  2030 + continue;
  2031 + }
  2032 +
  2033 + if (offset <= last_offset) {
  2034 + QTC::TC("qpdf", "QPDF object stream offsets not increasing");
  2035 + warn(damagedPDF(
  2036 + *input,
  2037 + "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
  2038 + 0,
  2039 + "offset is invalid (must be larger than previous offset " +
  2040 + std::to_string(last_offset) + ")"));
  2041 + continue;
  2042 + }
  2043 + last_offset = offset;
  2044 +
  2045 + if (num > m->xref_table_max_id) {
  2046 + continue;
  2047 + }
  2048 +
  2049 + offsets[num] = toI(offset + first);
  2050 + }
  2051 +
  2052 + // To avoid having to read the object stream multiple times, store all objects that would be
  2053 + // found here in the cache. Remember that some objects stored here might have been overridden
  2054 + // by new objects appended to the file, so it is necessary to recheck the xref table and only
  2055 + // cache what would actually be resolved here.
  2056 + m->last_object_description.clear();
  2057 + m->last_object_description += "object ";
  2058 + for (auto const& iter: offsets) {
  2059 + QPDFObjGen og(iter.first, 0);
  2060 + auto entry = m->xref_table.find(og);
  2061 + if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
  2062 + entry->second.getObjStreamNumber() == obj_stream_number) {
  2063 + int offset = iter.second;
  2064 + input->seek(offset, SEEK_SET);
  2065 + QPDFObjectHandle oh = readObjectInStream(input, iter.first);
  2066 + updateCache(og, oh.getObj(), end_before_space, end_after_space);
  2067 + } else {
  2068 + QTC::TC("qpdf", "QPDF not caching overridden objstm object");
  2069 + }
  2070 + }
  2071 +}
  2072 +
  2073 +QPDFObjectHandle
  2074 +QPDF::newIndirect(QPDFObjGen og, std::shared_ptr<QPDFObject> const& obj)
  2075 +{
  2076 + obj->setDefaultDescription(this, og);
  2077 + return {obj};
  2078 +}
  2079 +
  2080 +void
  2081 +QPDF::updateCache(
  2082 + QPDFObjGen og,
  2083 + std::shared_ptr<QPDFObject> const& object,
  2084 + qpdf_offset_t end_before_space,
  2085 + qpdf_offset_t end_after_space,
  2086 + bool destroy)
  2087 +{
  2088 + object->setObjGen(this, og);
  2089 + if (isCached(og)) {
  2090 + auto& cache = m->obj_cache[og];
  2091 + object->move_to(cache.object, destroy);
  2092 + cache.end_before_space = end_before_space;
  2093 + cache.end_after_space = end_after_space;
  2094 + } else {
  2095 + m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);
  2096 + }
  2097 +}
  2098 +
  2099 +bool
  2100 +QPDF::isCached(QPDFObjGen og)
  2101 +{
  2102 + return m->obj_cache.count(og) != 0;
  2103 +}
  2104 +
  2105 +bool
  2106 +QPDF::isUnresolved(QPDFObjGen og)
  2107 +{
  2108 + return !isCached(og) || m->obj_cache[og].object->isUnresolved();
  2109 +}
  2110 +
  2111 +QPDFObjGen
  2112 +QPDF::nextObjGen()
  2113 +{
  2114 + int max_objid = toI(getObjectCount());
  2115 + if (max_objid == std::numeric_limits<int>::max()) {
  2116 + throw std::range_error("max object id is too high to create new objects");
  2117 + }
  2118 + return QPDFObjGen(max_objid + 1, 0);
  2119 +}
  2120 +
  2121 +QPDFObjectHandle
  2122 +QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
  2123 +{
  2124 + QPDFObjGen next{nextObjGen()};
  2125 + m->obj_cache[next] = ObjCache(obj, -1, -1);
  2126 + return newIndirect(next, m->obj_cache[next].object);
  2127 +}
  2128 +
  2129 +QPDFObjectHandle
  2130 +QPDF::makeIndirectObject(QPDFObjectHandle oh)
  2131 +{
  2132 + if (!oh) {
  2133 + throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
  2134 + }
  2135 + return makeIndirectFromQPDFObject(oh.getObj());
436 } 2136 }
437 2137
438 QPDFObjectHandle 2138 QPDFObjectHandle
@@ -470,6 +2170,52 @@ QPDF::newStream(std::string const&amp; data) @@ -470,6 +2170,52 @@ QPDF::newStream(std::string const&amp; data)
470 return result; 2170 return result;
471 } 2171 }
472 2172
  2173 +std::shared_ptr<QPDFObject>
  2174 +QPDF::getObjectForParser(int id, int gen, bool parse_pdf)
  2175 +{
  2176 + // This method is called by the parser and therefore must not resolve any objects.
  2177 + auto og = QPDFObjGen(id, gen);
  2178 + if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
  2179 + return iter->second.object;
  2180 + }
  2181 + if (m->xref_table.count(og) || !m->parsed) {
  2182 + return m->obj_cache.insert({og, QPDFObject::create<QPDF_Unresolved>(this, og)})
  2183 + .first->second.object;
  2184 + }
  2185 + if (parse_pdf) {
  2186 + return QPDFObject::create<QPDF_Null>();
  2187 + }
  2188 + return m->obj_cache.insert({og, QPDFObject::create<QPDF_Null>(this, og)}).first->second.object;
  2189 +}
  2190 +
  2191 +std::shared_ptr<QPDFObject>
  2192 +QPDF::getObjectForJSON(int id, int gen)
  2193 +{
  2194 + auto og = QPDFObjGen(id, gen);
  2195 + auto [it, inserted] = m->obj_cache.try_emplace(og);
  2196 + auto& obj = it->second.object;
  2197 + if (inserted) {
  2198 + obj = (m->parsed && !m->xref_table.count(og))
  2199 + ? QPDFObject::create<QPDF_Null>(this, og)
  2200 + : QPDFObject::create<QPDF_Unresolved>(this, og);
  2201 + }
  2202 + return obj;
  2203 +}
  2204 +
  2205 +QPDFObjectHandle
  2206 +QPDF::getObject(QPDFObjGen og)
  2207 +{
  2208 + if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
  2209 + return {it->second.object};
  2210 + } else if (m->parsed && !m->xref_table.count(og)) {
  2211 + return QPDFObject::create<QPDF_Null>();
  2212 + } else {
  2213 + auto result =
  2214 + m->obj_cache.try_emplace(og, QPDFObject::create<QPDF_Unresolved>(this, og), -1, -1);
  2215 + return {result.first->second.object};
  2216 + }
  2217 +}
  2218 +
473 QPDFObjectHandle 2219 QPDFObjectHandle
474 QPDF::getObject(int objid, int generation) 2220 QPDF::getObject(int objid, int generation)
475 { 2221 {
@@ -488,6 +2234,45 @@ QPDF::getObjectByID(int objid, int generation) @@ -488,6 +2234,45 @@ QPDF::getObjectByID(int objid, int generation)
488 return getObject(QPDFObjGen(objid, generation)); 2234 return getObject(QPDFObjGen(objid, generation));
489 } 2235 }
490 2236
  2237 +void
  2238 +QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
  2239 +{
  2240 + replaceObject(QPDFObjGen(objid, generation), oh);
  2241 +}
  2242 +
  2243 +void
  2244 +QPDF::replaceObject(QPDFObjGen og, QPDFObjectHandle oh)
  2245 +{
  2246 + if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {
  2247 + QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
  2248 + throw std::logic_error("QPDF::replaceObject called with indirect object handle");
  2249 + }
  2250 + updateCache(og, oh.getObj(), -1, -1, false);
  2251 +}
  2252 +
  2253 +void
  2254 +QPDF::removeObject(QPDFObjGen og)
  2255 +{
  2256 + m->xref_table.erase(og);
  2257 + if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
  2258 + // Take care of any object handles that may be floating around.
  2259 + cached->second.object->assign_null();
  2260 + cached->second.object->setObjGen(nullptr, QPDFObjGen());
  2261 + m->obj_cache.erase(cached);
  2262 + }
  2263 +}
  2264 +
  2265 +void
  2266 +QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
  2267 +{
  2268 + QTC::TC("qpdf", "QPDF replaceReserved");
  2269 + auto tc = reserved.getTypeCode();
  2270 + if (!(tc == ::ot_reserved || tc == ::ot_null)) {
  2271 + throw std::logic_error("replaceReserved called with non-reserved object");
  2272 + }
  2273 + replaceObject(reserved.getObjGen(), replacement);
  2274 +}
  2275 +
491 QPDFObjectHandle 2276 QPDFObjectHandle
492 QPDF::copyForeignObject(QPDFObjectHandle foreign) 2277 QPDF::copyForeignObject(QPDFObjectHandle foreign)
493 { 2278 {
@@ -747,6 +2532,21 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign) @@ -747,6 +2532,21 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign)
747 } 2532 }
748 } 2533 }
749 2534
  2535 +void
  2536 +QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
  2537 +{
  2538 + swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
  2539 +}
  2540 +
  2541 +void
  2542 +QPDF::swapObjects(QPDFObjGen og1, QPDFObjGen og2)
  2543 +{
  2544 + // Force objects to be read from the input source if needed, then swap them in the cache.
  2545 + resolve(og1);
  2546 + resolve(og2);
  2547 + m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
  2548 +}
  2549 +
750 unsigned long long 2550 unsigned long long
751 QPDF::getUniqueId() const 2551 QPDF::getUniqueId() const
752 { 2552 {
@@ -840,6 +2640,136 @@ QPDF::getXRefTableInternal() @@ -840,6 +2640,136 @@ QPDF::getXRefTableInternal()
840 return m->xref_table; 2640 return m->xref_table;
841 } 2641 }
842 2642
  2643 +size_t
  2644 +QPDF::tableSize()
  2645 +{
  2646 + // If obj_cache is dense, accommodate all object in tables,else accommodate only original
  2647 + // objects.
  2648 + auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0;
  2649 + auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;
  2650 + auto max_id = std::numeric_limits<int>::max() - 1;
  2651 + if (max_obj >= max_id || max_xref >= max_id) {
  2652 + // Temporary fix. Long-term solution is
  2653 + // - QPDFObjGen to enforce objgens are valid and sensible
  2654 + // - xref table and obj cache to protect against insertion of impossibly large obj ids
  2655 + stopOnError("Impossibly large object id encountered.");
  2656 + }
  2657 + if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
  2658 + return toS(++max_obj);
  2659 + }
  2660 + return toS(++max_xref);
  2661 +}
  2662 +
  2663 +std::vector<QPDFObjGen>
  2664 +QPDF::getCompressibleObjVector()
  2665 +{
  2666 + return getCompressibleObjGens<QPDFObjGen>();
  2667 +}
  2668 +
  2669 +std::vector<bool>
  2670 +QPDF::getCompressibleObjSet()
  2671 +{
  2672 + return getCompressibleObjGens<bool>();
  2673 +}
  2674 +
  2675 +template <typename T>
  2676 +std::vector<T>
  2677 +QPDF::getCompressibleObjGens()
  2678 +{
  2679 + // Return a list of objects that are allowed to be in object streams. Walk through the objects
  2680 + // by traversing the document from the root, including a traversal of the pages tree. This
  2681 + // makes that objects that are on the same page are more likely to be in the same object stream,
  2682 + // which is slightly more efficient, particularly with linearized files. This is better than
  2683 + // iterating through the xref table since it avoids preserving orphaned items.
  2684 +
  2685 + // Exclude encryption dictionary, if any
  2686 + QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
  2687 + QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
  2688 +
  2689 + const size_t max_obj = getObjectCount();
  2690 + std::vector<bool> visited(max_obj, false);
  2691 + std::vector<QPDFObjectHandle> queue;
  2692 + queue.reserve(512);
  2693 + queue.push_back(m->trailer);
  2694 + std::vector<T> result;
  2695 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  2696 + result.reserve(m->obj_cache.size());
  2697 + } else if constexpr (std::is_same_v<T, bool>) {
  2698 + result.resize(max_obj + 1U, false);
  2699 + } else {
  2700 + throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
  2701 + }
  2702 + while (!queue.empty()) {
  2703 + auto obj = queue.back();
  2704 + queue.pop_back();
  2705 + if (obj.getObjectID() > 0) {
  2706 + QPDFObjGen og = obj.getObjGen();
  2707 + const size_t id = toS(og.getObj() - 1);
  2708 + if (id >= max_obj) {
  2709 + throw std::logic_error(
  2710 + "unexpected object id encountered in getCompressibleObjGens");
  2711 + }
  2712 + if (visited[id]) {
  2713 + QTC::TC("qpdf", "QPDF loop detected traversing objects");
  2714 + continue;
  2715 + }
  2716 +
  2717 + // Check whether this is the current object. If not, remove it (which changes it into a
  2718 + // direct null and therefore stops us from revisiting it) and move on to the next object
  2719 + // in the queue.
  2720 + auto upper = m->obj_cache.upper_bound(og);
  2721 + if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
  2722 + removeObject(og);
  2723 + continue;
  2724 + }
  2725 +
  2726 + visited[id] = true;
  2727 +
  2728 + if (og == encryption_dict_og) {
  2729 + QTC::TC("qpdf", "QPDF exclude encryption dictionary");
  2730 + } else if (!(obj.isStream() ||
  2731 + (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
  2732 + obj.hasKey("/Contents")))) {
  2733 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  2734 + result.push_back(og);
  2735 + } else if constexpr (std::is_same_v<T, bool>) {
  2736 + result[id + 1U] = true;
  2737 + }
  2738 + }
  2739 + }
  2740 + if (obj.isStream()) {
  2741 + auto dict = obj.getDict().as_dictionary();
  2742 + auto end = dict.crend();
  2743 + for (auto iter = dict.crbegin(); iter != end; ++iter) {
  2744 + std::string const& key = iter->first;
  2745 + QPDFObjectHandle const& value = iter->second;
  2746 + if (!value.null()) {
  2747 + if (key == "/Length") {
  2748 + // omit stream lengths
  2749 + if (value.isIndirect()) {
  2750 + QTC::TC("qpdf", "QPDF exclude indirect length");
  2751 + }
  2752 + } else {
  2753 + queue.emplace_back(value);
  2754 + }
  2755 + }
  2756 + }
  2757 + } else if (obj.isDictionary()) {
  2758 + auto dict = obj.as_dictionary();
  2759 + auto end = dict.crend();
  2760 + for (auto iter = dict.crbegin(); iter != end; ++iter) {
  2761 + if (!iter->second.null()) {
  2762 + queue.emplace_back(iter->second);
  2763 + }
  2764 + }
  2765 + } else if (auto items = obj.as_array()) {
  2766 + queue.insert(queue.end(), items.crbegin(), items.crend());
  2767 + }
  2768 + }
  2769 +
  2770 + return result;
  2771 +}
  2772 +
843 bool 2773 bool
844 QPDF::pipeStreamData( 2774 QPDF::pipeStreamData(
845 std::shared_ptr<EncryptionParameters> encp, 2775 std::shared_ptr<EncryptionParameters> encp,
libqpdf/QPDF_objects.cc deleted
1 -#include <qpdf/qpdf-config.h> // include first for large file support  
2 -  
3 -#include <qpdf/QPDF_private.hh>  
4 -  
5 -#include <array>  
6 -#include <atomic>  
7 -#include <cstring>  
8 -#include <limits>  
9 -#include <map>  
10 -#include <regex>  
11 -#include <sstream>  
12 -#include <vector>  
13 -  
14 -#include <qpdf/BufferInputSource.hh>  
15 -#include <qpdf/FileInputSource.hh>  
16 -#include <qpdf/InputSource_private.hh>  
17 -#include <qpdf/OffsetInputSource.hh>  
18 -#include <qpdf/Pipeline.hh>  
19 -#include <qpdf/QPDFExc.hh>  
20 -#include <qpdf/QPDFLogger.hh>  
21 -#include <qpdf/QPDFObjectHandle_private.hh>  
22 -#include <qpdf/QPDFObject_private.hh>  
23 -#include <qpdf/QPDFParser.hh>  
24 -#include <qpdf/QTC.hh>  
25 -#include <qpdf/QUtil.hh>  
26 -#include <qpdf/Util.hh>  
27 -  
28 -using namespace qpdf;  
29 -using namespace std::literals;  
30 -  
31 -namespace  
32 -{  
33 - class InvalidInputSource: public InputSource  
34 - {  
35 - public:  
36 - ~InvalidInputSource() override = default;  
37 - qpdf_offset_t  
38 - findAndSkipNextEOL() override  
39 - {  
40 - throwException();  
41 - return 0;  
42 - }  
43 - std::string const&  
44 - getName() const override  
45 - {  
46 - static std::string name("closed input source");  
47 - return name;  
48 - }  
49 - qpdf_offset_t  
50 - tell() override  
51 - {  
52 - throwException();  
53 - return 0;  
54 - }  
55 - void  
56 - seek(qpdf_offset_t offset, int whence) override  
57 - {  
58 - throwException();  
59 - }  
60 - void  
61 - rewind() override  
62 - {  
63 - throwException();  
64 - }  
65 - size_t  
66 - read(char* buffer, size_t length) override  
67 - {  
68 - throwException();  
69 - return 0;  
70 - }  
71 - void  
72 - unreadCh(char ch) override  
73 - {  
74 - throwException();  
75 - }  
76 -  
77 - private:  
78 - void  
79 - throwException()  
80 - {  
81 - throw std::logic_error(  
82 - "QPDF operation attempted on a QPDF object with no input "  
83 - "source. QPDF operations are invalid before processFile (or "  
84 - "another process method) or after closeInputSource");  
85 - }  
86 - };  
87 -} // namespace  
88 -  
89 -bool  
90 -QPDF::findStartxref()  
91 -{  
92 - if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {  
93 - // Position in front of offset token  
94 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
95 - return true;  
96 - }  
97 - return false;  
98 -}  
99 -  
100 -void  
101 -QPDF::parse(char const* password)  
102 -{  
103 - if (password) {  
104 - m->encp->provided_password = password;  
105 - }  
106 -  
107 - // Find the header anywhere in the first 1024 bytes of the file.  
108 - PatternFinder hf(*this, &QPDF::findHeader);  
109 - if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {  
110 - QTC::TC("qpdf", "QPDF not a pdf file");  
111 - warn(damagedPDF("", 0, "can't find PDF header"));  
112 - // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode  
113 - m->pdf_version = "1.2";  
114 - }  
115 -  
116 - // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra  
117 - // 30 characters to leave room for the startxref stuff.  
118 - m->file->seek(0, SEEK_END);  
119 - qpdf_offset_t end_offset = m->file->tell();  
120 - m->xref_table_max_offset = end_offset;  
121 - // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic  
122 - // scenarios at least 3 bytes are required.  
123 - if (m->xref_table_max_id > m->xref_table_max_offset / 3) {  
124 - m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);  
125 - }  
126 - qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);  
127 - PatternFinder sf(*this, &QPDF::findStartxref);  
128 - qpdf_offset_t xref_offset = 0;  
129 - if (m->file->findLast("startxref", start_offset, 0, sf)) {  
130 - xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());  
131 - }  
132 -  
133 - try {  
134 - if (xref_offset == 0) {  
135 - QTC::TC("qpdf", "QPDF can't find startxref");  
136 - throw damagedPDF("", 0, "can't find startxref");  
137 - }  
138 - try {  
139 - read_xref(xref_offset);  
140 - } catch (QPDFExc&) {  
141 - throw;  
142 - } catch (std::exception& e) {  
143 - throw damagedPDF("", 0, std::string("error reading xref: ") + e.what());  
144 - }  
145 - } catch (QPDFExc& e) {  
146 - if (m->attempt_recovery) {  
147 - reconstruct_xref(e, xref_offset > 0);  
148 - QTC::TC("qpdf", "QPDF reconstructed xref table");  
149 - } else {  
150 - throw;  
151 - }  
152 - }  
153 -  
154 - initializeEncryption();  
155 - m->parsed = true;  
156 - if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) {  
157 - // QPDFs created from JSON have an empty xref table and no root object yet.  
158 - throw damagedPDF("", 0, "unable to find page tree");  
159 - }  
160 -}  
161 -  
162 -void  
163 -QPDF::inParse(bool v)  
164 -{  
165 - if (m->in_parse == v) {  
166 - // This happens if QPDFParser::parse tries to resolve an indirect object while it is  
167 - // parsing.  
168 - throw std::logic_error(  
169 - "QPDF: re-entrant parsing detected. This is a qpdf bug."  
170 - " Please report at https://github.com/qpdf/qpdf/issues.");  
171 - }  
172 - m->in_parse = v;  
173 -}  
174 -  
175 -void  
176 -QPDF::setTrailer(QPDFObjectHandle obj)  
177 -{  
178 - if (m->trailer) {  
179 - return;  
180 - }  
181 - m->trailer = obj;  
182 -}  
183 -  
184 -void  
185 -QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)  
186 -{  
187 - if (m->reconstructed_xref) {  
188 - // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because  
189 - // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.  
190 - throw e;  
191 - }  
192 -  
193 - // If recovery generates more than 1000 warnings, the file is so severely damaged that there  
194 - // probably is no point trying to continue.  
195 - const auto max_warnings = m->warnings.size() + 1000U;  
196 - auto check_warnings = [this, max_warnings]() {  
197 - if (m->warnings.size() > max_warnings) {  
198 - throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");  
199 - }  
200 - };  
201 -  
202 - m->reconstructed_xref = true;  
203 - // We may find more objects, which may contain dangling references.  
204 - m->fixed_dangling_refs = false;  
205 -  
206 - warn(damagedPDF("", 0, "file is damaged"));  
207 - warn(e);  
208 - warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));  
209 -  
210 - // Delete all references to type 1 (uncompressed) objects  
211 - std::vector<QPDFObjGen> to_delete;  
212 - for (auto const& iter: m->xref_table) {  
213 - if (iter.second.getType() == 1) {  
214 - to_delete.emplace_back(iter.first);  
215 - }  
216 - }  
217 - for (auto const& iter: to_delete) {  
218 - m->xref_table.erase(iter);  
219 - }  
220 -  
221 - std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;  
222 - std::vector<qpdf_offset_t> trailers;  
223 - std::vector<qpdf_offset_t> startxrefs;  
224 -  
225 - m->file->seek(0, SEEK_END);  
226 - qpdf_offset_t eof = m->file->tell();  
227 - m->file->seek(0, SEEK_SET);  
228 - // Don't allow very long tokens here during recovery. All the interesting tokens are covered.  
229 - static size_t const MAX_LEN = 10;  
230 - while (m->file->tell() < eof) {  
231 - QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);  
232 - qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());  
233 - if (t1.isInteger()) {  
234 - auto pos = m->file->tell();  
235 - auto t2 = readToken(*m->file, MAX_LEN);  
236 - if (t2.isInteger() && readToken(*m->file, MAX_LEN).isWord("obj")) {  
237 - int obj = QUtil::string_to_int(t1.getValue().c_str());  
238 - int gen = QUtil::string_to_int(t2.getValue().c_str());  
239 - if (obj <= m->xref_table_max_id) {  
240 - found_objects.emplace_back(obj, gen, token_start);  
241 - } else {  
242 - warn(damagedPDF(  
243 - "", 0, "ignoring object with impossibly large id " + std::to_string(obj)));  
244 - }  
245 - }  
246 - m->file->seek(pos, SEEK_SET);  
247 - } else if (!m->trailer && t1.isWord("trailer")) {  
248 - trailers.emplace_back(m->file->tell());  
249 - } else if (!found_startxref && t1.isWord("startxref")) {  
250 - startxrefs.emplace_back(m->file->tell());  
251 - }  
252 - check_warnings();  
253 - m->file->findAndSkipNextEOL();  
254 - }  
255 -  
256 - if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&  
257 - startxrefs.back() > std::get<2>(found_objects.back())) {  
258 - try {  
259 - m->file->seek(startxrefs.back(), SEEK_SET);  
260 - if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {  
261 - read_xref(offset);  
262 - if (getRoot().getKey("/Pages").isDictionary()) {  
263 - QTC::TC("qpdf", "QPDF startxref more than 1024 before end");  
264 - warn(  
265 - damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));  
266 - initializeEncryption();  
267 - m->parsed = true;  
268 - m->reconstructed_xref = false;  
269 - return;  
270 - }  
271 - }  
272 - } catch (...) {  
273 - // ok, bad luck. Do recovery.  
274 - }  
275 - }  
276 -  
277 - auto rend = found_objects.rend();  
278 - for (auto it = found_objects.rbegin(); it != rend; it++) {  
279 - auto [obj, gen, token_start] = *it;  
280 - insertXrefEntry(obj, 1, token_start, gen);  
281 - check_warnings();  
282 - }  
283 - m->deleted_objects.clear();  
284 -  
285 - for (auto it = trailers.rbegin(); it != trailers.rend(); it++) {  
286 - m->file->seek(*it, SEEK_SET);  
287 - auto t = readTrailer();  
288 - if (!t.isDictionary()) {  
289 - // Oh well. It was worth a try.  
290 - } else {  
291 - if (t.hasKey("/Root")) {  
292 - m->trailer = t;  
293 - break;  
294 - }  
295 - warn(damagedPDF("trailer", *it, "recovered trailer has no /Root entry"));  
296 - }  
297 - check_warnings();  
298 - }  
299 -  
300 - if (!m->trailer) {  
301 - qpdf_offset_t max_offset{0};  
302 - size_t max_size{0};  
303 - // If there are any xref streams, take the last one to appear.  
304 - for (auto const& iter: m->xref_table) {  
305 - auto entry = iter.second;  
306 - if (entry.getType() != 1) {  
307 - continue;  
308 - }  
309 - auto oh = getObject(iter.first);  
310 - try {  
311 - if (!oh.isStreamOfType("/XRef")) {  
312 - continue;  
313 - }  
314 - } catch (std::exception&) {  
315 - continue;  
316 - }  
317 - auto offset = entry.getOffset();  
318 - auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();  
319 - if (size > max_size || (size == max_size && offset > max_offset)) {  
320 - max_offset = offset;  
321 - setTrailer(oh.getDict());  
322 - }  
323 - check_warnings();  
324 - }  
325 - if (max_offset > 0) {  
326 - try {  
327 - read_xref(max_offset);  
328 - } catch (std::exception&) {  
329 - warn(damagedPDF(  
330 - "", 0, "error decoding candidate xref stream while recovering damaged file"));  
331 - }  
332 - QTC::TC("qpdf", "QPDF recover xref stream");  
333 - }  
334 - }  
335 -  
336 - if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {  
337 - // Try to find a Root dictionary. As a quick fix try the one with the highest object id.  
338 - QPDFObjectHandle root;  
339 - for (auto const& iter: m->obj_cache) {  
340 - try {  
341 - if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {  
342 - root = iter.second.object;  
343 - }  
344 - } catch (std::exception&) {  
345 - continue;  
346 - }  
347 - }  
348 - if (root) {  
349 - if (!m->trailer) {  
350 - warn(damagedPDF(  
351 - "", 0, "unable to find trailer dictionary while recovering damaged file"));  
352 - m->trailer = QPDFObjectHandle::newDictionary();  
353 - }  
354 - m->trailer.replaceKey("/Root", root);  
355 - }  
356 - }  
357 -  
358 - if (!m->trailer) {  
359 - // We could check the last encountered object to see if it was an xref stream. If so, we  
360 - // could try to get the trailer from there. This may make it possible to recover files with  
361 - // bad startxref pointers even when they have object streams.  
362 -  
363 - throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");  
364 - }  
365 - if (m->xref_table.empty()) {  
366 - // We cannot check for an empty xref table in parse because empty tables are valid when  
367 - // creating QPDF objects from JSON.  
368 - throw damagedPDF("", 0, "unable to find objects while recovering damaged file");  
369 - }  
370 - check_warnings();  
371 - if (!m->parsed) {  
372 - m->parsed = true;  
373 - getAllPages();  
374 - check_warnings();  
375 - if (m->all_pages.empty()) {  
376 - m->parsed = false;  
377 - throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");  
378 - }  
379 - }  
380 - // We could iterate through the objects looking for streams and try to find objects inside of  
381 - // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors  
382 - // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything  
383 - // that involved looking at stream contents, we'd also have to call initializeEncryption() here.  
384 - // It's safe to call it more than once.  
385 -}  
386 -  
387 -void  
388 -QPDF::read_xref(qpdf_offset_t xref_offset)  
389 -{  
390 - std::map<int, int> free_table;  
391 - std::set<qpdf_offset_t> visited;  
392 - while (xref_offset) {  
393 - visited.insert(xref_offset);  
394 - char buf[7];  
395 - memset(buf, 0, sizeof(buf));  
396 - m->file->seek(xref_offset, SEEK_SET);  
397 - // Some files miss the mark a little with startxref. We could do a better job of searching  
398 - // in the neighborhood for something that looks like either an xref table or stream, but the  
399 - // simple heuristic of skipping whitespace can help with the xref table case and is harmless  
400 - // with the stream case.  
401 - bool done = false;  
402 - bool skipped_space = false;  
403 - while (!done) {  
404 - char ch;  
405 - if (1 == m->file->read(&ch, 1)) {  
406 - if (util::is_space(ch)) {  
407 - skipped_space = true;  
408 - } else {  
409 - m->file->unreadCh(ch);  
410 - done = true;  
411 - }  
412 - } else {  
413 - QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);  
414 - done = true;  
415 - }  
416 - }  
417 -  
418 - m->file->read(buf, sizeof(buf) - 1);  
419 - // The PDF spec says xref must be followed by a line terminator, but files exist in the wild  
420 - // where it is terminated by arbitrary whitespace.  
421 - if ((strncmp(buf, "xref", 4) == 0) && util::is_space(buf[4])) {  
422 - if (skipped_space) {  
423 - QTC::TC("qpdf", "QPDF xref skipped space");  
424 - warn(damagedPDF("", 0, "extraneous whitespace seen before xref"));  
425 - }  
426 - QTC::TC(  
427 - "qpdf",  
428 - "QPDF xref space",  
429 - ((buf[4] == '\n') ? 0  
430 - : (buf[4] == '\r') ? 1  
431 - : (buf[4] == ' ') ? 2  
432 - : 9999));  
433 - int skip = 4;  
434 - // buf is null-terminated, and util::is_space('\0') is false, so this won't overrun.  
435 - while (util::is_space(buf[skip])) {  
436 - ++skip;  
437 - }  
438 - xref_offset = read_xrefTable(xref_offset + skip);  
439 - } else {  
440 - xref_offset = read_xrefStream(xref_offset);  
441 - }  
442 - if (visited.count(xref_offset) != 0) {  
443 - QTC::TC("qpdf", "QPDF xref loop");  
444 - throw damagedPDF("", 0, "loop detected following xref tables");  
445 - }  
446 - }  
447 -  
448 - if (!m->trailer) {  
449 - throw damagedPDF("", 0, "unable to find trailer while reading xref");  
450 - }  
451 - int size = m->trailer.getKey("/Size").getIntValueAsInt();  
452 - int max_obj = 0;  
453 - if (!m->xref_table.empty()) {  
454 - max_obj = m->xref_table.rbegin()->first.getObj();  
455 - }  
456 - if (!m->deleted_objects.empty()) {  
457 - max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));  
458 - }  
459 - if ((size < 1) || (size - 1 != max_obj)) {  
460 - QTC::TC("qpdf", "QPDF xref size mismatch");  
461 - warn(damagedPDF(  
462 - "",  
463 - 0,  
464 - ("reported number of objects (" + std::to_string(size) +  
465 - ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));  
466 - }  
467 -  
468 - // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we  
469 - // never depend on its being set.  
470 - m->deleted_objects.clear();  
471 -  
472 - // Make sure we keep only the highest generation for any object.  
473 - QPDFObjGen last_og{-1, 0};  
474 - for (auto const& item: m->xref_table) {  
475 - auto id = item.first.getObj();  
476 - if (id == last_og.getObj() && id > 0) {  
477 - removeObject(last_og);  
478 - }  
479 - last_og = item.first;  
480 - }  
481 -}  
482 -  
483 -bool  
484 -QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)  
485 -{  
486 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
487 - // buffer.  
488 - char const* p = line.c_str();  
489 - char const* start = line.c_str();  
490 -  
491 - // Skip zero or more spaces  
492 - while (util::is_space(*p)) {  
493 - ++p;  
494 - }  
495 - // Require digit  
496 - if (!util::is_digit(*p)) {  
497 - return false;  
498 - }  
499 - // Gather digits  
500 - std::string obj_str;  
501 - while (util::is_digit(*p)) {  
502 - obj_str.append(1, *p++);  
503 - }  
504 - // Require space  
505 - if (!util::is_space(*p)) {  
506 - return false;  
507 - }  
508 - // Skip spaces  
509 - while (util::is_space(*p)) {  
510 - ++p;  
511 - }  
512 - // Require digit  
513 - if (!util::is_digit(*p)) {  
514 - return false;  
515 - }  
516 - // Gather digits  
517 - std::string num_str;  
518 - while (util::is_digit(*p)) {  
519 - num_str.append(1, *p++);  
520 - }  
521 - // Skip any space including line terminators  
522 - while (util::is_space(*p)) {  
523 - ++p;  
524 - }  
525 - bytes = toI(p - start);  
526 - obj = QUtil::string_to_int(obj_str.c_str());  
527 - num = QUtil::string_to_int(num_str.c_str());  
528 - return true;  
529 -}  
530 -  
531 -bool  
532 -QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)  
533 -{  
534 - // Reposition after initial read attempt and reread.  
535 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
536 - auto line = m->file->readLine(30);  
537 -  
538 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
539 - // buffer.  
540 - char const* p = line.data();  
541 -  
542 - // Skip zero or more spaces. There aren't supposed to be any.  
543 - bool invalid = false;  
544 - while (util::is_space(*p)) {  
545 - ++p;  
546 - QTC::TC("qpdf", "QPDF ignore first space in xref entry");  
547 - invalid = true;  
548 - }  
549 - // Require digit  
550 - if (!util::is_digit(*p)) {  
551 - return false;  
552 - }  
553 - // Gather digits  
554 - std::string f1_str;  
555 - while (util::is_digit(*p)) {  
556 - f1_str.append(1, *p++);  
557 - }  
558 - // Require space  
559 - if (!util::is_space(*p)) {  
560 - return false;  
561 - }  
562 - if (util::is_space(*(p + 1))) {  
563 - QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");  
564 - invalid = true;  
565 - }  
566 - // Skip spaces  
567 - while (util::is_space(*p)) {  
568 - ++p;  
569 - }  
570 - // Require digit  
571 - if (!util::is_digit(*p)) {  
572 - return false;  
573 - }  
574 - // Gather digits  
575 - std::string f2_str;  
576 - while (util::is_digit(*p)) {  
577 - f2_str.append(1, *p++);  
578 - }  
579 - // Require space  
580 - if (!util::is_space(*p)) {  
581 - return false;  
582 - }  
583 - if (util::is_space(*(p + 1))) {  
584 - QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");  
585 - invalid = true;  
586 - }  
587 - // Skip spaces  
588 - while (util::is_space(*p)) {  
589 - ++p;  
590 - }  
591 - if ((*p == 'f') || (*p == 'n')) {  
592 - type = *p;  
593 - } else {  
594 - return false;  
595 - }  
596 - if ((f1_str.length() != 10) || (f2_str.length() != 5)) {  
597 - QTC::TC("qpdf", "QPDF ignore length error xref entry");  
598 - invalid = true;  
599 - }  
600 -  
601 - if (invalid) {  
602 - warn(damagedPDF("xref table", "accepting invalid xref table entry"));  
603 - }  
604 -  
605 - f1 = QUtil::string_to_ll(f1_str.c_str());  
606 - f2 = QUtil::string_to_int(f2_str.c_str());  
607 -  
608 - return true;  
609 -}  
610 -  
611 -// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return  
612 -// result.  
613 -bool  
614 -QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)  
615 -{  
616 - std::array<char, 21> line;  
617 - if (m->file->read(line.data(), 20) != 20) {  
618 - // C++20: [[unlikely]]  
619 - return false;  
620 - }  
621 - line[20] = '\0';  
622 - char const* p = line.data();  
623 -  
624 - int f1_len = 0;  
625 - int f2_len = 0;  
626 -  
627 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
628 - // buffer.  
629 -  
630 - // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.  
631 - while (*p == '0') {  
632 - ++f1_len;  
633 - ++p;  
634 - }  
635 - while (util::is_digit(*p) && f1_len++ < 10) {  
636 - f1 *= 10;  
637 - f1 += *p++ - '0';  
638 - }  
639 - // Require space  
640 - if (!util::is_space(*p++)) {  
641 - // Entry doesn't start with space or digit.  
642 - // C++20: [[unlikely]]  
643 - return false;  
644 - }  
645 - // Gather digits. NB No risk of overflow as 99'999 < max int.  
646 - while (*p == '0') {  
647 - ++f2_len;  
648 - ++p;  
649 - }  
650 - while (util::is_digit(*p) && f2_len++ < 5) {  
651 - f2 *= 10;  
652 - f2 += static_cast<int>(*p++ - '0');  
653 - }  
654 - if (util::is_space(*p++) && (*p == 'f' || *p == 'n')) {  
655 - // C++20: [[likely]]  
656 - type = *p;  
657 - // No test for valid line[19].  
658 - if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {  
659 - // C++20: [[likely]]  
660 - return true;  
661 - }  
662 - }  
663 - return read_bad_xrefEntry(f1, f2, type);  
664 -}  
665 -  
666 -// Read a single cross-reference table section and associated trailer.  
667 -qpdf_offset_t  
668 -QPDF::read_xrefTable(qpdf_offset_t xref_offset)  
669 -{  
670 - m->file->seek(xref_offset, SEEK_SET);  
671 - std::string line;  
672 - while (true) {  
673 - line.assign(50, '\0');  
674 - m->file->read(line.data(), line.size());  
675 - int obj = 0;  
676 - int num = 0;  
677 - int bytes = 0;  
678 - if (!parse_xrefFirst(line, obj, num, bytes)) {  
679 - QTC::TC("qpdf", "QPDF invalid xref");  
680 - throw damagedPDF("xref table", "xref syntax invalid");  
681 - }  
682 - m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);  
683 - for (qpdf_offset_t i = obj; i - num < obj; ++i) {  
684 - if (i == 0) {  
685 - // This is needed by checkLinearization()  
686 - m->first_xref_item_offset = m->file->tell();  
687 - }  
688 - // For xref_table, these will always be small enough to be ints  
689 - qpdf_offset_t f1 = 0;  
690 - int f2 = 0;  
691 - char type = '\0';  
692 - if (!read_xrefEntry(f1, f2, type)) {  
693 - QTC::TC("qpdf", "QPDF invalid xref entry");  
694 - throw damagedPDF(  
695 - "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");  
696 - }  
697 - if (type == 'f') {  
698 - insertFreeXrefEntry(QPDFObjGen(toI(i), f2));  
699 - } else {  
700 - insertXrefEntry(toI(i), 1, f1, f2);  
701 - }  
702 - }  
703 - qpdf_offset_t pos = m->file->tell();  
704 - if (readToken(*m->file).isWord("trailer")) {  
705 - break;  
706 - } else {  
707 - m->file->seek(pos, SEEK_SET);  
708 - }  
709 - }  
710 -  
711 - // Set offset to previous xref table if any  
712 - QPDFObjectHandle cur_trailer = readTrailer();  
713 - if (!cur_trailer.isDictionary()) {  
714 - QTC::TC("qpdf", "QPDF missing trailer");  
715 - throw damagedPDF("", "expected trailer dictionary");  
716 - }  
717 -  
718 - if (!m->trailer) {  
719 - setTrailer(cur_trailer);  
720 -  
721 - if (!m->trailer.hasKey("/Size")) {  
722 - QTC::TC("qpdf", "QPDF trailer lacks size");  
723 - throw damagedPDF("trailer", "trailer dictionary lacks /Size key");  
724 - }  
725 - if (!m->trailer.getKey("/Size").isInteger()) {  
726 - QTC::TC("qpdf", "QPDF trailer size not integer");  
727 - throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");  
728 - }  
729 - }  
730 -  
731 - if (cur_trailer.hasKey("/XRefStm")) {  
732 - if (m->ignore_xref_streams) {  
733 - QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");  
734 - } else {  
735 - if (cur_trailer.getKey("/XRefStm").isInteger()) {  
736 - // Read the xref stream but disregard any return value -- we'll use our trailer's  
737 - // /Prev key instead of the xref stream's.  
738 - (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());  
739 - } else {  
740 - throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");  
741 - }  
742 - }  
743 - }  
744 -  
745 - if (cur_trailer.hasKey("/Prev")) {  
746 - if (!cur_trailer.getKey("/Prev").isInteger()) {  
747 - QTC::TC("qpdf", "QPDF trailer prev not integer");  
748 - throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");  
749 - }  
750 - QTC::TC("qpdf", "QPDF prev key in trailer dictionary");  
751 - return cur_trailer.getKey("/Prev").getIntValue();  
752 - }  
753 -  
754 - return 0;  
755 -}  
756 -  
757 -// Read a single cross-reference stream.  
758 -qpdf_offset_t  
759 -QPDF::read_xrefStream(qpdf_offset_t xref_offset)  
760 -{  
761 - if (!m->ignore_xref_streams) {  
762 - QPDFObjGen x_og;  
763 - QPDFObjectHandle xref_obj;  
764 - try {  
765 - xref_obj =  
766 - readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);  
767 - } catch (QPDFExc&) {  
768 - // ignore -- report error below  
769 - }  
770 - if (xref_obj.isStreamOfType("/XRef")) {  
771 - QTC::TC("qpdf", "QPDF found xref stream");  
772 - return processXRefStream(xref_offset, xref_obj);  
773 - }  
774 - }  
775 -  
776 - QTC::TC("qpdf", "QPDF can't find xref");  
777 - throw damagedPDF("", xref_offset, "xref not found");  
778 - return 0; // unreachable  
779 -}  
780 -  
781 -// Return the entry size of the xref stream and the processed W array.  
782 -std::pair<int, std::array<int, 3>>  
783 -QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)  
784 -{  
785 - auto W_obj = dict.getKey("/W");  
786 - if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&  
787 - W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {  
788 - throw damaged("Cross-reference stream does not have a proper /W key");  
789 - }  
790 -  
791 - std::array<int, 3> W;  
792 - int entry_size = 0;  
793 - auto w_vector = W_obj.getArrayAsVector();  
794 - int max_bytes = sizeof(qpdf_offset_t);  
795 - for (size_t i = 0; i < 3; ++i) {  
796 - W[i] = w_vector[i].getIntValueAsInt();  
797 - if (W[i] > max_bytes) {  
798 - throw damaged("Cross-reference stream's /W contains impossibly large values");  
799 - }  
800 - if (W[i] < 0) {  
801 - throw damaged("Cross-reference stream's /W contains negative values");  
802 - }  
803 - entry_size += W[i];  
804 - }  
805 - if (entry_size == 0) {  
806 - throw damaged("Cross-reference stream's /W indicates entry size of 0");  
807 - }  
808 - return {entry_size, W};  
809 -}  
810 -  
811 -// Validate Size key and return the maximum number of entries that the xref stream can contain.  
812 -int  
813 -QPDF::processXRefSize(  
814 - QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)  
815 -{  
816 - // Number of entries is limited by the highest possible object id and stream size.  
817 - auto max_num_entries = std::numeric_limits<int>::max();  
818 - if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {  
819 - max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);  
820 - }  
821 -  
822 - auto Size_obj = dict.getKey("/Size");  
823 - long long size;  
824 - if (!dict.getKey("/Size").getValueAsInt(size)) {  
825 - throw damaged("Cross-reference stream does not have a proper /Size key");  
826 - } else if (size < 0) {  
827 - throw damaged("Cross-reference stream has a negative /Size key");  
828 - } else if (size >= max_num_entries) {  
829 - throw damaged("Cross-reference stream has an impossibly large /Size key");  
830 - }  
831 - // We are not validating that Size <= (Size key of parent xref / trailer).  
832 - return max_num_entries;  
833 -}  
834 -  
835 -// Return the number of entries of the xref stream and the processed Index array.  
836 -std::pair<int, std::vector<std::pair<int, int>>>  
837 -QPDF::processXRefIndex(  
838 - QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)  
839 -{  
840 - auto size = dict.getKey("/Size").getIntValueAsInt();  
841 - auto Index_obj = dict.getKey("/Index");  
842 -  
843 - if (Index_obj.isArray()) {  
844 - std::vector<std::pair<int, int>> indx;  
845 - int num_entries = 0;  
846 - auto index_vec = Index_obj.getArrayAsVector();  
847 - if ((index_vec.size() % 2) || index_vec.size() < 2) {  
848 - throw damaged("Cross-reference stream's /Index has an invalid number of values");  
849 - }  
850 -  
851 - int i = 0;  
852 - long long first = 0;  
853 - for (auto& val: index_vec) {  
854 - if (val.isInteger()) {  
855 - if (i % 2) {  
856 - auto count = val.getIntValue();  
857 - if (count <= 0) {  
858 - throw damaged(  
859 - "Cross-reference stream section claims to contain " +  
860 - std::to_string(count) + " entries");  
861 - }  
862 - // We are guarding against the possibility of num_entries * entry_size  
863 - // overflowing. We are not checking that entries are in ascending order as  
864 - // required by the spec, which probably should generate a warning. We are also  
865 - // not checking that for each subsection first object number + number of entries  
866 - // <= /Size. The spec requires us to ignore object number > /Size.  
867 - if (first > (max_num_entries - count) ||  
868 - count > (max_num_entries - num_entries)) {  
869 - throw damaged(  
870 - "Cross-reference stream claims to contain too many entries: " +  
871 - std::to_string(first) + " " + std::to_string(max_num_entries) + " " +  
872 - std::to_string(num_entries));  
873 - }  
874 - indx.emplace_back(static_cast<int>(first), static_cast<int>(count));  
875 - num_entries += static_cast<int>(count);  
876 - } else {  
877 - first = val.getIntValue();  
878 - if (first < 0) {  
879 - throw damaged(  
880 - "Cross-reference stream's /Index contains a negative object id");  
881 - } else if (first > max_num_entries) {  
882 - throw damaged(  
883 - "Cross-reference stream's /Index contains an impossibly "  
884 - "large object id");  
885 - }  
886 - }  
887 - } else {  
888 - throw damaged(  
889 - "Cross-reference stream's /Index's item " + std::to_string(i) +  
890 - " is not an integer");  
891 - }  
892 - i++;  
893 - }  
894 - QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);  
895 - return {num_entries, indx};  
896 - } else if (Index_obj.isNull()) {  
897 - QTC::TC("qpdf", "QPDF xref /Index is null");  
898 - return {size, {{0, size}}};  
899 - } else {  
900 - throw damaged("Cross-reference stream does not have a proper /Index key");  
901 - }  
902 -}  
903 -  
904 -qpdf_offset_t  
905 -QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)  
906 -{  
907 - auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {  
908 - return damagedPDF("xref stream", xref_offset, msg.data());  
909 - };  
910 -  
911 - auto dict = xref_obj.getDict();  
912 -  
913 - auto [entry_size, W] = processXRefW(dict, damaged);  
914 - int max_num_entries = processXRefSize(dict, entry_size, damaged);  
915 - auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);  
916 -  
917 - std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);  
918 - size_t actual_size = bp->getSize();  
919 - auto expected_size = toS(entry_size) * toS(num_entries);  
920 -  
921 - if (expected_size != actual_size) {  
922 - QPDFExc x = damaged(  
923 - "Cross-reference stream data has the wrong size; expected = " +  
924 - std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));  
925 - if (expected_size > actual_size) {  
926 - throw x;  
927 - } else {  
928 - warn(x);  
929 - }  
930 - }  
931 -  
932 - bool saw_first_compressed_object = false;  
933 -  
934 - // Actual size vs. expected size check above ensures that we will not overflow any buffers here.  
935 - // We know that entry_size * num_entries is less or equal to the size of the buffer.  
936 - auto p = bp->getBuffer();  
937 - for (auto [obj, sec_entries]: indx) {  
938 - // Process a subsection.  
939 - for (int i = 0; i < sec_entries; ++i) {  
940 - // Read this entry  
941 - std::array<qpdf_offset_t, 3> fields{};  
942 - if (W[0] == 0) {  
943 - QTC::TC("qpdf", "QPDF default for xref stream field 0");  
944 - fields[0] = 1;  
945 - }  
946 - for (size_t j = 0; j < 3; ++j) {  
947 - for (int k = 0; k < W[j]; ++k) {  
948 - fields[j] <<= 8;  
949 - fields[j] |= *p++;  
950 - }  
951 - }  
952 -  
953 - // Get the generation number. The generation number is 0 unless this is an uncompressed  
954 - // object record, in which case the generation number appears as the third field.  
955 - if (saw_first_compressed_object) {  
956 - if (fields[0] != 2) {  
957 - m->uncompressed_after_compressed = true;  
958 - }  
959 - } else if (fields[0] == 2) {  
960 - saw_first_compressed_object = true;  
961 - }  
962 - if (obj == 0) {  
963 - // This is needed by checkLinearization()  
964 - m->first_xref_item_offset = xref_offset;  
965 - } else if (fields[0] == 0) {  
966 - // Ignore fields[2], which we don't care about in this case. This works around the  
967 - // issue of some PDF files that put invalid values, like -1, here for deleted  
968 - // objects.  
969 - insertFreeXrefEntry(QPDFObjGen(obj, 0));  
970 - } else {  
971 - insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));  
972 - }  
973 - ++obj;  
974 - }  
975 - }  
976 -  
977 - if (!m->trailer) {  
978 - setTrailer(dict);  
979 - }  
980 -  
981 - if (dict.hasKey("/Prev")) {  
982 - if (!dict.getKey("/Prev").isInteger()) {  
983 - throw damagedPDF(  
984 - "xref stream", "/Prev key in xref stream dictionary is not an integer");  
985 - }  
986 - QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");  
987 - return dict.getKey("/Prev").getIntValue();  
988 - } else {  
989 - return 0;  
990 - }  
991 -}  
992 -  
993 -void  
994 -QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)  
995 -{  
996 - // Populate the xref table in such a way that the first reference to an object that we see,  
997 - // which is the one in the latest xref table in which it appears, is the one that gets stored.  
998 - // This works because we are reading more recent appends before older ones.  
999 -  
1000 - // If there is already an entry for this object and generation in the table, it means that a  
1001 - // later xref table has registered this object. Disregard this one.  
1002 - int new_gen = f0 == 2 ? 0 : f2;  
1003 -  
1004 - if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && new_gen < 65535)) {  
1005 - // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There  
1006 - // is probably no point having another warning but we could count invalid items in order to  
1007 - // decide when to give up.  
1008 - QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");  
1009 - // ignore impossibly large object ids or object ids > Size.  
1010 - return;  
1011 - }  
1012 -  
1013 - if (m->deleted_objects.count(obj)) {  
1014 - QTC::TC("qpdf", "QPDF xref deleted object");  
1015 - return;  
1016 - }  
1017 -  
1018 - if (f0 == 2 && static_cast<int>(f1) == obj) {  
1019 - warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));  
1020 - return;  
1021 - }  
1022 -  
1023 - auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));  
1024 - if (!created) {  
1025 - QTC::TC("qpdf", "QPDF xref reused object");  
1026 - return;  
1027 - }  
1028 -  
1029 - switch (f0) {  
1030 - case 1:  
1031 - // f2 is generation  
1032 - QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));  
1033 - iter->second = QPDFXRefEntry(f1);  
1034 - break;  
1035 -  
1036 - case 2:  
1037 - iter->second = QPDFXRefEntry(toI(f1), f2);  
1038 - break;  
1039 -  
1040 - default:  
1041 - throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));  
1042 - break;  
1043 - }  
1044 -}  
1045 -  
1046 -void  
1047 -QPDF::insertFreeXrefEntry(QPDFObjGen og)  
1048 -{  
1049 - if (!m->xref_table.count(og)) {  
1050 - m->deleted_objects.insert(og.getObj());  
1051 - }  
1052 -}  
1053 -  
1054 -void  
1055 -QPDF::showXRefTable()  
1056 -{  
1057 - auto& cout = *m->log->getInfo();  
1058 - for (auto const& iter: m->xref_table) {  
1059 - QPDFObjGen const& og = iter.first;  
1060 - QPDFXRefEntry const& entry = iter.second;  
1061 - cout << og.unparse('/') << ": ";  
1062 - switch (entry.getType()) {  
1063 - case 1:  
1064 - cout << "uncompressed; offset = " << entry.getOffset();  
1065 - break;  
1066 -  
1067 - case 2:  
1068 - *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()  
1069 - << ", index = " << entry.getObjStreamIndex();  
1070 - break;  
1071 -  
1072 - default:  
1073 - throw std::logic_error("unknown cross-reference table type while showing xref_table");  
1074 - break;  
1075 - }  
1076 - m->log->info("\n");  
1077 - }  
1078 -}  
1079 -  
1080 -// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and  
1081 -// return false. Otherwise return true.  
1082 -bool  
1083 -QPDF::resolveXRefTable()  
1084 -{  
1085 - bool may_change = !m->reconstructed_xref;  
1086 - for (auto& iter: m->xref_table) {  
1087 - if (isUnresolved(iter.first)) {  
1088 - resolve(iter.first);  
1089 - if (may_change && m->reconstructed_xref) {  
1090 - return false;  
1091 - }  
1092 - }  
1093 - }  
1094 - return true;  
1095 -}  
1096 -  
1097 -// Ensure all objects in the pdf file, including those in indirect references, appear in the object  
1098 -// cache.  
1099 -void  
1100 -QPDF::fixDanglingReferences(bool force)  
1101 -{  
1102 - if (m->fixed_dangling_refs) {  
1103 - return;  
1104 - }  
1105 - if (!resolveXRefTable()) {  
1106 - QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");  
1107 - resolveXRefTable();  
1108 - }  
1109 - m->fixed_dangling_refs = true;  
1110 -}  
1111 -  
1112 -size_t  
1113 -QPDF::getObjectCount()  
1114 -{  
1115 - // This method returns the next available indirect object number. makeIndirectObject uses it for  
1116 - // this purpose. After fixDanglingReferences is called, all objects in the xref table will also  
1117 - // be in obj_cache.  
1118 - fixDanglingReferences();  
1119 - QPDFObjGen og;  
1120 - if (!m->obj_cache.empty()) {  
1121 - og = (*(m->obj_cache.rbegin())).first;  
1122 - }  
1123 - return toS(og.getObj());  
1124 -}  
1125 -  
1126 -std::vector<QPDFObjectHandle>  
1127 -QPDF::getAllObjects()  
1128 -{  
1129 - // After fixDanglingReferences is called, all objects are in the object cache.  
1130 - fixDanglingReferences();  
1131 - std::vector<QPDFObjectHandle> result;  
1132 - for (auto const& iter: m->obj_cache) {  
1133 - result.push_back(newIndirect(iter.first, iter.second.object));  
1134 - }  
1135 - return result;  
1136 -}  
1137 -  
1138 -void  
1139 -QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen og)  
1140 -{  
1141 - m->last_object_description.clear();  
1142 - if (!description.empty()) {  
1143 - m->last_object_description += description;  
1144 - if (og.isIndirect()) {  
1145 - m->last_object_description += ": ";  
1146 - }  
1147 - }  
1148 - if (og.isIndirect()) {  
1149 - m->last_object_description += "object " + og.unparse(' ');  
1150 - }  
1151 -}  
1152 -  
1153 -QPDFObjectHandle  
1154 -QPDF::readTrailer()  
1155 -{  
1156 - qpdf_offset_t offset = m->file->tell();  
1157 - bool empty = false;  
1158 - auto object =  
1159 - QPDFParser(*m->file, "trailer", m->tokenizer, nullptr, this, true).parse(empty, false);  
1160 - if (empty) {  
1161 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1162 - // actual PDF files and Adobe Reader appears to ignore them.  
1163 - warn(damagedPDF("trailer", "empty object treated as null"));  
1164 - } else if (object.isDictionary() && readToken(*m->file).isWord("stream")) {  
1165 - warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));  
1166 - }  
1167 - // Override last_offset so that it points to the beginning of the object we just read  
1168 - m->file->setLastOffset(offset);  
1169 - return object;  
1170 -}  
1171 -  
1172 -QPDFObjectHandle  
1173 -QPDF::readObject(std::string const& description, QPDFObjGen og)  
1174 -{  
1175 - setLastObjectDescription(description, og);  
1176 - qpdf_offset_t offset = m->file->tell();  
1177 - bool empty = false;  
1178 -  
1179 - StringDecrypter decrypter{this, og};  
1180 - StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;  
1181 - auto object =  
1182 - QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this, true)  
1183 - .parse(empty, false);  
1184 - if (empty) {  
1185 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1186 - // actual PDF files and Adobe Reader appears to ignore them.  
1187 - warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));  
1188 - return object;  
1189 - }  
1190 - auto token = readToken(*m->file);  
1191 - if (object.isDictionary() && token.isWord("stream")) {  
1192 - readStream(object, og, offset);  
1193 - token = readToken(*m->file);  
1194 - }  
1195 - if (!token.isWord("endobj")) {  
1196 - QTC::TC("qpdf", "QPDF err expected endobj");  
1197 - warn(damagedPDF("expected endobj"));  
1198 - }  
1199 - return object;  
1200 -}  
1201 -  
1202 -// After reading stream dictionary and stream keyword, read rest of stream.  
1203 -void  
1204 -QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)  
1205 -{  
1206 - validateStreamLineEnd(object, og, offset);  
1207 -  
1208 - // Must get offset before accessing any additional objects since resolving a previously  
1209 - // unresolved indirect object will change file position.  
1210 - qpdf_offset_t stream_offset = m->file->tell();  
1211 - size_t length = 0;  
1212 -  
1213 - try {  
1214 - auto length_obj = object.getKey("/Length");  
1215 -  
1216 - if (!length_obj.isInteger()) {  
1217 - if (length_obj.isNull()) {  
1218 - QTC::TC("qpdf", "QPDF stream without length");  
1219 - throw damagedPDF(offset, "stream dictionary lacks /Length key");  
1220 - }  
1221 - QTC::TC("qpdf", "QPDF stream length not integer");  
1222 - throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");  
1223 - }  
1224 -  
1225 - length = toS(length_obj.getUIntValue());  
1226 - // Seek in two steps to avoid potential integer overflow  
1227 - m->file->seek(stream_offset, SEEK_SET);  
1228 - m->file->seek(toO(length), SEEK_CUR);  
1229 - if (!readToken(*m->file).isWord("endstream")) {  
1230 - QTC::TC("qpdf", "QPDF missing endstream");  
1231 - throw damagedPDF("expected endstream");  
1232 - }  
1233 - } catch (QPDFExc& e) {  
1234 - if (m->attempt_recovery) {  
1235 - warn(e);  
1236 - length = recoverStreamLength(m->file, og, stream_offset);  
1237 - } else {  
1238 - throw;  
1239 - }  
1240 - }  
1241 - object = QPDFObjectHandle(qpdf::Stream(*this, og, object, stream_offset, length));  
1242 -}  
1243 -  
1244 -void  
1245 -QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)  
1246 -{  
1247 - // The PDF specification states that the word "stream" should be followed by either a carriage  
1248 - // return and a newline or by a newline alone. It specifically disallowed following it by a  
1249 - // carriage return alone since, in that case, there would be no way to tell whether the NL in a  
1250 - // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,  
1251 - // accept a carriage return by itself when followed by a non-newline character, so that's what  
1252 - // we do here. We have also seen files that have extraneous whitespace between the stream  
1253 - // keyword and the newline.  
1254 - while (true) {  
1255 - char ch;  
1256 - if (m->file->read(&ch, 1) == 0) {  
1257 - // A premature EOF here will result in some other problem that will get reported at  
1258 - // another time.  
1259 - return;  
1260 - }  
1261 - if (ch == '\n') {  
1262 - // ready to read stream data  
1263 - QTC::TC("qpdf", "QPDF stream with NL only");  
1264 - return;  
1265 - }  
1266 - if (ch == '\r') {  
1267 - // Read another character  
1268 - if (m->file->read(&ch, 1) != 0) {  
1269 - if (ch == '\n') {  
1270 - // Ready to read stream data  
1271 - QTC::TC("qpdf", "QPDF stream with CRNL");  
1272 - } else {  
1273 - // Treat the \r by itself as the whitespace after endstream and start reading  
1274 - // stream data in spite of not having seen a newline.  
1275 - QTC::TC("qpdf", "QPDF stream with CR only");  
1276 - m->file->unreadCh(ch);  
1277 - warn(damagedPDF(  
1278 - m->file->tell(), "stream keyword followed by carriage return only"));  
1279 - }  
1280 - }  
1281 - return;  
1282 - }  
1283 - if (!util::is_space(ch)) {  
1284 - QTC::TC("qpdf", "QPDF stream without newline");  
1285 - m->file->unreadCh(ch);  
1286 - warn(damagedPDF(  
1287 - m->file->tell(), "stream keyword not followed by proper line terminator"));  
1288 - return;  
1289 - }  
1290 - warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));  
1291 - }  
1292 -}  
1293 -  
1294 -QPDFObjectHandle  
1295 -QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)  
1296 -{  
1297 - m->last_object_description.erase(7); // last_object_description starts with "object "  
1298 - m->last_object_description += std::to_string(obj);  
1299 - m->last_object_description += " 0";  
1300 -  
1301 - bool empty = false;  
1302 - auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true)  
1303 - .parse(empty, false);  
1304 - if (empty) {  
1305 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1306 - // actual PDF files and Adobe Reader appears to ignore them.  
1307 - warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));  
1308 - }  
1309 - return object;  
1310 -}  
1311 -  
1312 -bool  
1313 -QPDF::findEndstream()  
1314 -{  
1315 - // Find endstream or endobj. Position the input at that token.  
1316 - auto t = readToken(*m->file, 20);  
1317 - if (t.isWord("endobj") || t.isWord("endstream")) {  
1318 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
1319 - return true;  
1320 - }  
1321 - return false;  
1322 -}  
1323 -  
1324 -size_t  
1325 -QPDF::recoverStreamLength(  
1326 - std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)  
1327 -{  
1328 - // Try to reconstruct stream length by looking for endstream or endobj  
1329 - warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));  
1330 -  
1331 - PatternFinder ef(*this, &QPDF::findEndstream);  
1332 - size_t length = 0;  
1333 - if (m->file->findFirst("end", stream_offset, 0, ef)) {  
1334 - length = toS(m->file->tell() - stream_offset);  
1335 - // Reread endstream but, if it was endobj, don't skip that.  
1336 - QPDFTokenizer::Token t = readToken(*m->file);  
1337 - if (t.getValue() == "endobj") {  
1338 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
1339 - }  
1340 - }  
1341 -  
1342 - if (length) {  
1343 - auto end = stream_offset + toO(length);  
1344 - qpdf_offset_t found_offset = 0;  
1345 - QPDFObjGen found_og;  
1346 -  
1347 - // Make sure this is inside this object  
1348 - for (auto const& [current_og, entry]: m->xref_table) {  
1349 - if (entry.getType() == 1) {  
1350 - qpdf_offset_t obj_offset = entry.getOffset();  
1351 - if (found_offset < obj_offset && obj_offset < end) {  
1352 - found_offset = obj_offset;  
1353 - found_og = current_og;  
1354 - }  
1355 - }  
1356 - }  
1357 - if (!found_offset || found_og == og) {  
1358 - // If we are trying to recover an XRef stream the xref table will not contain and  
1359 - // won't contain any entries, therefore we cannot check the found length. Otherwise we  
1360 - // found endstream\nendobj within the space allowed for this object, so we're probably  
1361 - // in good shape.  
1362 - } else {  
1363 - QTC::TC("qpdf", "QPDF found wrong endstream in recovery");  
1364 - length = 0;  
1365 - }  
1366 - }  
1367 -  
1368 - if (length == 0) {  
1369 - warn(damagedPDF(  
1370 - *input, stream_offset, "unable to recover stream data; treating stream as empty"));  
1371 - } else {  
1372 - warn(damagedPDF(  
1373 - *input, stream_offset, "recovered stream length: " + std::to_string(length)));  
1374 - }  
1375 -  
1376 - QTC::TC("qpdf", "QPDF recovered stream length");  
1377 - return length;  
1378 -}  
1379 -  
1380 -QPDFTokenizer::Token  
1381 -QPDF::readToken(InputSource& input, size_t max_len)  
1382 -{  
1383 - return m->tokenizer.readToken(input, m->last_object_description, true, max_len);  
1384 -}  
1385 -  
1386 -QPDFObjectHandle  
1387 -QPDF::readObjectAtOffset(  
1388 - bool try_recovery,  
1389 - qpdf_offset_t offset,  
1390 - std::string const& description,  
1391 - QPDFObjGen exp_og,  
1392 - QPDFObjGen& og,  
1393 - bool skip_cache_if_in_xref)  
1394 -{  
1395 - bool check_og = true;  
1396 - if (exp_og.getObj() == 0) {  
1397 - // This method uses an expect object ID of 0 to indicate that we don't know or don't care  
1398 - // what the actual object ID is at this offset. This is true when we read the xref stream  
1399 - // and linearization hint streams. In this case, we don't verify the expect object  
1400 - // ID/generation against what was read from the file. There is also no reason to attempt  
1401 - // xref recovery if we get a failure in this case since the read attempt was not triggered  
1402 - // by an xref lookup.  
1403 - check_og = false;  
1404 - try_recovery = false;  
1405 - }  
1406 - setLastObjectDescription(description, exp_og);  
1407 -  
1408 - if (!m->attempt_recovery) {  
1409 - try_recovery = false;  
1410 - }  
1411 -  
1412 - // Special case: if offset is 0, just return null. Some PDF writers, in particular  
1413 - // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as  
1414 - // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore  
1415 - // these.  
1416 - if (offset == 0) {  
1417 - QTC::TC("qpdf", "QPDF bogus 0 offset", 0);  
1418 - warn(damagedPDF(0, "object has offset 0"));  
1419 - return QPDFObjectHandle::newNull();  
1420 - }  
1421 -  
1422 - m->file->seek(offset, SEEK_SET);  
1423 - try {  
1424 - QPDFTokenizer::Token tobjid = readToken(*m->file);  
1425 - bool objidok = tobjid.isInteger();  
1426 - QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);  
1427 - if (!objidok) {  
1428 - QTC::TC("qpdf", "QPDF expected n n obj");  
1429 - throw damagedPDF(offset, "expected n n obj");  
1430 - }  
1431 - QPDFTokenizer::Token tgen = readToken(*m->file);  
1432 - bool genok = tgen.isInteger();  
1433 - QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);  
1434 - if (!genok) {  
1435 - throw damagedPDF(offset, "expected n n obj");  
1436 - }  
1437 - QPDFTokenizer::Token tobj = readToken(*m->file);  
1438 -  
1439 - bool objok = tobj.isWord("obj");  
1440 - QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);  
1441 -  
1442 - if (!objok) {  
1443 - throw damagedPDF(offset, "expected n n obj");  
1444 - }  
1445 - int objid = QUtil::string_to_int(tobjid.getValue().c_str());  
1446 - int generation = QUtil::string_to_int(tgen.getValue().c_str());  
1447 - og = QPDFObjGen(objid, generation);  
1448 - if (objid == 0) {  
1449 - QTC::TC("qpdf", "QPDF object id 0");  
1450 - throw damagedPDF(offset, "object with ID 0");  
1451 - }  
1452 - if (check_og && (exp_og != og)) {  
1453 - QTC::TC("qpdf", "QPDF err wrong objid/generation");  
1454 - QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");  
1455 - if (try_recovery) {  
1456 - // Will be retried below  
1457 - throw e;  
1458 - } else {  
1459 - // We can try reading the object anyway even if the ID doesn't match.  
1460 - warn(e);  
1461 - }  
1462 - }  
1463 - } catch (QPDFExc& e) {  
1464 - if (try_recovery) {  
1465 - // Try again after reconstructing xref table  
1466 - reconstruct_xref(e);  
1467 - if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {  
1468 - qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();  
1469 - QPDFObjectHandle result =  
1470 - readObjectAtOffset(false, new_offset, description, exp_og, og, false);  
1471 - QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");  
1472 - return result;  
1473 - } else {  
1474 - QTC::TC("qpdf", "QPDF object gone after xref reconstruction");  
1475 - warn(damagedPDF(  
1476 - "",  
1477 - 0,  
1478 - ("object " + exp_og.unparse(' ') +  
1479 - " not found in file after regenerating cross reference "  
1480 - "table")));  
1481 - return QPDFObjectHandle::newNull();  
1482 - }  
1483 - } else {  
1484 - throw;  
1485 - }  
1486 - }  
1487 -  
1488 - QPDFObjectHandle oh = readObject(description, og);  
1489 -  
1490 - if (isUnresolved(og)) {  
1491 - // Store the object in the cache here so it gets cached whether we first know the offset or  
1492 - // whether we first know the object ID and generation (in which we case we would get here  
1493 - // through resolve).  
1494 -  
1495 - // Determine the end offset of this object before and after white space. We use these  
1496 - // numbers to validate linearization hint tables. Offsets and lengths of objects may imply  
1497 - // the end of an object to be anywhere between these values.  
1498 - qpdf_offset_t end_before_space = m->file->tell();  
1499 -  
1500 - // skip over spaces  
1501 - while (true) {  
1502 - char ch;  
1503 - if (m->file->read(&ch, 1)) {  
1504 - if (!isspace(static_cast<unsigned char>(ch))) {  
1505 - m->file->seek(-1, SEEK_CUR);  
1506 - break;  
1507 - }  
1508 - } else {  
1509 - throw damagedPDF(m->file->tell(), "EOF after endobj");  
1510 - }  
1511 - }  
1512 - qpdf_offset_t end_after_space = m->file->tell();  
1513 - if (skip_cache_if_in_xref && m->xref_table.count(og)) {  
1514 - // Ordinarily, an object gets read here when resolved through xref table or stream. In  
1515 - // the special case of the xref stream and linearization hint tables, the offset comes  
1516 - // from another source. For the specific case of xref streams, the xref stream is read  
1517 - // and loaded into the object cache very early in parsing. Ordinarily, when a file is  
1518 - // updated by appending, items inserted into the xref table in later updates take  
1519 - // precedence over earlier items. In the special case of reusing the object number  
1520 - // previously used as the xref stream, we have the following order of events:  
1521 - //  
1522 - // * reused object gets loaded into the xref table  
1523 - // * old object is read here while reading xref streams  
1524 - // * original xref entry is ignored (since already in xref table)  
1525 - //  
1526 - // It is the second step that causes a problem. Even though the xref table is correct in  
1527 - // this case, the old object is already in the cache and so effectively prevails over  
1528 - // the reused object. To work around this issue, we have a special case for the xref  
1529 - // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,  
1530 - // don't cache what we read here.  
1531 - //  
1532 - // It is likely that the same bug may exist for linearization hint tables, but the  
1533 - // existing code uses end_before_space and end_after_space from the cache, so fixing  
1534 - // that would require more significant rework. The chances of a linearization hint  
1535 - // stream being reused seems smaller because the xref stream is probably the highest  
1536 - // object in the file and the linearization hint stream would be some random place in  
1537 - // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we  
1538 - // could use !check_og in place of skip_cache_if_in_xref.  
1539 - QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");  
1540 - } else {  
1541 - updateCache(og, oh.getObj(), end_before_space, end_after_space);  
1542 - }  
1543 - }  
1544 -  
1545 - return oh;  
1546 -}  
1547 -  
1548 -std::shared_ptr<QPDFObject> const&  
1549 -QPDF::resolve(QPDFObjGen og)  
1550 -{  
1551 - if (!isUnresolved(og)) {  
1552 - return m->obj_cache[og].object;  
1553 - }  
1554 -  
1555 - if (m->resolving.count(og)) {  
1556 - // This can happen if an object references itself directly or indirectly in some key that  
1557 - // has to be resolved during object parsing, such as stream length.  
1558 - QTC::TC("qpdf", "QPDF recursion loop in resolve");  
1559 - warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));  
1560 - updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);  
1561 - return m->obj_cache[og].object;  
1562 - }  
1563 - ResolveRecorder rr(this, og);  
1564 -  
1565 - if (m->xref_table.count(og) != 0) {  
1566 - QPDFXRefEntry const& entry = m->xref_table[og];  
1567 - try {  
1568 - switch (entry.getType()) {  
1569 - case 1:  
1570 - {  
1571 - qpdf_offset_t offset = entry.getOffset();  
1572 - // Object stored in cache by readObjectAtOffset  
1573 - QPDFObjGen a_og;  
1574 - QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);  
1575 - }  
1576 - break;  
1577 -  
1578 - case 2:  
1579 - resolveObjectsInStream(entry.getObjStreamNumber());  
1580 - break;  
1581 -  
1582 - default:  
1583 - throw damagedPDF(  
1584 - "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));  
1585 - }  
1586 - } catch (QPDFExc& e) {  
1587 - warn(e);  
1588 - } catch (std::exception& e) {  
1589 - warn(damagedPDF(  
1590 - "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));  
1591 - }  
1592 - }  
1593 -  
1594 - if (isUnresolved(og)) {  
1595 - // PDF spec says unknown objects resolve to the null object.  
1596 - QTC::TC("qpdf", "QPDF resolve failure to null");  
1597 - updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);  
1598 - }  
1599 -  
1600 - auto& result(m->obj_cache[og].object);  
1601 - result->setDefaultDescription(this, og);  
1602 - return result;  
1603 -}  
1604 -  
1605 -void  
1606 -QPDF::resolveObjectsInStream(int obj_stream_number)  
1607 -{  
1608 - if (m->resolved_object_streams.count(obj_stream_number)) {  
1609 - return;  
1610 - }  
1611 - m->resolved_object_streams.insert(obj_stream_number);  
1612 - // Force resolution of object stream  
1613 - QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);  
1614 - if (!obj_stream.isStream()) {  
1615 - throw damagedPDF(  
1616 - "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");  
1617 - }  
1618 -  
1619 - // For linearization data in the object, use the data from the object stream for the objects in  
1620 - // the stream.  
1621 - QPDFObjGen stream_og(obj_stream_number, 0);  
1622 - qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;  
1623 - qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;  
1624 -  
1625 - QPDFObjectHandle dict = obj_stream.getDict();  
1626 - if (!dict.isDictionaryOfType("/ObjStm")) {  
1627 - QTC::TC("qpdf", "QPDF ERR object stream with wrong type");  
1628 - warn(damagedPDF(  
1629 - "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));  
1630 - }  
1631 -  
1632 - if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {  
1633 - throw damagedPDF(  
1634 - ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));  
1635 - }  
1636 -  
1637 - int n = dict.getKey("/N").getIntValueAsInt();  
1638 - int first = dict.getKey("/First").getIntValueAsInt();  
1639 -  
1640 - std::map<int, int> offsets;  
1641 -  
1642 - std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);  
1643 - auto input = std::shared_ptr<InputSource>(  
1644 - // line-break  
1645 - new BufferInputSource(  
1646 - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),  
1647 - bp.get()));  
1648 -  
1649 - long long last_offset = -1;  
1650 - for (int i = 0; i < n; ++i) {  
1651 - QPDFTokenizer::Token tnum = readToken(*input);  
1652 - QPDFTokenizer::Token toffset = readToken(*input);  
1653 - if (!(tnum.isInteger() && toffset.isInteger())) {  
1654 - throw damagedPDF(  
1655 - *input,  
1656 - m->last_object_description,  
1657 - input->getLastOffset(),  
1658 - "expected integer in object stream header");  
1659 - }  
1660 -  
1661 - int num = QUtil::string_to_int(tnum.getValue().c_str());  
1662 - long long offset = QUtil::string_to_int(toffset.getValue().c_str());  
1663 -  
1664 - if (num == obj_stream_number) {  
1665 - QTC::TC("qpdf", "QPDF ignore self-referential object stream");  
1666 - warn(damagedPDF(  
1667 - *input,  
1668 - m->last_object_description,  
1669 - input->getLastOffset(),  
1670 - "object stream claims to contain itself"));  
1671 - continue;  
1672 - }  
1673 -  
1674 - if (num < 1) {  
1675 - QTC::TC("qpdf", "QPDF object stream contains id < 1");  
1676 - warn(damagedPDF(  
1677 - *input,  
1678 - "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),  
1679 - 0,  
1680 - "object id is invalid"s));  
1681 - continue;  
1682 - }  
1683 -  
1684 - if (offset <= last_offset) {  
1685 - QTC::TC("qpdf", "QPDF object stream offsets not increasing");  
1686 - warn(damagedPDF(  
1687 - *input,  
1688 - "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),  
1689 - 0,  
1690 - "offset is invalid (must be larger than previous offset " +  
1691 - std::to_string(last_offset) + ")"));  
1692 - continue;  
1693 - }  
1694 - last_offset = offset;  
1695 -  
1696 - if (num > m->xref_table_max_id) {  
1697 - continue;  
1698 - }  
1699 -  
1700 - offsets[num] = toI(offset + first);  
1701 - }  
1702 -  
1703 - // To avoid having to read the object stream multiple times, store all objects that would be  
1704 - // found here in the cache. Remember that some objects stored here might have been overridden  
1705 - // by new objects appended to the file, so it is necessary to recheck the xref table and only  
1706 - // cache what would actually be resolved here.  
1707 - m->last_object_description.clear();  
1708 - m->last_object_description += "object ";  
1709 - for (auto const& iter: offsets) {  
1710 - QPDFObjGen og(iter.first, 0);  
1711 - auto entry = m->xref_table.find(og);  
1712 - if (entry != m->xref_table.end() && entry->second.getType() == 2 &&  
1713 - entry->second.getObjStreamNumber() == obj_stream_number) {  
1714 - int offset = iter.second;  
1715 - input->seek(offset, SEEK_SET);  
1716 - QPDFObjectHandle oh = readObjectInStream(input, iter.first);  
1717 - updateCache(og, oh.getObj(), end_before_space, end_after_space);  
1718 - } else {  
1719 - QTC::TC("qpdf", "QPDF not caching overridden objstm object");  
1720 - }  
1721 - }  
1722 -}  
1723 -  
1724 -QPDFObjectHandle  
1725 -QPDF::newIndirect(QPDFObjGen og, std::shared_ptr<QPDFObject> const& obj)  
1726 -{  
1727 - obj->setDefaultDescription(this, og);  
1728 - return {obj};  
1729 -}  
1730 -  
1731 -void  
1732 -QPDF::updateCache(  
1733 - QPDFObjGen og,  
1734 - std::shared_ptr<QPDFObject> const& object,  
1735 - qpdf_offset_t end_before_space,  
1736 - qpdf_offset_t end_after_space,  
1737 - bool destroy)  
1738 -{  
1739 - object->setObjGen(this, og);  
1740 - if (isCached(og)) {  
1741 - auto& cache = m->obj_cache[og];  
1742 - object->move_to(cache.object, destroy);  
1743 - cache.end_before_space = end_before_space;  
1744 - cache.end_after_space = end_after_space;  
1745 - } else {  
1746 - m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);  
1747 - }  
1748 -}  
1749 -  
1750 -bool  
1751 -QPDF::isCached(QPDFObjGen og)  
1752 -{  
1753 - return m->obj_cache.count(og) != 0;  
1754 -}  
1755 -  
1756 -bool  
1757 -QPDF::isUnresolved(QPDFObjGen og)  
1758 -{  
1759 - return !isCached(og) || m->obj_cache[og].object->isUnresolved();  
1760 -}  
1761 -  
1762 -QPDFObjGen  
1763 -QPDF::nextObjGen()  
1764 -{  
1765 - int max_objid = toI(getObjectCount());  
1766 - if (max_objid == std::numeric_limits<int>::max()) {  
1767 - throw std::range_error("max object id is too high to create new objects");  
1768 - }  
1769 - return QPDFObjGen(max_objid + 1, 0);  
1770 -}  
1771 -  
1772 -QPDFObjectHandle  
1773 -QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)  
1774 -{  
1775 - QPDFObjGen next{nextObjGen()};  
1776 - m->obj_cache[next] = ObjCache(obj, -1, -1);  
1777 - return newIndirect(next, m->obj_cache[next].object);  
1778 -}  
1779 -  
1780 -QPDFObjectHandle  
1781 -QPDF::makeIndirectObject(QPDFObjectHandle oh)  
1782 -{  
1783 - if (!oh) {  
1784 - throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");  
1785 - }  
1786 - return makeIndirectFromQPDFObject(oh.getObj());  
1787 -}  
1788 -  
1789 -std::shared_ptr<QPDFObject>  
1790 -QPDF::getObjectForParser(int id, int gen, bool parse_pdf)  
1791 -{  
1792 - // This method is called by the parser and therefore must not resolve any objects.  
1793 - auto og = QPDFObjGen(id, gen);  
1794 - if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {  
1795 - return iter->second.object;  
1796 - }  
1797 - if (m->xref_table.count(og) || !m->parsed) {  
1798 - return m->obj_cache.insert({og, QPDFObject::create<QPDF_Unresolved>(this, og)})  
1799 - .first->second.object;  
1800 - }  
1801 - if (parse_pdf) {  
1802 - return QPDFObject::create<QPDF_Null>();  
1803 - }  
1804 - return m->obj_cache.insert({og, QPDFObject::create<QPDF_Null>(this, og)}).first->second.object;  
1805 -}  
1806 -  
1807 -std::shared_ptr<QPDFObject>  
1808 -QPDF::getObjectForJSON(int id, int gen)  
1809 -{  
1810 - auto og = QPDFObjGen(id, gen);  
1811 - auto [it, inserted] = m->obj_cache.try_emplace(og);  
1812 - auto& obj = it->second.object;  
1813 - if (inserted) {  
1814 - obj = (m->parsed && !m->xref_table.count(og))  
1815 - ? QPDFObject::create<QPDF_Null>(this, og)  
1816 - : QPDFObject::create<QPDF_Unresolved>(this, og);  
1817 - }  
1818 - return obj;  
1819 -}  
1820 -  
1821 -QPDFObjectHandle  
1822 -QPDF::getObject(QPDFObjGen og)  
1823 -{  
1824 - if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {  
1825 - return {it->second.object};  
1826 - } else if (m->parsed && !m->xref_table.count(og)) {  
1827 - return QPDFObject::create<QPDF_Null>();  
1828 - } else {  
1829 - auto result =  
1830 - m->obj_cache.try_emplace(og, QPDFObject::create<QPDF_Unresolved>(this, og), -1, -1);  
1831 - return {result.first->second.object};  
1832 - }  
1833 -}  
1834 -  
1835 -void  
1836 -QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)  
1837 -{  
1838 - replaceObject(QPDFObjGen(objid, generation), oh);  
1839 -}  
1840 -  
1841 -void  
1842 -QPDF::replaceObject(QPDFObjGen og, QPDFObjectHandle oh)  
1843 -{  
1844 - if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {  
1845 - QTC::TC("qpdf", "QPDF replaceObject called with indirect object");  
1846 - throw std::logic_error("QPDF::replaceObject called with indirect object handle");  
1847 - }  
1848 - updateCache(og, oh.getObj(), -1, -1, false);  
1849 -}  
1850 -  
1851 -void  
1852 -QPDF::removeObject(QPDFObjGen og)  
1853 -{  
1854 - m->xref_table.erase(og);  
1855 - if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {  
1856 - // Take care of any object handles that may be floating around.  
1857 - cached->second.object->assign_null();  
1858 - cached->second.object->setObjGen(nullptr, QPDFObjGen());  
1859 - m->obj_cache.erase(cached);  
1860 - }  
1861 -}  
1862 -  
1863 -void  
1864 -QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)  
1865 -{  
1866 - QTC::TC("qpdf", "QPDF replaceReserved");  
1867 - auto tc = reserved.getTypeCode();  
1868 - if (!(tc == ::ot_reserved || tc == ::ot_null)) {  
1869 - throw std::logic_error("replaceReserved called with non-reserved object");  
1870 - }  
1871 - replaceObject(reserved.getObjGen(), replacement);  
1872 -}  
1873 -  
1874 -void  
1875 -QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)  
1876 -{  
1877 - swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));  
1878 -}  
1879 -  
1880 -void  
1881 -QPDF::swapObjects(QPDFObjGen og1, QPDFObjGen og2)  
1882 -{  
1883 - // Force objects to be read from the input source if needed, then swap them in the cache.  
1884 - resolve(og1);  
1885 - resolve(og2);  
1886 - m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);  
1887 -}  
1888 -  
1889 -size_t  
1890 -QPDF::tableSize()  
1891 -{  
1892 - // If obj_cache is dense, accommodate all object in tables,else accommodate only original  
1893 - // objects.  
1894 - auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0;  
1895 - auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;  
1896 - auto max_id = std::numeric_limits<int>::max() - 1;  
1897 - if (max_obj >= max_id || max_xref >= max_id) {  
1898 - // Temporary fix. Long-term solution is  
1899 - // - QPDFObjGen to enforce objgens are valid and sensible  
1900 - // - xref table and obj cache to protect against insertion of impossibly large obj ids  
1901 - stopOnError("Impossibly large object id encountered.");  
1902 - }  
1903 - if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {  
1904 - return toS(++max_obj);  
1905 - }  
1906 - return toS(++max_xref);  
1907 -}  
1908 -  
1909 -std::vector<QPDFObjGen>  
1910 -QPDF::getCompressibleObjVector()  
1911 -{  
1912 - return getCompressibleObjGens<QPDFObjGen>();  
1913 -}  
1914 -  
1915 -std::vector<bool>  
1916 -QPDF::getCompressibleObjSet()  
1917 -{  
1918 - return getCompressibleObjGens<bool>();  
1919 -}  
1920 -  
1921 -template <typename T>  
1922 -std::vector<T>  
1923 -QPDF::getCompressibleObjGens()  
1924 -{  
1925 - // Return a list of objects that are allowed to be in object streams. Walk through the objects  
1926 - // by traversing the document from the root, including a traversal of the pages tree. This  
1927 - // makes that objects that are on the same page are more likely to be in the same object stream,  
1928 - // which is slightly more efficient, particularly with linearized files. This is better than  
1929 - // iterating through the xref table since it avoids preserving orphaned items.  
1930 -  
1931 - // Exclude encryption dictionary, if any  
1932 - QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");  
1933 - QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();  
1934 -  
1935 - const size_t max_obj = getObjectCount();  
1936 - std::vector<bool> visited(max_obj, false);  
1937 - std::vector<QPDFObjectHandle> queue;  
1938 - queue.reserve(512);  
1939 - queue.push_back(m->trailer);  
1940 - std::vector<T> result;  
1941 - if constexpr (std::is_same_v<T, QPDFObjGen>) {  
1942 - result.reserve(m->obj_cache.size());  
1943 - } else if constexpr (std::is_same_v<T, bool>) {  
1944 - result.resize(max_obj + 1U, false);  
1945 - } else {  
1946 - throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");  
1947 - }  
1948 - while (!queue.empty()) {  
1949 - auto obj = queue.back();  
1950 - queue.pop_back();  
1951 - if (obj.getObjectID() > 0) {  
1952 - QPDFObjGen og = obj.getObjGen();  
1953 - const size_t id = toS(og.getObj() - 1);  
1954 - if (id >= max_obj) {  
1955 - throw std::logic_error(  
1956 - "unexpected object id encountered in getCompressibleObjGens");  
1957 - }  
1958 - if (visited[id]) {  
1959 - QTC::TC("qpdf", "QPDF loop detected traversing objects");  
1960 - continue;  
1961 - }  
1962 -  
1963 - // Check whether this is the current object. If not, remove it (which changes it into a  
1964 - // direct null and therefore stops us from revisiting it) and move on to the next object  
1965 - // in the queue.  
1966 - auto upper = m->obj_cache.upper_bound(og);  
1967 - if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {  
1968 - removeObject(og);  
1969 - continue;  
1970 - }  
1971 -  
1972 - visited[id] = true;  
1973 -  
1974 - if (og == encryption_dict_og) {  
1975 - QTC::TC("qpdf", "QPDF exclude encryption dictionary");  
1976 - } else if (!(obj.isStream() ||  
1977 - (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&  
1978 - obj.hasKey("/Contents")))) {  
1979 - if constexpr (std::is_same_v<T, QPDFObjGen>) {  
1980 - result.push_back(og);  
1981 - } else if constexpr (std::is_same_v<T, bool>) {  
1982 - result[id + 1U] = true;  
1983 - }  
1984 - }  
1985 - }  
1986 - if (obj.isStream()) {  
1987 - auto dict = obj.getDict().as_dictionary();  
1988 - auto end = dict.crend();  
1989 - for (auto iter = dict.crbegin(); iter != end; ++iter) {  
1990 - std::string const& key = iter->first;  
1991 - QPDFObjectHandle const& value = iter->second;  
1992 - if (!value.null()) {  
1993 - if (key == "/Length") {  
1994 - // omit stream lengths  
1995 - if (value.isIndirect()) {  
1996 - QTC::TC("qpdf", "QPDF exclude indirect length");  
1997 - }  
1998 - } else {  
1999 - queue.emplace_back(value);  
2000 - }  
2001 - }  
2002 - }  
2003 - } else if (obj.isDictionary()) {  
2004 - auto dict = obj.as_dictionary();  
2005 - auto end = dict.crend();  
2006 - for (auto iter = dict.crbegin(); iter != end; ++iter) {  
2007 - if (!iter->second.null()) {  
2008 - queue.emplace_back(iter->second);  
2009 - }  
2010 - }  
2011 - } else if (auto items = obj.as_array()) {  
2012 - queue.insert(queue.end(), items.crbegin(), items.crend());  
2013 - }  
2014 - }  
2015 -  
2016 - return result;  
2017 -}  
manual/release-notes.rst
@@ -21,15 +21,16 @@ more detail. @@ -21,15 +21,16 @@ more detail.
21 integer object. Previously the method returned false if the first 21 integer object. Previously the method returned false if the first
22 dictionary object was not a linearization parameter dictionary. 22 dictionary object was not a linearization parameter dictionary.
23 23
24 - - Other enhancements 24 +.. _r12-0-0:
  25 +
  26 +12.0.1: not yet released
  27 + - Other enhancements
25 28
26 - - There have been further enhancements to how files with damaged xref  
27 - tables are recovered. 29 + - There have been further enhancements to how files with damaged xref
  30 + tables are recovered.
28 31
29 .. cSpell:ignore substract 32 .. cSpell:ignore substract
30 33
31 -.. _r12-0-0:  
32 -  
33 12.0.0: March 9, 2025 34 12.0.0: March 9, 2025
34 - API breaking changes 35 - API breaking changes
35 36