Commit 57da88747e02c9d6bb66b7ff38b4db963fd388c9

Authored by m-holger
1 parent fba542f9

Revert "Split QPDF.cc into QPDF.cc and QPDF_objects.cc"

This reverts commit bb045907a043b5c6de9fb804ff11087333747329.
libqpdf/CMakeLists.txt
... ... @@ -95,7 +95,6 @@ set(libqpdf_SOURCES
95 95 QPDF_encryption.cc
96 96 QPDF_json.cc
97 97 QPDF_linearization.cc
98   - QPDF_objects.cc
99 98 QPDF_optimization.cc
100 99 QPDF_pages.cc
101 100 QTC.cc
... ...
libqpdf/QPDF.cc
... ... @@ -413,26 +413,1726 @@ QPDF::findHeader()
413 413 return valid;
414 414 }
415 415  
  416 +bool
  417 +QPDF::findStartxref()
  418 +{
  419 + if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {
  420 + // Position in front of offset token
  421 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  422 + return true;
  423 + }
  424 + return false;
  425 +}
  426 +
  427 +void
  428 +QPDF::parse(char const* password)
  429 +{
  430 + if (password) {
  431 + m->encp->provided_password = password;
  432 + }
  433 +
  434 + // Find the header anywhere in the first 1024 bytes of the file.
  435 + PatternFinder hf(*this, &QPDF::findHeader);
  436 + if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {
  437 + QTC::TC("qpdf", "QPDF not a pdf file");
  438 + warn(damagedPDF("", 0, "can't find PDF header"));
  439 + // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode
  440 + m->pdf_version = "1.2";
  441 + }
  442 +
  443 + // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
  444 + // 30 characters to leave room for the startxref stuff.
  445 + m->file->seek(0, SEEK_END);
  446 + qpdf_offset_t end_offset = m->file->tell();
  447 + m->xref_table_max_offset = end_offset;
  448 + // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
  449 + // scenarios at least 3 bytes are required.
  450 + if (m->xref_table_max_id > m->xref_table_max_offset / 3) {
  451 + m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);
  452 + }
  453 + qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
  454 + PatternFinder sf(*this, &QPDF::findStartxref);
  455 + qpdf_offset_t xref_offset = 0;
  456 + if (m->file->findLast("startxref", start_offset, 0, sf)) {
  457 + xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());
  458 + }
  459 +
  460 + try {
  461 + if (xref_offset == 0) {
  462 + QTC::TC("qpdf", "QPDF can't find startxref");
  463 + throw damagedPDF("", 0, "can't find startxref");
  464 + }
  465 + try {
  466 + read_xref(xref_offset);
  467 + } catch (QPDFExc&) {
  468 + throw;
  469 + } catch (std::exception& e) {
  470 + throw damagedPDF("", 0, std::string("error reading xref: ") + e.what());
  471 + }
  472 + } catch (QPDFExc& e) {
  473 + if (m->attempt_recovery) {
  474 + reconstruct_xref(e, xref_offset > 0);
  475 + QTC::TC("qpdf", "QPDF reconstructed xref table");
  476 + } else {
  477 + throw;
  478 + }
  479 + }
  480 +
  481 + initializeEncryption();
  482 + m->parsed = true;
  483 + if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) {
  484 + // QPDFs created from JSON have an empty xref table and no root object yet.
  485 + throw damagedPDF("", 0, "unable to find page tree");
  486 + }
  487 +}
  488 +
  489 +void
  490 +QPDF::inParse(bool v)
  491 +{
  492 + if (m->in_parse == v) {
  493 + // This happens if QPDFParser::parse tries to resolve an indirect object while it is
  494 + // parsing.
  495 + throw std::logic_error(
  496 + "QPDF: re-entrant parsing detected. This is a qpdf bug."
  497 + " Please report at https://github.com/qpdf/qpdf/issues.");
  498 + }
  499 + m->in_parse = v;
  500 +}
  501 +
  502 +void
  503 +QPDF::warn(QPDFExc const& e)
  504 +{
  505 + if (m->max_warnings > 0 && m->warnings.size() >= m->max_warnings) {
  506 + stopOnError("Too many warnings - file is too badly damaged");
  507 + }
  508 + m->warnings.push_back(e);
  509 + if (!m->suppress_warnings) {
  510 + *m->log->getWarn() << "WARNING: " << m->warnings.back().what() << "\n";
  511 + }
  512 +}
  513 +
  514 +void
  515 +QPDF::warn(
  516 + qpdf_error_code_e error_code,
  517 + std::string const& object,
  518 + qpdf_offset_t offset,
  519 + std::string const& message)
  520 +{
  521 + warn(QPDFExc(error_code, getFilename(), object, offset, message));
  522 +}
  523 +
  524 +void
  525 +QPDF::setTrailer(QPDFObjectHandle obj)
  526 +{
  527 + if (m->trailer) {
  528 + return;
  529 + }
  530 + m->trailer = obj;
  531 +}
  532 +
  533 +void
  534 +QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
  535 +{
  536 + if (m->reconstructed_xref) {
  537 + // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
  538 + // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
  539 + throw e;
  540 + }
  541 +
  542 + // If recovery generates more than 1000 warnings, the file is so severely damaged that there
  543 + // probably is no point trying to continue.
  544 + const auto max_warnings = m->warnings.size() + 1000U;
  545 + auto check_warnings = [this, max_warnings]() {
  546 + if (m->warnings.size() > max_warnings) {
  547 + throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");
  548 + }
  549 + };
  550 +
  551 + m->reconstructed_xref = true;
  552 + // We may find more objects, which may contain dangling references.
  553 + m->fixed_dangling_refs = false;
  554 +
  555 + warn(damagedPDF("", 0, "file is damaged"));
  556 + warn(e);
  557 + warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));
  558 +
  559 + // Delete all references to type 1 (uncompressed) objects
  560 + std::vector<QPDFObjGen> to_delete;
  561 + for (auto const& iter: m->xref_table) {
  562 + if (iter.second.getType() == 1) {
  563 + to_delete.emplace_back(iter.first);
  564 + }
  565 + }
  566 + for (auto const& iter: to_delete) {
  567 + m->xref_table.erase(iter);
  568 + }
  569 +
  570 + std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
  571 + std::vector<qpdf_offset_t> trailers;
  572 + std::vector<qpdf_offset_t> startxrefs;
  573 +
  574 + m->file->seek(0, SEEK_END);
  575 + qpdf_offset_t eof = m->file->tell();
  576 + m->file->seek(0, SEEK_SET);
  577 + // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
  578 + static size_t const MAX_LEN = 10;
  579 + while (m->file->tell() < eof) {
  580 + QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);
  581 + qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
  582 + if (t1.isInteger()) {
  583 + auto pos = m->file->tell();
  584 + auto t2 = readToken(*m->file, MAX_LEN);
  585 + if (t2.isInteger() && readToken(*m->file, MAX_LEN).isWord("obj")) {
  586 + int obj = QUtil::string_to_int(t1.getValue().c_str());
  587 + int gen = QUtil::string_to_int(t2.getValue().c_str());
  588 + if (obj <= m->xref_table_max_id) {
  589 + found_objects.emplace_back(obj, gen, token_start);
  590 + } else {
  591 + warn(damagedPDF(
  592 + "", 0, "ignoring object with impossibly large id " + std::to_string(obj)));
  593 + }
  594 + }
  595 + m->file->seek(pos, SEEK_SET);
  596 + } else if (!m->trailer && t1.isWord("trailer")) {
  597 + trailers.emplace_back(m->file->tell());
  598 + } else if (!found_startxref && t1.isWord("startxref")) {
  599 + startxrefs.emplace_back(m->file->tell());
  600 + }
  601 + check_warnings();
  602 + m->file->findAndSkipNextEOL();
  603 + }
  604 +
  605 + if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
  606 + startxrefs.back() > std::get<2>(found_objects.back())) {
  607 + try {
  608 + m->file->seek(startxrefs.back(), SEEK_SET);
  609 + if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
  610 + read_xref(offset);
  611 + if (getRoot().getKey("/Pages").isDictionary()) {
  612 + QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
  613 + warn(
  614 + damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));
  615 + initializeEncryption();
  616 + m->parsed = true;
  617 + m->reconstructed_xref = false;
  618 + return;
  619 + }
  620 + }
  621 + } catch (...) {
  622 + // ok, bad luck. Do recovery.
  623 + }
  624 + }
  625 +
  626 + auto rend = found_objects.rend();
  627 + for (auto it = found_objects.rbegin(); it != rend; it++) {
  628 + auto [obj, gen, token_start] = *it;
  629 + insertXrefEntry(obj, 1, token_start, gen);
  630 + check_warnings();
  631 + }
  632 + m->deleted_objects.clear();
  633 +
  634 + for (auto it = trailers.rbegin(); it != trailers.rend(); it++) {
  635 + m->file->seek(*it, SEEK_SET);
  636 + auto t = readTrailer();
  637 + if (!t.isDictionary()) {
  638 + // Oh well. It was worth a try.
  639 + } else {
  640 + if (t.hasKey("/Root")) {
  641 + m->trailer = t;
  642 + break;
  643 + }
  644 + warn(damagedPDF("trailer", *it, "recovered trailer has no /Root entry"));
  645 + }
  646 + check_warnings();
  647 + }
  648 +
  649 + if (!m->trailer) {
  650 + qpdf_offset_t max_offset{0};
  651 + size_t max_size{0};
  652 + // If there are any xref streams, take the last one to appear.
  653 + for (auto const& iter: m->xref_table) {
  654 + auto entry = iter.second;
  655 + if (entry.getType() != 1) {
  656 + continue;
  657 + }
  658 + auto oh = getObject(iter.first);
  659 + try {
  660 + if (!oh.isStreamOfType("/XRef")) {
  661 + continue;
  662 + }
  663 + } catch (std::exception&) {
  664 + continue;
  665 + }
  666 + auto offset = entry.getOffset();
  667 + auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();
  668 + if (size > max_size || (size == max_size && offset > max_offset)) {
  669 + max_offset = offset;
  670 + setTrailer(oh.getDict());
  671 + }
  672 + check_warnings();
  673 + }
  674 + if (max_offset > 0) {
  675 + try {
  676 + read_xref(max_offset);
  677 + } catch (std::exception&) {
  678 + warn(damagedPDF(
  679 + "", 0, "error decoding candidate xref stream while recovering damaged file"));
  680 + }
  681 + QTC::TC("qpdf", "QPDF recover xref stream");
  682 + }
  683 + }
  684 +
  685 + if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {
  686 + // Try to find a Root dictionary. As a quick fix try the one with the highest object id.
  687 + QPDFObjectHandle root;
  688 + for (auto const& iter: m->obj_cache) {
  689 + try {
  690 + if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {
  691 + root = iter.second.object;
  692 + }
  693 + } catch (std::exception&) {
  694 + continue;
  695 + }
  696 + }
  697 + if (root) {
  698 + if (!m->trailer) {
  699 + warn(damagedPDF(
  700 + "", 0, "unable to find trailer dictionary while recovering damaged file"));
  701 + m->trailer = QPDFObjectHandle::newDictionary();
  702 + }
  703 + m->trailer.replaceKey("/Root", root);
  704 + }
  705 + }
  706 +
  707 + if (!m->trailer) {
  708 + // We could check the last encountered object to see if it was an xref stream. If so, we
  709 + // could try to get the trailer from there. This may make it possible to recover files with
  710 + // bad startxref pointers even when they have object streams.
  711 +
  712 + throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");
  713 + }
  714 + if (m->xref_table.empty()) {
  715 + // We cannot check for an empty xref table in parse because empty tables are valid when
  716 + // creating QPDF objects from JSON.
  717 + throw damagedPDF("", 0, "unable to find objects while recovering damaged file");
  718 + }
  719 + check_warnings();
  720 + if (!m->parsed) {
  721 + m->parsed = true;
  722 + getAllPages();
  723 + check_warnings();
  724 + if (m->all_pages.empty()) {
  725 + m->parsed = false;
  726 + throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");
  727 + }
  728 + }
  729 + // We could iterate through the objects looking for streams and try to find objects inside of
  730 + // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors
  731 + // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything
  732 + // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
  733 + // It's safe to call it more than once.
  734 +}
  735 +
  736 +void
  737 +QPDF::read_xref(qpdf_offset_t xref_offset)
  738 +{
  739 + std::map<int, int> free_table;
  740 + std::set<qpdf_offset_t> visited;
  741 + while (xref_offset) {
  742 + visited.insert(xref_offset);
  743 + char buf[7];
  744 + memset(buf, 0, sizeof(buf));
  745 + m->file->seek(xref_offset, SEEK_SET);
  746 + // Some files miss the mark a little with startxref. We could do a better job of searching
  747 + // in the neighborhood for something that looks like either an xref table or stream, but the
  748 + // simple heuristic of skipping whitespace can help with the xref table case and is harmless
  749 + // with the stream case.
  750 + bool done = false;
  751 + bool skipped_space = false;
  752 + while (!done) {
  753 + char ch;
  754 + if (1 == m->file->read(&ch, 1)) {
  755 + if (util::is_space(ch)) {
  756 + skipped_space = true;
  757 + } else {
  758 + m->file->unreadCh(ch);
  759 + done = true;
  760 + }
  761 + } else {
  762 + QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
  763 + done = true;
  764 + }
  765 + }
  766 +
  767 + m->file->read(buf, sizeof(buf) - 1);
  768 + // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
  769 + // where it is terminated by arbitrary whitespace.
  770 + if ((strncmp(buf, "xref", 4) == 0) && util::is_space(buf[4])) {
  771 + if (skipped_space) {
  772 + QTC::TC("qpdf", "QPDF xref skipped space");
  773 + warn(damagedPDF("", 0, "extraneous whitespace seen before xref"));
  774 + }
  775 + QTC::TC(
  776 + "qpdf",
  777 + "QPDF xref space",
  778 + ((buf[4] == '\n') ? 0
  779 + : (buf[4] == '\r') ? 1
  780 + : (buf[4] == ' ') ? 2
  781 + : 9999));
  782 + int skip = 4;
  783 + // buf is null-terminated, and util::is_space('\0') is false, so this won't overrun.
  784 + while (util::is_space(buf[skip])) {
  785 + ++skip;
  786 + }
  787 + xref_offset = read_xrefTable(xref_offset + skip);
  788 + } else {
  789 + xref_offset = read_xrefStream(xref_offset);
  790 + }
  791 + if (visited.count(xref_offset) != 0) {
  792 + QTC::TC("qpdf", "QPDF xref loop");
  793 + throw damagedPDF("", 0, "loop detected following xref tables");
  794 + }
  795 + }
  796 +
  797 + if (!m->trailer) {
  798 + throw damagedPDF("", 0, "unable to find trailer while reading xref");
  799 + }
  800 + int size = m->trailer.getKey("/Size").getIntValueAsInt();
  801 + int max_obj = 0;
  802 + if (!m->xref_table.empty()) {
  803 + max_obj = m->xref_table.rbegin()->first.getObj();
  804 + }
  805 + if (!m->deleted_objects.empty()) {
  806 + max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));
  807 + }
  808 + if ((size < 1) || (size - 1 != max_obj)) {
  809 + QTC::TC("qpdf", "QPDF xref size mismatch");
  810 + warn(damagedPDF(
  811 + "",
  812 + 0,
  813 + ("reported number of objects (" + std::to_string(size) +
  814 + ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));
  815 + }
  816 +
  817 + // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we
  818 + // never depend on its being set.
  819 + m->deleted_objects.clear();
  820 +
  821 + // Make sure we keep only the highest generation for any object.
  822 + QPDFObjGen last_og{-1, 0};
  823 + for (auto const& item: m->xref_table) {
  824 + auto id = item.first.getObj();
  825 + if (id == last_og.getObj() && id > 0) {
  826 + removeObject(last_og);
  827 + }
  828 + last_og = item.first;
  829 + }
  830 +}
  831 +
  832 +bool
  833 +QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)
  834 +{
  835 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  836 + // buffer.
  837 + char const* p = line.c_str();
  838 + char const* start = line.c_str();
  839 +
  840 + // Skip zero or more spaces
  841 + while (util::is_space(*p)) {
  842 + ++p;
  843 + }
  844 + // Require digit
  845 + if (!util::is_digit(*p)) {
  846 + return false;
  847 + }
  848 + // Gather digits
  849 + std::string obj_str;
  850 + while (util::is_digit(*p)) {
  851 + obj_str.append(1, *p++);
  852 + }
  853 + // Require space
  854 + if (!util::is_space(*p)) {
  855 + return false;
  856 + }
  857 + // Skip spaces
  858 + while (util::is_space(*p)) {
  859 + ++p;
  860 + }
  861 + // Require digit
  862 + if (!util::is_digit(*p)) {
  863 + return false;
  864 + }
  865 + // Gather digits
  866 + std::string num_str;
  867 + while (util::is_digit(*p)) {
  868 + num_str.append(1, *p++);
  869 + }
  870 + // Skip any space including line terminators
  871 + while (util::is_space(*p)) {
  872 + ++p;
  873 + }
  874 + bytes = toI(p - start);
  875 + obj = QUtil::string_to_int(obj_str.c_str());
  876 + num = QUtil::string_to_int(num_str.c_str());
  877 + return true;
  878 +}
  879 +
  880 +bool
  881 +QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
  882 +{
  883 + // Reposition after initial read attempt and reread.
  884 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  885 + auto line = m->file->readLine(30);
  886 +
  887 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  888 + // buffer.
  889 + char const* p = line.data();
  890 +
  891 + // Skip zero or more spaces. There aren't supposed to be any.
  892 + bool invalid = false;
  893 + while (util::is_space(*p)) {
  894 + ++p;
  895 + QTC::TC("qpdf", "QPDF ignore first space in xref entry");
  896 + invalid = true;
  897 + }
  898 + // Require digit
  899 + if (!util::is_digit(*p)) {
  900 + return false;
  901 + }
  902 + // Gather digits
  903 + std::string f1_str;
  904 + while (util::is_digit(*p)) {
  905 + f1_str.append(1, *p++);
  906 + }
  907 + // Require space
  908 + if (!util::is_space(*p)) {
  909 + return false;
  910 + }
  911 + if (util::is_space(*(p + 1))) {
  912 + QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
  913 + invalid = true;
  914 + }
  915 + // Skip spaces
  916 + while (util::is_space(*p)) {
  917 + ++p;
  918 + }
  919 + // Require digit
  920 + if (!util::is_digit(*p)) {
  921 + return false;
  922 + }
  923 + // Gather digits
  924 + std::string f2_str;
  925 + while (util::is_digit(*p)) {
  926 + f2_str.append(1, *p++);
  927 + }
  928 + // Require space
  929 + if (!util::is_space(*p)) {
  930 + return false;
  931 + }
  932 + if (util::is_space(*(p + 1))) {
  933 + QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
  934 + invalid = true;
  935 + }
  936 + // Skip spaces
  937 + while (util::is_space(*p)) {
  938 + ++p;
  939 + }
  940 + if ((*p == 'f') || (*p == 'n')) {
  941 + type = *p;
  942 + } else {
  943 + return false;
  944 + }
  945 + if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
  946 + QTC::TC("qpdf", "QPDF ignore length error xref entry");
  947 + invalid = true;
  948 + }
  949 +
  950 + if (invalid) {
  951 + warn(damagedPDF("xref table", "accepting invalid xref table entry"));
  952 + }
  953 +
  954 + f1 = QUtil::string_to_ll(f1_str.c_str());
  955 + f2 = QUtil::string_to_int(f2_str.c_str());
  956 +
  957 + return true;
  958 +}
  959 +
  960 +// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
  961 +// result.
  962 +bool
  963 +QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
  964 +{
  965 + std::array<char, 21> line;
  966 + if (m->file->read(line.data(), 20) != 20) {
  967 + // C++20: [[unlikely]]
  968 + return false;
  969 + }
  970 + line[20] = '\0';
  971 + char const* p = line.data();
  972 +
  973 + int f1_len = 0;
  974 + int f2_len = 0;
  975 +
  976 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  977 + // buffer.
  978 +
  979 + // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
  980 + while (*p == '0') {
  981 + ++f1_len;
  982 + ++p;
  983 + }
  984 + while (util::is_digit(*p) && f1_len++ < 10) {
  985 + f1 *= 10;
  986 + f1 += *p++ - '0';
  987 + }
  988 + // Require space
  989 + if (!util::is_space(*p++)) {
  990 + // Entry doesn't start with space or digit.
  991 + // C++20: [[unlikely]]
  992 + return false;
  993 + }
  994 + // Gather digits. NB No risk of overflow as 99'999 < max int.
  995 + while (*p == '0') {
  996 + ++f2_len;
  997 + ++p;
  998 + }
  999 + while (util::is_digit(*p) && f2_len++ < 5) {
  1000 + f2 *= 10;
  1001 + f2 += static_cast<int>(*p++ - '0');
  1002 + }
  1003 + if (util::is_space(*p++) && (*p == 'f' || *p == 'n')) {
  1004 + // C++20: [[likely]]
  1005 + type = *p;
  1006 + // No test for valid line[19].
  1007 + if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
  1008 + // C++20: [[likely]]
  1009 + return true;
  1010 + }
  1011 + }
  1012 + return read_bad_xrefEntry(f1, f2, type);
  1013 +}
  1014 +
  1015 +// Read a single cross-reference table section and associated trailer.
  1016 +qpdf_offset_t
  1017 +QPDF::read_xrefTable(qpdf_offset_t xref_offset)
  1018 +{
  1019 + m->file->seek(xref_offset, SEEK_SET);
  1020 + std::string line;
  1021 + while (true) {
  1022 + line.assign(50, '\0');
  1023 + m->file->read(line.data(), line.size());
  1024 + int obj = 0;
  1025 + int num = 0;
  1026 + int bytes = 0;
  1027 + if (!parse_xrefFirst(line, obj, num, bytes)) {
  1028 + QTC::TC("qpdf", "QPDF invalid xref");
  1029 + throw damagedPDF("xref table", "xref syntax invalid");
  1030 + }
  1031 + m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
  1032 + for (qpdf_offset_t i = obj; i - num < obj; ++i) {
  1033 + if (i == 0) {
  1034 + // This is needed by checkLinearization()
  1035 + m->first_xref_item_offset = m->file->tell();
  1036 + }
  1037 + // For xref_table, these will always be small enough to be ints
  1038 + qpdf_offset_t f1 = 0;
  1039 + int f2 = 0;
  1040 + char type = '\0';
  1041 + if (!read_xrefEntry(f1, f2, type)) {
  1042 + QTC::TC("qpdf", "QPDF invalid xref entry");
  1043 + throw damagedPDF(
  1044 + "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
  1045 + }
  1046 + if (type == 'f') {
  1047 + insertFreeXrefEntry(QPDFObjGen(toI(i), f2));
  1048 + } else {
  1049 + insertXrefEntry(toI(i), 1, f1, f2);
  1050 + }
  1051 + }
  1052 + qpdf_offset_t pos = m->file->tell();
  1053 + if (readToken(*m->file).isWord("trailer")) {
  1054 + break;
  1055 + } else {
  1056 + m->file->seek(pos, SEEK_SET);
  1057 + }
  1058 + }
  1059 +
  1060 + // Set offset to previous xref table if any
  1061 + QPDFObjectHandle cur_trailer = readTrailer();
  1062 + if (!cur_trailer.isDictionary()) {
  1063 + QTC::TC("qpdf", "QPDF missing trailer");
  1064 + throw damagedPDF("", "expected trailer dictionary");
  1065 + }
  1066 +
  1067 + if (!m->trailer) {
  1068 + setTrailer(cur_trailer);
  1069 +
  1070 + if (!m->trailer.hasKey("/Size")) {
  1071 + QTC::TC("qpdf", "QPDF trailer lacks size");
  1072 + throw damagedPDF("trailer", "trailer dictionary lacks /Size key");
  1073 + }
  1074 + if (!m->trailer.getKey("/Size").isInteger()) {
  1075 + QTC::TC("qpdf", "QPDF trailer size not integer");
  1076 + throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
  1077 + }
  1078 + }
  1079 +
  1080 + if (cur_trailer.hasKey("/XRefStm")) {
  1081 + if (m->ignore_xref_streams) {
  1082 + QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
  1083 + } else {
  1084 + if (cur_trailer.getKey("/XRefStm").isInteger()) {
  1085 + // Read the xref stream but disregard any return value -- we'll use our trailer's
  1086 + // /Prev key instead of the xref stream's.
  1087 + (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());
  1088 + } else {
  1089 + throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");
  1090 + }
  1091 + }
  1092 + }
  1093 +
  1094 + if (cur_trailer.hasKey("/Prev")) {
  1095 + if (!cur_trailer.getKey("/Prev").isInteger()) {
  1096 + QTC::TC("qpdf", "QPDF trailer prev not integer");
  1097 + throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");
  1098 + }
  1099 + QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
  1100 + return cur_trailer.getKey("/Prev").getIntValue();
  1101 + }
  1102 +
  1103 + return 0;
  1104 +}
  1105 +
  1106 +// Read a single cross-reference stream.
  1107 +qpdf_offset_t
  1108 +QPDF::read_xrefStream(qpdf_offset_t xref_offset)
  1109 +{
  1110 + if (!m->ignore_xref_streams) {
  1111 + QPDFObjGen x_og;
  1112 + QPDFObjectHandle xref_obj;
  1113 + try {
  1114 + xref_obj =
  1115 + readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);
  1116 + } catch (QPDFExc&) {
  1117 + // ignore -- report error below
  1118 + }
  1119 + if (xref_obj.isStreamOfType("/XRef")) {
  1120 + QTC::TC("qpdf", "QPDF found xref stream");
  1121 + return processXRefStream(xref_offset, xref_obj);
  1122 + }
  1123 + }
  1124 +
  1125 + QTC::TC("qpdf", "QPDF can't find xref");
  1126 + throw damagedPDF("", xref_offset, "xref not found");
  1127 + return 0; // unreachable
  1128 +}
  1129 +
  1130 +// Return the entry size of the xref stream and the processed W array.
  1131 +std::pair<int, std::array<int, 3>>
  1132 +QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
  1133 +{
  1134 + auto W_obj = dict.getKey("/W");
  1135 + if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&
  1136 + W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
  1137 + throw damaged("Cross-reference stream does not have a proper /W key");
  1138 + }
  1139 +
  1140 + std::array<int, 3> W;
  1141 + int entry_size = 0;
  1142 + auto w_vector = W_obj.getArrayAsVector();
  1143 + int max_bytes = sizeof(qpdf_offset_t);
  1144 + for (size_t i = 0; i < 3; ++i) {
  1145 + W[i] = w_vector[i].getIntValueAsInt();
  1146 + if (W[i] > max_bytes) {
  1147 + throw damaged("Cross-reference stream's /W contains impossibly large values");
  1148 + }
  1149 + if (W[i] < 0) {
  1150 + throw damaged("Cross-reference stream's /W contains negative values");
  1151 + }
  1152 + entry_size += W[i];
  1153 + }
  1154 + if (entry_size == 0) {
  1155 + throw damaged("Cross-reference stream's /W indicates entry size of 0");
  1156 + }
  1157 + return {entry_size, W};
  1158 +}
  1159 +
  1160 +// Validate Size key and return the maximum number of entries that the xref stream can contain.
  1161 +int
  1162 +QPDF::processXRefSize(
  1163 + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
  1164 +{
  1165 + // Number of entries is limited by the highest possible object id and stream size.
  1166 + auto max_num_entries = std::numeric_limits<int>::max();
  1167 + if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
  1168 + max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
  1169 + }
  1170 +
  1171 + auto Size_obj = dict.getKey("/Size");
  1172 + long long size;
  1173 + if (!dict.getKey("/Size").getValueAsInt(size)) {
  1174 + throw damaged("Cross-reference stream does not have a proper /Size key");
  1175 + } else if (size < 0) {
  1176 + throw damaged("Cross-reference stream has a negative /Size key");
  1177 + } else if (size >= max_num_entries) {
  1178 + throw damaged("Cross-reference stream has an impossibly large /Size key");
  1179 + }
  1180 + // We are not validating that Size <= (Size key of parent xref / trailer).
  1181 + return max_num_entries;
  1182 +}
  1183 +
  1184 +// Return the number of entries of the xref stream and the processed Index array.
  1185 +std::pair<int, std::vector<std::pair<int, int>>>
  1186 +QPDF::processXRefIndex(
  1187 + QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
  1188 +{
  1189 + auto size = dict.getKey("/Size").getIntValueAsInt();
  1190 + auto Index_obj = dict.getKey("/Index");
  1191 +
  1192 + if (Index_obj.isArray()) {
  1193 + std::vector<std::pair<int, int>> indx;
  1194 + int num_entries = 0;
  1195 + auto index_vec = Index_obj.getArrayAsVector();
  1196 + if ((index_vec.size() % 2) || index_vec.size() < 2) {
  1197 + throw damaged("Cross-reference stream's /Index has an invalid number of values");
  1198 + }
  1199 +
  1200 + int i = 0;
  1201 + long long first = 0;
  1202 + for (auto& val: index_vec) {
  1203 + if (val.isInteger()) {
  1204 + if (i % 2) {
  1205 + auto count = val.getIntValue();
  1206 + if (count <= 0) {
  1207 + throw damaged(
  1208 + "Cross-reference stream section claims to contain " +
  1209 + std::to_string(count) + " entries");
  1210 + }
  1211 + // We are guarding against the possibility of num_entries * entry_size
  1212 + // overflowing. We are not checking that entries are in ascending order as
  1213 + // required by the spec, which probably should generate a warning. We are also
  1214 + // not checking that for each subsection first object number + number of entries
  1215 + // <= /Size. The spec requires us to ignore object number > /Size.
  1216 + if (first > (max_num_entries - count) ||
  1217 + count > (max_num_entries - num_entries)) {
  1218 + throw damaged(
  1219 + "Cross-reference stream claims to contain too many entries: " +
  1220 + std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
  1221 + std::to_string(num_entries));
  1222 + }
  1223 + indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
  1224 + num_entries += static_cast<int>(count);
  1225 + } else {
  1226 + first = val.getIntValue();
  1227 + if (first < 0) {
  1228 + throw damaged(
  1229 + "Cross-reference stream's /Index contains a negative object id");
  1230 + } else if (first > max_num_entries) {
  1231 + throw damaged(
  1232 + "Cross-reference stream's /Index contains an impossibly "
  1233 + "large object id");
  1234 + }
  1235 + }
  1236 + } else {
  1237 + throw damaged(
  1238 + "Cross-reference stream's /Index's item " + std::to_string(i) +
  1239 + " is not an integer");
  1240 + }
  1241 + i++;
  1242 + }
  1243 + QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
  1244 + return {num_entries, indx};
  1245 + } else if (Index_obj.isNull()) {
  1246 + QTC::TC("qpdf", "QPDF xref /Index is null");
  1247 + return {size, {{0, size}}};
  1248 + } else {
  1249 + throw damaged("Cross-reference stream does not have a proper /Index key");
  1250 + }
  1251 +}
  1252 +
  1253 +qpdf_offset_t
  1254 +QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
  1255 +{
  1256 + auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
  1257 + return damagedPDF("xref stream", xref_offset, msg.data());
  1258 + };
  1259 +
  1260 + auto dict = xref_obj.getDict();
  1261 +
  1262 + auto [entry_size, W] = processXRefW(dict, damaged);
  1263 + int max_num_entries = processXRefSize(dict, entry_size, damaged);
  1264 + auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
  1265 +
  1266 + std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
  1267 + size_t actual_size = bp->getSize();
  1268 + auto expected_size = toS(entry_size) * toS(num_entries);
  1269 +
  1270 + if (expected_size != actual_size) {
  1271 + QPDFExc x = damaged(
  1272 + "Cross-reference stream data has the wrong size; expected = " +
  1273 + std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
  1274 + if (expected_size > actual_size) {
  1275 + throw x;
  1276 + } else {
  1277 + warn(x);
  1278 + }
  1279 + }
  1280 +
  1281 + bool saw_first_compressed_object = false;
  1282 +
  1283 + // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
  1284 + // We know that entry_size * num_entries is less or equal to the size of the buffer.
  1285 + auto p = bp->getBuffer();
  1286 + for (auto [obj, sec_entries]: indx) {
  1287 + // Process a subsection.
  1288 + for (int i = 0; i < sec_entries; ++i) {
  1289 + // Read this entry
  1290 + std::array<qpdf_offset_t, 3> fields{};
  1291 + if (W[0] == 0) {
  1292 + QTC::TC("qpdf", "QPDF default for xref stream field 0");
  1293 + fields[0] = 1;
  1294 + }
  1295 + for (size_t j = 0; j < 3; ++j) {
  1296 + for (int k = 0; k < W[j]; ++k) {
  1297 + fields[j] <<= 8;
  1298 + fields[j] |= *p++;
  1299 + }
  1300 + }
  1301 +
  1302 + // Get the generation number. The generation number is 0 unless this is an uncompressed
  1303 + // object record, in which case the generation number appears as the third field.
  1304 + if (saw_first_compressed_object) {
  1305 + if (fields[0] != 2) {
  1306 + m->uncompressed_after_compressed = true;
  1307 + }
  1308 + } else if (fields[0] == 2) {
  1309 + saw_first_compressed_object = true;
  1310 + }
  1311 + if (obj == 0) {
  1312 + // This is needed by checkLinearization()
  1313 + m->first_xref_item_offset = xref_offset;
  1314 + } else if (fields[0] == 0) {
  1315 + // Ignore fields[2], which we don't care about in this case. This works around the
  1316 + // issue of some PDF files that put invalid values, like -1, here for deleted
  1317 + // objects.
  1318 + insertFreeXrefEntry(QPDFObjGen(obj, 0));
  1319 + } else {
  1320 + insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
  1321 + }
  1322 + ++obj;
  1323 + }
  1324 + }
  1325 +
  1326 + if (!m->trailer) {
  1327 + setTrailer(dict);
  1328 + }
  1329 +
  1330 + if (dict.hasKey("/Prev")) {
  1331 + if (!dict.getKey("/Prev").isInteger()) {
  1332 + throw damagedPDF(
  1333 + "xref stream", "/Prev key in xref stream dictionary is not an integer");
  1334 + }
  1335 + QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
  1336 + return dict.getKey("/Prev").getIntValue();
  1337 + } else {
  1338 + return 0;
  1339 + }
  1340 +}
  1341 +
  1342 +void
  1343 +QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)
  1344 +{
  1345 + // Populate the xref table in such a way that the first reference to an object that we see,
  1346 + // which is the one in the latest xref table in which it appears, is the one that gets stored.
  1347 + // This works because we are reading more recent appends before older ones.
  1348 +
  1349 + // If there is already an entry for this object and generation in the table, it means that a
  1350 + // later xref table has registered this object. Disregard this one.
  1351 + int new_gen = f0 == 2 ? 0 : f2;
  1352 +
  1353 + if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && new_gen < 65535)) {
  1354 + // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There
  1355 + // is probably no point having another warning but we could count invalid items in order to
  1356 + // decide when to give up.
  1357 + QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");
  1358 + // ignore impossibly large object ids or object ids > Size.
  1359 + return;
  1360 + }
  1361 +
  1362 + if (m->deleted_objects.count(obj)) {
  1363 + QTC::TC("qpdf", "QPDF xref deleted object");
  1364 + return;
  1365 + }
  1366 +
  1367 + if (f0 == 2 && static_cast<int>(f1) == obj) {
  1368 + warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));
  1369 + return;
  1370 + }
  1371 +
  1372 + auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));
  1373 + if (!created) {
  1374 + QTC::TC("qpdf", "QPDF xref reused object");
  1375 + return;
  1376 + }
  1377 +
  1378 + switch (f0) {
  1379 + case 1:
  1380 + // f2 is generation
  1381 + QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
  1382 + iter->second = QPDFXRefEntry(f1);
  1383 + break;
  1384 +
  1385 + case 2:
  1386 + iter->second = QPDFXRefEntry(toI(f1), f2);
  1387 + break;
  1388 +
  1389 + default:
  1390 + throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));
  1391 + break;
  1392 + }
  1393 +}
  1394 +
  1395 +void
  1396 +QPDF::insertFreeXrefEntry(QPDFObjGen og)
  1397 +{
  1398 + if (!m->xref_table.count(og)) {
  1399 + m->deleted_objects.insert(og.getObj());
  1400 + }
  1401 +}
  1402 +
  1403 +void
  1404 +QPDF::showXRefTable()
  1405 +{
  1406 + auto& cout = *m->log->getInfo();
  1407 + for (auto const& iter: m->xref_table) {
  1408 + QPDFObjGen const& og = iter.first;
  1409 + QPDFXRefEntry const& entry = iter.second;
  1410 + cout << og.unparse('/') << ": ";
  1411 + switch (entry.getType()) {
  1412 + case 1:
  1413 + cout << "uncompressed; offset = " << entry.getOffset();
  1414 + break;
  1415 +
  1416 + case 2:
  1417 + *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()
  1418 + << ", index = " << entry.getObjStreamIndex();
  1419 + break;
  1420 +
  1421 + default:
  1422 + throw std::logic_error("unknown cross-reference table type while showing xref_table");
  1423 + break;
  1424 + }
  1425 + m->log->info("\n");
  1426 + }
  1427 +}
  1428 +
  1429 +// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
  1430 +// return false. Otherwise return true.
  1431 +bool
  1432 +QPDF::resolveXRefTable()
  1433 +{
  1434 + bool may_change = !m->reconstructed_xref;
  1435 + for (auto& iter: m->xref_table) {
  1436 + if (isUnresolved(iter.first)) {
  1437 + resolve(iter.first);
  1438 + if (may_change && m->reconstructed_xref) {
  1439 + return false;
  1440 + }
  1441 + }
  1442 + }
  1443 + return true;
  1444 +}
  1445 +
  1446 +// Ensure all objects in the pdf file, including those in indirect references, appear in the object
  1447 +// cache.
  1448 +void
  1449 +QPDF::fixDanglingReferences(bool force)
  1450 +{
  1451 + if (m->fixed_dangling_refs) {
  1452 + return;
  1453 + }
  1454 + if (!resolveXRefTable()) {
  1455 + QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");
  1456 + resolveXRefTable();
  1457 + }
  1458 + m->fixed_dangling_refs = true;
  1459 +}
  1460 +
  1461 +size_t
  1462 +QPDF::getObjectCount()
  1463 +{
  1464 + // This method returns the next available indirect object number. makeIndirectObject uses it for
  1465 + // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
  1466 + // be in obj_cache.
  1467 + fixDanglingReferences();
  1468 + QPDFObjGen og;
  1469 + if (!m->obj_cache.empty()) {
  1470 + og = (*(m->obj_cache.rbegin())).first;
  1471 + }
  1472 + return toS(og.getObj());
  1473 +}
  1474 +
  1475 +std::vector<QPDFObjectHandle>
  1476 +QPDF::getAllObjects()
  1477 +{
  1478 + // After fixDanglingReferences is called, all objects are in the object cache.
  1479 + fixDanglingReferences();
  1480 + std::vector<QPDFObjectHandle> result;
  1481 + for (auto const& iter: m->obj_cache) {
  1482 + result.push_back(newIndirect(iter.first, iter.second.object));
  1483 + }
  1484 + return result;
  1485 +}
  1486 +
  1487 +void
  1488 +QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen og)
  1489 +{
  1490 + m->last_object_description.clear();
  1491 + if (!description.empty()) {
  1492 + m->last_object_description += description;
  1493 + if (og.isIndirect()) {
  1494 + m->last_object_description += ": ";
  1495 + }
  1496 + }
  1497 + if (og.isIndirect()) {
  1498 + m->last_object_description += "object " + og.unparse(' ');
  1499 + }
  1500 +}
  1501 +
  1502 +QPDFObjectHandle
  1503 +QPDF::readTrailer()
  1504 +{
  1505 + qpdf_offset_t offset = m->file->tell();
  1506 + bool empty = false;
  1507 + auto object =
  1508 + QPDFParser(*m->file, "trailer", m->tokenizer, nullptr, this, true).parse(empty, false);
  1509 + if (empty) {
  1510 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1511 + // actual PDF files and Adobe Reader appears to ignore them.
  1512 + warn(damagedPDF("trailer", "empty object treated as null"));
  1513 + } else if (object.isDictionary() && readToken(*m->file).isWord("stream")) {
  1514 + warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
  1515 + }
  1516 + // Override last_offset so that it points to the beginning of the object we just read
  1517 + m->file->setLastOffset(offset);
  1518 + return object;
  1519 +}
  1520 +
  1521 +QPDFObjectHandle
  1522 +QPDF::readObject(std::string const& description, QPDFObjGen og)
  1523 +{
  1524 + setLastObjectDescription(description, og);
  1525 + qpdf_offset_t offset = m->file->tell();
  1526 + bool empty = false;
  1527 +
  1528 + StringDecrypter decrypter{this, og};
  1529 + StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
  1530 + auto object =
  1531 + QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this, true)
  1532 + .parse(empty, false);
  1533 + if (empty) {
  1534 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1535 + // actual PDF files and Adobe Reader appears to ignore them.
  1536 + warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));
  1537 + return object;
  1538 + }
  1539 + auto token = readToken(*m->file);
  1540 + if (object.isDictionary() && token.isWord("stream")) {
  1541 + readStream(object, og, offset);
  1542 + token = readToken(*m->file);
  1543 + }
  1544 + if (!token.isWord("endobj")) {
  1545 + QTC::TC("qpdf", "QPDF err expected endobj");
  1546 + warn(damagedPDF("expected endobj"));
  1547 + }
  1548 + return object;
  1549 +}
  1550 +
  1551 +// After reading stream dictionary and stream keyword, read rest of stream.
416 1552 void
417   -QPDF::warn(QPDFExc const& e)
  1553 +QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
418 1554 {
419   - if (m->max_warnings > 0 && m->warnings.size() >= m->max_warnings) {
420   - stopOnError("Too many warnings - file is too badly damaged");
421   - }
422   - m->warnings.push_back(e);
423   - if (!m->suppress_warnings) {
424   - *m->log->getWarn() << "WARNING: " << m->warnings.back().what() << "\n";
  1555 + validateStreamLineEnd(object, og, offset);
  1556 +
  1557 + // Must get offset before accessing any additional objects since resolving a previously
  1558 + // unresolved indirect object will change file position.
  1559 + qpdf_offset_t stream_offset = m->file->tell();
  1560 + size_t length = 0;
  1561 +
  1562 + try {
  1563 + auto length_obj = object.getKey("/Length");
  1564 +
  1565 + if (!length_obj.isInteger()) {
  1566 + if (length_obj.isNull()) {
  1567 + QTC::TC("qpdf", "QPDF stream without length");
  1568 + throw damagedPDF(offset, "stream dictionary lacks /Length key");
  1569 + }
  1570 + QTC::TC("qpdf", "QPDF stream length not integer");
  1571 + throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");
  1572 + }
  1573 +
  1574 + length = toS(length_obj.getUIntValue());
  1575 + // Seek in two steps to avoid potential integer overflow
  1576 + m->file->seek(stream_offset, SEEK_SET);
  1577 + m->file->seek(toO(length), SEEK_CUR);
  1578 + if (!readToken(*m->file).isWord("endstream")) {
  1579 + QTC::TC("qpdf", "QPDF missing endstream");
  1580 + throw damagedPDF("expected endstream");
  1581 + }
  1582 + } catch (QPDFExc& e) {
  1583 + if (m->attempt_recovery) {
  1584 + warn(e);
  1585 + length = recoverStreamLength(m->file, og, stream_offset);
  1586 + } else {
  1587 + throw;
  1588 + }
425 1589 }
  1590 + object = QPDFObjectHandle(qpdf::Stream(*this, og, object, stream_offset, length));
426 1591 }
427 1592  
428 1593 void
429   -QPDF::warn(
430   - qpdf_error_code_e error_code,
431   - std::string const& object,
  1594 +QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
  1595 +{
  1596 + // The PDF specification states that the word "stream" should be followed by either a carriage
  1597 + // return and a newline or by a newline alone. It specifically disallowed following it by a
  1598 + // carriage return alone since, in that case, there would be no way to tell whether the NL in a
  1599 + // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,
  1600 + // accept a carriage return by itself when followed by a non-newline character, so that's what
  1601 + // we do here. We have also seen files that have extraneous whitespace between the stream
  1602 + // keyword and the newline.
  1603 + while (true) {
  1604 + char ch;
  1605 + if (m->file->read(&ch, 1) == 0) {
  1606 + // A premature EOF here will result in some other problem that will get reported at
  1607 + // another time.
  1608 + return;
  1609 + }
  1610 + if (ch == '\n') {
  1611 + // ready to read stream data
  1612 + QTC::TC("qpdf", "QPDF stream with NL only");
  1613 + return;
  1614 + }
  1615 + if (ch == '\r') {
  1616 + // Read another character
  1617 + if (m->file->read(&ch, 1) != 0) {
  1618 + if (ch == '\n') {
  1619 + // Ready to read stream data
  1620 + QTC::TC("qpdf", "QPDF stream with CRNL");
  1621 + } else {
  1622 + // Treat the \r by itself as the whitespace after endstream and start reading
  1623 + // stream data in spite of not having seen a newline.
  1624 + QTC::TC("qpdf", "QPDF stream with CR only");
  1625 + m->file->unreadCh(ch);
  1626 + warn(damagedPDF(
  1627 + m->file->tell(), "stream keyword followed by carriage return only"));
  1628 + }
  1629 + }
  1630 + return;
  1631 + }
  1632 + if (!util::is_space(ch)) {
  1633 + QTC::TC("qpdf", "QPDF stream without newline");
  1634 + m->file->unreadCh(ch);
  1635 + warn(damagedPDF(
  1636 + m->file->tell(), "stream keyword not followed by proper line terminator"));
  1637 + return;
  1638 + }
  1639 + warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
  1640 + }
  1641 +}
  1642 +
  1643 +QPDFObjectHandle
  1644 +QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)
  1645 +{
  1646 + m->last_object_description.erase(7); // last_object_description starts with "object "
  1647 + m->last_object_description += std::to_string(obj);
  1648 + m->last_object_description += " 0";
  1649 +
  1650 + bool empty = false;
  1651 + auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true)
  1652 + .parse(empty, false);
  1653 + if (empty) {
  1654 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1655 + // actual PDF files and Adobe Reader appears to ignore them.
  1656 + warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));
  1657 + }
  1658 + return object;
  1659 +}
  1660 +
  1661 +bool
  1662 +QPDF::findEndstream()
  1663 +{
  1664 + // Find endstream or endobj. Position the input at that token.
  1665 + auto t = readToken(*m->file, 20);
  1666 + if (t.isWord("endobj") || t.isWord("endstream")) {
  1667 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1668 + return true;
  1669 + }
  1670 + return false;
  1671 +}
  1672 +
  1673 +size_t
  1674 +QPDF::recoverStreamLength(
  1675 + std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)
  1676 +{
  1677 + // Try to reconstruct stream length by looking for endstream or endobj
  1678 + warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));
  1679 +
  1680 + PatternFinder ef(*this, &QPDF::findEndstream);
  1681 + size_t length = 0;
  1682 + if (m->file->findFirst("end", stream_offset, 0, ef)) {
  1683 + length = toS(m->file->tell() - stream_offset);
  1684 + // Reread endstream but, if it was endobj, don't skip that.
  1685 + QPDFTokenizer::Token t = readToken(*m->file);
  1686 + if (t.getValue() == "endobj") {
  1687 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1688 + }
  1689 + }
  1690 +
  1691 + if (length) {
  1692 + auto end = stream_offset + toO(length);
  1693 + qpdf_offset_t found_offset = 0;
  1694 + QPDFObjGen found_og;
  1695 +
  1696 + // Make sure this is inside this object
  1697 + for (auto const& [current_og, entry]: m->xref_table) {
  1698 + if (entry.getType() == 1) {
  1699 + qpdf_offset_t obj_offset = entry.getOffset();
  1700 + if (found_offset < obj_offset && obj_offset < end) {
  1701 + found_offset = obj_offset;
  1702 + found_og = current_og;
  1703 + }
  1704 + }
  1705 + }
  1706 + if (!found_offset || found_og == og) {
  1707 + // If we are trying to recover an XRef stream the xref table will not contain and
  1708 + // won't contain any entries, therefore we cannot check the found length. Otherwise we
  1709 + // found endstream\nendobj within the space allowed for this object, so we're probably
  1710 + // in good shape.
  1711 + } else {
  1712 + QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
  1713 + length = 0;
  1714 + }
  1715 + }
  1716 +
  1717 + if (length == 0) {
  1718 + warn(damagedPDF(
  1719 + *input, stream_offset, "unable to recover stream data; treating stream as empty"));
  1720 + } else {
  1721 + warn(damagedPDF(
  1722 + *input, stream_offset, "recovered stream length: " + std::to_string(length)));
  1723 + }
  1724 +
  1725 + QTC::TC("qpdf", "QPDF recovered stream length");
  1726 + return length;
  1727 +}
  1728 +
  1729 +QPDFTokenizer::Token
  1730 +QPDF::readToken(InputSource& input, size_t max_len)
  1731 +{
  1732 + return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
  1733 +}
  1734 +
  1735 +QPDFObjectHandle
  1736 +QPDF::readObjectAtOffset(
  1737 + bool try_recovery,
432 1738 qpdf_offset_t offset,
433   - std::string const& message)
  1739 + std::string const& description,
  1740 + QPDFObjGen exp_og,
  1741 + QPDFObjGen& og,
  1742 + bool skip_cache_if_in_xref)
  1743 +{
  1744 + bool check_og = true;
  1745 + if (exp_og.getObj() == 0) {
  1746 + // This method uses an expect object ID of 0 to indicate that we don't know or don't care
  1747 + // what the actual object ID is at this offset. This is true when we read the xref stream
  1748 + // and linearization hint streams. In this case, we don't verify the expect object
  1749 + // ID/generation against what was read from the file. There is also no reason to attempt
  1750 + // xref recovery if we get a failure in this case since the read attempt was not triggered
  1751 + // by an xref lookup.
  1752 + check_og = false;
  1753 + try_recovery = false;
  1754 + }
  1755 + setLastObjectDescription(description, exp_og);
  1756 +
  1757 + if (!m->attempt_recovery) {
  1758 + try_recovery = false;
  1759 + }
  1760 +
  1761 + // Special case: if offset is 0, just return null. Some PDF writers, in particular
  1762 + // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
  1763 + // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
  1764 + // these.
  1765 + if (offset == 0) {
  1766 + QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
  1767 + warn(damagedPDF(0, "object has offset 0"));
  1768 + return QPDFObjectHandle::newNull();
  1769 + }
  1770 +
  1771 + m->file->seek(offset, SEEK_SET);
  1772 + try {
  1773 + QPDFTokenizer::Token tobjid = readToken(*m->file);
  1774 + bool objidok = tobjid.isInteger();
  1775 + QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
  1776 + if (!objidok) {
  1777 + QTC::TC("qpdf", "QPDF expected n n obj");
  1778 + throw damagedPDF(offset, "expected n n obj");
  1779 + }
  1780 + QPDFTokenizer::Token tgen = readToken(*m->file);
  1781 + bool genok = tgen.isInteger();
  1782 + QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
  1783 + if (!genok) {
  1784 + throw damagedPDF(offset, "expected n n obj");
  1785 + }
  1786 + QPDFTokenizer::Token tobj = readToken(*m->file);
  1787 +
  1788 + bool objok = tobj.isWord("obj");
  1789 + QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
  1790 +
  1791 + if (!objok) {
  1792 + throw damagedPDF(offset, "expected n n obj");
  1793 + }
  1794 + int objid = QUtil::string_to_int(tobjid.getValue().c_str());
  1795 + int generation = QUtil::string_to_int(tgen.getValue().c_str());
  1796 + og = QPDFObjGen(objid, generation);
  1797 + if (objid == 0) {
  1798 + QTC::TC("qpdf", "QPDF object id 0");
  1799 + throw damagedPDF(offset, "object with ID 0");
  1800 + }
  1801 + if (check_og && (exp_og != og)) {
  1802 + QTC::TC("qpdf", "QPDF err wrong objid/generation");
  1803 + QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
  1804 + if (try_recovery) {
  1805 + // Will be retried below
  1806 + throw e;
  1807 + } else {
  1808 + // We can try reading the object anyway even if the ID doesn't match.
  1809 + warn(e);
  1810 + }
  1811 + }
  1812 + } catch (QPDFExc& e) {
  1813 + if (try_recovery) {
  1814 + // Try again after reconstructing xref table
  1815 + reconstruct_xref(e);
  1816 + if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {
  1817 + qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
  1818 + QPDFObjectHandle result =
  1819 + readObjectAtOffset(false, new_offset, description, exp_og, og, false);
  1820 + QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
  1821 + return result;
  1822 + } else {
  1823 + QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
  1824 + warn(damagedPDF(
  1825 + "",
  1826 + 0,
  1827 + ("object " + exp_og.unparse(' ') +
  1828 + " not found in file after regenerating cross reference "
  1829 + "table")));
  1830 + return QPDFObjectHandle::newNull();
  1831 + }
  1832 + } else {
  1833 + throw;
  1834 + }
  1835 + }
  1836 +
  1837 + QPDFObjectHandle oh = readObject(description, og);
  1838 +
  1839 + if (isUnresolved(og)) {
  1840 + // Store the object in the cache here so it gets cached whether we first know the offset or
  1841 + // whether we first know the object ID and generation (in which we case we would get here
  1842 + // through resolve).
  1843 +
  1844 + // Determine the end offset of this object before and after white space. We use these
  1845 + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
  1846 + // the end of an object to be anywhere between these values.
  1847 + qpdf_offset_t end_before_space = m->file->tell();
  1848 +
  1849 + // skip over spaces
  1850 + while (true) {
  1851 + char ch;
  1852 + if (m->file->read(&ch, 1)) {
  1853 + if (!isspace(static_cast<unsigned char>(ch))) {
  1854 + m->file->seek(-1, SEEK_CUR);
  1855 + break;
  1856 + }
  1857 + } else {
  1858 + throw damagedPDF(m->file->tell(), "EOF after endobj");
  1859 + }
  1860 + }
  1861 + qpdf_offset_t end_after_space = m->file->tell();
  1862 + if (skip_cache_if_in_xref && m->xref_table.count(og)) {
  1863 + // Ordinarily, an object gets read here when resolved through xref table or stream. In
  1864 + // the special case of the xref stream and linearization hint tables, the offset comes
  1865 + // from another source. For the specific case of xref streams, the xref stream is read
  1866 + // and loaded into the object cache very early in parsing. Ordinarily, when a file is
  1867 + // updated by appending, items inserted into the xref table in later updates take
  1868 + // precedence over earlier items. In the special case of reusing the object number
  1869 + // previously used as the xref stream, we have the following order of events:
  1870 + //
  1871 + // * reused object gets loaded into the xref table
  1872 + // * old object is read here while reading xref streams
  1873 + // * original xref entry is ignored (since already in xref table)
  1874 + //
  1875 + // It is the second step that causes a problem. Even though the xref table is correct in
  1876 + // this case, the old object is already in the cache and so effectively prevails over
  1877 + // the reused object. To work around this issue, we have a special case for the xref
  1878 + // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,
  1879 + // don't cache what we read here.
  1880 + //
  1881 + // It is likely that the same bug may exist for linearization hint tables, but the
  1882 + // existing code uses end_before_space and end_after_space from the cache, so fixing
  1883 + // that would require more significant rework. The chances of a linearization hint
  1884 + // stream being reused seems smaller because the xref stream is probably the highest
  1885 + // object in the file and the linearization hint stream would be some random place in
  1886 + // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we
  1887 + // could use !check_og in place of skip_cache_if_in_xref.
  1888 + QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
  1889 + } else {
  1890 + updateCache(og, oh.getObj(), end_before_space, end_after_space);
  1891 + }
  1892 + }
  1893 +
  1894 + return oh;
  1895 +}
  1896 +
  1897 +std::shared_ptr<QPDFObject> const&
  1898 +QPDF::resolve(QPDFObjGen og)
434 1899 {
435   - warn(QPDFExc(error_code, getFilename(), object, offset, message));
  1900 + if (!isUnresolved(og)) {
  1901 + return m->obj_cache[og].object;
  1902 + }
  1903 +
  1904 + if (m->resolving.count(og)) {
  1905 + // This can happen if an object references itself directly or indirectly in some key that
  1906 + // has to be resolved during object parsing, such as stream length.
  1907 + QTC::TC("qpdf", "QPDF recursion loop in resolve");
  1908 + warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
  1909 + updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
  1910 + return m->obj_cache[og].object;
  1911 + }
  1912 + ResolveRecorder rr(this, og);
  1913 +
  1914 + if (m->xref_table.count(og) != 0) {
  1915 + QPDFXRefEntry const& entry = m->xref_table[og];
  1916 + try {
  1917 + switch (entry.getType()) {
  1918 + case 1:
  1919 + {
  1920 + qpdf_offset_t offset = entry.getOffset();
  1921 + // Object stored in cache by readObjectAtOffset
  1922 + QPDFObjGen a_og;
  1923 + QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);
  1924 + }
  1925 + break;
  1926 +
  1927 + case 2:
  1928 + resolveObjectsInStream(entry.getObjStreamNumber());
  1929 + break;
  1930 +
  1931 + default:
  1932 + throw damagedPDF(
  1933 + "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));
  1934 + }
  1935 + } catch (QPDFExc& e) {
  1936 + warn(e);
  1937 + } catch (std::exception& e) {
  1938 + warn(damagedPDF(
  1939 + "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
  1940 + }
  1941 + }
  1942 +
  1943 + if (isUnresolved(og)) {
  1944 + // PDF spec says unknown objects resolve to the null object.
  1945 + QTC::TC("qpdf", "QPDF resolve failure to null");
  1946 + updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
  1947 + }
  1948 +
  1949 + auto& result(m->obj_cache[og].object);
  1950 + result->setDefaultDescription(this, og);
  1951 + return result;
  1952 +}
  1953 +
  1954 +void
  1955 +QPDF::resolveObjectsInStream(int obj_stream_number)
  1956 +{
  1957 + if (m->resolved_object_streams.count(obj_stream_number)) {
  1958 + return;
  1959 + }
  1960 + m->resolved_object_streams.insert(obj_stream_number);
  1961 + // Force resolution of object stream
  1962 + QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
  1963 + if (!obj_stream.isStream()) {
  1964 + throw damagedPDF(
  1965 + "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
  1966 + }
  1967 +
  1968 + // For linearization data in the object, use the data from the object stream for the objects in
  1969 + // the stream.
  1970 + QPDFObjGen stream_og(obj_stream_number, 0);
  1971 + qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;
  1972 + qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;
  1973 +
  1974 + QPDFObjectHandle dict = obj_stream.getDict();
  1975 + if (!dict.isDictionaryOfType("/ObjStm")) {
  1976 + QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
  1977 + warn(damagedPDF(
  1978 + "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
  1979 + }
  1980 +
  1981 + if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {
  1982 + throw damagedPDF(
  1983 + ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));
  1984 + }
  1985 +
  1986 + int n = dict.getKey("/N").getIntValueAsInt();
  1987 + int first = dict.getKey("/First").getIntValueAsInt();
  1988 +
  1989 + std::map<int, int> offsets;
  1990 +
  1991 + std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
  1992 + auto input = std::shared_ptr<InputSource>(
  1993 + // line-break
  1994 + new BufferInputSource(
  1995 + (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),
  1996 + bp.get()));
  1997 +
  1998 + long long last_offset = -1;
  1999 + for (int i = 0; i < n; ++i) {
  2000 + QPDFTokenizer::Token tnum = readToken(*input);
  2001 + QPDFTokenizer::Token toffset = readToken(*input);
  2002 + if (!(tnum.isInteger() && toffset.isInteger())) {
  2003 + throw damagedPDF(
  2004 + *input,
  2005 + m->last_object_description,
  2006 + input->getLastOffset(),
  2007 + "expected integer in object stream header");
  2008 + }
  2009 +
  2010 + int num = QUtil::string_to_int(tnum.getValue().c_str());
  2011 + long long offset = QUtil::string_to_int(toffset.getValue().c_str());
  2012 +
  2013 + if (num == obj_stream_number) {
  2014 + QTC::TC("qpdf", "QPDF ignore self-referential object stream");
  2015 + warn(damagedPDF(
  2016 + *input,
  2017 + m->last_object_description,
  2018 + input->getLastOffset(),
  2019 + "object stream claims to contain itself"));
  2020 + continue;
  2021 + }
  2022 +
  2023 + if (num < 1) {
  2024 + QTC::TC("qpdf", "QPDF object stream contains id < 1");
  2025 + warn(damagedPDF(
  2026 + *input,
  2027 + "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
  2028 + 0,
  2029 + "object id is invalid"s));
  2030 + continue;
  2031 + }
  2032 +
  2033 + if (offset <= last_offset) {
  2034 + QTC::TC("qpdf", "QPDF object stream offsets not increasing");
  2035 + warn(damagedPDF(
  2036 + *input,
  2037 + "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
  2038 + 0,
  2039 + "offset is invalid (must be larger than previous offset " +
  2040 + std::to_string(last_offset) + ")"));
  2041 + continue;
  2042 + }
  2043 + last_offset = offset;
  2044 +
  2045 + if (num > m->xref_table_max_id) {
  2046 + continue;
  2047 + }
  2048 +
  2049 + offsets[num] = toI(offset + first);
  2050 + }
  2051 +
  2052 + // To avoid having to read the object stream multiple times, store all objects that would be
  2053 + // found here in the cache. Remember that some objects stored here might have been overridden
  2054 + // by new objects appended to the file, so it is necessary to recheck the xref table and only
  2055 + // cache what would actually be resolved here.
  2056 + m->last_object_description.clear();
  2057 + m->last_object_description += "object ";
  2058 + for (auto const& iter: offsets) {
  2059 + QPDFObjGen og(iter.first, 0);
  2060 + auto entry = m->xref_table.find(og);
  2061 + if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
  2062 + entry->second.getObjStreamNumber() == obj_stream_number) {
  2063 + int offset = iter.second;
  2064 + input->seek(offset, SEEK_SET);
  2065 + QPDFObjectHandle oh = readObjectInStream(input, iter.first);
  2066 + updateCache(og, oh.getObj(), end_before_space, end_after_space);
  2067 + } else {
  2068 + QTC::TC("qpdf", "QPDF not caching overridden objstm object");
  2069 + }
  2070 + }
  2071 +}
  2072 +
  2073 +QPDFObjectHandle
  2074 +QPDF::newIndirect(QPDFObjGen og, std::shared_ptr<QPDFObject> const& obj)
  2075 +{
  2076 + obj->setDefaultDescription(this, og);
  2077 + return {obj};
  2078 +}
  2079 +
  2080 +void
  2081 +QPDF::updateCache(
  2082 + QPDFObjGen og,
  2083 + std::shared_ptr<QPDFObject> const& object,
  2084 + qpdf_offset_t end_before_space,
  2085 + qpdf_offset_t end_after_space,
  2086 + bool destroy)
  2087 +{
  2088 + object->setObjGen(this, og);
  2089 + if (isCached(og)) {
  2090 + auto& cache = m->obj_cache[og];
  2091 + object->move_to(cache.object, destroy);
  2092 + cache.end_before_space = end_before_space;
  2093 + cache.end_after_space = end_after_space;
  2094 + } else {
  2095 + m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);
  2096 + }
  2097 +}
  2098 +
  2099 +bool
  2100 +QPDF::isCached(QPDFObjGen og)
  2101 +{
  2102 + return m->obj_cache.count(og) != 0;
  2103 +}
  2104 +
  2105 +bool
  2106 +QPDF::isUnresolved(QPDFObjGen og)
  2107 +{
  2108 + return !isCached(og) || m->obj_cache[og].object->isUnresolved();
  2109 +}
  2110 +
  2111 +QPDFObjGen
  2112 +QPDF::nextObjGen()
  2113 +{
  2114 + int max_objid = toI(getObjectCount());
  2115 + if (max_objid == std::numeric_limits<int>::max()) {
  2116 + throw std::range_error("max object id is too high to create new objects");
  2117 + }
  2118 + return QPDFObjGen(max_objid + 1, 0);
  2119 +}
  2120 +
  2121 +QPDFObjectHandle
  2122 +QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
  2123 +{
  2124 + QPDFObjGen next{nextObjGen()};
  2125 + m->obj_cache[next] = ObjCache(obj, -1, -1);
  2126 + return newIndirect(next, m->obj_cache[next].object);
  2127 +}
  2128 +
  2129 +QPDFObjectHandle
  2130 +QPDF::makeIndirectObject(QPDFObjectHandle oh)
  2131 +{
  2132 + if (!oh) {
  2133 + throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
  2134 + }
  2135 + return makeIndirectFromQPDFObject(oh.getObj());
436 2136 }
437 2137  
438 2138 QPDFObjectHandle
... ... @@ -470,6 +2170,52 @@ QPDF::newStream(std::string const&amp; data)
470 2170 return result;
471 2171 }
472 2172  
  2173 +std::shared_ptr<QPDFObject>
  2174 +QPDF::getObjectForParser(int id, int gen, bool parse_pdf)
  2175 +{
  2176 + // This method is called by the parser and therefore must not resolve any objects.
  2177 + auto og = QPDFObjGen(id, gen);
  2178 + if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
  2179 + return iter->second.object;
  2180 + }
  2181 + if (m->xref_table.count(og) || !m->parsed) {
  2182 + return m->obj_cache.insert({og, QPDFObject::create<QPDF_Unresolved>(this, og)})
  2183 + .first->second.object;
  2184 + }
  2185 + if (parse_pdf) {
  2186 + return QPDFObject::create<QPDF_Null>();
  2187 + }
  2188 + return m->obj_cache.insert({og, QPDFObject::create<QPDF_Null>(this, og)}).first->second.object;
  2189 +}
  2190 +
  2191 +std::shared_ptr<QPDFObject>
  2192 +QPDF::getObjectForJSON(int id, int gen)
  2193 +{
  2194 + auto og = QPDFObjGen(id, gen);
  2195 + auto [it, inserted] = m->obj_cache.try_emplace(og);
  2196 + auto& obj = it->second.object;
  2197 + if (inserted) {
  2198 + obj = (m->parsed && !m->xref_table.count(og))
  2199 + ? QPDFObject::create<QPDF_Null>(this, og)
  2200 + : QPDFObject::create<QPDF_Unresolved>(this, og);
  2201 + }
  2202 + return obj;
  2203 +}
  2204 +
  2205 +QPDFObjectHandle
  2206 +QPDF::getObject(QPDFObjGen og)
  2207 +{
  2208 + if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
  2209 + return {it->second.object};
  2210 + } else if (m->parsed && !m->xref_table.count(og)) {
  2211 + return QPDFObject::create<QPDF_Null>();
  2212 + } else {
  2213 + auto result =
  2214 + m->obj_cache.try_emplace(og, QPDFObject::create<QPDF_Unresolved>(this, og), -1, -1);
  2215 + return {result.first->second.object};
  2216 + }
  2217 +}
  2218 +
473 2219 QPDFObjectHandle
474 2220 QPDF::getObject(int objid, int generation)
475 2221 {
... ... @@ -488,6 +2234,45 @@ QPDF::getObjectByID(int objid, int generation)
488 2234 return getObject(QPDFObjGen(objid, generation));
489 2235 }
490 2236  
  2237 +void
  2238 +QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
  2239 +{
  2240 + replaceObject(QPDFObjGen(objid, generation), oh);
  2241 +}
  2242 +
  2243 +void
  2244 +QPDF::replaceObject(QPDFObjGen og, QPDFObjectHandle oh)
  2245 +{
  2246 + if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {
  2247 + QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
  2248 + throw std::logic_error("QPDF::replaceObject called with indirect object handle");
  2249 + }
  2250 + updateCache(og, oh.getObj(), -1, -1, false);
  2251 +}
  2252 +
  2253 +void
  2254 +QPDF::removeObject(QPDFObjGen og)
  2255 +{
  2256 + m->xref_table.erase(og);
  2257 + if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
  2258 + // Take care of any object handles that may be floating around.
  2259 + cached->second.object->assign_null();
  2260 + cached->second.object->setObjGen(nullptr, QPDFObjGen());
  2261 + m->obj_cache.erase(cached);
  2262 + }
  2263 +}
  2264 +
  2265 +void
  2266 +QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
  2267 +{
  2268 + QTC::TC("qpdf", "QPDF replaceReserved");
  2269 + auto tc = reserved.getTypeCode();
  2270 + if (!(tc == ::ot_reserved || tc == ::ot_null)) {
  2271 + throw std::logic_error("replaceReserved called with non-reserved object");
  2272 + }
  2273 + replaceObject(reserved.getObjGen(), replacement);
  2274 +}
  2275 +
491 2276 QPDFObjectHandle
492 2277 QPDF::copyForeignObject(QPDFObjectHandle foreign)
493 2278 {
... ... @@ -747,6 +2532,21 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign)
747 2532 }
748 2533 }
749 2534  
  2535 +void
  2536 +QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
  2537 +{
  2538 + swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
  2539 +}
  2540 +
  2541 +void
  2542 +QPDF::swapObjects(QPDFObjGen og1, QPDFObjGen og2)
  2543 +{
  2544 + // Force objects to be read from the input source if needed, then swap them in the cache.
  2545 + resolve(og1);
  2546 + resolve(og2);
  2547 + m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
  2548 +}
  2549 +
750 2550 unsigned long long
751 2551 QPDF::getUniqueId() const
752 2552 {
... ... @@ -840,6 +2640,136 @@ QPDF::getXRefTableInternal()
840 2640 return m->xref_table;
841 2641 }
842 2642  
  2643 +size_t
  2644 +QPDF::tableSize()
  2645 +{
  2646 + // If obj_cache is dense, accommodate all object in tables,else accommodate only original
  2647 + // objects.
  2648 + auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0;
  2649 + auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;
  2650 + auto max_id = std::numeric_limits<int>::max() - 1;
  2651 + if (max_obj >= max_id || max_xref >= max_id) {
  2652 + // Temporary fix. Long-term solution is
  2653 + // - QPDFObjGen to enforce objgens are valid and sensible
  2654 + // - xref table and obj cache to protect against insertion of impossibly large obj ids
  2655 + stopOnError("Impossibly large object id encountered.");
  2656 + }
  2657 + if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
  2658 + return toS(++max_obj);
  2659 + }
  2660 + return toS(++max_xref);
  2661 +}
  2662 +
  2663 +std::vector<QPDFObjGen>
  2664 +QPDF::getCompressibleObjVector()
  2665 +{
  2666 + return getCompressibleObjGens<QPDFObjGen>();
  2667 +}
  2668 +
  2669 +std::vector<bool>
  2670 +QPDF::getCompressibleObjSet()
  2671 +{
  2672 + return getCompressibleObjGens<bool>();
  2673 +}
  2674 +
  2675 +template <typename T>
  2676 +std::vector<T>
  2677 +QPDF::getCompressibleObjGens()
  2678 +{
  2679 + // Return a list of objects that are allowed to be in object streams. Walk through the objects
  2680 + // by traversing the document from the root, including a traversal of the pages tree. This
  2681 + // makes that objects that are on the same page are more likely to be in the same object stream,
  2682 + // which is slightly more efficient, particularly with linearized files. This is better than
  2683 + // iterating through the xref table since it avoids preserving orphaned items.
  2684 +
  2685 + // Exclude encryption dictionary, if any
  2686 + QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
  2687 + QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
  2688 +
  2689 + const size_t max_obj = getObjectCount();
  2690 + std::vector<bool> visited(max_obj, false);
  2691 + std::vector<QPDFObjectHandle> queue;
  2692 + queue.reserve(512);
  2693 + queue.push_back(m->trailer);
  2694 + std::vector<T> result;
  2695 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  2696 + result.reserve(m->obj_cache.size());
  2697 + } else if constexpr (std::is_same_v<T, bool>) {
  2698 + result.resize(max_obj + 1U, false);
  2699 + } else {
  2700 + throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
  2701 + }
  2702 + while (!queue.empty()) {
  2703 + auto obj = queue.back();
  2704 + queue.pop_back();
  2705 + if (obj.getObjectID() > 0) {
  2706 + QPDFObjGen og = obj.getObjGen();
  2707 + const size_t id = toS(og.getObj() - 1);
  2708 + if (id >= max_obj) {
  2709 + throw std::logic_error(
  2710 + "unexpected object id encountered in getCompressibleObjGens");
  2711 + }
  2712 + if (visited[id]) {
  2713 + QTC::TC("qpdf", "QPDF loop detected traversing objects");
  2714 + continue;
  2715 + }
  2716 +
  2717 + // Check whether this is the current object. If not, remove it (which changes it into a
  2718 + // direct null and therefore stops us from revisiting it) and move on to the next object
  2719 + // in the queue.
  2720 + auto upper = m->obj_cache.upper_bound(og);
  2721 + if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
  2722 + removeObject(og);
  2723 + continue;
  2724 + }
  2725 +
  2726 + visited[id] = true;
  2727 +
  2728 + if (og == encryption_dict_og) {
  2729 + QTC::TC("qpdf", "QPDF exclude encryption dictionary");
  2730 + } else if (!(obj.isStream() ||
  2731 + (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
  2732 + obj.hasKey("/Contents")))) {
  2733 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  2734 + result.push_back(og);
  2735 + } else if constexpr (std::is_same_v<T, bool>) {
  2736 + result[id + 1U] = true;
  2737 + }
  2738 + }
  2739 + }
  2740 + if (obj.isStream()) {
  2741 + auto dict = obj.getDict().as_dictionary();
  2742 + auto end = dict.crend();
  2743 + for (auto iter = dict.crbegin(); iter != end; ++iter) {
  2744 + std::string const& key = iter->first;
  2745 + QPDFObjectHandle const& value = iter->second;
  2746 + if (!value.null()) {
  2747 + if (key == "/Length") {
  2748 + // omit stream lengths
  2749 + if (value.isIndirect()) {
  2750 + QTC::TC("qpdf", "QPDF exclude indirect length");
  2751 + }
  2752 + } else {
  2753 + queue.emplace_back(value);
  2754 + }
  2755 + }
  2756 + }
  2757 + } else if (obj.isDictionary()) {
  2758 + auto dict = obj.as_dictionary();
  2759 + auto end = dict.crend();
  2760 + for (auto iter = dict.crbegin(); iter != end; ++iter) {
  2761 + if (!iter->second.null()) {
  2762 + queue.emplace_back(iter->second);
  2763 + }
  2764 + }
  2765 + } else if (auto items = obj.as_array()) {
  2766 + queue.insert(queue.end(), items.crbegin(), items.crend());
  2767 + }
  2768 + }
  2769 +
  2770 + return result;
  2771 +}
  2772 +
843 2773 bool
844 2774 QPDF::pipeStreamData(
845 2775 std::shared_ptr<EncryptionParameters> encp,
... ...
libqpdf/QPDF_objects.cc deleted
1   -#include <qpdf/qpdf-config.h> // include first for large file support
2   -
3   -#include <qpdf/QPDF_private.hh>
4   -
5   -#include <array>
6   -#include <atomic>
7   -#include <cstring>
8   -#include <limits>
9   -#include <map>
10   -#include <regex>
11   -#include <sstream>
12   -#include <vector>
13   -
14   -#include <qpdf/BufferInputSource.hh>
15   -#include <qpdf/FileInputSource.hh>
16   -#include <qpdf/InputSource_private.hh>
17   -#include <qpdf/OffsetInputSource.hh>
18   -#include <qpdf/Pipeline.hh>
19   -#include <qpdf/QPDFExc.hh>
20   -#include <qpdf/QPDFLogger.hh>
21   -#include <qpdf/QPDFObjectHandle_private.hh>
22   -#include <qpdf/QPDFObject_private.hh>
23   -#include <qpdf/QPDFParser.hh>
24   -#include <qpdf/QTC.hh>
25   -#include <qpdf/QUtil.hh>
26   -#include <qpdf/Util.hh>
27   -
28   -using namespace qpdf;
29   -using namespace std::literals;
30   -
31   -namespace
32   -{
33   - class InvalidInputSource: public InputSource
34   - {
35   - public:
36   - ~InvalidInputSource() override = default;
37   - qpdf_offset_t
38   - findAndSkipNextEOL() override
39   - {
40   - throwException();
41   - return 0;
42   - }
43   - std::string const&
44   - getName() const override
45   - {
46   - static std::string name("closed input source");
47   - return name;
48   - }
49   - qpdf_offset_t
50   - tell() override
51   - {
52   - throwException();
53   - return 0;
54   - }
55   - void
56   - seek(qpdf_offset_t offset, int whence) override
57   - {
58   - throwException();
59   - }
60   - void
61   - rewind() override
62   - {
63   - throwException();
64   - }
65   - size_t
66   - read(char* buffer, size_t length) override
67   - {
68   - throwException();
69   - return 0;
70   - }
71   - void
72   - unreadCh(char ch) override
73   - {
74   - throwException();
75   - }
76   -
77   - private:
78   - void
79   - throwException()
80   - {
81   - throw std::logic_error(
82   - "QPDF operation attempted on a QPDF object with no input "
83   - "source. QPDF operations are invalid before processFile (or "
84   - "another process method) or after closeInputSource");
85   - }
86   - };
87   -} // namespace
88   -
89   -bool
90   -QPDF::findStartxref()
91   -{
92   - if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {
93   - // Position in front of offset token
94   - m->file->seek(m->file->getLastOffset(), SEEK_SET);
95   - return true;
96   - }
97   - return false;
98   -}
99   -
100   -void
101   -QPDF::parse(char const* password)
102   -{
103   - if (password) {
104   - m->encp->provided_password = password;
105   - }
106   -
107   - // Find the header anywhere in the first 1024 bytes of the file.
108   - PatternFinder hf(*this, &QPDF::findHeader);
109   - if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {
110   - QTC::TC("qpdf", "QPDF not a pdf file");
111   - warn(damagedPDF("", 0, "can't find PDF header"));
112   - // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode
113   - m->pdf_version = "1.2";
114   - }
115   -
116   - // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
117   - // 30 characters to leave room for the startxref stuff.
118   - m->file->seek(0, SEEK_END);
119   - qpdf_offset_t end_offset = m->file->tell();
120   - m->xref_table_max_offset = end_offset;
121   - // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
122   - // scenarios at least 3 bytes are required.
123   - if (m->xref_table_max_id > m->xref_table_max_offset / 3) {
124   - m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);
125   - }
126   - qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
127   - PatternFinder sf(*this, &QPDF::findStartxref);
128   - qpdf_offset_t xref_offset = 0;
129   - if (m->file->findLast("startxref", start_offset, 0, sf)) {
130   - xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());
131   - }
132   -
133   - try {
134   - if (xref_offset == 0) {
135   - QTC::TC("qpdf", "QPDF can't find startxref");
136   - throw damagedPDF("", 0, "can't find startxref");
137   - }
138   - try {
139   - read_xref(xref_offset);
140   - } catch (QPDFExc&) {
141   - throw;
142   - } catch (std::exception& e) {
143   - throw damagedPDF("", 0, std::string("error reading xref: ") + e.what());
144   - }
145   - } catch (QPDFExc& e) {
146   - if (m->attempt_recovery) {
147   - reconstruct_xref(e, xref_offset > 0);
148   - QTC::TC("qpdf", "QPDF reconstructed xref table");
149   - } else {
150   - throw;
151   - }
152   - }
153   -
154   - initializeEncryption();
155   - m->parsed = true;
156   - if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) {
157   - // QPDFs created from JSON have an empty xref table and no root object yet.
158   - throw damagedPDF("", 0, "unable to find page tree");
159   - }
160   -}
161   -
162   -void
163   -QPDF::inParse(bool v)
164   -{
165   - if (m->in_parse == v) {
166   - // This happens if QPDFParser::parse tries to resolve an indirect object while it is
167   - // parsing.
168   - throw std::logic_error(
169   - "QPDF: re-entrant parsing detected. This is a qpdf bug."
170   - " Please report at https://github.com/qpdf/qpdf/issues.");
171   - }
172   - m->in_parse = v;
173   -}
174   -
175   -void
176   -QPDF::setTrailer(QPDFObjectHandle obj)
177   -{
178   - if (m->trailer) {
179   - return;
180   - }
181   - m->trailer = obj;
182   -}
183   -
184   -void
185   -QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
186   -{
187   - if (m->reconstructed_xref) {
188   - // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
189   - // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
190   - throw e;
191   - }
192   -
193   - // If recovery generates more than 1000 warnings, the file is so severely damaged that there
194   - // probably is no point trying to continue.
195   - const auto max_warnings = m->warnings.size() + 1000U;
196   - auto check_warnings = [this, max_warnings]() {
197   - if (m->warnings.size() > max_warnings) {
198   - throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");
199   - }
200   - };
201   -
202   - m->reconstructed_xref = true;
203   - // We may find more objects, which may contain dangling references.
204   - m->fixed_dangling_refs = false;
205   -
206   - warn(damagedPDF("", 0, "file is damaged"));
207   - warn(e);
208   - warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));
209   -
210   - // Delete all references to type 1 (uncompressed) objects
211   - std::vector<QPDFObjGen> to_delete;
212   - for (auto const& iter: m->xref_table) {
213   - if (iter.second.getType() == 1) {
214   - to_delete.emplace_back(iter.first);
215   - }
216   - }
217   - for (auto const& iter: to_delete) {
218   - m->xref_table.erase(iter);
219   - }
220   -
221   - std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
222   - std::vector<qpdf_offset_t> trailers;
223   - std::vector<qpdf_offset_t> startxrefs;
224   -
225   - m->file->seek(0, SEEK_END);
226   - qpdf_offset_t eof = m->file->tell();
227   - m->file->seek(0, SEEK_SET);
228   - // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
229   - static size_t const MAX_LEN = 10;
230   - while (m->file->tell() < eof) {
231   - QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);
232   - qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
233   - if (t1.isInteger()) {
234   - auto pos = m->file->tell();
235   - auto t2 = readToken(*m->file, MAX_LEN);
236   - if (t2.isInteger() && readToken(*m->file, MAX_LEN).isWord("obj")) {
237   - int obj = QUtil::string_to_int(t1.getValue().c_str());
238   - int gen = QUtil::string_to_int(t2.getValue().c_str());
239   - if (obj <= m->xref_table_max_id) {
240   - found_objects.emplace_back(obj, gen, token_start);
241   - } else {
242   - warn(damagedPDF(
243   - "", 0, "ignoring object with impossibly large id " + std::to_string(obj)));
244   - }
245   - }
246   - m->file->seek(pos, SEEK_SET);
247   - } else if (!m->trailer && t1.isWord("trailer")) {
248   - trailers.emplace_back(m->file->tell());
249   - } else if (!found_startxref && t1.isWord("startxref")) {
250   - startxrefs.emplace_back(m->file->tell());
251   - }
252   - check_warnings();
253   - m->file->findAndSkipNextEOL();
254   - }
255   -
256   - if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
257   - startxrefs.back() > std::get<2>(found_objects.back())) {
258   - try {
259   - m->file->seek(startxrefs.back(), SEEK_SET);
260   - if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
261   - read_xref(offset);
262   - if (getRoot().getKey("/Pages").isDictionary()) {
263   - QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
264   - warn(
265   - damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));
266   - initializeEncryption();
267   - m->parsed = true;
268   - m->reconstructed_xref = false;
269   - return;
270   - }
271   - }
272   - } catch (...) {
273   - // ok, bad luck. Do recovery.
274   - }
275   - }
276   -
277   - auto rend = found_objects.rend();
278   - for (auto it = found_objects.rbegin(); it != rend; it++) {
279   - auto [obj, gen, token_start] = *it;
280   - insertXrefEntry(obj, 1, token_start, gen);
281   - check_warnings();
282   - }
283   - m->deleted_objects.clear();
284   -
285   - for (auto it = trailers.rbegin(); it != trailers.rend(); it++) {
286   - m->file->seek(*it, SEEK_SET);
287   - auto t = readTrailer();
288   - if (!t.isDictionary()) {
289   - // Oh well. It was worth a try.
290   - } else {
291   - if (t.hasKey("/Root")) {
292   - m->trailer = t;
293   - break;
294   - }
295   - warn(damagedPDF("trailer", *it, "recovered trailer has no /Root entry"));
296   - }
297   - check_warnings();
298   - }
299   -
300   - if (!m->trailer) {
301   - qpdf_offset_t max_offset{0};
302   - size_t max_size{0};
303   - // If there are any xref streams, take the last one to appear.
304   - for (auto const& iter: m->xref_table) {
305   - auto entry = iter.second;
306   - if (entry.getType() != 1) {
307   - continue;
308   - }
309   - auto oh = getObject(iter.first);
310   - try {
311   - if (!oh.isStreamOfType("/XRef")) {
312   - continue;
313   - }
314   - } catch (std::exception&) {
315   - continue;
316   - }
317   - auto offset = entry.getOffset();
318   - auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();
319   - if (size > max_size || (size == max_size && offset > max_offset)) {
320   - max_offset = offset;
321   - setTrailer(oh.getDict());
322   - }
323   - check_warnings();
324   - }
325   - if (max_offset > 0) {
326   - try {
327   - read_xref(max_offset);
328   - } catch (std::exception&) {
329   - warn(damagedPDF(
330   - "", 0, "error decoding candidate xref stream while recovering damaged file"));
331   - }
332   - QTC::TC("qpdf", "QPDF recover xref stream");
333   - }
334   - }
335   -
336   - if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {
337   - // Try to find a Root dictionary. As a quick fix try the one with the highest object id.
338   - QPDFObjectHandle root;
339   - for (auto const& iter: m->obj_cache) {
340   - try {
341   - if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {
342   - root = iter.second.object;
343   - }
344   - } catch (std::exception&) {
345   - continue;
346   - }
347   - }
348   - if (root) {
349   - if (!m->trailer) {
350   - warn(damagedPDF(
351   - "", 0, "unable to find trailer dictionary while recovering damaged file"));
352   - m->trailer = QPDFObjectHandle::newDictionary();
353   - }
354   - m->trailer.replaceKey("/Root", root);
355   - }
356   - }
357   -
358   - if (!m->trailer) {
359   - // We could check the last encountered object to see if it was an xref stream. If so, we
360   - // could try to get the trailer from there. This may make it possible to recover files with
361   - // bad startxref pointers even when they have object streams.
362   -
363   - throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");
364   - }
365   - if (m->xref_table.empty()) {
366   - // We cannot check for an empty xref table in parse because empty tables are valid when
367   - // creating QPDF objects from JSON.
368   - throw damagedPDF("", 0, "unable to find objects while recovering damaged file");
369   - }
370   - check_warnings();
371   - if (!m->parsed) {
372   - m->parsed = true;
373   - getAllPages();
374   - check_warnings();
375   - if (m->all_pages.empty()) {
376   - m->parsed = false;
377   - throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");
378   - }
379   - }
380   - // We could iterate through the objects looking for streams and try to find objects inside of
381   - // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors
382   - // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything
383   - // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
384   - // It's safe to call it more than once.
385   -}
386   -
387   -void
388   -QPDF::read_xref(qpdf_offset_t xref_offset)
389   -{
390   - std::map<int, int> free_table;
391   - std::set<qpdf_offset_t> visited;
392   - while (xref_offset) {
393   - visited.insert(xref_offset);
394   - char buf[7];
395   - memset(buf, 0, sizeof(buf));
396   - m->file->seek(xref_offset, SEEK_SET);
397   - // Some files miss the mark a little with startxref. We could do a better job of searching
398   - // in the neighborhood for something that looks like either an xref table or stream, but the
399   - // simple heuristic of skipping whitespace can help with the xref table case and is harmless
400   - // with the stream case.
401   - bool done = false;
402   - bool skipped_space = false;
403   - while (!done) {
404   - char ch;
405   - if (1 == m->file->read(&ch, 1)) {
406   - if (util::is_space(ch)) {
407   - skipped_space = true;
408   - } else {
409   - m->file->unreadCh(ch);
410   - done = true;
411   - }
412   - } else {
413   - QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
414   - done = true;
415   - }
416   - }
417   -
418   - m->file->read(buf, sizeof(buf) - 1);
419   - // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
420   - // where it is terminated by arbitrary whitespace.
421   - if ((strncmp(buf, "xref", 4) == 0) && util::is_space(buf[4])) {
422   - if (skipped_space) {
423   - QTC::TC("qpdf", "QPDF xref skipped space");
424   - warn(damagedPDF("", 0, "extraneous whitespace seen before xref"));
425   - }
426   - QTC::TC(
427   - "qpdf",
428   - "QPDF xref space",
429   - ((buf[4] == '\n') ? 0
430   - : (buf[4] == '\r') ? 1
431   - : (buf[4] == ' ') ? 2
432   - : 9999));
433   - int skip = 4;
434   - // buf is null-terminated, and util::is_space('\0') is false, so this won't overrun.
435   - while (util::is_space(buf[skip])) {
436   - ++skip;
437   - }
438   - xref_offset = read_xrefTable(xref_offset + skip);
439   - } else {
440   - xref_offset = read_xrefStream(xref_offset);
441   - }
442   - if (visited.count(xref_offset) != 0) {
443   - QTC::TC("qpdf", "QPDF xref loop");
444   - throw damagedPDF("", 0, "loop detected following xref tables");
445   - }
446   - }
447   -
448   - if (!m->trailer) {
449   - throw damagedPDF("", 0, "unable to find trailer while reading xref");
450   - }
451   - int size = m->trailer.getKey("/Size").getIntValueAsInt();
452   - int max_obj = 0;
453   - if (!m->xref_table.empty()) {
454   - max_obj = m->xref_table.rbegin()->first.getObj();
455   - }
456   - if (!m->deleted_objects.empty()) {
457   - max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));
458   - }
459   - if ((size < 1) || (size - 1 != max_obj)) {
460   - QTC::TC("qpdf", "QPDF xref size mismatch");
461   - warn(damagedPDF(
462   - "",
463   - 0,
464   - ("reported number of objects (" + std::to_string(size) +
465   - ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));
466   - }
467   -
468   - // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we
469   - // never depend on its being set.
470   - m->deleted_objects.clear();
471   -
472   - // Make sure we keep only the highest generation for any object.
473   - QPDFObjGen last_og{-1, 0};
474   - for (auto const& item: m->xref_table) {
475   - auto id = item.first.getObj();
476   - if (id == last_og.getObj() && id > 0) {
477   - removeObject(last_og);
478   - }
479   - last_og = item.first;
480   - }
481   -}
482   -
483   -bool
484   -QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)
485   -{
486   - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
487   - // buffer.
488   - char const* p = line.c_str();
489   - char const* start = line.c_str();
490   -
491   - // Skip zero or more spaces
492   - while (util::is_space(*p)) {
493   - ++p;
494   - }
495   - // Require digit
496   - if (!util::is_digit(*p)) {
497   - return false;
498   - }
499   - // Gather digits
500   - std::string obj_str;
501   - while (util::is_digit(*p)) {
502   - obj_str.append(1, *p++);
503   - }
504   - // Require space
505   - if (!util::is_space(*p)) {
506   - return false;
507   - }
508   - // Skip spaces
509   - while (util::is_space(*p)) {
510   - ++p;
511   - }
512   - // Require digit
513   - if (!util::is_digit(*p)) {
514   - return false;
515   - }
516   - // Gather digits
517   - std::string num_str;
518   - while (util::is_digit(*p)) {
519   - num_str.append(1, *p++);
520   - }
521   - // Skip any space including line terminators
522   - while (util::is_space(*p)) {
523   - ++p;
524   - }
525   - bytes = toI(p - start);
526   - obj = QUtil::string_to_int(obj_str.c_str());
527   - num = QUtil::string_to_int(num_str.c_str());
528   - return true;
529   -}
530   -
531   -bool
532   -QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
533   -{
534   - // Reposition after initial read attempt and reread.
535   - m->file->seek(m->file->getLastOffset(), SEEK_SET);
536   - auto line = m->file->readLine(30);
537   -
538   - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
539   - // buffer.
540   - char const* p = line.data();
541   -
542   - // Skip zero or more spaces. There aren't supposed to be any.
543   - bool invalid = false;
544   - while (util::is_space(*p)) {
545   - ++p;
546   - QTC::TC("qpdf", "QPDF ignore first space in xref entry");
547   - invalid = true;
548   - }
549   - // Require digit
550   - if (!util::is_digit(*p)) {
551   - return false;
552   - }
553   - // Gather digits
554   - std::string f1_str;
555   - while (util::is_digit(*p)) {
556   - f1_str.append(1, *p++);
557   - }
558   - // Require space
559   - if (!util::is_space(*p)) {
560   - return false;
561   - }
562   - if (util::is_space(*(p + 1))) {
563   - QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
564   - invalid = true;
565   - }
566   - // Skip spaces
567   - while (util::is_space(*p)) {
568   - ++p;
569   - }
570   - // Require digit
571   - if (!util::is_digit(*p)) {
572   - return false;
573   - }
574   - // Gather digits
575   - std::string f2_str;
576   - while (util::is_digit(*p)) {
577   - f2_str.append(1, *p++);
578   - }
579   - // Require space
580   - if (!util::is_space(*p)) {
581   - return false;
582   - }
583   - if (util::is_space(*(p + 1))) {
584   - QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
585   - invalid = true;
586   - }
587   - // Skip spaces
588   - while (util::is_space(*p)) {
589   - ++p;
590   - }
591   - if ((*p == 'f') || (*p == 'n')) {
592   - type = *p;
593   - } else {
594   - return false;
595   - }
596   - if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
597   - QTC::TC("qpdf", "QPDF ignore length error xref entry");
598   - invalid = true;
599   - }
600   -
601   - if (invalid) {
602   - warn(damagedPDF("xref table", "accepting invalid xref table entry"));
603   - }
604   -
605   - f1 = QUtil::string_to_ll(f1_str.c_str());
606   - f2 = QUtil::string_to_int(f2_str.c_str());
607   -
608   - return true;
609   -}
610   -
611   -// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
612   -// result.
613   -bool
614   -QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
615   -{
616   - std::array<char, 21> line;
617   - if (m->file->read(line.data(), 20) != 20) {
618   - // C++20: [[unlikely]]
619   - return false;
620   - }
621   - line[20] = '\0';
622   - char const* p = line.data();
623   -
624   - int f1_len = 0;
625   - int f2_len = 0;
626   -
627   - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
628   - // buffer.
629   -
630   - // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
631   - while (*p == '0') {
632   - ++f1_len;
633   - ++p;
634   - }
635   - while (util::is_digit(*p) && f1_len++ < 10) {
636   - f1 *= 10;
637   - f1 += *p++ - '0';
638   - }
639   - // Require space
640   - if (!util::is_space(*p++)) {
641   - // Entry doesn't start with space or digit.
642   - // C++20: [[unlikely]]
643   - return false;
644   - }
645   - // Gather digits. NB No risk of overflow as 99'999 < max int.
646   - while (*p == '0') {
647   - ++f2_len;
648   - ++p;
649   - }
650   - while (util::is_digit(*p) && f2_len++ < 5) {
651   - f2 *= 10;
652   - f2 += static_cast<int>(*p++ - '0');
653   - }
654   - if (util::is_space(*p++) && (*p == 'f' || *p == 'n')) {
655   - // C++20: [[likely]]
656   - type = *p;
657   - // No test for valid line[19].
658   - if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
659   - // C++20: [[likely]]
660   - return true;
661   - }
662   - }
663   - return read_bad_xrefEntry(f1, f2, type);
664   -}
665   -
666   -// Read a single cross-reference table section and associated trailer.
667   -qpdf_offset_t
668   -QPDF::read_xrefTable(qpdf_offset_t xref_offset)
669   -{
670   - m->file->seek(xref_offset, SEEK_SET);
671   - std::string line;
672   - while (true) {
673   - line.assign(50, '\0');
674   - m->file->read(line.data(), line.size());
675   - int obj = 0;
676   - int num = 0;
677   - int bytes = 0;
678   - if (!parse_xrefFirst(line, obj, num, bytes)) {
679   - QTC::TC("qpdf", "QPDF invalid xref");
680   - throw damagedPDF("xref table", "xref syntax invalid");
681   - }
682   - m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
683   - for (qpdf_offset_t i = obj; i - num < obj; ++i) {
684   - if (i == 0) {
685   - // This is needed by checkLinearization()
686   - m->first_xref_item_offset = m->file->tell();
687   - }
688   - // For xref_table, these will always be small enough to be ints
689   - qpdf_offset_t f1 = 0;
690   - int f2 = 0;
691   - char type = '\0';
692   - if (!read_xrefEntry(f1, f2, type)) {
693   - QTC::TC("qpdf", "QPDF invalid xref entry");
694   - throw damagedPDF(
695   - "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
696   - }
697   - if (type == 'f') {
698   - insertFreeXrefEntry(QPDFObjGen(toI(i), f2));
699   - } else {
700   - insertXrefEntry(toI(i), 1, f1, f2);
701   - }
702   - }
703   - qpdf_offset_t pos = m->file->tell();
704   - if (readToken(*m->file).isWord("trailer")) {
705   - break;
706   - } else {
707   - m->file->seek(pos, SEEK_SET);
708   - }
709   - }
710   -
711   - // Set offset to previous xref table if any
712   - QPDFObjectHandle cur_trailer = readTrailer();
713   - if (!cur_trailer.isDictionary()) {
714   - QTC::TC("qpdf", "QPDF missing trailer");
715   - throw damagedPDF("", "expected trailer dictionary");
716   - }
717   -
718   - if (!m->trailer) {
719   - setTrailer(cur_trailer);
720   -
721   - if (!m->trailer.hasKey("/Size")) {
722   - QTC::TC("qpdf", "QPDF trailer lacks size");
723   - throw damagedPDF("trailer", "trailer dictionary lacks /Size key");
724   - }
725   - if (!m->trailer.getKey("/Size").isInteger()) {
726   - QTC::TC("qpdf", "QPDF trailer size not integer");
727   - throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
728   - }
729   - }
730   -
731   - if (cur_trailer.hasKey("/XRefStm")) {
732   - if (m->ignore_xref_streams) {
733   - QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
734   - } else {
735   - if (cur_trailer.getKey("/XRefStm").isInteger()) {
736   - // Read the xref stream but disregard any return value -- we'll use our trailer's
737   - // /Prev key instead of the xref stream's.
738   - (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());
739   - } else {
740   - throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");
741   - }
742   - }
743   - }
744   -
745   - if (cur_trailer.hasKey("/Prev")) {
746   - if (!cur_trailer.getKey("/Prev").isInteger()) {
747   - QTC::TC("qpdf", "QPDF trailer prev not integer");
748   - throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");
749   - }
750   - QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
751   - return cur_trailer.getKey("/Prev").getIntValue();
752   - }
753   -
754   - return 0;
755   -}
756   -
757   -// Read a single cross-reference stream.
758   -qpdf_offset_t
759   -QPDF::read_xrefStream(qpdf_offset_t xref_offset)
760   -{
761   - if (!m->ignore_xref_streams) {
762   - QPDFObjGen x_og;
763   - QPDFObjectHandle xref_obj;
764   - try {
765   - xref_obj =
766   - readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);
767   - } catch (QPDFExc&) {
768   - // ignore -- report error below
769   - }
770   - if (xref_obj.isStreamOfType("/XRef")) {
771   - QTC::TC("qpdf", "QPDF found xref stream");
772   - return processXRefStream(xref_offset, xref_obj);
773   - }
774   - }
775   -
776   - QTC::TC("qpdf", "QPDF can't find xref");
777   - throw damagedPDF("", xref_offset, "xref not found");
778   - return 0; // unreachable
779   -}
780   -
781   -// Return the entry size of the xref stream and the processed W array.
782   -std::pair<int, std::array<int, 3>>
783   -QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
784   -{
785   - auto W_obj = dict.getKey("/W");
786   - if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&
787   - W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
788   - throw damaged("Cross-reference stream does not have a proper /W key");
789   - }
790   -
791   - std::array<int, 3> W;
792   - int entry_size = 0;
793   - auto w_vector = W_obj.getArrayAsVector();
794   - int max_bytes = sizeof(qpdf_offset_t);
795   - for (size_t i = 0; i < 3; ++i) {
796   - W[i] = w_vector[i].getIntValueAsInt();
797   - if (W[i] > max_bytes) {
798   - throw damaged("Cross-reference stream's /W contains impossibly large values");
799   - }
800   - if (W[i] < 0) {
801   - throw damaged("Cross-reference stream's /W contains negative values");
802   - }
803   - entry_size += W[i];
804   - }
805   - if (entry_size == 0) {
806   - throw damaged("Cross-reference stream's /W indicates entry size of 0");
807   - }
808   - return {entry_size, W};
809   -}
810   -
811   -// Validate Size key and return the maximum number of entries that the xref stream can contain.
812   -int
813   -QPDF::processXRefSize(
814   - QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
815   -{
816   - // Number of entries is limited by the highest possible object id and stream size.
817   - auto max_num_entries = std::numeric_limits<int>::max();
818   - if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
819   - max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
820   - }
821   -
822   - auto Size_obj = dict.getKey("/Size");
823   - long long size;
824   - if (!dict.getKey("/Size").getValueAsInt(size)) {
825   - throw damaged("Cross-reference stream does not have a proper /Size key");
826   - } else if (size < 0) {
827   - throw damaged("Cross-reference stream has a negative /Size key");
828   - } else if (size >= max_num_entries) {
829   - throw damaged("Cross-reference stream has an impossibly large /Size key");
830   - }
831   - // We are not validating that Size <= (Size key of parent xref / trailer).
832   - return max_num_entries;
833   -}
834   -
835   -// Return the number of entries of the xref stream and the processed Index array.
836   -std::pair<int, std::vector<std::pair<int, int>>>
837   -QPDF::processXRefIndex(
838   - QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
839   -{
840   - auto size = dict.getKey("/Size").getIntValueAsInt();
841   - auto Index_obj = dict.getKey("/Index");
842   -
843   - if (Index_obj.isArray()) {
844   - std::vector<std::pair<int, int>> indx;
845   - int num_entries = 0;
846   - auto index_vec = Index_obj.getArrayAsVector();
847   - if ((index_vec.size() % 2) || index_vec.size() < 2) {
848   - throw damaged("Cross-reference stream's /Index has an invalid number of values");
849   - }
850   -
851   - int i = 0;
852   - long long first = 0;
853   - for (auto& val: index_vec) {
854   - if (val.isInteger()) {
855   - if (i % 2) {
856   - auto count = val.getIntValue();
857   - if (count <= 0) {
858   - throw damaged(
859   - "Cross-reference stream section claims to contain " +
860   - std::to_string(count) + " entries");
861   - }
862   - // We are guarding against the possibility of num_entries * entry_size
863   - // overflowing. We are not checking that entries are in ascending order as
864   - // required by the spec, which probably should generate a warning. We are also
865   - // not checking that for each subsection first object number + number of entries
866   - // <= /Size. The spec requires us to ignore object number > /Size.
867   - if (first > (max_num_entries - count) ||
868   - count > (max_num_entries - num_entries)) {
869   - throw damaged(
870   - "Cross-reference stream claims to contain too many entries: " +
871   - std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
872   - std::to_string(num_entries));
873   - }
874   - indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
875   - num_entries += static_cast<int>(count);
876   - } else {
877   - first = val.getIntValue();
878   - if (first < 0) {
879   - throw damaged(
880   - "Cross-reference stream's /Index contains a negative object id");
881   - } else if (first > max_num_entries) {
882   - throw damaged(
883   - "Cross-reference stream's /Index contains an impossibly "
884   - "large object id");
885   - }
886   - }
887   - } else {
888   - throw damaged(
889   - "Cross-reference stream's /Index's item " + std::to_string(i) +
890   - " is not an integer");
891   - }
892   - i++;
893   - }
894   - QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
895   - return {num_entries, indx};
896   - } else if (Index_obj.isNull()) {
897   - QTC::TC("qpdf", "QPDF xref /Index is null");
898   - return {size, {{0, size}}};
899   - } else {
900   - throw damaged("Cross-reference stream does not have a proper /Index key");
901   - }
902   -}
903   -
904   -qpdf_offset_t
905   -QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
906   -{
907   - auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
908   - return damagedPDF("xref stream", xref_offset, msg.data());
909   - };
910   -
911   - auto dict = xref_obj.getDict();
912   -
913   - auto [entry_size, W] = processXRefW(dict, damaged);
914   - int max_num_entries = processXRefSize(dict, entry_size, damaged);
915   - auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
916   -
917   - std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
918   - size_t actual_size = bp->getSize();
919   - auto expected_size = toS(entry_size) * toS(num_entries);
920   -
921   - if (expected_size != actual_size) {
922   - QPDFExc x = damaged(
923   - "Cross-reference stream data has the wrong size; expected = " +
924   - std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
925   - if (expected_size > actual_size) {
926   - throw x;
927   - } else {
928   - warn(x);
929   - }
930   - }
931   -
932   - bool saw_first_compressed_object = false;
933   -
934   - // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
935   - // We know that entry_size * num_entries is less or equal to the size of the buffer.
936   - auto p = bp->getBuffer();
937   - for (auto [obj, sec_entries]: indx) {
938   - // Process a subsection.
939   - for (int i = 0; i < sec_entries; ++i) {
940   - // Read this entry
941   - std::array<qpdf_offset_t, 3> fields{};
942   - if (W[0] == 0) {
943   - QTC::TC("qpdf", "QPDF default for xref stream field 0");
944   - fields[0] = 1;
945   - }
946   - for (size_t j = 0; j < 3; ++j) {
947   - for (int k = 0; k < W[j]; ++k) {
948   - fields[j] <<= 8;
949   - fields[j] |= *p++;
950   - }
951   - }
952   -
953   - // Get the generation number. The generation number is 0 unless this is an uncompressed
954   - // object record, in which case the generation number appears as the third field.
955   - if (saw_first_compressed_object) {
956   - if (fields[0] != 2) {
957   - m->uncompressed_after_compressed = true;
958   - }
959   - } else if (fields[0] == 2) {
960   - saw_first_compressed_object = true;
961   - }
962   - if (obj == 0) {
963   - // This is needed by checkLinearization()
964   - m->first_xref_item_offset = xref_offset;
965   - } else if (fields[0] == 0) {
966   - // Ignore fields[2], which we don't care about in this case. This works around the
967   - // issue of some PDF files that put invalid values, like -1, here for deleted
968   - // objects.
969   - insertFreeXrefEntry(QPDFObjGen(obj, 0));
970   - } else {
971   - insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
972   - }
973   - ++obj;
974   - }
975   - }
976   -
977   - if (!m->trailer) {
978   - setTrailer(dict);
979   - }
980   -
981   - if (dict.hasKey("/Prev")) {
982   - if (!dict.getKey("/Prev").isInteger()) {
983   - throw damagedPDF(
984   - "xref stream", "/Prev key in xref stream dictionary is not an integer");
985   - }
986   - QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
987   - return dict.getKey("/Prev").getIntValue();
988   - } else {
989   - return 0;
990   - }
991   -}
992   -
993   -void
994   -QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)
995   -{
996   - // Populate the xref table in such a way that the first reference to an object that we see,
997   - // which is the one in the latest xref table in which it appears, is the one that gets stored.
998   - // This works because we are reading more recent appends before older ones.
999   -
1000   - // If there is already an entry for this object and generation in the table, it means that a
1001   - // later xref table has registered this object. Disregard this one.
1002   - int new_gen = f0 == 2 ? 0 : f2;
1003   -
1004   - if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && new_gen < 65535)) {
1005   - // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There
1006   - // is probably no point having another warning but we could count invalid items in order to
1007   - // decide when to give up.
1008   - QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");
1009   - // ignore impossibly large object ids or object ids > Size.
1010   - return;
1011   - }
1012   -
1013   - if (m->deleted_objects.count(obj)) {
1014   - QTC::TC("qpdf", "QPDF xref deleted object");
1015   - return;
1016   - }
1017   -
1018   - if (f0 == 2 && static_cast<int>(f1) == obj) {
1019   - warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));
1020   - return;
1021   - }
1022   -
1023   - auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));
1024   - if (!created) {
1025   - QTC::TC("qpdf", "QPDF xref reused object");
1026   - return;
1027   - }
1028   -
1029   - switch (f0) {
1030   - case 1:
1031   - // f2 is generation
1032   - QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
1033   - iter->second = QPDFXRefEntry(f1);
1034   - break;
1035   -
1036   - case 2:
1037   - iter->second = QPDFXRefEntry(toI(f1), f2);
1038   - break;
1039   -
1040   - default:
1041   - throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));
1042   - break;
1043   - }
1044   -}
1045   -
1046   -void
1047   -QPDF::insertFreeXrefEntry(QPDFObjGen og)
1048   -{
1049   - if (!m->xref_table.count(og)) {
1050   - m->deleted_objects.insert(og.getObj());
1051   - }
1052   -}
1053   -
1054   -void
1055   -QPDF::showXRefTable()
1056   -{
1057   - auto& cout = *m->log->getInfo();
1058   - for (auto const& iter: m->xref_table) {
1059   - QPDFObjGen const& og = iter.first;
1060   - QPDFXRefEntry const& entry = iter.second;
1061   - cout << og.unparse('/') << ": ";
1062   - switch (entry.getType()) {
1063   - case 1:
1064   - cout << "uncompressed; offset = " << entry.getOffset();
1065   - break;
1066   -
1067   - case 2:
1068   - *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()
1069   - << ", index = " << entry.getObjStreamIndex();
1070   - break;
1071   -
1072   - default:
1073   - throw std::logic_error("unknown cross-reference table type while showing xref_table");
1074   - break;
1075   - }
1076   - m->log->info("\n");
1077   - }
1078   -}
1079   -
1080   -// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
1081   -// return false. Otherwise return true.
1082   -bool
1083   -QPDF::resolveXRefTable()
1084   -{
1085   - bool may_change = !m->reconstructed_xref;
1086   - for (auto& iter: m->xref_table) {
1087   - if (isUnresolved(iter.first)) {
1088   - resolve(iter.first);
1089   - if (may_change && m->reconstructed_xref) {
1090   - return false;
1091   - }
1092   - }
1093   - }
1094   - return true;
1095   -}
1096   -
1097   -// Ensure all objects in the pdf file, including those in indirect references, appear in the object
1098   -// cache.
1099   -void
1100   -QPDF::fixDanglingReferences(bool force)
1101   -{
1102   - if (m->fixed_dangling_refs) {
1103   - return;
1104   - }
1105   - if (!resolveXRefTable()) {
1106   - QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");
1107   - resolveXRefTable();
1108   - }
1109   - m->fixed_dangling_refs = true;
1110   -}
1111   -
1112   -size_t
1113   -QPDF::getObjectCount()
1114   -{
1115   - // This method returns the next available indirect object number. makeIndirectObject uses it for
1116   - // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
1117   - // be in obj_cache.
1118   - fixDanglingReferences();
1119   - QPDFObjGen og;
1120   - if (!m->obj_cache.empty()) {
1121   - og = (*(m->obj_cache.rbegin())).first;
1122   - }
1123   - return toS(og.getObj());
1124   -}
1125   -
1126   -std::vector<QPDFObjectHandle>
1127   -QPDF::getAllObjects()
1128   -{
1129   - // After fixDanglingReferences is called, all objects are in the object cache.
1130   - fixDanglingReferences();
1131   - std::vector<QPDFObjectHandle> result;
1132   - for (auto const& iter: m->obj_cache) {
1133   - result.push_back(newIndirect(iter.first, iter.second.object));
1134   - }
1135   - return result;
1136   -}
1137   -
1138   -void
1139   -QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen og)
1140   -{
1141   - m->last_object_description.clear();
1142   - if (!description.empty()) {
1143   - m->last_object_description += description;
1144   - if (og.isIndirect()) {
1145   - m->last_object_description += ": ";
1146   - }
1147   - }
1148   - if (og.isIndirect()) {
1149   - m->last_object_description += "object " + og.unparse(' ');
1150   - }
1151   -}
1152   -
1153   -QPDFObjectHandle
1154   -QPDF::readTrailer()
1155   -{
1156   - qpdf_offset_t offset = m->file->tell();
1157   - bool empty = false;
1158   - auto object =
1159   - QPDFParser(*m->file, "trailer", m->tokenizer, nullptr, this, true).parse(empty, false);
1160   - if (empty) {
1161   - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1162   - // actual PDF files and Adobe Reader appears to ignore them.
1163   - warn(damagedPDF("trailer", "empty object treated as null"));
1164   - } else if (object.isDictionary() && readToken(*m->file).isWord("stream")) {
1165   - warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
1166   - }
1167   - // Override last_offset so that it points to the beginning of the object we just read
1168   - m->file->setLastOffset(offset);
1169   - return object;
1170   -}
1171   -
1172   -QPDFObjectHandle
1173   -QPDF::readObject(std::string const& description, QPDFObjGen og)
1174   -{
1175   - setLastObjectDescription(description, og);
1176   - qpdf_offset_t offset = m->file->tell();
1177   - bool empty = false;
1178   -
1179   - StringDecrypter decrypter{this, og};
1180   - StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
1181   - auto object =
1182   - QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this, true)
1183   - .parse(empty, false);
1184   - if (empty) {
1185   - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1186   - // actual PDF files and Adobe Reader appears to ignore them.
1187   - warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));
1188   - return object;
1189   - }
1190   - auto token = readToken(*m->file);
1191   - if (object.isDictionary() && token.isWord("stream")) {
1192   - readStream(object, og, offset);
1193   - token = readToken(*m->file);
1194   - }
1195   - if (!token.isWord("endobj")) {
1196   - QTC::TC("qpdf", "QPDF err expected endobj");
1197   - warn(damagedPDF("expected endobj"));
1198   - }
1199   - return object;
1200   -}
1201   -
1202   -// After reading stream dictionary and stream keyword, read rest of stream.
1203   -void
1204   -QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1205   -{
1206   - validateStreamLineEnd(object, og, offset);
1207   -
1208   - // Must get offset before accessing any additional objects since resolving a previously
1209   - // unresolved indirect object will change file position.
1210   - qpdf_offset_t stream_offset = m->file->tell();
1211   - size_t length = 0;
1212   -
1213   - try {
1214   - auto length_obj = object.getKey("/Length");
1215   -
1216   - if (!length_obj.isInteger()) {
1217   - if (length_obj.isNull()) {
1218   - QTC::TC("qpdf", "QPDF stream without length");
1219   - throw damagedPDF(offset, "stream dictionary lacks /Length key");
1220   - }
1221   - QTC::TC("qpdf", "QPDF stream length not integer");
1222   - throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");
1223   - }
1224   -
1225   - length = toS(length_obj.getUIntValue());
1226   - // Seek in two steps to avoid potential integer overflow
1227   - m->file->seek(stream_offset, SEEK_SET);
1228   - m->file->seek(toO(length), SEEK_CUR);
1229   - if (!readToken(*m->file).isWord("endstream")) {
1230   - QTC::TC("qpdf", "QPDF missing endstream");
1231   - throw damagedPDF("expected endstream");
1232   - }
1233   - } catch (QPDFExc& e) {
1234   - if (m->attempt_recovery) {
1235   - warn(e);
1236   - length = recoverStreamLength(m->file, og, stream_offset);
1237   - } else {
1238   - throw;
1239   - }
1240   - }
1241   - object = QPDFObjectHandle(qpdf::Stream(*this, og, object, stream_offset, length));
1242   -}
1243   -
1244   -void
1245   -QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1246   -{
1247   - // The PDF specification states that the word "stream" should be followed by either a carriage
1248   - // return and a newline or by a newline alone. It specifically disallowed following it by a
1249   - // carriage return alone since, in that case, there would be no way to tell whether the NL in a
1250   - // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,
1251   - // accept a carriage return by itself when followed by a non-newline character, so that's what
1252   - // we do here. We have also seen files that have extraneous whitespace between the stream
1253   - // keyword and the newline.
1254   - while (true) {
1255   - char ch;
1256   - if (m->file->read(&ch, 1) == 0) {
1257   - // A premature EOF here will result in some other problem that will get reported at
1258   - // another time.
1259   - return;
1260   - }
1261   - if (ch == '\n') {
1262   - // ready to read stream data
1263   - QTC::TC("qpdf", "QPDF stream with NL only");
1264   - return;
1265   - }
1266   - if (ch == '\r') {
1267   - // Read another character
1268   - if (m->file->read(&ch, 1) != 0) {
1269   - if (ch == '\n') {
1270   - // Ready to read stream data
1271   - QTC::TC("qpdf", "QPDF stream with CRNL");
1272   - } else {
1273   - // Treat the \r by itself as the whitespace after endstream and start reading
1274   - // stream data in spite of not having seen a newline.
1275   - QTC::TC("qpdf", "QPDF stream with CR only");
1276   - m->file->unreadCh(ch);
1277   - warn(damagedPDF(
1278   - m->file->tell(), "stream keyword followed by carriage return only"));
1279   - }
1280   - }
1281   - return;
1282   - }
1283   - if (!util::is_space(ch)) {
1284   - QTC::TC("qpdf", "QPDF stream without newline");
1285   - m->file->unreadCh(ch);
1286   - warn(damagedPDF(
1287   - m->file->tell(), "stream keyword not followed by proper line terminator"));
1288   - return;
1289   - }
1290   - warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
1291   - }
1292   -}
1293   -
1294   -QPDFObjectHandle
1295   -QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)
1296   -{
1297   - m->last_object_description.erase(7); // last_object_description starts with "object "
1298   - m->last_object_description += std::to_string(obj);
1299   - m->last_object_description += " 0";
1300   -
1301   - bool empty = false;
1302   - auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true)
1303   - .parse(empty, false);
1304   - if (empty) {
1305   - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1306   - // actual PDF files and Adobe Reader appears to ignore them.
1307   - warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));
1308   - }
1309   - return object;
1310   -}
1311   -
1312   -bool
1313   -QPDF::findEndstream()
1314   -{
1315   - // Find endstream or endobj. Position the input at that token.
1316   - auto t = readToken(*m->file, 20);
1317   - if (t.isWord("endobj") || t.isWord("endstream")) {
1318   - m->file->seek(m->file->getLastOffset(), SEEK_SET);
1319   - return true;
1320   - }
1321   - return false;
1322   -}
1323   -
1324   -size_t
1325   -QPDF::recoverStreamLength(
1326   - std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)
1327   -{
1328   - // Try to reconstruct stream length by looking for endstream or endobj
1329   - warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));
1330   -
1331   - PatternFinder ef(*this, &QPDF::findEndstream);
1332   - size_t length = 0;
1333   - if (m->file->findFirst("end", stream_offset, 0, ef)) {
1334   - length = toS(m->file->tell() - stream_offset);
1335   - // Reread endstream but, if it was endobj, don't skip that.
1336   - QPDFTokenizer::Token t = readToken(*m->file);
1337   - if (t.getValue() == "endobj") {
1338   - m->file->seek(m->file->getLastOffset(), SEEK_SET);
1339   - }
1340   - }
1341   -
1342   - if (length) {
1343   - auto end = stream_offset + toO(length);
1344   - qpdf_offset_t found_offset = 0;
1345   - QPDFObjGen found_og;
1346   -
1347   - // Make sure this is inside this object
1348   - for (auto const& [current_og, entry]: m->xref_table) {
1349   - if (entry.getType() == 1) {
1350   - qpdf_offset_t obj_offset = entry.getOffset();
1351   - if (found_offset < obj_offset && obj_offset < end) {
1352   - found_offset = obj_offset;
1353   - found_og = current_og;
1354   - }
1355   - }
1356   - }
1357   - if (!found_offset || found_og == og) {
1358   - // If we are trying to recover an XRef stream the xref table will not contain and
1359   - // won't contain any entries, therefore we cannot check the found length. Otherwise we
1360   - // found endstream\nendobj within the space allowed for this object, so we're probably
1361   - // in good shape.
1362   - } else {
1363   - QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
1364   - length = 0;
1365   - }
1366   - }
1367   -
1368   - if (length == 0) {
1369   - warn(damagedPDF(
1370   - *input, stream_offset, "unable to recover stream data; treating stream as empty"));
1371   - } else {
1372   - warn(damagedPDF(
1373   - *input, stream_offset, "recovered stream length: " + std::to_string(length)));
1374   - }
1375   -
1376   - QTC::TC("qpdf", "QPDF recovered stream length");
1377   - return length;
1378   -}
1379   -
1380   -QPDFTokenizer::Token
1381   -QPDF::readToken(InputSource& input, size_t max_len)
1382   -{
1383   - return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
1384   -}
1385   -
1386   -QPDFObjectHandle
1387   -QPDF::readObjectAtOffset(
1388   - bool try_recovery,
1389   - qpdf_offset_t offset,
1390   - std::string const& description,
1391   - QPDFObjGen exp_og,
1392   - QPDFObjGen& og,
1393   - bool skip_cache_if_in_xref)
1394   -{
1395   - bool check_og = true;
1396   - if (exp_og.getObj() == 0) {
1397   - // This method uses an expect object ID of 0 to indicate that we don't know or don't care
1398   - // what the actual object ID is at this offset. This is true when we read the xref stream
1399   - // and linearization hint streams. In this case, we don't verify the expect object
1400   - // ID/generation against what was read from the file. There is also no reason to attempt
1401   - // xref recovery if we get a failure in this case since the read attempt was not triggered
1402   - // by an xref lookup.
1403   - check_og = false;
1404   - try_recovery = false;
1405   - }
1406   - setLastObjectDescription(description, exp_og);
1407   -
1408   - if (!m->attempt_recovery) {
1409   - try_recovery = false;
1410   - }
1411   -
1412   - // Special case: if offset is 0, just return null. Some PDF writers, in particular
1413   - // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
1414   - // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
1415   - // these.
1416   - if (offset == 0) {
1417   - QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
1418   - warn(damagedPDF(0, "object has offset 0"));
1419   - return QPDFObjectHandle::newNull();
1420   - }
1421   -
1422   - m->file->seek(offset, SEEK_SET);
1423   - try {
1424   - QPDFTokenizer::Token tobjid = readToken(*m->file);
1425   - bool objidok = tobjid.isInteger();
1426   - QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
1427   - if (!objidok) {
1428   - QTC::TC("qpdf", "QPDF expected n n obj");
1429   - throw damagedPDF(offset, "expected n n obj");
1430   - }
1431   - QPDFTokenizer::Token tgen = readToken(*m->file);
1432   - bool genok = tgen.isInteger();
1433   - QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
1434   - if (!genok) {
1435   - throw damagedPDF(offset, "expected n n obj");
1436   - }
1437   - QPDFTokenizer::Token tobj = readToken(*m->file);
1438   -
1439   - bool objok = tobj.isWord("obj");
1440   - QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
1441   -
1442   - if (!objok) {
1443   - throw damagedPDF(offset, "expected n n obj");
1444   - }
1445   - int objid = QUtil::string_to_int(tobjid.getValue().c_str());
1446   - int generation = QUtil::string_to_int(tgen.getValue().c_str());
1447   - og = QPDFObjGen(objid, generation);
1448   - if (objid == 0) {
1449   - QTC::TC("qpdf", "QPDF object id 0");
1450   - throw damagedPDF(offset, "object with ID 0");
1451   - }
1452   - if (check_og && (exp_og != og)) {
1453   - QTC::TC("qpdf", "QPDF err wrong objid/generation");
1454   - QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
1455   - if (try_recovery) {
1456   - // Will be retried below
1457   - throw e;
1458   - } else {
1459   - // We can try reading the object anyway even if the ID doesn't match.
1460   - warn(e);
1461   - }
1462   - }
1463   - } catch (QPDFExc& e) {
1464   - if (try_recovery) {
1465   - // Try again after reconstructing xref table
1466   - reconstruct_xref(e);
1467   - if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {
1468   - qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
1469   - QPDFObjectHandle result =
1470   - readObjectAtOffset(false, new_offset, description, exp_og, og, false);
1471   - QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
1472   - return result;
1473   - } else {
1474   - QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
1475   - warn(damagedPDF(
1476   - "",
1477   - 0,
1478   - ("object " + exp_og.unparse(' ') +
1479   - " not found in file after regenerating cross reference "
1480   - "table")));
1481   - return QPDFObjectHandle::newNull();
1482   - }
1483   - } else {
1484   - throw;
1485   - }
1486   - }
1487   -
1488   - QPDFObjectHandle oh = readObject(description, og);
1489   -
1490   - if (isUnresolved(og)) {
1491   - // Store the object in the cache here so it gets cached whether we first know the offset or
1492   - // whether we first know the object ID and generation (in which we case we would get here
1493   - // through resolve).
1494   -
1495   - // Determine the end offset of this object before and after white space. We use these
1496   - // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
1497   - // the end of an object to be anywhere between these values.
1498   - qpdf_offset_t end_before_space = m->file->tell();
1499   -
1500   - // skip over spaces
1501   - while (true) {
1502   - char ch;
1503   - if (m->file->read(&ch, 1)) {
1504   - if (!isspace(static_cast<unsigned char>(ch))) {
1505   - m->file->seek(-1, SEEK_CUR);
1506   - break;
1507   - }
1508   - } else {
1509   - throw damagedPDF(m->file->tell(), "EOF after endobj");
1510   - }
1511   - }
1512   - qpdf_offset_t end_after_space = m->file->tell();
1513   - if (skip_cache_if_in_xref && m->xref_table.count(og)) {
1514   - // Ordinarily, an object gets read here when resolved through xref table or stream. In
1515   - // the special case of the xref stream and linearization hint tables, the offset comes
1516   - // from another source. For the specific case of xref streams, the xref stream is read
1517   - // and loaded into the object cache very early in parsing. Ordinarily, when a file is
1518   - // updated by appending, items inserted into the xref table in later updates take
1519   - // precedence over earlier items. In the special case of reusing the object number
1520   - // previously used as the xref stream, we have the following order of events:
1521   - //
1522   - // * reused object gets loaded into the xref table
1523   - // * old object is read here while reading xref streams
1524   - // * original xref entry is ignored (since already in xref table)
1525   - //
1526   - // It is the second step that causes a problem. Even though the xref table is correct in
1527   - // this case, the old object is already in the cache and so effectively prevails over
1528   - // the reused object. To work around this issue, we have a special case for the xref
1529   - // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,
1530   - // don't cache what we read here.
1531   - //
1532   - // It is likely that the same bug may exist for linearization hint tables, but the
1533   - // existing code uses end_before_space and end_after_space from the cache, so fixing
1534   - // that would require more significant rework. The chances of a linearization hint
1535   - // stream being reused seems smaller because the xref stream is probably the highest
1536   - // object in the file and the linearization hint stream would be some random place in
1537   - // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we
1538   - // could use !check_og in place of skip_cache_if_in_xref.
1539   - QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
1540   - } else {
1541   - updateCache(og, oh.getObj(), end_before_space, end_after_space);
1542   - }
1543   - }
1544   -
1545   - return oh;
1546   -}
1547   -
1548   -std::shared_ptr<QPDFObject> const&
1549   -QPDF::resolve(QPDFObjGen og)
1550   -{
1551   - if (!isUnresolved(og)) {
1552   - return m->obj_cache[og].object;
1553   - }
1554   -
1555   - if (m->resolving.count(og)) {
1556   - // This can happen if an object references itself directly or indirectly in some key that
1557   - // has to be resolved during object parsing, such as stream length.
1558   - QTC::TC("qpdf", "QPDF recursion loop in resolve");
1559   - warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
1560   - updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
1561   - return m->obj_cache[og].object;
1562   - }
1563   - ResolveRecorder rr(this, og);
1564   -
1565   - if (m->xref_table.count(og) != 0) {
1566   - QPDFXRefEntry const& entry = m->xref_table[og];
1567   - try {
1568   - switch (entry.getType()) {
1569   - case 1:
1570   - {
1571   - qpdf_offset_t offset = entry.getOffset();
1572   - // Object stored in cache by readObjectAtOffset
1573   - QPDFObjGen a_og;
1574   - QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);
1575   - }
1576   - break;
1577   -
1578   - case 2:
1579   - resolveObjectsInStream(entry.getObjStreamNumber());
1580   - break;
1581   -
1582   - default:
1583   - throw damagedPDF(
1584   - "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));
1585   - }
1586   - } catch (QPDFExc& e) {
1587   - warn(e);
1588   - } catch (std::exception& e) {
1589   - warn(damagedPDF(
1590   - "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
1591   - }
1592   - }
1593   -
1594   - if (isUnresolved(og)) {
1595   - // PDF spec says unknown objects resolve to the null object.
1596   - QTC::TC("qpdf", "QPDF resolve failure to null");
1597   - updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
1598   - }
1599   -
1600   - auto& result(m->obj_cache[og].object);
1601   - result->setDefaultDescription(this, og);
1602   - return result;
1603   -}
1604   -
1605   -void
1606   -QPDF::resolveObjectsInStream(int obj_stream_number)
1607   -{
1608   - if (m->resolved_object_streams.count(obj_stream_number)) {
1609   - return;
1610   - }
1611   - m->resolved_object_streams.insert(obj_stream_number);
1612   - // Force resolution of object stream
1613   - QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
1614   - if (!obj_stream.isStream()) {
1615   - throw damagedPDF(
1616   - "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
1617   - }
1618   -
1619   - // For linearization data in the object, use the data from the object stream for the objects in
1620   - // the stream.
1621   - QPDFObjGen stream_og(obj_stream_number, 0);
1622   - qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;
1623   - qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;
1624   -
1625   - QPDFObjectHandle dict = obj_stream.getDict();
1626   - if (!dict.isDictionaryOfType("/ObjStm")) {
1627   - QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
1628   - warn(damagedPDF(
1629   - "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
1630   - }
1631   -
1632   - if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {
1633   - throw damagedPDF(
1634   - ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));
1635   - }
1636   -
1637   - int n = dict.getKey("/N").getIntValueAsInt();
1638   - int first = dict.getKey("/First").getIntValueAsInt();
1639   -
1640   - std::map<int, int> offsets;
1641   -
1642   - std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
1643   - auto input = std::shared_ptr<InputSource>(
1644   - // line-break
1645   - new BufferInputSource(
1646   - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),
1647   - bp.get()));
1648   -
1649   - long long last_offset = -1;
1650   - for (int i = 0; i < n; ++i) {
1651   - QPDFTokenizer::Token tnum = readToken(*input);
1652   - QPDFTokenizer::Token toffset = readToken(*input);
1653   - if (!(tnum.isInteger() && toffset.isInteger())) {
1654   - throw damagedPDF(
1655   - *input,
1656   - m->last_object_description,
1657   - input->getLastOffset(),
1658   - "expected integer in object stream header");
1659   - }
1660   -
1661   - int num = QUtil::string_to_int(tnum.getValue().c_str());
1662   - long long offset = QUtil::string_to_int(toffset.getValue().c_str());
1663   -
1664   - if (num == obj_stream_number) {
1665   - QTC::TC("qpdf", "QPDF ignore self-referential object stream");
1666   - warn(damagedPDF(
1667   - *input,
1668   - m->last_object_description,
1669   - input->getLastOffset(),
1670   - "object stream claims to contain itself"));
1671   - continue;
1672   - }
1673   -
1674   - if (num < 1) {
1675   - QTC::TC("qpdf", "QPDF object stream contains id < 1");
1676   - warn(damagedPDF(
1677   - *input,
1678   - "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
1679   - 0,
1680   - "object id is invalid"s));
1681   - continue;
1682   - }
1683   -
1684   - if (offset <= last_offset) {
1685   - QTC::TC("qpdf", "QPDF object stream offsets not increasing");
1686   - warn(damagedPDF(
1687   - *input,
1688   - "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
1689   - 0,
1690   - "offset is invalid (must be larger than previous offset " +
1691   - std::to_string(last_offset) + ")"));
1692   - continue;
1693   - }
1694   - last_offset = offset;
1695   -
1696   - if (num > m->xref_table_max_id) {
1697   - continue;
1698   - }
1699   -
1700   - offsets[num] = toI(offset + first);
1701   - }
1702   -
1703   - // To avoid having to read the object stream multiple times, store all objects that would be
1704   - // found here in the cache. Remember that some objects stored here might have been overridden
1705   - // by new objects appended to the file, so it is necessary to recheck the xref table and only
1706   - // cache what would actually be resolved here.
1707   - m->last_object_description.clear();
1708   - m->last_object_description += "object ";
1709   - for (auto const& iter: offsets) {
1710   - QPDFObjGen og(iter.first, 0);
1711   - auto entry = m->xref_table.find(og);
1712   - if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1713   - entry->second.getObjStreamNumber() == obj_stream_number) {
1714   - int offset = iter.second;
1715   - input->seek(offset, SEEK_SET);
1716   - QPDFObjectHandle oh = readObjectInStream(input, iter.first);
1717   - updateCache(og, oh.getObj(), end_before_space, end_after_space);
1718   - } else {
1719   - QTC::TC("qpdf", "QPDF not caching overridden objstm object");
1720   - }
1721   - }
1722   -}
1723   -
1724   -QPDFObjectHandle
1725   -QPDF::newIndirect(QPDFObjGen og, std::shared_ptr<QPDFObject> const& obj)
1726   -{
1727   - obj->setDefaultDescription(this, og);
1728   - return {obj};
1729   -}
1730   -
1731   -void
1732   -QPDF::updateCache(
1733   - QPDFObjGen og,
1734   - std::shared_ptr<QPDFObject> const& object,
1735   - qpdf_offset_t end_before_space,
1736   - qpdf_offset_t end_after_space,
1737   - bool destroy)
1738   -{
1739   - object->setObjGen(this, og);
1740   - if (isCached(og)) {
1741   - auto& cache = m->obj_cache[og];
1742   - object->move_to(cache.object, destroy);
1743   - cache.end_before_space = end_before_space;
1744   - cache.end_after_space = end_after_space;
1745   - } else {
1746   - m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);
1747   - }
1748   -}
1749   -
1750   -bool
1751   -QPDF::isCached(QPDFObjGen og)
1752   -{
1753   - return m->obj_cache.count(og) != 0;
1754   -}
1755   -
1756   -bool
1757   -QPDF::isUnresolved(QPDFObjGen og)
1758   -{
1759   - return !isCached(og) || m->obj_cache[og].object->isUnresolved();
1760   -}
1761   -
1762   -QPDFObjGen
1763   -QPDF::nextObjGen()
1764   -{
1765   - int max_objid = toI(getObjectCount());
1766   - if (max_objid == std::numeric_limits<int>::max()) {
1767   - throw std::range_error("max object id is too high to create new objects");
1768   - }
1769   - return QPDFObjGen(max_objid + 1, 0);
1770   -}
1771   -
1772   -QPDFObjectHandle
1773   -QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
1774   -{
1775   - QPDFObjGen next{nextObjGen()};
1776   - m->obj_cache[next] = ObjCache(obj, -1, -1);
1777   - return newIndirect(next, m->obj_cache[next].object);
1778   -}
1779   -
1780   -QPDFObjectHandle
1781   -QPDF::makeIndirectObject(QPDFObjectHandle oh)
1782   -{
1783   - if (!oh) {
1784   - throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
1785   - }
1786   - return makeIndirectFromQPDFObject(oh.getObj());
1787   -}
1788   -
1789   -std::shared_ptr<QPDFObject>
1790   -QPDF::getObjectForParser(int id, int gen, bool parse_pdf)
1791   -{
1792   - // This method is called by the parser and therefore must not resolve any objects.
1793   - auto og = QPDFObjGen(id, gen);
1794   - if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
1795   - return iter->second.object;
1796   - }
1797   - if (m->xref_table.count(og) || !m->parsed) {
1798   - return m->obj_cache.insert({og, QPDFObject::create<QPDF_Unresolved>(this, og)})
1799   - .first->second.object;
1800   - }
1801   - if (parse_pdf) {
1802   - return QPDFObject::create<QPDF_Null>();
1803   - }
1804   - return m->obj_cache.insert({og, QPDFObject::create<QPDF_Null>(this, og)}).first->second.object;
1805   -}
1806   -
1807   -std::shared_ptr<QPDFObject>
1808   -QPDF::getObjectForJSON(int id, int gen)
1809   -{
1810   - auto og = QPDFObjGen(id, gen);
1811   - auto [it, inserted] = m->obj_cache.try_emplace(og);
1812   - auto& obj = it->second.object;
1813   - if (inserted) {
1814   - obj = (m->parsed && !m->xref_table.count(og))
1815   - ? QPDFObject::create<QPDF_Null>(this, og)
1816   - : QPDFObject::create<QPDF_Unresolved>(this, og);
1817   - }
1818   - return obj;
1819   -}
1820   -
1821   -QPDFObjectHandle
1822   -QPDF::getObject(QPDFObjGen og)
1823   -{
1824   - if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
1825   - return {it->second.object};
1826   - } else if (m->parsed && !m->xref_table.count(og)) {
1827   - return QPDFObject::create<QPDF_Null>();
1828   - } else {
1829   - auto result =
1830   - m->obj_cache.try_emplace(og, QPDFObject::create<QPDF_Unresolved>(this, og), -1, -1);
1831   - return {result.first->second.object};
1832   - }
1833   -}
1834   -
1835   -void
1836   -QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
1837   -{
1838   - replaceObject(QPDFObjGen(objid, generation), oh);
1839   -}
1840   -
1841   -void
1842   -QPDF::replaceObject(QPDFObjGen og, QPDFObjectHandle oh)
1843   -{
1844   - if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {
1845   - QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
1846   - throw std::logic_error("QPDF::replaceObject called with indirect object handle");
1847   - }
1848   - updateCache(og, oh.getObj(), -1, -1, false);
1849   -}
1850   -
1851   -void
1852   -QPDF::removeObject(QPDFObjGen og)
1853   -{
1854   - m->xref_table.erase(og);
1855   - if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
1856   - // Take care of any object handles that may be floating around.
1857   - cached->second.object->assign_null();
1858   - cached->second.object->setObjGen(nullptr, QPDFObjGen());
1859   - m->obj_cache.erase(cached);
1860   - }
1861   -}
1862   -
1863   -void
1864   -QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
1865   -{
1866   - QTC::TC("qpdf", "QPDF replaceReserved");
1867   - auto tc = reserved.getTypeCode();
1868   - if (!(tc == ::ot_reserved || tc == ::ot_null)) {
1869   - throw std::logic_error("replaceReserved called with non-reserved object");
1870   - }
1871   - replaceObject(reserved.getObjGen(), replacement);
1872   -}
1873   -
1874   -void
1875   -QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
1876   -{
1877   - swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
1878   -}
1879   -
1880   -void
1881   -QPDF::swapObjects(QPDFObjGen og1, QPDFObjGen og2)
1882   -{
1883   - // Force objects to be read from the input source if needed, then swap them in the cache.
1884   - resolve(og1);
1885   - resolve(og2);
1886   - m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
1887   -}
1888   -
1889   -size_t
1890   -QPDF::tableSize()
1891   -{
1892   - // If obj_cache is dense, accommodate all object in tables,else accommodate only original
1893   - // objects.
1894   - auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0;
1895   - auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;
1896   - auto max_id = std::numeric_limits<int>::max() - 1;
1897   - if (max_obj >= max_id || max_xref >= max_id) {
1898   - // Temporary fix. Long-term solution is
1899   - // - QPDFObjGen to enforce objgens are valid and sensible
1900   - // - xref table and obj cache to protect against insertion of impossibly large obj ids
1901   - stopOnError("Impossibly large object id encountered.");
1902   - }
1903   - if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
1904   - return toS(++max_obj);
1905   - }
1906   - return toS(++max_xref);
1907   -}
1908   -
1909   -std::vector<QPDFObjGen>
1910   -QPDF::getCompressibleObjVector()
1911   -{
1912   - return getCompressibleObjGens<QPDFObjGen>();
1913   -}
1914   -
1915   -std::vector<bool>
1916   -QPDF::getCompressibleObjSet()
1917   -{
1918   - return getCompressibleObjGens<bool>();
1919   -}
1920   -
1921   -template <typename T>
1922   -std::vector<T>
1923   -QPDF::getCompressibleObjGens()
1924   -{
1925   - // Return a list of objects that are allowed to be in object streams. Walk through the objects
1926   - // by traversing the document from the root, including a traversal of the pages tree. This
1927   - // makes that objects that are on the same page are more likely to be in the same object stream,
1928   - // which is slightly more efficient, particularly with linearized files. This is better than
1929   - // iterating through the xref table since it avoids preserving orphaned items.
1930   -
1931   - // Exclude encryption dictionary, if any
1932   - QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
1933   - QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
1934   -
1935   - const size_t max_obj = getObjectCount();
1936   - std::vector<bool> visited(max_obj, false);
1937   - std::vector<QPDFObjectHandle> queue;
1938   - queue.reserve(512);
1939   - queue.push_back(m->trailer);
1940   - std::vector<T> result;
1941   - if constexpr (std::is_same_v<T, QPDFObjGen>) {
1942   - result.reserve(m->obj_cache.size());
1943   - } else if constexpr (std::is_same_v<T, bool>) {
1944   - result.resize(max_obj + 1U, false);
1945   - } else {
1946   - throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
1947   - }
1948   - while (!queue.empty()) {
1949   - auto obj = queue.back();
1950   - queue.pop_back();
1951   - if (obj.getObjectID() > 0) {
1952   - QPDFObjGen og = obj.getObjGen();
1953   - const size_t id = toS(og.getObj() - 1);
1954   - if (id >= max_obj) {
1955   - throw std::logic_error(
1956   - "unexpected object id encountered in getCompressibleObjGens");
1957   - }
1958   - if (visited[id]) {
1959   - QTC::TC("qpdf", "QPDF loop detected traversing objects");
1960   - continue;
1961   - }
1962   -
1963   - // Check whether this is the current object. If not, remove it (which changes it into a
1964   - // direct null and therefore stops us from revisiting it) and move on to the next object
1965   - // in the queue.
1966   - auto upper = m->obj_cache.upper_bound(og);
1967   - if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
1968   - removeObject(og);
1969   - continue;
1970   - }
1971   -
1972   - visited[id] = true;
1973   -
1974   - if (og == encryption_dict_og) {
1975   - QTC::TC("qpdf", "QPDF exclude encryption dictionary");
1976   - } else if (!(obj.isStream() ||
1977   - (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
1978   - obj.hasKey("/Contents")))) {
1979   - if constexpr (std::is_same_v<T, QPDFObjGen>) {
1980   - result.push_back(og);
1981   - } else if constexpr (std::is_same_v<T, bool>) {
1982   - result[id + 1U] = true;
1983   - }
1984   - }
1985   - }
1986   - if (obj.isStream()) {
1987   - auto dict = obj.getDict().as_dictionary();
1988   - auto end = dict.crend();
1989   - for (auto iter = dict.crbegin(); iter != end; ++iter) {
1990   - std::string const& key = iter->first;
1991   - QPDFObjectHandle const& value = iter->second;
1992   - if (!value.null()) {
1993   - if (key == "/Length") {
1994   - // omit stream lengths
1995   - if (value.isIndirect()) {
1996   - QTC::TC("qpdf", "QPDF exclude indirect length");
1997   - }
1998   - } else {
1999   - queue.emplace_back(value);
2000   - }
2001   - }
2002   - }
2003   - } else if (obj.isDictionary()) {
2004   - auto dict = obj.as_dictionary();
2005   - auto end = dict.crend();
2006   - for (auto iter = dict.crbegin(); iter != end; ++iter) {
2007   - if (!iter->second.null()) {
2008   - queue.emplace_back(iter->second);
2009   - }
2010   - }
2011   - } else if (auto items = obj.as_array()) {
2012   - queue.insert(queue.end(), items.crbegin(), items.crend());
2013   - }
2014   - }
2015   -
2016   - return result;
2017   -}
manual/release-notes.rst
... ... @@ -21,15 +21,16 @@ more detail.
21 21 integer object. Previously the method returned false if the first
22 22 dictionary object was not a linearization parameter dictionary.
23 23  
24   - - Other enhancements
  24 +.. _r12-0-0:
  25 +
  26 +12.0.1: not yet released
  27 + - Other enhancements
25 28  
26   - - There have been further enhancements to how files with damaged xref
27   - tables are recovered.
  29 + - There have been further enhancements to how files with damaged xref
  30 + tables are recovered.
28 31  
29 32 .. cSpell:ignore substract
30 33  
31   -.. _r12-0-0:
32   -
33 34 12.0.0: March 9, 2025
34 35 - API breaking changes
35 36  
... ...