Commit bb045907a043b5c6de9fb804ff11087333747329

Authored by m-holger
1 parent 9740930b

Split QPDF.cc into QPDF.cc and QPDF_objects.cc

Move methods responsible for loading or keeping track of objects to
QPDF_objects.cc.

The split was part of the reverted #1297. Reintroducing it now makes it
easier/safer to keep the work to refactor the xref and object tables in
sync with main.
libqpdf/CMakeLists.txt
... ... @@ -95,6 +95,7 @@ set(libqpdf_SOURCES
95 95 QPDF_encryption.cc
96 96 QPDF_json.cc
97 97 QPDF_linearization.cc
  98 + QPDF_objects.cc
98 99 QPDF_optimization.cc
99 100 QPDF_pages.cc
100 101 QTC.cc
... ...
libqpdf/QPDF.cc
... ... @@ -413,1726 +413,26 @@ QPDF::findHeader()
413 413 return valid;
414 414 }
415 415  
416   -bool
417   -QPDF::findStartxref()
418   -{
419   - if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {
420   - // Position in front of offset token
421   - m->file->seek(m->file->getLastOffset(), SEEK_SET);
422   - return true;
423   - }
424   - return false;
425   -}
426   -
427   -void
428   -QPDF::parse(char const* password)
429   -{
430   - if (password) {
431   - m->encp->provided_password = password;
432   - }
433   -
434   - // Find the header anywhere in the first 1024 bytes of the file.
435   - PatternFinder hf(*this, &QPDF::findHeader);
436   - if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {
437   - QTC::TC("qpdf", "QPDF not a pdf file");
438   - warn(damagedPDF("", 0, "can't find PDF header"));
439   - // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode
440   - m->pdf_version = "1.2";
441   - }
442   -
443   - // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
444   - // 30 characters to leave room for the startxref stuff.
445   - m->file->seek(0, SEEK_END);
446   - qpdf_offset_t end_offset = m->file->tell();
447   - m->xref_table_max_offset = end_offset;
448   - // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
449   - // scenarios at least 3 bytes are required.
450   - if (m->xref_table_max_id > m->xref_table_max_offset / 3) {
451   - m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);
452   - }
453   - qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
454   - PatternFinder sf(*this, &QPDF::findStartxref);
455   - qpdf_offset_t xref_offset = 0;
456   - if (m->file->findLast("startxref", start_offset, 0, sf)) {
457   - xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());
458   - }
459   -
460   - try {
461   - if (xref_offset == 0) {
462   - QTC::TC("qpdf", "QPDF can't find startxref");
463   - throw damagedPDF("", 0, "can't find startxref");
464   - }
465   - try {
466   - read_xref(xref_offset);
467   - } catch (QPDFExc&) {
468   - throw;
469   - } catch (std::exception& e) {
470   - throw damagedPDF("", 0, std::string("error reading xref: ") + e.what());
471   - }
472   - } catch (QPDFExc& e) {
473   - if (m->attempt_recovery) {
474   - reconstruct_xref(e, xref_offset > 0);
475   - QTC::TC("qpdf", "QPDF reconstructed xref table");
476   - } else {
477   - throw;
478   - }
479   - }
480   -
481   - initializeEncryption();
482   - m->parsed = true;
483   - if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) {
484   - // QPDFs created from JSON have an empty xref table and no root object yet.
485   - throw damagedPDF("", 0, "unable to find page tree");
486   - }
487   -}
488   -
489   -void
490   -QPDF::inParse(bool v)
491   -{
492   - if (m->in_parse == v) {
493   - // This happens if QPDFParser::parse tries to resolve an indirect object while it is
494   - // parsing.
495   - throw std::logic_error(
496   - "QPDF: re-entrant parsing detected. This is a qpdf bug."
497   - " Please report at https://github.com/qpdf/qpdf/issues.");
498   - }
499   - m->in_parse = v;
500   -}
501   -
502   -void
503   -QPDF::warn(QPDFExc const& e)
504   -{
505   - if (m->max_warnings > 0 && m->warnings.size() >= m->max_warnings) {
506   - stopOnError("Too many warnings - file is too badly damaged");
507   - }
508   - m->warnings.push_back(e);
509   - if (!m->suppress_warnings) {
510   - *m->log->getWarn() << "WARNING: " << m->warnings.back().what() << "\n";
511   - }
512   -}
513   -
514   -void
515   -QPDF::warn(
516   - qpdf_error_code_e error_code,
517   - std::string const& object,
518   - qpdf_offset_t offset,
519   - std::string const& message)
520   -{
521   - warn(QPDFExc(error_code, getFilename(), object, offset, message));
522   -}
523   -
524   -void
525   -QPDF::setTrailer(QPDFObjectHandle obj)
526   -{
527   - if (m->trailer) {
528   - return;
529   - }
530   - m->trailer = obj;
531   -}
532   -
533   -void
534   -QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
535   -{
536   - if (m->reconstructed_xref) {
537   - // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
538   - // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
539   - throw e;
540   - }
541   -
542   - // If recovery generates more than 1000 warnings, the file is so severely damaged that there
543   - // probably is no point trying to continue.
544   - const auto max_warnings = m->warnings.size() + 1000U;
545   - auto check_warnings = [this, max_warnings]() {
546   - if (m->warnings.size() > max_warnings) {
547   - throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");
548   - }
549   - };
550   -
551   - m->reconstructed_xref = true;
552   - // We may find more objects, which may contain dangling references.
553   - m->fixed_dangling_refs = false;
554   -
555   - warn(damagedPDF("", 0, "file is damaged"));
556   - warn(e);
557   - warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));
558   -
559   - // Delete all references to type 1 (uncompressed) objects
560   - std::vector<QPDFObjGen> to_delete;
561   - for (auto const& iter: m->xref_table) {
562   - if (iter.second.getType() == 1) {
563   - to_delete.emplace_back(iter.first);
564   - }
565   - }
566   - for (auto const& iter: to_delete) {
567   - m->xref_table.erase(iter);
568   - }
569   -
570   - std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
571   - std::vector<qpdf_offset_t> trailers;
572   - std::vector<qpdf_offset_t> startxrefs;
573   -
574   - m->file->seek(0, SEEK_END);
575   - qpdf_offset_t eof = m->file->tell();
576   - m->file->seek(0, SEEK_SET);
577   - // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
578   - static size_t const MAX_LEN = 10;
579   - while (m->file->tell() < eof) {
580   - QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);
581   - qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
582   - if (t1.isInteger()) {
583   - auto pos = m->file->tell();
584   - auto t2 = readToken(*m->file, MAX_LEN);
585   - if (t2.isInteger() && readToken(*m->file, MAX_LEN).isWord("obj")) {
586   - int obj = QUtil::string_to_int(t1.getValue().c_str());
587   - int gen = QUtil::string_to_int(t2.getValue().c_str());
588   - if (obj <= m->xref_table_max_id) {
589   - found_objects.emplace_back(obj, gen, token_start);
590   - } else {
591   - warn(damagedPDF(
592   - "", 0, "ignoring object with impossibly large id " + std::to_string(obj)));
593   - }
594   - }
595   - m->file->seek(pos, SEEK_SET);
596   - } else if (!m->trailer && t1.isWord("trailer")) {
597   - trailers.emplace_back(m->file->tell());
598   - } else if (!found_startxref && t1.isWord("startxref")) {
599   - startxrefs.emplace_back(m->file->tell());
600   - }
601   - check_warnings();
602   - m->file->findAndSkipNextEOL();
603   - }
604   -
605   - if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
606   - startxrefs.back() > std::get<2>(found_objects.back())) {
607   - try {
608   - m->file->seek(startxrefs.back(), SEEK_SET);
609   - if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
610   - read_xref(offset);
611   - if (getRoot().getKey("/Pages").isDictionary()) {
612   - QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
613   - warn(
614   - damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));
615   - initializeEncryption();
616   - m->parsed = true;
617   - m->reconstructed_xref = false;
618   - return;
619   - }
620   - }
621   - } catch (...) {
622   - // ok, bad luck. Do recovery.
623   - }
624   - }
625   -
626   - auto rend = found_objects.rend();
627   - for (auto it = found_objects.rbegin(); it != rend; it++) {
628   - auto [obj, gen, token_start] = *it;
629   - insertXrefEntry(obj, 1, token_start, gen);
630   - check_warnings();
631   - }
632   - m->deleted_objects.clear();
633   -
634   - for (auto it = trailers.rbegin(); it != trailers.rend(); it++) {
635   - m->file->seek(*it, SEEK_SET);
636   - auto t = readTrailer();
637   - if (!t.isDictionary()) {
638   - // Oh well. It was worth a try.
639   - } else {
640   - if (t.hasKey("/Root")) {
641   - m->trailer = t;
642   - break;
643   - }
644   - warn(damagedPDF("trailer", *it, "recovered trailer has no /Root entry"));
645   - }
646   - check_warnings();
647   - }
648   -
649   - if (!m->trailer) {
650   - qpdf_offset_t max_offset{0};
651   - size_t max_size{0};
652   - // If there are any xref streams, take the last one to appear.
653   - for (auto const& iter: m->xref_table) {
654   - auto entry = iter.second;
655   - if (entry.getType() != 1) {
656   - continue;
657   - }
658   - auto oh = getObject(iter.first);
659   - try {
660   - if (!oh.isStreamOfType("/XRef")) {
661   - continue;
662   - }
663   - } catch (std::exception&) {
664   - continue;
665   - }
666   - auto offset = entry.getOffset();
667   - auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();
668   - if (size > max_size || (size == max_size && offset > max_offset)) {
669   - max_offset = offset;
670   - setTrailer(oh.getDict());
671   - }
672   - check_warnings();
673   - }
674   - if (max_offset > 0) {
675   - try {
676   - read_xref(max_offset);
677   - } catch (std::exception&) {
678   - warn(damagedPDF(
679   - "", 0, "error decoding candidate xref stream while recovering damaged file"));
680   - }
681   - QTC::TC("qpdf", "QPDF recover xref stream");
682   - }
683   - }
684   -
685   - if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {
686   - // Try to find a Root dictionary. As a quick fix try the one with the highest object id.
687   - QPDFObjectHandle root;
688   - for (auto const& iter: m->obj_cache) {
689   - try {
690   - if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {
691   - root = iter.second.object;
692   - }
693   - } catch (std::exception&) {
694   - continue;
695   - }
696   - }
697   - if (root) {
698   - if (!m->trailer) {
699   - warn(damagedPDF(
700   - "", 0, "unable to find trailer dictionary while recovering damaged file"));
701   - m->trailer = QPDFObjectHandle::newDictionary();
702   - }
703   - m->trailer.replaceKey("/Root", root);
704   - }
705   - }
706   -
707   - if (!m->trailer) {
708   - // We could check the last encountered object to see if it was an xref stream. If so, we
709   - // could try to get the trailer from there. This may make it possible to recover files with
710   - // bad startxref pointers even when they have object streams.
711   -
712   - throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");
713   - }
714   - if (m->xref_table.empty()) {
715   - // We cannot check for an empty xref table in parse because empty tables are valid when
716   - // creating QPDF objects from JSON.
717   - throw damagedPDF("", 0, "unable to find objects while recovering damaged file");
718   - }
719   - check_warnings();
720   - if (!m->parsed) {
721   - m->parsed = true;
722   - getAllPages();
723   - check_warnings();
724   - if (m->all_pages.empty()) {
725   - m->parsed = false;
726   - throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");
727   - }
728   - }
729   - // We could iterate through the objects looking for streams and try to find objects inside of
730   - // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors
731   - // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything
732   - // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
733   - // It's safe to call it more than once.
734   -}
735   -
736   -void
737   -QPDF::read_xref(qpdf_offset_t xref_offset)
738   -{
739   - std::map<int, int> free_table;
740   - std::set<qpdf_offset_t> visited;
741   - while (xref_offset) {
742   - visited.insert(xref_offset);
743   - char buf[7];
744   - memset(buf, 0, sizeof(buf));
745   - m->file->seek(xref_offset, SEEK_SET);
746   - // Some files miss the mark a little with startxref. We could do a better job of searching
747   - // in the neighborhood for something that looks like either an xref table or stream, but the
748   - // simple heuristic of skipping whitespace can help with the xref table case and is harmless
749   - // with the stream case.
750   - bool done = false;
751   - bool skipped_space = false;
752   - while (!done) {
753   - char ch;
754   - if (1 == m->file->read(&ch, 1)) {
755   - if (util::is_space(ch)) {
756   - skipped_space = true;
757   - } else {
758   - m->file->unreadCh(ch);
759   - done = true;
760   - }
761   - } else {
762   - QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
763   - done = true;
764   - }
765   - }
766   -
767   - m->file->read(buf, sizeof(buf) - 1);
768   - // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
769   - // where it is terminated by arbitrary whitespace.
770   - if ((strncmp(buf, "xref", 4) == 0) && util::is_space(buf[4])) {
771   - if (skipped_space) {
772   - QTC::TC("qpdf", "QPDF xref skipped space");
773   - warn(damagedPDF("", 0, "extraneous whitespace seen before xref"));
774   - }
775   - QTC::TC(
776   - "qpdf",
777   - "QPDF xref space",
778   - ((buf[4] == '\n') ? 0
779   - : (buf[4] == '\r') ? 1
780   - : (buf[4] == ' ') ? 2
781   - : 9999));
782   - int skip = 4;
783   - // buf is null-terminated, and util::is_space('\0') is false, so this won't overrun.
784   - while (util::is_space(buf[skip])) {
785   - ++skip;
786   - }
787   - xref_offset = read_xrefTable(xref_offset + skip);
788   - } else {
789   - xref_offset = read_xrefStream(xref_offset);
790   - }
791   - if (visited.count(xref_offset) != 0) {
792   - QTC::TC("qpdf", "QPDF xref loop");
793   - throw damagedPDF("", 0, "loop detected following xref tables");
794   - }
795   - }
796   -
797   - if (!m->trailer) {
798   - throw damagedPDF("", 0, "unable to find trailer while reading xref");
799   - }
800   - int size = m->trailer.getKey("/Size").getIntValueAsInt();
801   - int max_obj = 0;
802   - if (!m->xref_table.empty()) {
803   - max_obj = m->xref_table.rbegin()->first.getObj();
804   - }
805   - if (!m->deleted_objects.empty()) {
806   - max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));
807   - }
808   - if ((size < 1) || (size - 1 != max_obj)) {
809   - QTC::TC("qpdf", "QPDF xref size mismatch");
810   - warn(damagedPDF(
811   - "",
812   - 0,
813   - ("reported number of objects (" + std::to_string(size) +
814   - ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));
815   - }
816   -
817   - // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we
818   - // never depend on its being set.
819   - m->deleted_objects.clear();
820   -
821   - // Make sure we keep only the highest generation for any object.
822   - QPDFObjGen last_og{-1, 0};
823   - for (auto const& item: m->xref_table) {
824   - auto id = item.first.getObj();
825   - if (id == last_og.getObj() && id > 0) {
826   - removeObject(last_og);
827   - }
828   - last_og = item.first;
829   - }
830   -}
831   -
832   -bool
833   -QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)
834   -{
835   - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
836   - // buffer.
837   - char const* p = line.c_str();
838   - char const* start = line.c_str();
839   -
840   - // Skip zero or more spaces
841   - while (util::is_space(*p)) {
842   - ++p;
843   - }
844   - // Require digit
845   - if (!util::is_digit(*p)) {
846   - return false;
847   - }
848   - // Gather digits
849   - std::string obj_str;
850   - while (util::is_digit(*p)) {
851   - obj_str.append(1, *p++);
852   - }
853   - // Require space
854   - if (!util::is_space(*p)) {
855   - return false;
856   - }
857   - // Skip spaces
858   - while (util::is_space(*p)) {
859   - ++p;
860   - }
861   - // Require digit
862   - if (!util::is_digit(*p)) {
863   - return false;
864   - }
865   - // Gather digits
866   - std::string num_str;
867   - while (util::is_digit(*p)) {
868   - num_str.append(1, *p++);
869   - }
870   - // Skip any space including line terminators
871   - while (util::is_space(*p)) {
872   - ++p;
873   - }
874   - bytes = toI(p - start);
875   - obj = QUtil::string_to_int(obj_str.c_str());
876   - num = QUtil::string_to_int(num_str.c_str());
877   - return true;
878   -}
879   -
880   -bool
881   -QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
882   -{
883   - // Reposition after initial read attempt and reread.
884   - m->file->seek(m->file->getLastOffset(), SEEK_SET);
885   - auto line = m->file->readLine(30);
886   -
887   - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
888   - // buffer.
889   - char const* p = line.data();
890   -
891   - // Skip zero or more spaces. There aren't supposed to be any.
892   - bool invalid = false;
893   - while (util::is_space(*p)) {
894   - ++p;
895   - QTC::TC("qpdf", "QPDF ignore first space in xref entry");
896   - invalid = true;
897   - }
898   - // Require digit
899   - if (!util::is_digit(*p)) {
900   - return false;
901   - }
902   - // Gather digits
903   - std::string f1_str;
904   - while (util::is_digit(*p)) {
905   - f1_str.append(1, *p++);
906   - }
907   - // Require space
908   - if (!util::is_space(*p)) {
909   - return false;
910   - }
911   - if (util::is_space(*(p + 1))) {
912   - QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
913   - invalid = true;
914   - }
915   - // Skip spaces
916   - while (util::is_space(*p)) {
917   - ++p;
918   - }
919   - // Require digit
920   - if (!util::is_digit(*p)) {
921   - return false;
922   - }
923   - // Gather digits
924   - std::string f2_str;
925   - while (util::is_digit(*p)) {
926   - f2_str.append(1, *p++);
927   - }
928   - // Require space
929   - if (!util::is_space(*p)) {
930   - return false;
931   - }
932   - if (util::is_space(*(p + 1))) {
933   - QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
934   - invalid = true;
935   - }
936   - // Skip spaces
937   - while (util::is_space(*p)) {
938   - ++p;
939   - }
940   - if ((*p == 'f') || (*p == 'n')) {
941   - type = *p;
942   - } else {
943   - return false;
944   - }
945   - if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
946   - QTC::TC("qpdf", "QPDF ignore length error xref entry");
947   - invalid = true;
948   - }
949   -
950   - if (invalid) {
951   - warn(damagedPDF("xref table", "accepting invalid xref table entry"));
952   - }
953   -
954   - f1 = QUtil::string_to_ll(f1_str.c_str());
955   - f2 = QUtil::string_to_int(f2_str.c_str());
956   -
957   - return true;
958   -}
959   -
960   -// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
961   -// result.
962   -bool
963   -QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
964   -{
965   - std::array<char, 21> line;
966   - if (m->file->read(line.data(), 20) != 20) {
967   - // C++20: [[unlikely]]
968   - return false;
969   - }
970   - line[20] = '\0';
971   - char const* p = line.data();
972   -
973   - int f1_len = 0;
974   - int f2_len = 0;
975   -
976   - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
977   - // buffer.
978   -
979   - // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
980   - while (*p == '0') {
981   - ++f1_len;
982   - ++p;
983   - }
984   - while (util::is_digit(*p) && f1_len++ < 10) {
985   - f1 *= 10;
986   - f1 += *p++ - '0';
987   - }
988   - // Require space
989   - if (!util::is_space(*p++)) {
990   - // Entry doesn't start with space or digit.
991   - // C++20: [[unlikely]]
992   - return false;
993   - }
994   - // Gather digits. NB No risk of overflow as 99'999 < max int.
995   - while (*p == '0') {
996   - ++f2_len;
997   - ++p;
998   - }
999   - while (util::is_digit(*p) && f2_len++ < 5) {
1000   - f2 *= 10;
1001   - f2 += static_cast<int>(*p++ - '0');
1002   - }
1003   - if (util::is_space(*p++) && (*p == 'f' || *p == 'n')) {
1004   - // C++20: [[likely]]
1005   - type = *p;
1006   - // No test for valid line[19].
1007   - if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
1008   - // C++20: [[likely]]
1009   - return true;
1010   - }
1011   - }
1012   - return read_bad_xrefEntry(f1, f2, type);
1013   -}
1014   -
1015   -// Read a single cross-reference table section and associated trailer.
1016   -qpdf_offset_t
1017   -QPDF::read_xrefTable(qpdf_offset_t xref_offset)
1018   -{
1019   - m->file->seek(xref_offset, SEEK_SET);
1020   - std::string line;
1021   - while (true) {
1022   - line.assign(50, '\0');
1023   - m->file->read(line.data(), line.size());
1024   - int obj = 0;
1025   - int num = 0;
1026   - int bytes = 0;
1027   - if (!parse_xrefFirst(line, obj, num, bytes)) {
1028   - QTC::TC("qpdf", "QPDF invalid xref");
1029   - throw damagedPDF("xref table", "xref syntax invalid");
1030   - }
1031   - m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
1032   - for (qpdf_offset_t i = obj; i - num < obj; ++i) {
1033   - if (i == 0) {
1034   - // This is needed by checkLinearization()
1035   - m->first_xref_item_offset = m->file->tell();
1036   - }
1037   - // For xref_table, these will always be small enough to be ints
1038   - qpdf_offset_t f1 = 0;
1039   - int f2 = 0;
1040   - char type = '\0';
1041   - if (!read_xrefEntry(f1, f2, type)) {
1042   - QTC::TC("qpdf", "QPDF invalid xref entry");
1043   - throw damagedPDF(
1044   - "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
1045   - }
1046   - if (type == 'f') {
1047   - insertFreeXrefEntry(QPDFObjGen(toI(i), f2));
1048   - } else {
1049   - insertXrefEntry(toI(i), 1, f1, f2);
1050   - }
1051   - }
1052   - qpdf_offset_t pos = m->file->tell();
1053   - if (readToken(*m->file).isWord("trailer")) {
1054   - break;
1055   - } else {
1056   - m->file->seek(pos, SEEK_SET);
1057   - }
1058   - }
1059   -
1060   - // Set offset to previous xref table if any
1061   - QPDFObjectHandle cur_trailer = readTrailer();
1062   - if (!cur_trailer.isDictionary()) {
1063   - QTC::TC("qpdf", "QPDF missing trailer");
1064   - throw damagedPDF("", "expected trailer dictionary");
1065   - }
1066   -
1067   - if (!m->trailer) {
1068   - setTrailer(cur_trailer);
1069   -
1070   - if (!m->trailer.hasKey("/Size")) {
1071   - QTC::TC("qpdf", "QPDF trailer lacks size");
1072   - throw damagedPDF("trailer", "trailer dictionary lacks /Size key");
1073   - }
1074   - if (!m->trailer.getKey("/Size").isInteger()) {
1075   - QTC::TC("qpdf", "QPDF trailer size not integer");
1076   - throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
1077   - }
1078   - }
1079   -
1080   - if (cur_trailer.hasKey("/XRefStm")) {
1081   - if (m->ignore_xref_streams) {
1082   - QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
1083   - } else {
1084   - if (cur_trailer.getKey("/XRefStm").isInteger()) {
1085   - // Read the xref stream but disregard any return value -- we'll use our trailer's
1086   - // /Prev key instead of the xref stream's.
1087   - (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());
1088   - } else {
1089   - throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");
1090   - }
1091   - }
1092   - }
1093   -
1094   - if (cur_trailer.hasKey("/Prev")) {
1095   - if (!cur_trailer.getKey("/Prev").isInteger()) {
1096   - QTC::TC("qpdf", "QPDF trailer prev not integer");
1097   - throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");
1098   - }
1099   - QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
1100   - return cur_trailer.getKey("/Prev").getIntValue();
1101   - }
1102   -
1103   - return 0;
1104   -}
1105   -
1106   -// Read a single cross-reference stream.
1107   -qpdf_offset_t
1108   -QPDF::read_xrefStream(qpdf_offset_t xref_offset)
1109   -{
1110   - if (!m->ignore_xref_streams) {
1111   - QPDFObjGen x_og;
1112   - QPDFObjectHandle xref_obj;
1113   - try {
1114   - xref_obj =
1115   - readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);
1116   - } catch (QPDFExc&) {
1117   - // ignore -- report error below
1118   - }
1119   - if (xref_obj.isStreamOfType("/XRef")) {
1120   - QTC::TC("qpdf", "QPDF found xref stream");
1121   - return processXRefStream(xref_offset, xref_obj);
1122   - }
1123   - }
1124   -
1125   - QTC::TC("qpdf", "QPDF can't find xref");
1126   - throw damagedPDF("", xref_offset, "xref not found");
1127   - return 0; // unreachable
1128   -}
1129   -
1130   -// Return the entry size of the xref stream and the processed W array.
1131   -std::pair<int, std::array<int, 3>>
1132   -QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
1133   -{
1134   - auto W_obj = dict.getKey("/W");
1135   - if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&
1136   - W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
1137   - throw damaged("Cross-reference stream does not have a proper /W key");
1138   - }
1139   -
1140   - std::array<int, 3> W;
1141   - int entry_size = 0;
1142   - auto w_vector = W_obj.getArrayAsVector();
1143   - int max_bytes = sizeof(qpdf_offset_t);
1144   - for (size_t i = 0; i < 3; ++i) {
1145   - W[i] = w_vector[i].getIntValueAsInt();
1146   - if (W[i] > max_bytes) {
1147   - throw damaged("Cross-reference stream's /W contains impossibly large values");
1148   - }
1149   - if (W[i] < 0) {
1150   - throw damaged("Cross-reference stream's /W contains negative values");
1151   - }
1152   - entry_size += W[i];
1153   - }
1154   - if (entry_size == 0) {
1155   - throw damaged("Cross-reference stream's /W indicates entry size of 0");
1156   - }
1157   - return {entry_size, W};
1158   -}
1159   -
1160   -// Validate Size key and return the maximum number of entries that the xref stream can contain.
1161   -int
1162   -QPDF::processXRefSize(
1163   - QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
1164   -{
1165   - // Number of entries is limited by the highest possible object id and stream size.
1166   - auto max_num_entries = std::numeric_limits<int>::max();
1167   - if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
1168   - max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
1169   - }
1170   -
1171   - auto Size_obj = dict.getKey("/Size");
1172   - long long size;
1173   - if (!dict.getKey("/Size").getValueAsInt(size)) {
1174   - throw damaged("Cross-reference stream does not have a proper /Size key");
1175   - } else if (size < 0) {
1176   - throw damaged("Cross-reference stream has a negative /Size key");
1177   - } else if (size >= max_num_entries) {
1178   - throw damaged("Cross-reference stream has an impossibly large /Size key");
1179   - }
1180   - // We are not validating that Size <= (Size key of parent xref / trailer).
1181   - return max_num_entries;
1182   -}
1183   -
1184   -// Return the number of entries of the xref stream and the processed Index array.
1185   -std::pair<int, std::vector<std::pair<int, int>>>
1186   -QPDF::processXRefIndex(
1187   - QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
1188   -{
1189   - auto size = dict.getKey("/Size").getIntValueAsInt();
1190   - auto Index_obj = dict.getKey("/Index");
1191   -
1192   - if (Index_obj.isArray()) {
1193   - std::vector<std::pair<int, int>> indx;
1194   - int num_entries = 0;
1195   - auto index_vec = Index_obj.getArrayAsVector();
1196   - if ((index_vec.size() % 2) || index_vec.size() < 2) {
1197   - throw damaged("Cross-reference stream's /Index has an invalid number of values");
1198   - }
1199   -
1200   - int i = 0;
1201   - long long first = 0;
1202   - for (auto& val: index_vec) {
1203   - if (val.isInteger()) {
1204   - if (i % 2) {
1205   - auto count = val.getIntValue();
1206   - if (count <= 0) {
1207   - throw damaged(
1208   - "Cross-reference stream section claims to contain " +
1209   - std::to_string(count) + " entries");
1210   - }
1211   - // We are guarding against the possibility of num_entries * entry_size
1212   - // overflowing. We are not checking that entries are in ascending order as
1213   - // required by the spec, which probably should generate a warning. We are also
1214   - // not checking that for each subsection first object number + number of entries
1215   - // <= /Size. The spec requires us to ignore object number > /Size.
1216   - if (first > (max_num_entries - count) ||
1217   - count > (max_num_entries - num_entries)) {
1218   - throw damaged(
1219   - "Cross-reference stream claims to contain too many entries: " +
1220   - std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
1221   - std::to_string(num_entries));
1222   - }
1223   - indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
1224   - num_entries += static_cast<int>(count);
1225   - } else {
1226   - first = val.getIntValue();
1227   - if (first < 0) {
1228   - throw damaged(
1229   - "Cross-reference stream's /Index contains a negative object id");
1230   - } else if (first > max_num_entries) {
1231   - throw damaged(
1232   - "Cross-reference stream's /Index contains an impossibly "
1233   - "large object id");
1234   - }
1235   - }
1236   - } else {
1237   - throw damaged(
1238   - "Cross-reference stream's /Index's item " + std::to_string(i) +
1239   - " is not an integer");
1240   - }
1241   - i++;
1242   - }
1243   - QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
1244   - return {num_entries, indx};
1245   - } else if (Index_obj.isNull()) {
1246   - QTC::TC("qpdf", "QPDF xref /Index is null");
1247   - return {size, {{0, size}}};
1248   - } else {
1249   - throw damaged("Cross-reference stream does not have a proper /Index key");
1250   - }
1251   -}
1252   -
1253   -qpdf_offset_t
1254   -QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
1255   -{
1256   - auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
1257   - return damagedPDF("xref stream", xref_offset, msg.data());
1258   - };
1259   -
1260   - auto dict = xref_obj.getDict();
1261   -
1262   - auto [entry_size, W] = processXRefW(dict, damaged);
1263   - int max_num_entries = processXRefSize(dict, entry_size, damaged);
1264   - auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
1265   -
1266   - std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
1267   - size_t actual_size = bp->getSize();
1268   - auto expected_size = toS(entry_size) * toS(num_entries);
1269   -
1270   - if (expected_size != actual_size) {
1271   - QPDFExc x = damaged(
1272   - "Cross-reference stream data has the wrong size; expected = " +
1273   - std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
1274   - if (expected_size > actual_size) {
1275   - throw x;
1276   - } else {
1277   - warn(x);
1278   - }
1279   - }
1280   -
1281   - bool saw_first_compressed_object = false;
1282   -
1283   - // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
1284   - // We know that entry_size * num_entries is less or equal to the size of the buffer.
1285   - auto p = bp->getBuffer();
1286   - for (auto [obj, sec_entries]: indx) {
1287   - // Process a subsection.
1288   - for (int i = 0; i < sec_entries; ++i) {
1289   - // Read this entry
1290   - std::array<qpdf_offset_t, 3> fields{};
1291   - if (W[0] == 0) {
1292   - QTC::TC("qpdf", "QPDF default for xref stream field 0");
1293   - fields[0] = 1;
1294   - }
1295   - for (size_t j = 0; j < 3; ++j) {
1296   - for (int k = 0; k < W[j]; ++k) {
1297   - fields[j] <<= 8;
1298   - fields[j] |= *p++;
1299   - }
1300   - }
1301   -
1302   - // Get the generation number. The generation number is 0 unless this is an uncompressed
1303   - // object record, in which case the generation number appears as the third field.
1304   - if (saw_first_compressed_object) {
1305   - if (fields[0] != 2) {
1306   - m->uncompressed_after_compressed = true;
1307   - }
1308   - } else if (fields[0] == 2) {
1309   - saw_first_compressed_object = true;
1310   - }
1311   - if (obj == 0) {
1312   - // This is needed by checkLinearization()
1313   - m->first_xref_item_offset = xref_offset;
1314   - } else if (fields[0] == 0) {
1315   - // Ignore fields[2], which we don't care about in this case. This works around the
1316   - // issue of some PDF files that put invalid values, like -1, here for deleted
1317   - // objects.
1318   - insertFreeXrefEntry(QPDFObjGen(obj, 0));
1319   - } else {
1320   - insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
1321   - }
1322   - ++obj;
1323   - }
1324   - }
1325   -
1326   - if (!m->trailer) {
1327   - setTrailer(dict);
1328   - }
1329   -
1330   - if (dict.hasKey("/Prev")) {
1331   - if (!dict.getKey("/Prev").isInteger()) {
1332   - throw damagedPDF(
1333   - "xref stream", "/Prev key in xref stream dictionary is not an integer");
1334   - }
1335   - QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
1336   - return dict.getKey("/Prev").getIntValue();
1337   - } else {
1338   - return 0;
1339   - }
1340   -}
1341   -
1342   -void
1343   -QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)
1344   -{
1345   - // Populate the xref table in such a way that the first reference to an object that we see,
1346   - // which is the one in the latest xref table in which it appears, is the one that gets stored.
1347   - // This works because we are reading more recent appends before older ones.
1348   -
1349   - // If there is already an entry for this object and generation in the table, it means that a
1350   - // later xref table has registered this object. Disregard this one.
1351   - int new_gen = f0 == 2 ? 0 : f2;
1352   -
1353   - if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && new_gen < 65535)) {
1354   - // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There
1355   - // is probably no point having another warning but we could count invalid items in order to
1356   - // decide when to give up.
1357   - QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");
1358   - // ignore impossibly large object ids or object ids > Size.
1359   - return;
1360   - }
1361   -
1362   - if (m->deleted_objects.count(obj)) {
1363   - QTC::TC("qpdf", "QPDF xref deleted object");
1364   - return;
1365   - }
1366   -
1367   - if (f0 == 2 && static_cast<int>(f1) == obj) {
1368   - warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));
1369   - return;
1370   - }
1371   -
1372   - auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));
1373   - if (!created) {
1374   - QTC::TC("qpdf", "QPDF xref reused object");
1375   - return;
1376   - }
1377   -
1378   - switch (f0) {
1379   - case 1:
1380   - // f2 is generation
1381   - QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
1382   - iter->second = QPDFXRefEntry(f1);
1383   - break;
1384   -
1385   - case 2:
1386   - iter->second = QPDFXRefEntry(toI(f1), f2);
1387   - break;
1388   -
1389   - default:
1390   - throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));
1391   - break;
1392   - }
1393   -}
1394   -
1395   -void
1396   -QPDF::insertFreeXrefEntry(QPDFObjGen og)
1397   -{
1398   - if (!m->xref_table.count(og)) {
1399   - m->deleted_objects.insert(og.getObj());
1400   - }
1401   -}
1402   -
1403   -void
1404   -QPDF::showXRefTable()
1405   -{
1406   - auto& cout = *m->log->getInfo();
1407   - for (auto const& iter: m->xref_table) {
1408   - QPDFObjGen const& og = iter.first;
1409   - QPDFXRefEntry const& entry = iter.second;
1410   - cout << og.unparse('/') << ": ";
1411   - switch (entry.getType()) {
1412   - case 1:
1413   - cout << "uncompressed; offset = " << entry.getOffset();
1414   - break;
1415   -
1416   - case 2:
1417   - *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()
1418   - << ", index = " << entry.getObjStreamIndex();
1419   - break;
1420   -
1421   - default:
1422   - throw std::logic_error("unknown cross-reference table type while showing xref_table");
1423   - break;
1424   - }
1425   - m->log->info("\n");
1426   - }
1427   -}
1428   -
1429   -// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
1430   -// return false. Otherwise return true.
1431   -bool
1432   -QPDF::resolveXRefTable()
1433   -{
1434   - bool may_change = !m->reconstructed_xref;
1435   - for (auto& iter: m->xref_table) {
1436   - if (isUnresolved(iter.first)) {
1437   - resolve(iter.first);
1438   - if (may_change && m->reconstructed_xref) {
1439   - return false;
1440   - }
1441   - }
1442   - }
1443   - return true;
1444   -}
1445   -
1446   -// Ensure all objects in the pdf file, including those in indirect references, appear in the object
1447   -// cache.
1448   -void
1449   -QPDF::fixDanglingReferences(bool force)
1450   -{
1451   - if (m->fixed_dangling_refs) {
1452   - return;
1453   - }
1454   - if (!resolveXRefTable()) {
1455   - QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");
1456   - resolveXRefTable();
1457   - }
1458   - m->fixed_dangling_refs = true;
1459   -}
1460   -
1461   -size_t
1462   -QPDF::getObjectCount()
1463   -{
1464   - // This method returns the next available indirect object number. makeIndirectObject uses it for
1465   - // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
1466   - // be in obj_cache.
1467   - fixDanglingReferences();
1468   - QPDFObjGen og;
1469   - if (!m->obj_cache.empty()) {
1470   - og = (*(m->obj_cache.rbegin())).first;
1471   - }
1472   - return toS(og.getObj());
1473   -}
1474   -
1475   -std::vector<QPDFObjectHandle>
1476   -QPDF::getAllObjects()
1477   -{
1478   - // After fixDanglingReferences is called, all objects are in the object cache.
1479   - fixDanglingReferences();
1480   - std::vector<QPDFObjectHandle> result;
1481   - for (auto const& iter: m->obj_cache) {
1482   - result.push_back(newIndirect(iter.first, iter.second.object));
1483   - }
1484   - return result;
1485   -}
1486   -
1487   -void
1488   -QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen og)
1489   -{
1490   - m->last_object_description.clear();
1491   - if (!description.empty()) {
1492   - m->last_object_description += description;
1493   - if (og.isIndirect()) {
1494   - m->last_object_description += ": ";
1495   - }
1496   - }
1497   - if (og.isIndirect()) {
1498   - m->last_object_description += "object " + og.unparse(' ');
1499   - }
1500   -}
1501   -
1502   -QPDFObjectHandle
1503   -QPDF::readTrailer()
1504   -{
1505   - qpdf_offset_t offset = m->file->tell();
1506   - bool empty = false;
1507   - auto object =
1508   - QPDFParser(*m->file, "trailer", m->tokenizer, nullptr, this, true).parse(empty, false);
1509   - if (empty) {
1510   - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1511   - // actual PDF files and Adobe Reader appears to ignore them.
1512   - warn(damagedPDF("trailer", "empty object treated as null"));
1513   - } else if (object.isDictionary() && readToken(*m->file).isWord("stream")) {
1514   - warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
1515   - }
1516   - // Override last_offset so that it points to the beginning of the object we just read
1517   - m->file->setLastOffset(offset);
1518   - return object;
1519   -}
1520   -
1521   -QPDFObjectHandle
1522   -QPDF::readObject(std::string const& description, QPDFObjGen og)
1523   -{
1524   - setLastObjectDescription(description, og);
1525   - qpdf_offset_t offset = m->file->tell();
1526   - bool empty = false;
1527   -
1528   - StringDecrypter decrypter{this, og};
1529   - StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
1530   - auto object =
1531   - QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this, true)
1532   - .parse(empty, false);
1533   - if (empty) {
1534   - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1535   - // actual PDF files and Adobe Reader appears to ignore them.
1536   - warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));
1537   - return object;
1538   - }
1539   - auto token = readToken(*m->file);
1540   - if (object.isDictionary() && token.isWord("stream")) {
1541   - readStream(object, og, offset);
1542   - token = readToken(*m->file);
1543   - }
1544   - if (!token.isWord("endobj")) {
1545   - QTC::TC("qpdf", "QPDF err expected endobj");
1546   - warn(damagedPDF("expected endobj"));
1547   - }
1548   - return object;
1549   -}
1550   -
1551   -// After reading stream dictionary and stream keyword, read rest of stream.
1552 416 void
1553   -QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1554   -{
1555   - validateStreamLineEnd(object, og, offset);
1556   -
1557   - // Must get offset before accessing any additional objects since resolving a previously
1558   - // unresolved indirect object will change file position.
1559   - qpdf_offset_t stream_offset = m->file->tell();
1560   - size_t length = 0;
1561   -
1562   - try {
1563   - auto length_obj = object.getKey("/Length");
1564   -
1565   - if (!length_obj.isInteger()) {
1566   - if (length_obj.isNull()) {
1567   - QTC::TC("qpdf", "QPDF stream without length");
1568   - throw damagedPDF(offset, "stream dictionary lacks /Length key");
1569   - }
1570   - QTC::TC("qpdf", "QPDF stream length not integer");
1571   - throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");
1572   - }
1573   -
1574   - length = toS(length_obj.getUIntValue());
1575   - // Seek in two steps to avoid potential integer overflow
1576   - m->file->seek(stream_offset, SEEK_SET);
1577   - m->file->seek(toO(length), SEEK_CUR);
1578   - if (!readToken(*m->file).isWord("endstream")) {
1579   - QTC::TC("qpdf", "QPDF missing endstream");
1580   - throw damagedPDF("expected endstream");
1581   - }
1582   - } catch (QPDFExc& e) {
1583   - if (m->attempt_recovery) {
1584   - warn(e);
1585   - length = recoverStreamLength(m->file, og, stream_offset);
1586   - } else {
1587   - throw;
1588   - }
1589   - }
1590   - object = QPDFObjectHandle(qpdf::Stream(*this, og, object, stream_offset, length));
1591   -}
1592   -
1593   -void
1594   -QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1595   -{
1596   - // The PDF specification states that the word "stream" should be followed by either a carriage
1597   - // return and a newline or by a newline alone. It specifically disallowed following it by a
1598   - // carriage return alone since, in that case, there would be no way to tell whether the NL in a
1599   - // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,
1600   - // accept a carriage return by itself when followed by a non-newline character, so that's what
1601   - // we do here. We have also seen files that have extraneous whitespace between the stream
1602   - // keyword and the newline.
1603   - while (true) {
1604   - char ch;
1605   - if (m->file->read(&ch, 1) == 0) {
1606   - // A premature EOF here will result in some other problem that will get reported at
1607   - // another time.
1608   - return;
1609   - }
1610   - if (ch == '\n') {
1611   - // ready to read stream data
1612   - QTC::TC("qpdf", "QPDF stream with NL only");
1613   - return;
1614   - }
1615   - if (ch == '\r') {
1616   - // Read another character
1617   - if (m->file->read(&ch, 1) != 0) {
1618   - if (ch == '\n') {
1619   - // Ready to read stream data
1620   - QTC::TC("qpdf", "QPDF stream with CRNL");
1621   - } else {
1622   - // Treat the \r by itself as the whitespace after endstream and start reading
1623   - // stream data in spite of not having seen a newline.
1624   - QTC::TC("qpdf", "QPDF stream with CR only");
1625   - m->file->unreadCh(ch);
1626   - warn(damagedPDF(
1627   - m->file->tell(), "stream keyword followed by carriage return only"));
1628   - }
1629   - }
1630   - return;
1631   - }
1632   - if (!util::is_space(ch)) {
1633   - QTC::TC("qpdf", "QPDF stream without newline");
1634   - m->file->unreadCh(ch);
1635   - warn(damagedPDF(
1636   - m->file->tell(), "stream keyword not followed by proper line terminator"));
1637   - return;
1638   - }
1639   - warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
1640   - }
1641   -}
1642   -
1643   -QPDFObjectHandle
1644   -QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)
1645   -{
1646   - m->last_object_description.erase(7); // last_object_description starts with "object "
1647   - m->last_object_description += std::to_string(obj);
1648   - m->last_object_description += " 0";
1649   -
1650   - bool empty = false;
1651   - auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true)
1652   - .parse(empty, false);
1653   - if (empty) {
1654   - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1655   - // actual PDF files and Adobe Reader appears to ignore them.
1656   - warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));
1657   - }
1658   - return object;
1659   -}
1660   -
1661   -bool
1662   -QPDF::findEndstream()
1663   -{
1664   - // Find endstream or endobj. Position the input at that token.
1665   - auto t = readToken(*m->file, 20);
1666   - if (t.isWord("endobj") || t.isWord("endstream")) {
1667   - m->file->seek(m->file->getLastOffset(), SEEK_SET);
1668   - return true;
1669   - }
1670   - return false;
1671   -}
1672   -
1673   -size_t
1674   -QPDF::recoverStreamLength(
1675   - std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)
1676   -{
1677   - // Try to reconstruct stream length by looking for endstream or endobj
1678   - warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));
1679   -
1680   - PatternFinder ef(*this, &QPDF::findEndstream);
1681   - size_t length = 0;
1682   - if (m->file->findFirst("end", stream_offset, 0, ef)) {
1683   - length = toS(m->file->tell() - stream_offset);
1684   - // Reread endstream but, if it was endobj, don't skip that.
1685   - QPDFTokenizer::Token t = readToken(*m->file);
1686   - if (t.getValue() == "endobj") {
1687   - m->file->seek(m->file->getLastOffset(), SEEK_SET);
1688   - }
1689   - }
1690   -
1691   - if (length) {
1692   - auto end = stream_offset + toO(length);
1693   - qpdf_offset_t found_offset = 0;
1694   - QPDFObjGen found_og;
1695   -
1696   - // Make sure this is inside this object
1697   - for (auto const& [current_og, entry]: m->xref_table) {
1698   - if (entry.getType() == 1) {
1699   - qpdf_offset_t obj_offset = entry.getOffset();
1700   - if (found_offset < obj_offset && obj_offset < end) {
1701   - found_offset = obj_offset;
1702   - found_og = current_og;
1703   - }
1704   - }
1705   - }
1706   - if (!found_offset || found_og == og) {
1707   - // If we are trying to recover an XRef stream the xref table will not contain and
1708   - // won't contain any entries, therefore we cannot check the found length. Otherwise we
1709   - // found endstream\nendobj within the space allowed for this object, so we're probably
1710   - // in good shape.
1711   - } else {
1712   - QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
1713   - length = 0;
1714   - }
1715   - }
1716   -
1717   - if (length == 0) {
1718   - warn(damagedPDF(
1719   - *input, stream_offset, "unable to recover stream data; treating stream as empty"));
1720   - } else {
1721   - warn(damagedPDF(
1722   - *input, stream_offset, "recovered stream length: " + std::to_string(length)));
1723   - }
1724   -
1725   - QTC::TC("qpdf", "QPDF recovered stream length");
1726   - return length;
1727   -}
1728   -
1729   -QPDFTokenizer::Token
1730   -QPDF::readToken(InputSource& input, size_t max_len)
1731   -{
1732   - return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
1733   -}
1734   -
1735   -QPDFObjectHandle
1736   -QPDF::readObjectAtOffset(
1737   - bool try_recovery,
1738   - qpdf_offset_t offset,
1739   - std::string const& description,
1740   - QPDFObjGen exp_og,
1741   - QPDFObjGen& og,
1742   - bool skip_cache_if_in_xref)
1743   -{
1744   - bool check_og = true;
1745   - if (exp_og.getObj() == 0) {
1746   - // This method uses an expect object ID of 0 to indicate that we don't know or don't care
1747   - // what the actual object ID is at this offset. This is true when we read the xref stream
1748   - // and linearization hint streams. In this case, we don't verify the expect object
1749   - // ID/generation against what was read from the file. There is also no reason to attempt
1750   - // xref recovery if we get a failure in this case since the read attempt was not triggered
1751   - // by an xref lookup.
1752   - check_og = false;
1753   - try_recovery = false;
1754   - }
1755   - setLastObjectDescription(description, exp_og);
1756   -
1757   - if (!m->attempt_recovery) {
1758   - try_recovery = false;
1759   - }
1760   -
1761   - // Special case: if offset is 0, just return null. Some PDF writers, in particular
1762   - // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
1763   - // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
1764   - // these.
1765   - if (offset == 0) {
1766   - QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
1767   - warn(damagedPDF(0, "object has offset 0"));
1768   - return QPDFObjectHandle::newNull();
1769   - }
1770   -
1771   - m->file->seek(offset, SEEK_SET);
1772   - try {
1773   - QPDFTokenizer::Token tobjid = readToken(*m->file);
1774   - bool objidok = tobjid.isInteger();
1775   - QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
1776   - if (!objidok) {
1777   - QTC::TC("qpdf", "QPDF expected n n obj");
1778   - throw damagedPDF(offset, "expected n n obj");
1779   - }
1780   - QPDFTokenizer::Token tgen = readToken(*m->file);
1781   - bool genok = tgen.isInteger();
1782   - QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
1783   - if (!genok) {
1784   - throw damagedPDF(offset, "expected n n obj");
1785   - }
1786   - QPDFTokenizer::Token tobj = readToken(*m->file);
1787   -
1788   - bool objok = tobj.isWord("obj");
1789   - QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
1790   -
1791   - if (!objok) {
1792   - throw damagedPDF(offset, "expected n n obj");
1793   - }
1794   - int objid = QUtil::string_to_int(tobjid.getValue().c_str());
1795   - int generation = QUtil::string_to_int(tgen.getValue().c_str());
1796   - og = QPDFObjGen(objid, generation);
1797   - if (objid == 0) {
1798   - QTC::TC("qpdf", "QPDF object id 0");
1799   - throw damagedPDF(offset, "object with ID 0");
1800   - }
1801   - if (check_og && (exp_og != og)) {
1802   - QTC::TC("qpdf", "QPDF err wrong objid/generation");
1803   - QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
1804   - if (try_recovery) {
1805   - // Will be retried below
1806   - throw e;
1807   - } else {
1808   - // We can try reading the object anyway even if the ID doesn't match.
1809   - warn(e);
1810   - }
1811   - }
1812   - } catch (QPDFExc& e) {
1813   - if (try_recovery) {
1814   - // Try again after reconstructing xref table
1815   - reconstruct_xref(e);
1816   - if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {
1817   - qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
1818   - QPDFObjectHandle result =
1819   - readObjectAtOffset(false, new_offset, description, exp_og, og, false);
1820   - QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
1821   - return result;
1822   - } else {
1823   - QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
1824   - warn(damagedPDF(
1825   - "",
1826   - 0,
1827   - ("object " + exp_og.unparse(' ') +
1828   - " not found in file after regenerating cross reference "
1829   - "table")));
1830   - return QPDFObjectHandle::newNull();
1831   - }
1832   - } else {
1833   - throw;
1834   - }
1835   - }
1836   -
1837   - QPDFObjectHandle oh = readObject(description, og);
1838   -
1839   - if (isUnresolved(og)) {
1840   - // Store the object in the cache here so it gets cached whether we first know the offset or
1841   - // whether we first know the object ID and generation (in which we case we would get here
1842   - // through resolve).
1843   -
1844   - // Determine the end offset of this object before and after white space. We use these
1845   - // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
1846   - // the end of an object to be anywhere between these values.
1847   - qpdf_offset_t end_before_space = m->file->tell();
1848   -
1849   - // skip over spaces
1850   - while (true) {
1851   - char ch;
1852   - if (m->file->read(&ch, 1)) {
1853   - if (!isspace(static_cast<unsigned char>(ch))) {
1854   - m->file->seek(-1, SEEK_CUR);
1855   - break;
1856   - }
1857   - } else {
1858   - throw damagedPDF(m->file->tell(), "EOF after endobj");
1859   - }
1860   - }
1861   - qpdf_offset_t end_after_space = m->file->tell();
1862   - if (skip_cache_if_in_xref && m->xref_table.count(og)) {
1863   - // Ordinarily, an object gets read here when resolved through xref table or stream. In
1864   - // the special case of the xref stream and linearization hint tables, the offset comes
1865   - // from another source. For the specific case of xref streams, the xref stream is read
1866   - // and loaded into the object cache very early in parsing. Ordinarily, when a file is
1867   - // updated by appending, items inserted into the xref table in later updates take
1868   - // precedence over earlier items. In the special case of reusing the object number
1869   - // previously used as the xref stream, we have the following order of events:
1870   - //
1871   - // * reused object gets loaded into the xref table
1872   - // * old object is read here while reading xref streams
1873   - // * original xref entry is ignored (since already in xref table)
1874   - //
1875   - // It is the second step that causes a problem. Even though the xref table is correct in
1876   - // this case, the old object is already in the cache and so effectively prevails over
1877   - // the reused object. To work around this issue, we have a special case for the xref
1878   - // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,
1879   - // don't cache what we read here.
1880   - //
1881   - // It is likely that the same bug may exist for linearization hint tables, but the
1882   - // existing code uses end_before_space and end_after_space from the cache, so fixing
1883   - // that would require more significant rework. The chances of a linearization hint
1884   - // stream being reused seems smaller because the xref stream is probably the highest
1885   - // object in the file and the linearization hint stream would be some random place in
1886   - // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we
1887   - // could use !check_og in place of skip_cache_if_in_xref.
1888   - QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
1889   - } else {
1890   - updateCache(og, oh.getObj(), end_before_space, end_after_space);
1891   - }
1892   - }
1893   -
1894   - return oh;
1895   -}
1896   -
1897   -std::shared_ptr<QPDFObject> const&
1898   -QPDF::resolve(QPDFObjGen og)
1899   -{
1900   - if (!isUnresolved(og)) {
1901   - return m->obj_cache[og].object;
1902   - }
1903   -
1904   - if (m->resolving.count(og)) {
1905   - // This can happen if an object references itself directly or indirectly in some key that
1906   - // has to be resolved during object parsing, such as stream length.
1907   - QTC::TC("qpdf", "QPDF recursion loop in resolve");
1908   - warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
1909   - updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
1910   - return m->obj_cache[og].object;
1911   - }
1912   - ResolveRecorder rr(this, og);
1913   -
1914   - if (m->xref_table.count(og) != 0) {
1915   - QPDFXRefEntry const& entry = m->xref_table[og];
1916   - try {
1917   - switch (entry.getType()) {
1918   - case 1:
1919   - {
1920   - qpdf_offset_t offset = entry.getOffset();
1921   - // Object stored in cache by readObjectAtOffset
1922   - QPDFObjGen a_og;
1923   - QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);
1924   - }
1925   - break;
1926   -
1927   - case 2:
1928   - resolveObjectsInStream(entry.getObjStreamNumber());
1929   - break;
1930   -
1931   - default:
1932   - throw damagedPDF(
1933   - "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));
1934   - }
1935   - } catch (QPDFExc& e) {
1936   - warn(e);
1937   - } catch (std::exception& e) {
1938   - warn(damagedPDF(
1939   - "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
1940   - }
1941   - }
1942   -
1943   - if (isUnresolved(og)) {
1944   - // PDF spec says unknown objects resolve to the null object.
1945   - QTC::TC("qpdf", "QPDF resolve failure to null");
1946   - updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
1947   - }
1948   -
1949   - auto& result(m->obj_cache[og].object);
1950   - result->setDefaultDescription(this, og);
1951   - return result;
1952   -}
1953   -
1954   -void
1955   -QPDF::resolveObjectsInStream(int obj_stream_number)
  417 +QPDF::warn(QPDFExc const& e)
1956 418 {
1957   - if (m->resolved_object_streams.count(obj_stream_number)) {
1958   - return;
1959   - }
1960   - m->resolved_object_streams.insert(obj_stream_number);
1961   - // Force resolution of object stream
1962   - QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
1963   - if (!obj_stream.isStream()) {
1964   - throw damagedPDF(
1965   - "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
1966   - }
1967   -
1968   - // For linearization data in the object, use the data from the object stream for the objects in
1969   - // the stream.
1970   - QPDFObjGen stream_og(obj_stream_number, 0);
1971   - qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;
1972   - qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;
1973   -
1974   - QPDFObjectHandle dict = obj_stream.getDict();
1975   - if (!dict.isDictionaryOfType("/ObjStm")) {
1976   - QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
1977   - warn(damagedPDF(
1978   - "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
1979   - }
1980   -
1981   - if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {
1982   - throw damagedPDF(
1983   - ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));
1984   - }
1985   -
1986   - int n = dict.getKey("/N").getIntValueAsInt();
1987   - int first = dict.getKey("/First").getIntValueAsInt();
1988   -
1989   - std::map<int, int> offsets;
1990   -
1991   - std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
1992   - auto input = std::shared_ptr<InputSource>(
1993   - // line-break
1994   - new BufferInputSource(
1995   - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),
1996   - bp.get()));
1997   -
1998   - long long last_offset = -1;
1999   - for (int i = 0; i < n; ++i) {
2000   - QPDFTokenizer::Token tnum = readToken(*input);
2001   - QPDFTokenizer::Token toffset = readToken(*input);
2002   - if (!(tnum.isInteger() && toffset.isInteger())) {
2003   - throw damagedPDF(
2004   - *input,
2005   - m->last_object_description,
2006   - input->getLastOffset(),
2007   - "expected integer in object stream header");
2008   - }
2009   -
2010   - int num = QUtil::string_to_int(tnum.getValue().c_str());
2011   - long long offset = QUtil::string_to_int(toffset.getValue().c_str());
2012   -
2013   - if (num == obj_stream_number) {
2014   - QTC::TC("qpdf", "QPDF ignore self-referential object stream");
2015   - warn(damagedPDF(
2016   - *input,
2017   - m->last_object_description,
2018   - input->getLastOffset(),
2019   - "object stream claims to contain itself"));
2020   - continue;
2021   - }
2022   -
2023   - if (num < 1) {
2024   - QTC::TC("qpdf", "QPDF object stream contains id < 1");
2025   - warn(damagedPDF(
2026   - *input,
2027   - "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
2028   - 0,
2029   - "object id is invalid"s));
2030   - continue;
2031   - }
2032   -
2033   - if (offset <= last_offset) {
2034   - QTC::TC("qpdf", "QPDF object stream offsets not increasing");
2035   - warn(damagedPDF(
2036   - *input,
2037   - "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
2038   - 0,
2039   - "offset is invalid (must be larger than previous offset " +
2040   - std::to_string(last_offset) + ")"));
2041   - continue;
2042   - }
2043   - last_offset = offset;
2044   -
2045   - if (num > m->xref_table_max_id) {
2046   - continue;
2047   - }
2048   -
2049   - offsets[num] = toI(offset + first);
  419 + if (m->max_warnings > 0 && m->warnings.size() >= m->max_warnings) {
  420 + stopOnError("Too many warnings - file is too badly damaged");
2050 421 }
2051   -
2052   - // To avoid having to read the object stream multiple times, store all objects that would be
2053   - // found here in the cache. Remember that some objects stored here might have been overridden
2054   - // by new objects appended to the file, so it is necessary to recheck the xref table and only
2055   - // cache what would actually be resolved here.
2056   - m->last_object_description.clear();
2057   - m->last_object_description += "object ";
2058   - for (auto const& iter: offsets) {
2059   - QPDFObjGen og(iter.first, 0);
2060   - auto entry = m->xref_table.find(og);
2061   - if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
2062   - entry->second.getObjStreamNumber() == obj_stream_number) {
2063   - int offset = iter.second;
2064   - input->seek(offset, SEEK_SET);
2065   - QPDFObjectHandle oh = readObjectInStream(input, iter.first);
2066   - updateCache(og, oh.getObj(), end_before_space, end_after_space);
2067   - } else {
2068   - QTC::TC("qpdf", "QPDF not caching overridden objstm object");
2069   - }
  422 + m->warnings.push_back(e);
  423 + if (!m->suppress_warnings) {
  424 + *m->log->getWarn() << "WARNING: " << m->warnings.back().what() << "\n";
2070 425 }
2071 426 }
2072 427  
2073   -QPDFObjectHandle
2074   -QPDF::newIndirect(QPDFObjGen og, std::shared_ptr<QPDFObject> const& obj)
2075   -{
2076   - obj->setDefaultDescription(this, og);
2077   - return {obj};
2078   -}
2079   -
2080 428 void
2081   -QPDF::updateCache(
2082   - QPDFObjGen og,
2083   - std::shared_ptr<QPDFObject> const& object,
2084   - qpdf_offset_t end_before_space,
2085   - qpdf_offset_t end_after_space,
2086   - bool destroy)
2087   -{
2088   - object->setObjGen(this, og);
2089   - if (isCached(og)) {
2090   - auto& cache = m->obj_cache[og];
2091   - object->move_to(cache.object, destroy);
2092   - cache.end_before_space = end_before_space;
2093   - cache.end_after_space = end_after_space;
2094   - } else {
2095   - m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);
2096   - }
2097   -}
2098   -
2099   -bool
2100   -QPDF::isCached(QPDFObjGen og)
2101   -{
2102   - return m->obj_cache.count(og) != 0;
2103   -}
2104   -
2105   -bool
2106   -QPDF::isUnresolved(QPDFObjGen og)
2107   -{
2108   - return !isCached(og) || m->obj_cache[og].object->isUnresolved();
2109   -}
2110   -
2111   -QPDFObjGen
2112   -QPDF::nextObjGen()
2113   -{
2114   - int max_objid = toI(getObjectCount());
2115   - if (max_objid == std::numeric_limits<int>::max()) {
2116   - throw std::range_error("max object id is too high to create new objects");
2117   - }
2118   - return QPDFObjGen(max_objid + 1, 0);
2119   -}
2120   -
2121   -QPDFObjectHandle
2122   -QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
2123   -{
2124   - QPDFObjGen next{nextObjGen()};
2125   - m->obj_cache[next] = ObjCache(obj, -1, -1);
2126   - return newIndirect(next, m->obj_cache[next].object);
2127   -}
2128   -
2129   -QPDFObjectHandle
2130   -QPDF::makeIndirectObject(QPDFObjectHandle oh)
  429 +QPDF::warn(
  430 + qpdf_error_code_e error_code,
  431 + std::string const& object,
  432 + qpdf_offset_t offset,
  433 + std::string const& message)
2131 434 {
2132   - if (!oh) {
2133   - throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
2134   - }
2135   - return makeIndirectFromQPDFObject(oh.getObj());
  435 + warn(QPDFExc(error_code, getFilename(), object, offset, message));
2136 436 }
2137 437  
2138 438 QPDFObjectHandle
... ... @@ -2170,52 +470,6 @@ QPDF::newStream(std::string const&amp; data)
2170 470 return result;
2171 471 }
2172 472  
2173   -std::shared_ptr<QPDFObject>
2174   -QPDF::getObjectForParser(int id, int gen, bool parse_pdf)
2175   -{
2176   - // This method is called by the parser and therefore must not resolve any objects.
2177   - auto og = QPDFObjGen(id, gen);
2178   - if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
2179   - return iter->second.object;
2180   - }
2181   - if (m->xref_table.count(og) || !m->parsed) {
2182   - return m->obj_cache.insert({og, QPDFObject::create<QPDF_Unresolved>(this, og)})
2183   - .first->second.object;
2184   - }
2185   - if (parse_pdf) {
2186   - return QPDFObject::create<QPDF_Null>();
2187   - }
2188   - return m->obj_cache.insert({og, QPDFObject::create<QPDF_Null>(this, og)}).first->second.object;
2189   -}
2190   -
2191   -std::shared_ptr<QPDFObject>
2192   -QPDF::getObjectForJSON(int id, int gen)
2193   -{
2194   - auto og = QPDFObjGen(id, gen);
2195   - auto [it, inserted] = m->obj_cache.try_emplace(og);
2196   - auto& obj = it->second.object;
2197   - if (inserted) {
2198   - obj = (m->parsed && !m->xref_table.count(og))
2199   - ? QPDFObject::create<QPDF_Null>(this, og)
2200   - : QPDFObject::create<QPDF_Unresolved>(this, og);
2201   - }
2202   - return obj;
2203   -}
2204   -
2205   -QPDFObjectHandle
2206   -QPDF::getObject(QPDFObjGen og)
2207   -{
2208   - if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
2209   - return {it->second.object};
2210   - } else if (m->parsed && !m->xref_table.count(og)) {
2211   - return QPDFObject::create<QPDF_Null>();
2212   - } else {
2213   - auto result =
2214   - m->obj_cache.try_emplace(og, QPDFObject::create<QPDF_Unresolved>(this, og), -1, -1);
2215   - return {result.first->second.object};
2216   - }
2217   -}
2218   -
2219 473 QPDFObjectHandle
2220 474 QPDF::getObject(int objid, int generation)
2221 475 {
... ... @@ -2234,45 +488,6 @@ QPDF::getObjectByID(int objid, int generation)
2234 488 return getObject(QPDFObjGen(objid, generation));
2235 489 }
2236 490  
2237   -void
2238   -QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
2239   -{
2240   - replaceObject(QPDFObjGen(objid, generation), oh);
2241   -}
2242   -
2243   -void
2244   -QPDF::replaceObject(QPDFObjGen og, QPDFObjectHandle oh)
2245   -{
2246   - if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {
2247   - QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
2248   - throw std::logic_error("QPDF::replaceObject called with indirect object handle");
2249   - }
2250   - updateCache(og, oh.getObj(), -1, -1, false);
2251   -}
2252   -
2253   -void
2254   -QPDF::removeObject(QPDFObjGen og)
2255   -{
2256   - m->xref_table.erase(og);
2257   - if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
2258   - // Take care of any object handles that may be floating around.
2259   - cached->second.object->assign_null();
2260   - cached->second.object->setObjGen(nullptr, QPDFObjGen());
2261   - m->obj_cache.erase(cached);
2262   - }
2263   -}
2264   -
2265   -void
2266   -QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
2267   -{
2268   - QTC::TC("qpdf", "QPDF replaceReserved");
2269   - auto tc = reserved.getTypeCode();
2270   - if (!(tc == ::ot_reserved || tc == ::ot_null)) {
2271   - throw std::logic_error("replaceReserved called with non-reserved object");
2272   - }
2273   - replaceObject(reserved.getObjGen(), replacement);
2274   -}
2275   -
2276 491 QPDFObjectHandle
2277 492 QPDF::copyForeignObject(QPDFObjectHandle foreign)
2278 493 {
... ... @@ -2532,21 +747,6 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign)
2532 747 }
2533 748 }
2534 749  
2535   -void
2536   -QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
2537   -{
2538   - swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
2539   -}
2540   -
2541   -void
2542   -QPDF::swapObjects(QPDFObjGen og1, QPDFObjGen og2)
2543   -{
2544   - // Force objects to be read from the input source if needed, then swap them in the cache.
2545   - resolve(og1);
2546   - resolve(og2);
2547   - m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
2548   -}
2549   -
2550 750 unsigned long long
2551 751 QPDF::getUniqueId() const
2552 752 {
... ... @@ -2640,136 +840,6 @@ QPDF::getXRefTableInternal()
2640 840 return m->xref_table;
2641 841 }
2642 842  
2643   -size_t
2644   -QPDF::tableSize()
2645   -{
2646   - // If obj_cache is dense, accommodate all object in tables,else accommodate only original
2647   - // objects.
2648   - auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0;
2649   - auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;
2650   - auto max_id = std::numeric_limits<int>::max() - 1;
2651   - if (max_obj >= max_id || max_xref >= max_id) {
2652   - // Temporary fix. Long-term solution is
2653   - // - QPDFObjGen to enforce objgens are valid and sensible
2654   - // - xref table and obj cache to protect against insertion of impossibly large obj ids
2655   - stopOnError("Impossibly large object id encountered.");
2656   - }
2657   - if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
2658   - return toS(++max_obj);
2659   - }
2660   - return toS(++max_xref);
2661   -}
2662   -
2663   -std::vector<QPDFObjGen>
2664   -QPDF::getCompressibleObjVector()
2665   -{
2666   - return getCompressibleObjGens<QPDFObjGen>();
2667   -}
2668   -
2669   -std::vector<bool>
2670   -QPDF::getCompressibleObjSet()
2671   -{
2672   - return getCompressibleObjGens<bool>();
2673   -}
2674   -
2675   -template <typename T>
2676   -std::vector<T>
2677   -QPDF::getCompressibleObjGens()
2678   -{
2679   - // Return a list of objects that are allowed to be in object streams. Walk through the objects
2680   - // by traversing the document from the root, including a traversal of the pages tree. This
2681   - // makes that objects that are on the same page are more likely to be in the same object stream,
2682   - // which is slightly more efficient, particularly with linearized files. This is better than
2683   - // iterating through the xref table since it avoids preserving orphaned items.
2684   -
2685   - // Exclude encryption dictionary, if any
2686   - QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
2687   - QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
2688   -
2689   - const size_t max_obj = getObjectCount();
2690   - std::vector<bool> visited(max_obj, false);
2691   - std::vector<QPDFObjectHandle> queue;
2692   - queue.reserve(512);
2693   - queue.push_back(m->trailer);
2694   - std::vector<T> result;
2695   - if constexpr (std::is_same_v<T, QPDFObjGen>) {
2696   - result.reserve(m->obj_cache.size());
2697   - } else if constexpr (std::is_same_v<T, bool>) {
2698   - result.resize(max_obj + 1U, false);
2699   - } else {
2700   - throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
2701   - }
2702   - while (!queue.empty()) {
2703   - auto obj = queue.back();
2704   - queue.pop_back();
2705   - if (obj.getObjectID() > 0) {
2706   - QPDFObjGen og = obj.getObjGen();
2707   - const size_t id = toS(og.getObj() - 1);
2708   - if (id >= max_obj) {
2709   - throw std::logic_error(
2710   - "unexpected object id encountered in getCompressibleObjGens");
2711   - }
2712   - if (visited[id]) {
2713   - QTC::TC("qpdf", "QPDF loop detected traversing objects");
2714   - continue;
2715   - }
2716   -
2717   - // Check whether this is the current object. If not, remove it (which changes it into a
2718   - // direct null and therefore stops us from revisiting it) and move on to the next object
2719   - // in the queue.
2720   - auto upper = m->obj_cache.upper_bound(og);
2721   - if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
2722   - removeObject(og);
2723   - continue;
2724   - }
2725   -
2726   - visited[id] = true;
2727   -
2728   - if (og == encryption_dict_og) {
2729   - QTC::TC("qpdf", "QPDF exclude encryption dictionary");
2730   - } else if (!(obj.isStream() ||
2731   - (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
2732   - obj.hasKey("/Contents")))) {
2733   - if constexpr (std::is_same_v<T, QPDFObjGen>) {
2734   - result.push_back(og);
2735   - } else if constexpr (std::is_same_v<T, bool>) {
2736   - result[id + 1U] = true;
2737   - }
2738   - }
2739   - }
2740   - if (obj.isStream()) {
2741   - auto dict = obj.getDict().as_dictionary();
2742   - auto end = dict.crend();
2743   - for (auto iter = dict.crbegin(); iter != end; ++iter) {
2744   - std::string const& key = iter->first;
2745   - QPDFObjectHandle const& value = iter->second;
2746   - if (!value.null()) {
2747   - if (key == "/Length") {
2748   - // omit stream lengths
2749   - if (value.isIndirect()) {
2750   - QTC::TC("qpdf", "QPDF exclude indirect length");
2751   - }
2752   - } else {
2753   - queue.emplace_back(value);
2754   - }
2755   - }
2756   - }
2757   - } else if (obj.isDictionary()) {
2758   - auto dict = obj.as_dictionary();
2759   - auto end = dict.crend();
2760   - for (auto iter = dict.crbegin(); iter != end; ++iter) {
2761   - if (!iter->second.null()) {
2762   - queue.emplace_back(iter->second);
2763   - }
2764   - }
2765   - } else if (auto items = obj.as_array()) {
2766   - queue.insert(queue.end(), items.crbegin(), items.crend());
2767   - }
2768   - }
2769   -
2770   - return result;
2771   -}
2772   -
2773 843 bool
2774 844 QPDF::pipeStreamData(
2775 845 std::shared_ptr<EncryptionParameters> encp,
... ...
libqpdf/QPDF_objects.cc 0 โ†’ 100644
  1 +#include <qpdf/qpdf-config.h> // include first for large file support
  2 +
  3 +#include <qpdf/QPDF_private.hh>
  4 +
  5 +#include <array>
  6 +#include <atomic>
  7 +#include <cstring>
  8 +#include <limits>
  9 +#include <map>
  10 +#include <regex>
  11 +#include <sstream>
  12 +#include <vector>
  13 +
  14 +#include <qpdf/BufferInputSource.hh>
  15 +#include <qpdf/FileInputSource.hh>
  16 +#include <qpdf/InputSource_private.hh>
  17 +#include <qpdf/OffsetInputSource.hh>
  18 +#include <qpdf/Pipeline.hh>
  19 +#include <qpdf/QPDFExc.hh>
  20 +#include <qpdf/QPDFLogger.hh>
  21 +#include <qpdf/QPDFObjectHandle_private.hh>
  22 +#include <qpdf/QPDFObject_private.hh>
  23 +#include <qpdf/QPDFParser.hh>
  24 +#include <qpdf/QTC.hh>
  25 +#include <qpdf/QUtil.hh>
  26 +#include <qpdf/Util.hh>
  27 +
  28 +using namespace qpdf;
  29 +using namespace std::literals;
  30 +
  31 +namespace
  32 +{
  33 + class InvalidInputSource: public InputSource
  34 + {
  35 + public:
  36 + ~InvalidInputSource() override = default;
  37 + qpdf_offset_t
  38 + findAndSkipNextEOL() override
  39 + {
  40 + throwException();
  41 + return 0;
  42 + }
  43 + std::string const&
  44 + getName() const override
  45 + {
  46 + static std::string name("closed input source");
  47 + return name;
  48 + }
  49 + qpdf_offset_t
  50 + tell() override
  51 + {
  52 + throwException();
  53 + return 0;
  54 + }
  55 + void
  56 + seek(qpdf_offset_t offset, int whence) override
  57 + {
  58 + throwException();
  59 + }
  60 + void
  61 + rewind() override
  62 + {
  63 + throwException();
  64 + }
  65 + size_t
  66 + read(char* buffer, size_t length) override
  67 + {
  68 + throwException();
  69 + return 0;
  70 + }
  71 + void
  72 + unreadCh(char ch) override
  73 + {
  74 + throwException();
  75 + }
  76 +
  77 + private:
  78 + void
  79 + throwException()
  80 + {
  81 + throw std::logic_error(
  82 + "QPDF operation attempted on a QPDF object with no input "
  83 + "source. QPDF operations are invalid before processFile (or "
  84 + "another process method) or after closeInputSource");
  85 + }
  86 + };
  87 +} // namespace
  88 +
  89 +bool
  90 +QPDF::findStartxref()
  91 +{
  92 + if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {
  93 + // Position in front of offset token
  94 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  95 + return true;
  96 + }
  97 + return false;
  98 +}
  99 +
  100 +void
  101 +QPDF::parse(char const* password)
  102 +{
  103 + if (password) {
  104 + m->encp->provided_password = password;
  105 + }
  106 +
  107 + // Find the header anywhere in the first 1024 bytes of the file.
  108 + PatternFinder hf(*this, &QPDF::findHeader);
  109 + if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {
  110 + QTC::TC("qpdf", "QPDF not a pdf file");
  111 + warn(damagedPDF("", 0, "can't find PDF header"));
  112 + // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode
  113 + m->pdf_version = "1.2";
  114 + }
  115 +
  116 + // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
  117 + // 30 characters to leave room for the startxref stuff.
  118 + m->file->seek(0, SEEK_END);
  119 + qpdf_offset_t end_offset = m->file->tell();
  120 + m->xref_table_max_offset = end_offset;
  121 + // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
  122 + // scenarios at least 3 bytes are required.
  123 + if (m->xref_table_max_id > m->xref_table_max_offset / 3) {
  124 + m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);
  125 + }
  126 + qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
  127 + PatternFinder sf(*this, &QPDF::findStartxref);
  128 + qpdf_offset_t xref_offset = 0;
  129 + if (m->file->findLast("startxref", start_offset, 0, sf)) {
  130 + xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());
  131 + }
  132 +
  133 + try {
  134 + if (xref_offset == 0) {
  135 + QTC::TC("qpdf", "QPDF can't find startxref");
  136 + throw damagedPDF("", 0, "can't find startxref");
  137 + }
  138 + try {
  139 + read_xref(xref_offset);
  140 + } catch (QPDFExc&) {
  141 + throw;
  142 + } catch (std::exception& e) {
  143 + throw damagedPDF("", 0, std::string("error reading xref: ") + e.what());
  144 + }
  145 + } catch (QPDFExc& e) {
  146 + if (m->attempt_recovery) {
  147 + reconstruct_xref(e, xref_offset > 0);
  148 + QTC::TC("qpdf", "QPDF reconstructed xref table");
  149 + } else {
  150 + throw;
  151 + }
  152 + }
  153 +
  154 + initializeEncryption();
  155 + m->parsed = true;
  156 + if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) {
  157 + // QPDFs created from JSON have an empty xref table and no root object yet.
  158 + throw damagedPDF("", 0, "unable to find page tree");
  159 + }
  160 +}
  161 +
  162 +void
  163 +QPDF::inParse(bool v)
  164 +{
  165 + if (m->in_parse == v) {
  166 + // This happens if QPDFParser::parse tries to resolve an indirect object while it is
  167 + // parsing.
  168 + throw std::logic_error(
  169 + "QPDF: re-entrant parsing detected. This is a qpdf bug."
  170 + " Please report at https://github.com/qpdf/qpdf/issues.");
  171 + }
  172 + m->in_parse = v;
  173 +}
  174 +
  175 +void
  176 +QPDF::setTrailer(QPDFObjectHandle obj)
  177 +{
  178 + if (m->trailer) {
  179 + return;
  180 + }
  181 + m->trailer = obj;
  182 +}
  183 +
  184 +void
  185 +QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
  186 +{
  187 + if (m->reconstructed_xref) {
  188 + // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
  189 + // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
  190 + throw e;
  191 + }
  192 +
  193 + // If recovery generates more than 1000 warnings, the file is so severely damaged that there
  194 + // probably is no point trying to continue.
  195 + const auto max_warnings = m->warnings.size() + 1000U;
  196 + auto check_warnings = [this, max_warnings]() {
  197 + if (m->warnings.size() > max_warnings) {
  198 + throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");
  199 + }
  200 + };
  201 +
  202 + m->reconstructed_xref = true;
  203 + // We may find more objects, which may contain dangling references.
  204 + m->fixed_dangling_refs = false;
  205 +
  206 + warn(damagedPDF("", 0, "file is damaged"));
  207 + warn(e);
  208 + warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));
  209 +
  210 + // Delete all references to type 1 (uncompressed) objects
  211 + std::vector<QPDFObjGen> to_delete;
  212 + for (auto const& iter: m->xref_table) {
  213 + if (iter.second.getType() == 1) {
  214 + to_delete.emplace_back(iter.first);
  215 + }
  216 + }
  217 + for (auto const& iter: to_delete) {
  218 + m->xref_table.erase(iter);
  219 + }
  220 +
  221 + std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
  222 + std::vector<qpdf_offset_t> trailers;
  223 + std::vector<qpdf_offset_t> startxrefs;
  224 +
  225 + m->file->seek(0, SEEK_END);
  226 + qpdf_offset_t eof = m->file->tell();
  227 + m->file->seek(0, SEEK_SET);
  228 + // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
  229 + static size_t const MAX_LEN = 10;
  230 + while (m->file->tell() < eof) {
  231 + QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);
  232 + qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
  233 + if (t1.isInteger()) {
  234 + auto pos = m->file->tell();
  235 + auto t2 = readToken(*m->file, MAX_LEN);
  236 + if (t2.isInteger() && readToken(*m->file, MAX_LEN).isWord("obj")) {
  237 + int obj = QUtil::string_to_int(t1.getValue().c_str());
  238 + int gen = QUtil::string_to_int(t2.getValue().c_str());
  239 + if (obj <= m->xref_table_max_id) {
  240 + found_objects.emplace_back(obj, gen, token_start);
  241 + } else {
  242 + warn(damagedPDF(
  243 + "", 0, "ignoring object with impossibly large id " + std::to_string(obj)));
  244 + }
  245 + }
  246 + m->file->seek(pos, SEEK_SET);
  247 + } else if (!m->trailer && t1.isWord("trailer")) {
  248 + trailers.emplace_back(m->file->tell());
  249 + } else if (!found_startxref && t1.isWord("startxref")) {
  250 + startxrefs.emplace_back(m->file->tell());
  251 + }
  252 + check_warnings();
  253 + m->file->findAndSkipNextEOL();
  254 + }
  255 +
  256 + if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
  257 + startxrefs.back() > std::get<2>(found_objects.back())) {
  258 + try {
  259 + m->file->seek(startxrefs.back(), SEEK_SET);
  260 + if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
  261 + read_xref(offset);
  262 + if (getRoot().getKey("/Pages").isDictionary()) {
  263 + QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
  264 + warn(
  265 + damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));
  266 + initializeEncryption();
  267 + m->parsed = true;
  268 + m->reconstructed_xref = false;
  269 + return;
  270 + }
  271 + }
  272 + } catch (...) {
  273 + // ok, bad luck. Do recovery.
  274 + }
  275 + }
  276 +
  277 + auto rend = found_objects.rend();
  278 + for (auto it = found_objects.rbegin(); it != rend; it++) {
  279 + auto [obj, gen, token_start] = *it;
  280 + insertXrefEntry(obj, 1, token_start, gen);
  281 + check_warnings();
  282 + }
  283 + m->deleted_objects.clear();
  284 +
  285 + for (auto it = trailers.rbegin(); it != trailers.rend(); it++) {
  286 + m->file->seek(*it, SEEK_SET);
  287 + auto t = readTrailer();
  288 + if (!t.isDictionary()) {
  289 + // Oh well. It was worth a try.
  290 + } else {
  291 + if (t.hasKey("/Root")) {
  292 + m->trailer = t;
  293 + break;
  294 + }
  295 + warn(damagedPDF("trailer", *it, "recovered trailer has no /Root entry"));
  296 + }
  297 + check_warnings();
  298 + }
  299 +
  300 + if (!m->trailer) {
  301 + qpdf_offset_t max_offset{0};
  302 + size_t max_size{0};
  303 + // If there are any xref streams, take the last one to appear.
  304 + for (auto const& iter: m->xref_table) {
  305 + auto entry = iter.second;
  306 + if (entry.getType() != 1) {
  307 + continue;
  308 + }
  309 + auto oh = getObject(iter.first);
  310 + try {
  311 + if (!oh.isStreamOfType("/XRef")) {
  312 + continue;
  313 + }
  314 + } catch (std::exception&) {
  315 + continue;
  316 + }
  317 + auto offset = entry.getOffset();
  318 + auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();
  319 + if (size > max_size || (size == max_size && offset > max_offset)) {
  320 + max_offset = offset;
  321 + setTrailer(oh.getDict());
  322 + }
  323 + check_warnings();
  324 + }
  325 + if (max_offset > 0) {
  326 + try {
  327 + read_xref(max_offset);
  328 + } catch (std::exception&) {
  329 + warn(damagedPDF(
  330 + "", 0, "error decoding candidate xref stream while recovering damaged file"));
  331 + }
  332 + QTC::TC("qpdf", "QPDF recover xref stream");
  333 + }
  334 + }
  335 +
  336 + if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {
  337 + // Try to find a Root dictionary. As a quick fix try the one with the highest object id.
  338 + QPDFObjectHandle root;
  339 + for (auto const& iter: m->obj_cache) {
  340 + try {
  341 + if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {
  342 + root = iter.second.object;
  343 + }
  344 + } catch (std::exception&) {
  345 + continue;
  346 + }
  347 + }
  348 + if (root) {
  349 + if (!m->trailer) {
  350 + warn(damagedPDF(
  351 + "", 0, "unable to find trailer dictionary while recovering damaged file"));
  352 + m->trailer = QPDFObjectHandle::newDictionary();
  353 + }
  354 + m->trailer.replaceKey("/Root", root);
  355 + }
  356 + }
  357 +
  358 + if (!m->trailer) {
  359 + // We could check the last encountered object to see if it was an xref stream. If so, we
  360 + // could try to get the trailer from there. This may make it possible to recover files with
  361 + // bad startxref pointers even when they have object streams.
  362 +
  363 + throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");
  364 + }
  365 + if (m->xref_table.empty()) {
  366 + // We cannot check for an empty xref table in parse because empty tables are valid when
  367 + // creating QPDF objects from JSON.
  368 + throw damagedPDF("", 0, "unable to find objects while recovering damaged file");
  369 + }
  370 + check_warnings();
  371 + if (!m->parsed) {
  372 + m->parsed = true;
  373 + getAllPages();
  374 + check_warnings();
  375 + if (m->all_pages.empty()) {
  376 + m->parsed = false;
  377 + throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");
  378 + }
  379 + }
  380 + // We could iterate through the objects looking for streams and try to find objects inside of
  381 + // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors
  382 + // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything
  383 + // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
  384 + // It's safe to call it more than once.
  385 +}
  386 +
  387 +void
  388 +QPDF::read_xref(qpdf_offset_t xref_offset)
  389 +{
  390 + std::map<int, int> free_table;
  391 + std::set<qpdf_offset_t> visited;
  392 + while (xref_offset) {
  393 + visited.insert(xref_offset);
  394 + char buf[7];
  395 + memset(buf, 0, sizeof(buf));
  396 + m->file->seek(xref_offset, SEEK_SET);
  397 + // Some files miss the mark a little with startxref. We could do a better job of searching
  398 + // in the neighborhood for something that looks like either an xref table or stream, but the
  399 + // simple heuristic of skipping whitespace can help with the xref table case and is harmless
  400 + // with the stream case.
  401 + bool done = false;
  402 + bool skipped_space = false;
  403 + while (!done) {
  404 + char ch;
  405 + if (1 == m->file->read(&ch, 1)) {
  406 + if (util::is_space(ch)) {
  407 + skipped_space = true;
  408 + } else {
  409 + m->file->unreadCh(ch);
  410 + done = true;
  411 + }
  412 + } else {
  413 + QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
  414 + done = true;
  415 + }
  416 + }
  417 +
  418 + m->file->read(buf, sizeof(buf) - 1);
  419 + // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
  420 + // where it is terminated by arbitrary whitespace.
  421 + if ((strncmp(buf, "xref", 4) == 0) && util::is_space(buf[4])) {
  422 + if (skipped_space) {
  423 + QTC::TC("qpdf", "QPDF xref skipped space");
  424 + warn(damagedPDF("", 0, "extraneous whitespace seen before xref"));
  425 + }
  426 + QTC::TC(
  427 + "qpdf",
  428 + "QPDF xref space",
  429 + ((buf[4] == '\n') ? 0
  430 + : (buf[4] == '\r') ? 1
  431 + : (buf[4] == ' ') ? 2
  432 + : 9999));
  433 + int skip = 4;
  434 + // buf is null-terminated, and util::is_space('\0') is false, so this won't overrun.
  435 + while (util::is_space(buf[skip])) {
  436 + ++skip;
  437 + }
  438 + xref_offset = read_xrefTable(xref_offset + skip);
  439 + } else {
  440 + xref_offset = read_xrefStream(xref_offset);
  441 + }
  442 + if (visited.count(xref_offset) != 0) {
  443 + QTC::TC("qpdf", "QPDF xref loop");
  444 + throw damagedPDF("", 0, "loop detected following xref tables");
  445 + }
  446 + }
  447 +
  448 + if (!m->trailer) {
  449 + throw damagedPDF("", 0, "unable to find trailer while reading xref");
  450 + }
  451 + int size = m->trailer.getKey("/Size").getIntValueAsInt();
  452 + int max_obj = 0;
  453 + if (!m->xref_table.empty()) {
  454 + max_obj = m->xref_table.rbegin()->first.getObj();
  455 + }
  456 + if (!m->deleted_objects.empty()) {
  457 + max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));
  458 + }
  459 + if ((size < 1) || (size - 1 != max_obj)) {
  460 + QTC::TC("qpdf", "QPDF xref size mismatch");
  461 + warn(damagedPDF(
  462 + "",
  463 + 0,
  464 + ("reported number of objects (" + std::to_string(size) +
  465 + ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));
  466 + }
  467 +
  468 + // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we
  469 + // never depend on its being set.
  470 + m->deleted_objects.clear();
  471 +
  472 + // Make sure we keep only the highest generation for any object.
  473 + QPDFObjGen last_og{-1, 0};
  474 + for (auto const& item: m->xref_table) {
  475 + auto id = item.first.getObj();
  476 + if (id == last_og.getObj() && id > 0) {
  477 + removeObject(last_og);
  478 + }
  479 + last_og = item.first;
  480 + }
  481 +}
  482 +
  483 +bool
  484 +QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)
  485 +{
  486 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  487 + // buffer.
  488 + char const* p = line.c_str();
  489 + char const* start = line.c_str();
  490 +
  491 + // Skip zero or more spaces
  492 + while (util::is_space(*p)) {
  493 + ++p;
  494 + }
  495 + // Require digit
  496 + if (!util::is_digit(*p)) {
  497 + return false;
  498 + }
  499 + // Gather digits
  500 + std::string obj_str;
  501 + while (util::is_digit(*p)) {
  502 + obj_str.append(1, *p++);
  503 + }
  504 + // Require space
  505 + if (!util::is_space(*p)) {
  506 + return false;
  507 + }
  508 + // Skip spaces
  509 + while (util::is_space(*p)) {
  510 + ++p;
  511 + }
  512 + // Require digit
  513 + if (!util::is_digit(*p)) {
  514 + return false;
  515 + }
  516 + // Gather digits
  517 + std::string num_str;
  518 + while (util::is_digit(*p)) {
  519 + num_str.append(1, *p++);
  520 + }
  521 + // Skip any space including line terminators
  522 + while (util::is_space(*p)) {
  523 + ++p;
  524 + }
  525 + bytes = toI(p - start);
  526 + obj = QUtil::string_to_int(obj_str.c_str());
  527 + num = QUtil::string_to_int(num_str.c_str());
  528 + return true;
  529 +}
  530 +
  531 +bool
  532 +QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
  533 +{
  534 + // Reposition after initial read attempt and reread.
  535 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  536 + auto line = m->file->readLine(30);
  537 +
  538 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  539 + // buffer.
  540 + char const* p = line.data();
  541 +
  542 + // Skip zero or more spaces. There aren't supposed to be any.
  543 + bool invalid = false;
  544 + while (util::is_space(*p)) {
  545 + ++p;
  546 + QTC::TC("qpdf", "QPDF ignore first space in xref entry");
  547 + invalid = true;
  548 + }
  549 + // Require digit
  550 + if (!util::is_digit(*p)) {
  551 + return false;
  552 + }
  553 + // Gather digits
  554 + std::string f1_str;
  555 + while (util::is_digit(*p)) {
  556 + f1_str.append(1, *p++);
  557 + }
  558 + // Require space
  559 + if (!util::is_space(*p)) {
  560 + return false;
  561 + }
  562 + if (util::is_space(*(p + 1))) {
  563 + QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
  564 + invalid = true;
  565 + }
  566 + // Skip spaces
  567 + while (util::is_space(*p)) {
  568 + ++p;
  569 + }
  570 + // Require digit
  571 + if (!util::is_digit(*p)) {
  572 + return false;
  573 + }
  574 + // Gather digits
  575 + std::string f2_str;
  576 + while (util::is_digit(*p)) {
  577 + f2_str.append(1, *p++);
  578 + }
  579 + // Require space
  580 + if (!util::is_space(*p)) {
  581 + return false;
  582 + }
  583 + if (util::is_space(*(p + 1))) {
  584 + QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
  585 + invalid = true;
  586 + }
  587 + // Skip spaces
  588 + while (util::is_space(*p)) {
  589 + ++p;
  590 + }
  591 + if ((*p == 'f') || (*p == 'n')) {
  592 + type = *p;
  593 + } else {
  594 + return false;
  595 + }
  596 + if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
  597 + QTC::TC("qpdf", "QPDF ignore length error xref entry");
  598 + invalid = true;
  599 + }
  600 +
  601 + if (invalid) {
  602 + warn(damagedPDF("xref table", "accepting invalid xref table entry"));
  603 + }
  604 +
  605 + f1 = QUtil::string_to_ll(f1_str.c_str());
  606 + f2 = QUtil::string_to_int(f2_str.c_str());
  607 +
  608 + return true;
  609 +}
  610 +
  611 +// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
  612 +// result.
  613 +bool
  614 +QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
  615 +{
  616 + std::array<char, 21> line;
  617 + if (m->file->read(line.data(), 20) != 20) {
  618 + // C++20: [[unlikely]]
  619 + return false;
  620 + }
  621 + line[20] = '\0';
  622 + char const* p = line.data();
  623 +
  624 + int f1_len = 0;
  625 + int f2_len = 0;
  626 +
  627 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  628 + // buffer.
  629 +
  630 + // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
  631 + while (*p == '0') {
  632 + ++f1_len;
  633 + ++p;
  634 + }
  635 + while (util::is_digit(*p) && f1_len++ < 10) {
  636 + f1 *= 10;
  637 + f1 += *p++ - '0';
  638 + }
  639 + // Require space
  640 + if (!util::is_space(*p++)) {
  641 + // Entry doesn't start with space or digit.
  642 + // C++20: [[unlikely]]
  643 + return false;
  644 + }
  645 + // Gather digits. NB No risk of overflow as 99'999 < max int.
  646 + while (*p == '0') {
  647 + ++f2_len;
  648 + ++p;
  649 + }
  650 + while (util::is_digit(*p) && f2_len++ < 5) {
  651 + f2 *= 10;
  652 + f2 += static_cast<int>(*p++ - '0');
  653 + }
  654 + if (util::is_space(*p++) && (*p == 'f' || *p == 'n')) {
  655 + // C++20: [[likely]]
  656 + type = *p;
  657 + // No test for valid line[19].
  658 + if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
  659 + // C++20: [[likely]]
  660 + return true;
  661 + }
  662 + }
  663 + return read_bad_xrefEntry(f1, f2, type);
  664 +}
  665 +
  666 +// Read a single cross-reference table section and associated trailer.
  667 +qpdf_offset_t
  668 +QPDF::read_xrefTable(qpdf_offset_t xref_offset)
  669 +{
  670 + m->file->seek(xref_offset, SEEK_SET);
  671 + std::string line;
  672 + while (true) {
  673 + line.assign(50, '\0');
  674 + m->file->read(line.data(), line.size());
  675 + int obj = 0;
  676 + int num = 0;
  677 + int bytes = 0;
  678 + if (!parse_xrefFirst(line, obj, num, bytes)) {
  679 + QTC::TC("qpdf", "QPDF invalid xref");
  680 + throw damagedPDF("xref table", "xref syntax invalid");
  681 + }
  682 + m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
  683 + for (qpdf_offset_t i = obj; i - num < obj; ++i) {
  684 + if (i == 0) {
  685 + // This is needed by checkLinearization()
  686 + m->first_xref_item_offset = m->file->tell();
  687 + }
  688 + // For xref_table, these will always be small enough to be ints
  689 + qpdf_offset_t f1 = 0;
  690 + int f2 = 0;
  691 + char type = '\0';
  692 + if (!read_xrefEntry(f1, f2, type)) {
  693 + QTC::TC("qpdf", "QPDF invalid xref entry");
  694 + throw damagedPDF(
  695 + "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
  696 + }
  697 + if (type == 'f') {
  698 + insertFreeXrefEntry(QPDFObjGen(toI(i), f2));
  699 + } else {
  700 + insertXrefEntry(toI(i), 1, f1, f2);
  701 + }
  702 + }
  703 + qpdf_offset_t pos = m->file->tell();
  704 + if (readToken(*m->file).isWord("trailer")) {
  705 + break;
  706 + } else {
  707 + m->file->seek(pos, SEEK_SET);
  708 + }
  709 + }
  710 +
  711 + // Set offset to previous xref table if any
  712 + QPDFObjectHandle cur_trailer = readTrailer();
  713 + if (!cur_trailer.isDictionary()) {
  714 + QTC::TC("qpdf", "QPDF missing trailer");
  715 + throw damagedPDF("", "expected trailer dictionary");
  716 + }
  717 +
  718 + if (!m->trailer) {
  719 + setTrailer(cur_trailer);
  720 +
  721 + if (!m->trailer.hasKey("/Size")) {
  722 + QTC::TC("qpdf", "QPDF trailer lacks size");
  723 + throw damagedPDF("trailer", "trailer dictionary lacks /Size key");
  724 + }
  725 + if (!m->trailer.getKey("/Size").isInteger()) {
  726 + QTC::TC("qpdf", "QPDF trailer size not integer");
  727 + throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
  728 + }
  729 + }
  730 +
  731 + if (cur_trailer.hasKey("/XRefStm")) {
  732 + if (m->ignore_xref_streams) {
  733 + QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
  734 + } else {
  735 + if (cur_trailer.getKey("/XRefStm").isInteger()) {
  736 + // Read the xref stream but disregard any return value -- we'll use our trailer's
  737 + // /Prev key instead of the xref stream's.
  738 + (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());
  739 + } else {
  740 + throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");
  741 + }
  742 + }
  743 + }
  744 +
  745 + if (cur_trailer.hasKey("/Prev")) {
  746 + if (!cur_trailer.getKey("/Prev").isInteger()) {
  747 + QTC::TC("qpdf", "QPDF trailer prev not integer");
  748 + throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");
  749 + }
  750 + QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
  751 + return cur_trailer.getKey("/Prev").getIntValue();
  752 + }
  753 +
  754 + return 0;
  755 +}
  756 +
  757 +// Read a single cross-reference stream.
  758 +qpdf_offset_t
  759 +QPDF::read_xrefStream(qpdf_offset_t xref_offset)
  760 +{
  761 + if (!m->ignore_xref_streams) {
  762 + QPDFObjGen x_og;
  763 + QPDFObjectHandle xref_obj;
  764 + try {
  765 + xref_obj =
  766 + readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);
  767 + } catch (QPDFExc&) {
  768 + // ignore -- report error below
  769 + }
  770 + if (xref_obj.isStreamOfType("/XRef")) {
  771 + QTC::TC("qpdf", "QPDF found xref stream");
  772 + return processXRefStream(xref_offset, xref_obj);
  773 + }
  774 + }
  775 +
  776 + QTC::TC("qpdf", "QPDF can't find xref");
  777 + throw damagedPDF("", xref_offset, "xref not found");
  778 + return 0; // unreachable
  779 +}
  780 +
  781 +// Return the entry size of the xref stream and the processed W array.
  782 +std::pair<int, std::array<int, 3>>
  783 +QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
  784 +{
  785 + auto W_obj = dict.getKey("/W");
  786 + if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&
  787 + W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
  788 + throw damaged("Cross-reference stream does not have a proper /W key");
  789 + }
  790 +
  791 + std::array<int, 3> W;
  792 + int entry_size = 0;
  793 + auto w_vector = W_obj.getArrayAsVector();
  794 + int max_bytes = sizeof(qpdf_offset_t);
  795 + for (size_t i = 0; i < 3; ++i) {
  796 + W[i] = w_vector[i].getIntValueAsInt();
  797 + if (W[i] > max_bytes) {
  798 + throw damaged("Cross-reference stream's /W contains impossibly large values");
  799 + }
  800 + if (W[i] < 0) {
  801 + throw damaged("Cross-reference stream's /W contains negative values");
  802 + }
  803 + entry_size += W[i];
  804 + }
  805 + if (entry_size == 0) {
  806 + throw damaged("Cross-reference stream's /W indicates entry size of 0");
  807 + }
  808 + return {entry_size, W};
  809 +}
  810 +
  811 +// Validate Size key and return the maximum number of entries that the xref stream can contain.
  812 +int
  813 +QPDF::processXRefSize(
  814 + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
  815 +{
  816 + // Number of entries is limited by the highest possible object id and stream size.
  817 + auto max_num_entries = std::numeric_limits<int>::max();
  818 + if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
  819 + max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
  820 + }
  821 +
  822 + auto Size_obj = dict.getKey("/Size");
  823 + long long size;
  824 + if (!dict.getKey("/Size").getValueAsInt(size)) {
  825 + throw damaged("Cross-reference stream does not have a proper /Size key");
  826 + } else if (size < 0) {
  827 + throw damaged("Cross-reference stream has a negative /Size key");
  828 + } else if (size >= max_num_entries) {
  829 + throw damaged("Cross-reference stream has an impossibly large /Size key");
  830 + }
  831 + // We are not validating that Size <= (Size key of parent xref / trailer).
  832 + return max_num_entries;
  833 +}
  834 +
  835 +// Return the number of entries of the xref stream and the processed Index array.
  836 +std::pair<int, std::vector<std::pair<int, int>>>
  837 +QPDF::processXRefIndex(
  838 + QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
  839 +{
  840 + auto size = dict.getKey("/Size").getIntValueAsInt();
  841 + auto Index_obj = dict.getKey("/Index");
  842 +
  843 + if (Index_obj.isArray()) {
  844 + std::vector<std::pair<int, int>> indx;
  845 + int num_entries = 0;
  846 + auto index_vec = Index_obj.getArrayAsVector();
  847 + if ((index_vec.size() % 2) || index_vec.size() < 2) {
  848 + throw damaged("Cross-reference stream's /Index has an invalid number of values");
  849 + }
  850 +
  851 + int i = 0;
  852 + long long first = 0;
  853 + for (auto& val: index_vec) {
  854 + if (val.isInteger()) {
  855 + if (i % 2) {
  856 + auto count = val.getIntValue();
  857 + if (count <= 0) {
  858 + throw damaged(
  859 + "Cross-reference stream section claims to contain " +
  860 + std::to_string(count) + " entries");
  861 + }
  862 + // We are guarding against the possibility of num_entries * entry_size
  863 + // overflowing. We are not checking that entries are in ascending order as
  864 + // required by the spec, which probably should generate a warning. We are also
  865 + // not checking that for each subsection first object number + number of entries
  866 + // <= /Size. The spec requires us to ignore object number > /Size.
  867 + if (first > (max_num_entries - count) ||
  868 + count > (max_num_entries - num_entries)) {
  869 + throw damaged(
  870 + "Cross-reference stream claims to contain too many entries: " +
  871 + std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
  872 + std::to_string(num_entries));
  873 + }
  874 + indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
  875 + num_entries += static_cast<int>(count);
  876 + } else {
  877 + first = val.getIntValue();
  878 + if (first < 0) {
  879 + throw damaged(
  880 + "Cross-reference stream's /Index contains a negative object id");
  881 + } else if (first > max_num_entries) {
  882 + throw damaged(
  883 + "Cross-reference stream's /Index contains an impossibly "
  884 + "large object id");
  885 + }
  886 + }
  887 + } else {
  888 + throw damaged(
  889 + "Cross-reference stream's /Index's item " + std::to_string(i) +
  890 + " is not an integer");
  891 + }
  892 + i++;
  893 + }
  894 + QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
  895 + return {num_entries, indx};
  896 + } else if (Index_obj.isNull()) {
  897 + QTC::TC("qpdf", "QPDF xref /Index is null");
  898 + return {size, {{0, size}}};
  899 + } else {
  900 + throw damaged("Cross-reference stream does not have a proper /Index key");
  901 + }
  902 +}
  903 +
  904 +qpdf_offset_t
  905 +QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
  906 +{
  907 + auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
  908 + return damagedPDF("xref stream", xref_offset, msg.data());
  909 + };
  910 +
  911 + auto dict = xref_obj.getDict();
  912 +
  913 + auto [entry_size, W] = processXRefW(dict, damaged);
  914 + int max_num_entries = processXRefSize(dict, entry_size, damaged);
  915 + auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
  916 +
  917 + std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
  918 + size_t actual_size = bp->getSize();
  919 + auto expected_size = toS(entry_size) * toS(num_entries);
  920 +
  921 + if (expected_size != actual_size) {
  922 + QPDFExc x = damaged(
  923 + "Cross-reference stream data has the wrong size; expected = " +
  924 + std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
  925 + if (expected_size > actual_size) {
  926 + throw x;
  927 + } else {
  928 + warn(x);
  929 + }
  930 + }
  931 +
  932 + bool saw_first_compressed_object = false;
  933 +
  934 + // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
  935 + // We know that entry_size * num_entries is less or equal to the size of the buffer.
  936 + auto p = bp->getBuffer();
  937 + for (auto [obj, sec_entries]: indx) {
  938 + // Process a subsection.
  939 + for (int i = 0; i < sec_entries; ++i) {
  940 + // Read this entry
  941 + std::array<qpdf_offset_t, 3> fields{};
  942 + if (W[0] == 0) {
  943 + QTC::TC("qpdf", "QPDF default for xref stream field 0");
  944 + fields[0] = 1;
  945 + }
  946 + for (size_t j = 0; j < 3; ++j) {
  947 + for (int k = 0; k < W[j]; ++k) {
  948 + fields[j] <<= 8;
  949 + fields[j] |= *p++;
  950 + }
  951 + }
  952 +
  953 + // Get the generation number. The generation number is 0 unless this is an uncompressed
  954 + // object record, in which case the generation number appears as the third field.
  955 + if (saw_first_compressed_object) {
  956 + if (fields[0] != 2) {
  957 + m->uncompressed_after_compressed = true;
  958 + }
  959 + } else if (fields[0] == 2) {
  960 + saw_first_compressed_object = true;
  961 + }
  962 + if (obj == 0) {
  963 + // This is needed by checkLinearization()
  964 + m->first_xref_item_offset = xref_offset;
  965 + } else if (fields[0] == 0) {
  966 + // Ignore fields[2], which we don't care about in this case. This works around the
  967 + // issue of some PDF files that put invalid values, like -1, here for deleted
  968 + // objects.
  969 + insertFreeXrefEntry(QPDFObjGen(obj, 0));
  970 + } else {
  971 + insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
  972 + }
  973 + ++obj;
  974 + }
  975 + }
  976 +
  977 + if (!m->trailer) {
  978 + setTrailer(dict);
  979 + }
  980 +
  981 + if (dict.hasKey("/Prev")) {
  982 + if (!dict.getKey("/Prev").isInteger()) {
  983 + throw damagedPDF(
  984 + "xref stream", "/Prev key in xref stream dictionary is not an integer");
  985 + }
  986 + QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
  987 + return dict.getKey("/Prev").getIntValue();
  988 + } else {
  989 + return 0;
  990 + }
  991 +}
  992 +
  993 +void
  994 +QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)
  995 +{
  996 + // Populate the xref table in such a way that the first reference to an object that we see,
  997 + // which is the one in the latest xref table in which it appears, is the one that gets stored.
  998 + // This works because we are reading more recent appends before older ones.
  999 +
  1000 + // If there is already an entry for this object and generation in the table, it means that a
  1001 + // later xref table has registered this object. Disregard this one.
  1002 + int new_gen = f0 == 2 ? 0 : f2;
  1003 +
  1004 + if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && new_gen < 65535)) {
  1005 + // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There
  1006 + // is probably no point having another warning but we could count invalid items in order to
  1007 + // decide when to give up.
  1008 + QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");
  1009 + // ignore impossibly large object ids or object ids > Size.
  1010 + return;
  1011 + }
  1012 +
  1013 + if (m->deleted_objects.count(obj)) {
  1014 + QTC::TC("qpdf", "QPDF xref deleted object");
  1015 + return;
  1016 + }
  1017 +
  1018 + if (f0 == 2 && static_cast<int>(f1) == obj) {
  1019 + warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));
  1020 + return;
  1021 + }
  1022 +
  1023 + auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));
  1024 + if (!created) {
  1025 + QTC::TC("qpdf", "QPDF xref reused object");
  1026 + return;
  1027 + }
  1028 +
  1029 + switch (f0) {
  1030 + case 1:
  1031 + // f2 is generation
  1032 + QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
  1033 + iter->second = QPDFXRefEntry(f1);
  1034 + break;
  1035 +
  1036 + case 2:
  1037 + iter->second = QPDFXRefEntry(toI(f1), f2);
  1038 + break;
  1039 +
  1040 + default:
  1041 + throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));
  1042 + break;
  1043 + }
  1044 +}
  1045 +
  1046 +void
  1047 +QPDF::insertFreeXrefEntry(QPDFObjGen og)
  1048 +{
  1049 + if (!m->xref_table.count(og)) {
  1050 + m->deleted_objects.insert(og.getObj());
  1051 + }
  1052 +}
  1053 +
  1054 +void
  1055 +QPDF::showXRefTable()
  1056 +{
  1057 + auto& cout = *m->log->getInfo();
  1058 + for (auto const& iter: m->xref_table) {
  1059 + QPDFObjGen const& og = iter.first;
  1060 + QPDFXRefEntry const& entry = iter.second;
  1061 + cout << og.unparse('/') << ": ";
  1062 + switch (entry.getType()) {
  1063 + case 1:
  1064 + cout << "uncompressed; offset = " << entry.getOffset();
  1065 + break;
  1066 +
  1067 + case 2:
  1068 + *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()
  1069 + << ", index = " << entry.getObjStreamIndex();
  1070 + break;
  1071 +
  1072 + default:
  1073 + throw std::logic_error("unknown cross-reference table type while showing xref_table");
  1074 + break;
  1075 + }
  1076 + m->log->info("\n");
  1077 + }
  1078 +}
  1079 +
  1080 +// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
  1081 +// return false. Otherwise return true.
  1082 +bool
  1083 +QPDF::resolveXRefTable()
  1084 +{
  1085 + bool may_change = !m->reconstructed_xref;
  1086 + for (auto& iter: m->xref_table) {
  1087 + if (isUnresolved(iter.first)) {
  1088 + resolve(iter.first);
  1089 + if (may_change && m->reconstructed_xref) {
  1090 + return false;
  1091 + }
  1092 + }
  1093 + }
  1094 + return true;
  1095 +}
  1096 +
  1097 +// Ensure all objects in the pdf file, including those in indirect references, appear in the object
  1098 +// cache.
  1099 +void
  1100 +QPDF::fixDanglingReferences(bool force)
  1101 +{
  1102 + if (m->fixed_dangling_refs) {
  1103 + return;
  1104 + }
  1105 + if (!resolveXRefTable()) {
  1106 + QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");
  1107 + resolveXRefTable();
  1108 + }
  1109 + m->fixed_dangling_refs = true;
  1110 +}
  1111 +
  1112 +size_t
  1113 +QPDF::getObjectCount()
  1114 +{
  1115 + // This method returns the next available indirect object number. makeIndirectObject uses it for
  1116 + // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
  1117 + // be in obj_cache.
  1118 + fixDanglingReferences();
  1119 + QPDFObjGen og;
  1120 + if (!m->obj_cache.empty()) {
  1121 + og = (*(m->obj_cache.rbegin())).first;
  1122 + }
  1123 + return toS(og.getObj());
  1124 +}
  1125 +
  1126 +std::vector<QPDFObjectHandle>
  1127 +QPDF::getAllObjects()
  1128 +{
  1129 + // After fixDanglingReferences is called, all objects are in the object cache.
  1130 + fixDanglingReferences();
  1131 + std::vector<QPDFObjectHandle> result;
  1132 + for (auto const& iter: m->obj_cache) {
  1133 + result.push_back(newIndirect(iter.first, iter.second.object));
  1134 + }
  1135 + return result;
  1136 +}
  1137 +
  1138 +void
  1139 +QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen og)
  1140 +{
  1141 + m->last_object_description.clear();
  1142 + if (!description.empty()) {
  1143 + m->last_object_description += description;
  1144 + if (og.isIndirect()) {
  1145 + m->last_object_description += ": ";
  1146 + }
  1147 + }
  1148 + if (og.isIndirect()) {
  1149 + m->last_object_description += "object " + og.unparse(' ');
  1150 + }
  1151 +}
  1152 +
  1153 +QPDFObjectHandle
  1154 +QPDF::readTrailer()
  1155 +{
  1156 + qpdf_offset_t offset = m->file->tell();
  1157 + bool empty = false;
  1158 + auto object =
  1159 + QPDFParser(*m->file, "trailer", m->tokenizer, nullptr, this, true).parse(empty, false);
  1160 + if (empty) {
  1161 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1162 + // actual PDF files and Adobe Reader appears to ignore them.
  1163 + warn(damagedPDF("trailer", "empty object treated as null"));
  1164 + } else if (object.isDictionary() && readToken(*m->file).isWord("stream")) {
  1165 + warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
  1166 + }
  1167 + // Override last_offset so that it points to the beginning of the object we just read
  1168 + m->file->setLastOffset(offset);
  1169 + return object;
  1170 +}
  1171 +
  1172 +QPDFObjectHandle
  1173 +QPDF::readObject(std::string const& description, QPDFObjGen og)
  1174 +{
  1175 + setLastObjectDescription(description, og);
  1176 + qpdf_offset_t offset = m->file->tell();
  1177 + bool empty = false;
  1178 +
  1179 + StringDecrypter decrypter{this, og};
  1180 + StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
  1181 + auto object =
  1182 + QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this, true)
  1183 + .parse(empty, false);
  1184 + if (empty) {
  1185 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1186 + // actual PDF files and Adobe Reader appears to ignore them.
  1187 + warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));
  1188 + return object;
  1189 + }
  1190 + auto token = readToken(*m->file);
  1191 + if (object.isDictionary() && token.isWord("stream")) {
  1192 + readStream(object, og, offset);
  1193 + token = readToken(*m->file);
  1194 + }
  1195 + if (!token.isWord("endobj")) {
  1196 + QTC::TC("qpdf", "QPDF err expected endobj");
  1197 + warn(damagedPDF("expected endobj"));
  1198 + }
  1199 + return object;
  1200 +}
  1201 +
  1202 +// After reading stream dictionary and stream keyword, read rest of stream.
  1203 +void
  1204 +QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
  1205 +{
  1206 + validateStreamLineEnd(object, og, offset);
  1207 +
  1208 + // Must get offset before accessing any additional objects since resolving a previously
  1209 + // unresolved indirect object will change file position.
  1210 + qpdf_offset_t stream_offset = m->file->tell();
  1211 + size_t length = 0;
  1212 +
  1213 + try {
  1214 + auto length_obj = object.getKey("/Length");
  1215 +
  1216 + if (!length_obj.isInteger()) {
  1217 + if (length_obj.isNull()) {
  1218 + QTC::TC("qpdf", "QPDF stream without length");
  1219 + throw damagedPDF(offset, "stream dictionary lacks /Length key");
  1220 + }
  1221 + QTC::TC("qpdf", "QPDF stream length not integer");
  1222 + throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");
  1223 + }
  1224 +
  1225 + length = toS(length_obj.getUIntValue());
  1226 + // Seek in two steps to avoid potential integer overflow
  1227 + m->file->seek(stream_offset, SEEK_SET);
  1228 + m->file->seek(toO(length), SEEK_CUR);
  1229 + if (!readToken(*m->file).isWord("endstream")) {
  1230 + QTC::TC("qpdf", "QPDF missing endstream");
  1231 + throw damagedPDF("expected endstream");
  1232 + }
  1233 + } catch (QPDFExc& e) {
  1234 + if (m->attempt_recovery) {
  1235 + warn(e);
  1236 + length = recoverStreamLength(m->file, og, stream_offset);
  1237 + } else {
  1238 + throw;
  1239 + }
  1240 + }
  1241 + object = QPDFObjectHandle(qpdf::Stream(*this, og, object, stream_offset, length));
  1242 +}
  1243 +
  1244 +void
  1245 +QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
  1246 +{
  1247 + // The PDF specification states that the word "stream" should be followed by either a carriage
  1248 + // return and a newline or by a newline alone. It specifically disallowed following it by a
  1249 + // carriage return alone since, in that case, there would be no way to tell whether the NL in a
  1250 + // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,
  1251 + // accept a carriage return by itself when followed by a non-newline character, so that's what
  1252 + // we do here. We have also seen files that have extraneous whitespace between the stream
  1253 + // keyword and the newline.
  1254 + while (true) {
  1255 + char ch;
  1256 + if (m->file->read(&ch, 1) == 0) {
  1257 + // A premature EOF here will result in some other problem that will get reported at
  1258 + // another time.
  1259 + return;
  1260 + }
  1261 + if (ch == '\n') {
  1262 + // ready to read stream data
  1263 + QTC::TC("qpdf", "QPDF stream with NL only");
  1264 + return;
  1265 + }
  1266 + if (ch == '\r') {
  1267 + // Read another character
  1268 + if (m->file->read(&ch, 1) != 0) {
  1269 + if (ch == '\n') {
  1270 + // Ready to read stream data
  1271 + QTC::TC("qpdf", "QPDF stream with CRNL");
  1272 + } else {
  1273 + // Treat the \r by itself as the whitespace after endstream and start reading
  1274 + // stream data in spite of not having seen a newline.
  1275 + QTC::TC("qpdf", "QPDF stream with CR only");
  1276 + m->file->unreadCh(ch);
  1277 + warn(damagedPDF(
  1278 + m->file->tell(), "stream keyword followed by carriage return only"));
  1279 + }
  1280 + }
  1281 + return;
  1282 + }
  1283 + if (!util::is_space(ch)) {
  1284 + QTC::TC("qpdf", "QPDF stream without newline");
  1285 + m->file->unreadCh(ch);
  1286 + warn(damagedPDF(
  1287 + m->file->tell(), "stream keyword not followed by proper line terminator"));
  1288 + return;
  1289 + }
  1290 + warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
  1291 + }
  1292 +}
  1293 +
  1294 +QPDFObjectHandle
  1295 +QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)
  1296 +{
  1297 + m->last_object_description.erase(7); // last_object_description starts with "object "
  1298 + m->last_object_description += std::to_string(obj);
  1299 + m->last_object_description += " 0";
  1300 +
  1301 + bool empty = false;
  1302 + auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true)
  1303 + .parse(empty, false);
  1304 + if (empty) {
  1305 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1306 + // actual PDF files and Adobe Reader appears to ignore them.
  1307 + warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));
  1308 + }
  1309 + return object;
  1310 +}
  1311 +
  1312 +bool
  1313 +QPDF::findEndstream()
  1314 +{
  1315 + // Find endstream or endobj. Position the input at that token.
  1316 + auto t = readToken(*m->file, 20);
  1317 + if (t.isWord("endobj") || t.isWord("endstream")) {
  1318 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1319 + return true;
  1320 + }
  1321 + return false;
  1322 +}
  1323 +
  1324 +size_t
  1325 +QPDF::recoverStreamLength(
  1326 + std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)
  1327 +{
  1328 + // Try to reconstruct stream length by looking for endstream or endobj
  1329 + warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));
  1330 +
  1331 + PatternFinder ef(*this, &QPDF::findEndstream);
  1332 + size_t length = 0;
  1333 + if (m->file->findFirst("end", stream_offset, 0, ef)) {
  1334 + length = toS(m->file->tell() - stream_offset);
  1335 + // Reread endstream but, if it was endobj, don't skip that.
  1336 + QPDFTokenizer::Token t = readToken(*m->file);
  1337 + if (t.getValue() == "endobj") {
  1338 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1339 + }
  1340 + }
  1341 +
  1342 + if (length) {
  1343 + auto end = stream_offset + toO(length);
  1344 + qpdf_offset_t found_offset = 0;
  1345 + QPDFObjGen found_og;
  1346 +
  1347 + // Make sure this is inside this object
  1348 + for (auto const& [current_og, entry]: m->xref_table) {
  1349 + if (entry.getType() == 1) {
  1350 + qpdf_offset_t obj_offset = entry.getOffset();
  1351 + if (found_offset < obj_offset && obj_offset < end) {
  1352 + found_offset = obj_offset;
  1353 + found_og = current_og;
  1354 + }
  1355 + }
  1356 + }
  1357 + if (!found_offset || found_og == og) {
  1358 + // If we are trying to recover an XRef stream the xref table will not contain and
  1359 + // won't contain any entries, therefore we cannot check the found length. Otherwise we
  1360 + // found endstream\nendobj within the space allowed for this object, so we're probably
  1361 + // in good shape.
  1362 + } else {
  1363 + QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
  1364 + length = 0;
  1365 + }
  1366 + }
  1367 +
  1368 + if (length == 0) {
  1369 + warn(damagedPDF(
  1370 + *input, stream_offset, "unable to recover stream data; treating stream as empty"));
  1371 + } else {
  1372 + warn(damagedPDF(
  1373 + *input, stream_offset, "recovered stream length: " + std::to_string(length)));
  1374 + }
  1375 +
  1376 + QTC::TC("qpdf", "QPDF recovered stream length");
  1377 + return length;
  1378 +}
  1379 +
  1380 +QPDFTokenizer::Token
  1381 +QPDF::readToken(InputSource& input, size_t max_len)
  1382 +{
  1383 + return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
  1384 +}
  1385 +
  1386 +QPDFObjectHandle
  1387 +QPDF::readObjectAtOffset(
  1388 + bool try_recovery,
  1389 + qpdf_offset_t offset,
  1390 + std::string const& description,
  1391 + QPDFObjGen exp_og,
  1392 + QPDFObjGen& og,
  1393 + bool skip_cache_if_in_xref)
  1394 +{
  1395 + bool check_og = true;
  1396 + if (exp_og.getObj() == 0) {
  1397 + // This method uses an expect object ID of 0 to indicate that we don't know or don't care
  1398 + // what the actual object ID is at this offset. This is true when we read the xref stream
  1399 + // and linearization hint streams. In this case, we don't verify the expect object
  1400 + // ID/generation against what was read from the file. There is also no reason to attempt
  1401 + // xref recovery if we get a failure in this case since the read attempt was not triggered
  1402 + // by an xref lookup.
  1403 + check_og = false;
  1404 + try_recovery = false;
  1405 + }
  1406 + setLastObjectDescription(description, exp_og);
  1407 +
  1408 + if (!m->attempt_recovery) {
  1409 + try_recovery = false;
  1410 + }
  1411 +
  1412 + // Special case: if offset is 0, just return null. Some PDF writers, in particular
  1413 + // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
  1414 + // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
  1415 + // these.
  1416 + if (offset == 0) {
  1417 + QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
  1418 + warn(damagedPDF(0, "object has offset 0"));
  1419 + return QPDFObjectHandle::newNull();
  1420 + }
  1421 +
  1422 + m->file->seek(offset, SEEK_SET);
  1423 + try {
  1424 + QPDFTokenizer::Token tobjid = readToken(*m->file);
  1425 + bool objidok = tobjid.isInteger();
  1426 + QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
  1427 + if (!objidok) {
  1428 + QTC::TC("qpdf", "QPDF expected n n obj");
  1429 + throw damagedPDF(offset, "expected n n obj");
  1430 + }
  1431 + QPDFTokenizer::Token tgen = readToken(*m->file);
  1432 + bool genok = tgen.isInteger();
  1433 + QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
  1434 + if (!genok) {
  1435 + throw damagedPDF(offset, "expected n n obj");
  1436 + }
  1437 + QPDFTokenizer::Token tobj = readToken(*m->file);
  1438 +
  1439 + bool objok = tobj.isWord("obj");
  1440 + QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
  1441 +
  1442 + if (!objok) {
  1443 + throw damagedPDF(offset, "expected n n obj");
  1444 + }
  1445 + int objid = QUtil::string_to_int(tobjid.getValue().c_str());
  1446 + int generation = QUtil::string_to_int(tgen.getValue().c_str());
  1447 + og = QPDFObjGen(objid, generation);
  1448 + if (objid == 0) {
  1449 + QTC::TC("qpdf", "QPDF object id 0");
  1450 + throw damagedPDF(offset, "object with ID 0");
  1451 + }
  1452 + if (check_og && (exp_og != og)) {
  1453 + QTC::TC("qpdf", "QPDF err wrong objid/generation");
  1454 + QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
  1455 + if (try_recovery) {
  1456 + // Will be retried below
  1457 + throw e;
  1458 + } else {
  1459 + // We can try reading the object anyway even if the ID doesn't match.
  1460 + warn(e);
  1461 + }
  1462 + }
  1463 + } catch (QPDFExc& e) {
  1464 + if (try_recovery) {
  1465 + // Try again after reconstructing xref table
  1466 + reconstruct_xref(e);
  1467 + if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {
  1468 + qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
  1469 + QPDFObjectHandle result =
  1470 + readObjectAtOffset(false, new_offset, description, exp_og, og, false);
  1471 + QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
  1472 + return result;
  1473 + } else {
  1474 + QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
  1475 + warn(damagedPDF(
  1476 + "",
  1477 + 0,
  1478 + ("object " + exp_og.unparse(' ') +
  1479 + " not found in file after regenerating cross reference "
  1480 + "table")));
  1481 + return QPDFObjectHandle::newNull();
  1482 + }
  1483 + } else {
  1484 + throw;
  1485 + }
  1486 + }
  1487 +
  1488 + QPDFObjectHandle oh = readObject(description, og);
  1489 +
  1490 + if (isUnresolved(og)) {
  1491 + // Store the object in the cache here so it gets cached whether we first know the offset or
  1492 + // whether we first know the object ID and generation (in which we case we would get here
  1493 + // through resolve).
  1494 +
  1495 + // Determine the end offset of this object before and after white space. We use these
  1496 + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
  1497 + // the end of an object to be anywhere between these values.
  1498 + qpdf_offset_t end_before_space = m->file->tell();
  1499 +
  1500 + // skip over spaces
  1501 + while (true) {
  1502 + char ch;
  1503 + if (m->file->read(&ch, 1)) {
  1504 + if (!isspace(static_cast<unsigned char>(ch))) {
  1505 + m->file->seek(-1, SEEK_CUR);
  1506 + break;
  1507 + }
  1508 + } else {
  1509 + throw damagedPDF(m->file->tell(), "EOF after endobj");
  1510 + }
  1511 + }
  1512 + qpdf_offset_t end_after_space = m->file->tell();
  1513 + if (skip_cache_if_in_xref && m->xref_table.count(og)) {
  1514 + // Ordinarily, an object gets read here when resolved through xref table or stream. In
  1515 + // the special case of the xref stream and linearization hint tables, the offset comes
  1516 + // from another source. For the specific case of xref streams, the xref stream is read
  1517 + // and loaded into the object cache very early in parsing. Ordinarily, when a file is
  1518 + // updated by appending, items inserted into the xref table in later updates take
  1519 + // precedence over earlier items. In the special case of reusing the object number
  1520 + // previously used as the xref stream, we have the following order of events:
  1521 + //
  1522 + // * reused object gets loaded into the xref table
  1523 + // * old object is read here while reading xref streams
  1524 + // * original xref entry is ignored (since already in xref table)
  1525 + //
  1526 + // It is the second step that causes a problem. Even though the xref table is correct in
  1527 + // this case, the old object is already in the cache and so effectively prevails over
  1528 + // the reused object. To work around this issue, we have a special case for the xref
  1529 + // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,
  1530 + // don't cache what we read here.
  1531 + //
  1532 + // It is likely that the same bug may exist for linearization hint tables, but the
  1533 + // existing code uses end_before_space and end_after_space from the cache, so fixing
  1534 + // that would require more significant rework. The chances of a linearization hint
  1535 + // stream being reused seems smaller because the xref stream is probably the highest
  1536 + // object in the file and the linearization hint stream would be some random place in
  1537 + // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we
  1538 + // could use !check_og in place of skip_cache_if_in_xref.
  1539 + QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
  1540 + } else {
  1541 + updateCache(og, oh.getObj(), end_before_space, end_after_space);
  1542 + }
  1543 + }
  1544 +
  1545 + return oh;
  1546 +}
  1547 +
  1548 +std::shared_ptr<QPDFObject> const&
  1549 +QPDF::resolve(QPDFObjGen og)
  1550 +{
  1551 + if (!isUnresolved(og)) {
  1552 + return m->obj_cache[og].object;
  1553 + }
  1554 +
  1555 + if (m->resolving.count(og)) {
  1556 + // This can happen if an object references itself directly or indirectly in some key that
  1557 + // has to be resolved during object parsing, such as stream length.
  1558 + QTC::TC("qpdf", "QPDF recursion loop in resolve");
  1559 + warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
  1560 + updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
  1561 + return m->obj_cache[og].object;
  1562 + }
  1563 + ResolveRecorder rr(this, og);
  1564 +
  1565 + if (m->xref_table.count(og) != 0) {
  1566 + QPDFXRefEntry const& entry = m->xref_table[og];
  1567 + try {
  1568 + switch (entry.getType()) {
  1569 + case 1:
  1570 + {
  1571 + qpdf_offset_t offset = entry.getOffset();
  1572 + // Object stored in cache by readObjectAtOffset
  1573 + QPDFObjGen a_og;
  1574 + QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);
  1575 + }
  1576 + break;
  1577 +
  1578 + case 2:
  1579 + resolveObjectsInStream(entry.getObjStreamNumber());
  1580 + break;
  1581 +
  1582 + default:
  1583 + throw damagedPDF(
  1584 + "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));
  1585 + }
  1586 + } catch (QPDFExc& e) {
  1587 + warn(e);
  1588 + } catch (std::exception& e) {
  1589 + warn(damagedPDF(
  1590 + "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
  1591 + }
  1592 + }
  1593 +
  1594 + if (isUnresolved(og)) {
  1595 + // PDF spec says unknown objects resolve to the null object.
  1596 + QTC::TC("qpdf", "QPDF resolve failure to null");
  1597 + updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
  1598 + }
  1599 +
  1600 + auto& result(m->obj_cache[og].object);
  1601 + result->setDefaultDescription(this, og);
  1602 + return result;
  1603 +}
  1604 +
  1605 +void
  1606 +QPDF::resolveObjectsInStream(int obj_stream_number)
  1607 +{
  1608 + if (m->resolved_object_streams.count(obj_stream_number)) {
  1609 + return;
  1610 + }
  1611 + m->resolved_object_streams.insert(obj_stream_number);
  1612 + // Force resolution of object stream
  1613 + QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
  1614 + if (!obj_stream.isStream()) {
  1615 + throw damagedPDF(
  1616 + "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
  1617 + }
  1618 +
  1619 + // For linearization data in the object, use the data from the object stream for the objects in
  1620 + // the stream.
  1621 + QPDFObjGen stream_og(obj_stream_number, 0);
  1622 + qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;
  1623 + qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;
  1624 +
  1625 + QPDFObjectHandle dict = obj_stream.getDict();
  1626 + if (!dict.isDictionaryOfType("/ObjStm")) {
  1627 + QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
  1628 + warn(damagedPDF(
  1629 + "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
  1630 + }
  1631 +
  1632 + if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {
  1633 + throw damagedPDF(
  1634 + ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));
  1635 + }
  1636 +
  1637 + int n = dict.getKey("/N").getIntValueAsInt();
  1638 + int first = dict.getKey("/First").getIntValueAsInt();
  1639 +
  1640 + std::map<int, int> offsets;
  1641 +
  1642 + std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
  1643 + auto input = std::shared_ptr<InputSource>(
  1644 + // line-break
  1645 + new BufferInputSource(
  1646 + (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),
  1647 + bp.get()));
  1648 +
  1649 + long long last_offset = -1;
  1650 + for (int i = 0; i < n; ++i) {
  1651 + QPDFTokenizer::Token tnum = readToken(*input);
  1652 + QPDFTokenizer::Token toffset = readToken(*input);
  1653 + if (!(tnum.isInteger() && toffset.isInteger())) {
  1654 + throw damagedPDF(
  1655 + *input,
  1656 + m->last_object_description,
  1657 + input->getLastOffset(),
  1658 + "expected integer in object stream header");
  1659 + }
  1660 +
  1661 + int num = QUtil::string_to_int(tnum.getValue().c_str());
  1662 + long long offset = QUtil::string_to_int(toffset.getValue().c_str());
  1663 +
  1664 + if (num == obj_stream_number) {
  1665 + QTC::TC("qpdf", "QPDF ignore self-referential object stream");
  1666 + warn(damagedPDF(
  1667 + *input,
  1668 + m->last_object_description,
  1669 + input->getLastOffset(),
  1670 + "object stream claims to contain itself"));
  1671 + continue;
  1672 + }
  1673 +
  1674 + if (num < 1) {
  1675 + QTC::TC("qpdf", "QPDF object stream contains id < 1");
  1676 + warn(damagedPDF(
  1677 + *input,
  1678 + "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
  1679 + 0,
  1680 + "object id is invalid"s));
  1681 + continue;
  1682 + }
  1683 +
  1684 + if (offset <= last_offset) {
  1685 + QTC::TC("qpdf", "QPDF object stream offsets not increasing");
  1686 + warn(damagedPDF(
  1687 + *input,
  1688 + "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
  1689 + 0,
  1690 + "offset is invalid (must be larger than previous offset " +
  1691 + std::to_string(last_offset) + ")"));
  1692 + continue;
  1693 + }
  1694 + last_offset = offset;
  1695 +
  1696 + if (num > m->xref_table_max_id) {
  1697 + continue;
  1698 + }
  1699 +
  1700 + offsets[num] = toI(offset + first);
  1701 + }
  1702 +
  1703 + // To avoid having to read the object stream multiple times, store all objects that would be
  1704 + // found here in the cache. Remember that some objects stored here might have been overridden
  1705 + // by new objects appended to the file, so it is necessary to recheck the xref table and only
  1706 + // cache what would actually be resolved here.
  1707 + m->last_object_description.clear();
  1708 + m->last_object_description += "object ";
  1709 + for (auto const& iter: offsets) {
  1710 + QPDFObjGen og(iter.first, 0);
  1711 + auto entry = m->xref_table.find(og);
  1712 + if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
  1713 + entry->second.getObjStreamNumber() == obj_stream_number) {
  1714 + int offset = iter.second;
  1715 + input->seek(offset, SEEK_SET);
  1716 + QPDFObjectHandle oh = readObjectInStream(input, iter.first);
  1717 + updateCache(og, oh.getObj(), end_before_space, end_after_space);
  1718 + } else {
  1719 + QTC::TC("qpdf", "QPDF not caching overridden objstm object");
  1720 + }
  1721 + }
  1722 +}
  1723 +
  1724 +QPDFObjectHandle
  1725 +QPDF::newIndirect(QPDFObjGen og, std::shared_ptr<QPDFObject> const& obj)
  1726 +{
  1727 + obj->setDefaultDescription(this, og);
  1728 + return {obj};
  1729 +}
  1730 +
  1731 +void
  1732 +QPDF::updateCache(
  1733 + QPDFObjGen og,
  1734 + std::shared_ptr<QPDFObject> const& object,
  1735 + qpdf_offset_t end_before_space,
  1736 + qpdf_offset_t end_after_space,
  1737 + bool destroy)
  1738 +{
  1739 + object->setObjGen(this, og);
  1740 + if (isCached(og)) {
  1741 + auto& cache = m->obj_cache[og];
  1742 + object->move_to(cache.object, destroy);
  1743 + cache.end_before_space = end_before_space;
  1744 + cache.end_after_space = end_after_space;
  1745 + } else {
  1746 + m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);
  1747 + }
  1748 +}
  1749 +
  1750 +bool
  1751 +QPDF::isCached(QPDFObjGen og)
  1752 +{
  1753 + return m->obj_cache.count(og) != 0;
  1754 +}
  1755 +
  1756 +bool
  1757 +QPDF::isUnresolved(QPDFObjGen og)
  1758 +{
  1759 + return !isCached(og) || m->obj_cache[og].object->isUnresolved();
  1760 +}
  1761 +
  1762 +QPDFObjGen
  1763 +QPDF::nextObjGen()
  1764 +{
  1765 + int max_objid = toI(getObjectCount());
  1766 + if (max_objid == std::numeric_limits<int>::max()) {
  1767 + throw std::range_error("max object id is too high to create new objects");
  1768 + }
  1769 + return QPDFObjGen(max_objid + 1, 0);
  1770 +}
  1771 +
  1772 +QPDFObjectHandle
  1773 +QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
  1774 +{
  1775 + QPDFObjGen next{nextObjGen()};
  1776 + m->obj_cache[next] = ObjCache(obj, -1, -1);
  1777 + return newIndirect(next, m->obj_cache[next].object);
  1778 +}
  1779 +
  1780 +QPDFObjectHandle
  1781 +QPDF::makeIndirectObject(QPDFObjectHandle oh)
  1782 +{
  1783 + if (!oh) {
  1784 + throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
  1785 + }
  1786 + return makeIndirectFromQPDFObject(oh.getObj());
  1787 +}
  1788 +
  1789 +std::shared_ptr<QPDFObject>
  1790 +QPDF::getObjectForParser(int id, int gen, bool parse_pdf)
  1791 +{
  1792 + // This method is called by the parser and therefore must not resolve any objects.
  1793 + auto og = QPDFObjGen(id, gen);
  1794 + if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
  1795 + return iter->second.object;
  1796 + }
  1797 + if (m->xref_table.count(og) || !m->parsed) {
  1798 + return m->obj_cache.insert({og, QPDFObject::create<QPDF_Unresolved>(this, og)})
  1799 + .first->second.object;
  1800 + }
  1801 + if (parse_pdf) {
  1802 + return QPDFObject::create<QPDF_Null>();
  1803 + }
  1804 + return m->obj_cache.insert({og, QPDFObject::create<QPDF_Null>(this, og)}).first->second.object;
  1805 +}
  1806 +
  1807 +std::shared_ptr<QPDFObject>
  1808 +QPDF::getObjectForJSON(int id, int gen)
  1809 +{
  1810 + auto og = QPDFObjGen(id, gen);
  1811 + auto [it, inserted] = m->obj_cache.try_emplace(og);
  1812 + auto& obj = it->second.object;
  1813 + if (inserted) {
  1814 + obj = (m->parsed && !m->xref_table.count(og))
  1815 + ? QPDFObject::create<QPDF_Null>(this, og)
  1816 + : QPDFObject::create<QPDF_Unresolved>(this, og);
  1817 + }
  1818 + return obj;
  1819 +}
  1820 +
  1821 +QPDFObjectHandle
  1822 +QPDF::getObject(QPDFObjGen og)
  1823 +{
  1824 + if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
  1825 + return {it->second.object};
  1826 + } else if (m->parsed && !m->xref_table.count(og)) {
  1827 + return QPDFObject::create<QPDF_Null>();
  1828 + } else {
  1829 + auto result =
  1830 + m->obj_cache.try_emplace(og, QPDFObject::create<QPDF_Unresolved>(this, og), -1, -1);
  1831 + return {result.first->second.object};
  1832 + }
  1833 +}
  1834 +
  1835 +void
  1836 +QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
  1837 +{
  1838 + replaceObject(QPDFObjGen(objid, generation), oh);
  1839 +}
  1840 +
  1841 +void
  1842 +QPDF::replaceObject(QPDFObjGen og, QPDFObjectHandle oh)
  1843 +{
  1844 + if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {
  1845 + QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
  1846 + throw std::logic_error("QPDF::replaceObject called with indirect object handle");
  1847 + }
  1848 + updateCache(og, oh.getObj(), -1, -1, false);
  1849 +}
  1850 +
  1851 +void
  1852 +QPDF::removeObject(QPDFObjGen og)
  1853 +{
  1854 + m->xref_table.erase(og);
  1855 + if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
  1856 + // Take care of any object handles that may be floating around.
  1857 + cached->second.object->assign_null();
  1858 + cached->second.object->setObjGen(nullptr, QPDFObjGen());
  1859 + m->obj_cache.erase(cached);
  1860 + }
  1861 +}
  1862 +
  1863 +void
  1864 +QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
  1865 +{
  1866 + QTC::TC("qpdf", "QPDF replaceReserved");
  1867 + auto tc = reserved.getTypeCode();
  1868 + if (!(tc == ::ot_reserved || tc == ::ot_null)) {
  1869 + throw std::logic_error("replaceReserved called with non-reserved object");
  1870 + }
  1871 + replaceObject(reserved.getObjGen(), replacement);
  1872 +}
  1873 +
  1874 +void
  1875 +QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
  1876 +{
  1877 + swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
  1878 +}
  1879 +
  1880 +void
  1881 +QPDF::swapObjects(QPDFObjGen og1, QPDFObjGen og2)
  1882 +{
  1883 + // Force objects to be read from the input source if needed, then swap them in the cache.
  1884 + resolve(og1);
  1885 + resolve(og2);
  1886 + m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
  1887 +}
  1888 +
  1889 +size_t
  1890 +QPDF::tableSize()
  1891 +{
  1892 + // If obj_cache is dense, accommodate all object in tables,else accommodate only original
  1893 + // objects.
  1894 + auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0;
  1895 + auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;
  1896 + auto max_id = std::numeric_limits<int>::max() - 1;
  1897 + if (max_obj >= max_id || max_xref >= max_id) {
  1898 + // Temporary fix. Long-term solution is
  1899 + // - QPDFObjGen to enforce objgens are valid and sensible
  1900 + // - xref table and obj cache to protect against insertion of impossibly large obj ids
  1901 + stopOnError("Impossibly large object id encountered.");
  1902 + }
  1903 + if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
  1904 + return toS(++max_obj);
  1905 + }
  1906 + return toS(++max_xref);
  1907 +}
  1908 +
  1909 +std::vector<QPDFObjGen>
  1910 +QPDF::getCompressibleObjVector()
  1911 +{
  1912 + return getCompressibleObjGens<QPDFObjGen>();
  1913 +}
  1914 +
  1915 +std::vector<bool>
  1916 +QPDF::getCompressibleObjSet()
  1917 +{
  1918 + return getCompressibleObjGens<bool>();
  1919 +}
  1920 +
  1921 +template <typename T>
  1922 +std::vector<T>
  1923 +QPDF::getCompressibleObjGens()
  1924 +{
  1925 + // Return a list of objects that are allowed to be in object streams. Walk through the objects
  1926 + // by traversing the document from the root, including a traversal of the pages tree. This
  1927 + // makes that objects that are on the same page are more likely to be in the same object stream,
  1928 + // which is slightly more efficient, particularly with linearized files. This is better than
  1929 + // iterating through the xref table since it avoids preserving orphaned items.
  1930 +
  1931 + // Exclude encryption dictionary, if any
  1932 + QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
  1933 + QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
  1934 +
  1935 + const size_t max_obj = getObjectCount();
  1936 + std::vector<bool> visited(max_obj, false);
  1937 + std::vector<QPDFObjectHandle> queue;
  1938 + queue.reserve(512);
  1939 + queue.push_back(m->trailer);
  1940 + std::vector<T> result;
  1941 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  1942 + result.reserve(m->obj_cache.size());
  1943 + } else if constexpr (std::is_same_v<T, bool>) {
  1944 + result.resize(max_obj + 1U, false);
  1945 + } else {
  1946 + throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
  1947 + }
  1948 + while (!queue.empty()) {
  1949 + auto obj = queue.back();
  1950 + queue.pop_back();
  1951 + if (obj.getObjectID() > 0) {
  1952 + QPDFObjGen og = obj.getObjGen();
  1953 + const size_t id = toS(og.getObj() - 1);
  1954 + if (id >= max_obj) {
  1955 + throw std::logic_error(
  1956 + "unexpected object id encountered in getCompressibleObjGens");
  1957 + }
  1958 + if (visited[id]) {
  1959 + QTC::TC("qpdf", "QPDF loop detected traversing objects");
  1960 + continue;
  1961 + }
  1962 +
  1963 + // Check whether this is the current object. If not, remove it (which changes it into a
  1964 + // direct null and therefore stops us from revisiting it) and move on to the next object
  1965 + // in the queue.
  1966 + auto upper = m->obj_cache.upper_bound(og);
  1967 + if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
  1968 + removeObject(og);
  1969 + continue;
  1970 + }
  1971 +
  1972 + visited[id] = true;
  1973 +
  1974 + if (og == encryption_dict_og) {
  1975 + QTC::TC("qpdf", "QPDF exclude encryption dictionary");
  1976 + } else if (!(obj.isStream() ||
  1977 + (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
  1978 + obj.hasKey("/Contents")))) {
  1979 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  1980 + result.push_back(og);
  1981 + } else if constexpr (std::is_same_v<T, bool>) {
  1982 + result[id + 1U] = true;
  1983 + }
  1984 + }
  1985 + }
  1986 + if (obj.isStream()) {
  1987 + auto dict = obj.getDict().as_dictionary();
  1988 + auto end = dict.crend();
  1989 + for (auto iter = dict.crbegin(); iter != end; ++iter) {
  1990 + std::string const& key = iter->first;
  1991 + QPDFObjectHandle const& value = iter->second;
  1992 + if (!value.null()) {
  1993 + if (key == "/Length") {
  1994 + // omit stream lengths
  1995 + if (value.isIndirect()) {
  1996 + QTC::TC("qpdf", "QPDF exclude indirect length");
  1997 + }
  1998 + } else {
  1999 + queue.emplace_back(value);
  2000 + }
  2001 + }
  2002 + }
  2003 + } else if (obj.isDictionary()) {
  2004 + auto dict = obj.as_dictionary();
  2005 + auto end = dict.crend();
  2006 + for (auto iter = dict.crbegin(); iter != end; ++iter) {
  2007 + if (!iter->second.null()) {
  2008 + queue.emplace_back(iter->second);
  2009 + }
  2010 + }
  2011 + } else if (auto items = obj.as_array()) {
  2012 + queue.insert(queue.end(), items.crbegin(), items.crend());
  2013 + }
  2014 + }
  2015 +
  2016 + return result;
  2017 +}
... ...
manual/release-notes.rst
... ... @@ -21,16 +21,15 @@ more detail.
21 21 integer object. Previously the method returned false if the first
22 22 dictionary object was not a linearization parameter dictionary.
23 23  
24   -.. _r12-0-0:
25   -
26   -12.0.1: not yet released
27   - - Other enhancements
  24 + - Other enhancements
28 25  
29   - - There have been further enhancements to how files with damaged xref
30   - tables are recovered.
  26 + - There have been further enhancements to how files with damaged xref
  27 + tables are recovered.
31 28  
32 29 .. cSpell:ignore substract
33 30  
  31 +.. _r12-0-0:
  32 +
34 33 12.0.0: March 9, 2025
35 34 - API breaking changes
36 35  
... ...