Commit fba542f9981e19582870ebc0e298df6a42223793

Authored by m-holger
Committed by GitHub
2 parents 9740930b bb045907

Merge pull request #1394 from m-holger/qpdf_objects

Split QPDF.cc into QPDF.cc and QPDF_objects.cc
libqpdf/CMakeLists.txt
@@ -95,6 +95,7 @@ set(libqpdf_SOURCES @@ -95,6 +95,7 @@ set(libqpdf_SOURCES
95 QPDF_encryption.cc 95 QPDF_encryption.cc
96 QPDF_json.cc 96 QPDF_json.cc
97 QPDF_linearization.cc 97 QPDF_linearization.cc
  98 + QPDF_objects.cc
98 QPDF_optimization.cc 99 QPDF_optimization.cc
99 QPDF_pages.cc 100 QPDF_pages.cc
100 QTC.cc 101 QTC.cc
libqpdf/QPDF.cc
@@ -413,1726 +413,26 @@ QPDF::findHeader() @@ -413,1726 +413,26 @@ QPDF::findHeader()
413 return valid; 413 return valid;
414 } 414 }
415 415
416 -bool  
417 -QPDF::findStartxref()  
418 -{  
419 - if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {  
420 - // Position in front of offset token  
421 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
422 - return true;  
423 - }  
424 - return false;  
425 -}  
426 -  
427 -void  
428 -QPDF::parse(char const* password)  
429 -{  
430 - if (password) {  
431 - m->encp->provided_password = password;  
432 - }  
433 -  
434 - // Find the header anywhere in the first 1024 bytes of the file.  
435 - PatternFinder hf(*this, &QPDF::findHeader);  
436 - if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {  
437 - QTC::TC("qpdf", "QPDF not a pdf file");  
438 - warn(damagedPDF("", 0, "can't find PDF header"));  
439 - // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode  
440 - m->pdf_version = "1.2";  
441 - }  
442 -  
443 - // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra  
444 - // 30 characters to leave room for the startxref stuff.  
445 - m->file->seek(0, SEEK_END);  
446 - qpdf_offset_t end_offset = m->file->tell();  
447 - m->xref_table_max_offset = end_offset;  
448 - // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic  
449 - // scenarios at least 3 bytes are required.  
450 - if (m->xref_table_max_id > m->xref_table_max_offset / 3) {  
451 - m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);  
452 - }  
453 - qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);  
454 - PatternFinder sf(*this, &QPDF::findStartxref);  
455 - qpdf_offset_t xref_offset = 0;  
456 - if (m->file->findLast("startxref", start_offset, 0, sf)) {  
457 - xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());  
458 - }  
459 -  
460 - try {  
461 - if (xref_offset == 0) {  
462 - QTC::TC("qpdf", "QPDF can't find startxref");  
463 - throw damagedPDF("", 0, "can't find startxref");  
464 - }  
465 - try {  
466 - read_xref(xref_offset);  
467 - } catch (QPDFExc&) {  
468 - throw;  
469 - } catch (std::exception& e) {  
470 - throw damagedPDF("", 0, std::string("error reading xref: ") + e.what());  
471 - }  
472 - } catch (QPDFExc& e) {  
473 - if (m->attempt_recovery) {  
474 - reconstruct_xref(e, xref_offset > 0);  
475 - QTC::TC("qpdf", "QPDF reconstructed xref table");  
476 - } else {  
477 - throw;  
478 - }  
479 - }  
480 -  
481 - initializeEncryption();  
482 - m->parsed = true;  
483 - if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) {  
484 - // QPDFs created from JSON have an empty xref table and no root object yet.  
485 - throw damagedPDF("", 0, "unable to find page tree");  
486 - }  
487 -}  
488 -  
489 -void  
490 -QPDF::inParse(bool v)  
491 -{  
492 - if (m->in_parse == v) {  
493 - // This happens if QPDFParser::parse tries to resolve an indirect object while it is  
494 - // parsing.  
495 - throw std::logic_error(  
496 - "QPDF: re-entrant parsing detected. This is a qpdf bug."  
497 - " Please report at https://github.com/qpdf/qpdf/issues.");  
498 - }  
499 - m->in_parse = v;  
500 -}  
501 -  
502 -void  
503 -QPDF::warn(QPDFExc const& e)  
504 -{  
505 - if (m->max_warnings > 0 && m->warnings.size() >= m->max_warnings) {  
506 - stopOnError("Too many warnings - file is too badly damaged");  
507 - }  
508 - m->warnings.push_back(e);  
509 - if (!m->suppress_warnings) {  
510 - *m->log->getWarn() << "WARNING: " << m->warnings.back().what() << "\n";  
511 - }  
512 -}  
513 -  
514 -void  
515 -QPDF::warn(  
516 - qpdf_error_code_e error_code,  
517 - std::string const& object,  
518 - qpdf_offset_t offset,  
519 - std::string const& message)  
520 -{  
521 - warn(QPDFExc(error_code, getFilename(), object, offset, message));  
522 -}  
523 -  
524 -void  
525 -QPDF::setTrailer(QPDFObjectHandle obj)  
526 -{  
527 - if (m->trailer) {  
528 - return;  
529 - }  
530 - m->trailer = obj;  
531 -}  
532 -  
533 -void  
534 -QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)  
535 -{  
536 - if (m->reconstructed_xref) {  
537 - // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because  
538 - // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.  
539 - throw e;  
540 - }  
541 -  
542 - // If recovery generates more than 1000 warnings, the file is so severely damaged that there  
543 - // probably is no point trying to continue.  
544 - const auto max_warnings = m->warnings.size() + 1000U;  
545 - auto check_warnings = [this, max_warnings]() {  
546 - if (m->warnings.size() > max_warnings) {  
547 - throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");  
548 - }  
549 - };  
550 -  
551 - m->reconstructed_xref = true;  
552 - // We may find more objects, which may contain dangling references.  
553 - m->fixed_dangling_refs = false;  
554 -  
555 - warn(damagedPDF("", 0, "file is damaged"));  
556 - warn(e);  
557 - warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));  
558 -  
559 - // Delete all references to type 1 (uncompressed) objects  
560 - std::vector<QPDFObjGen> to_delete;  
561 - for (auto const& iter: m->xref_table) {  
562 - if (iter.second.getType() == 1) {  
563 - to_delete.emplace_back(iter.first);  
564 - }  
565 - }  
566 - for (auto const& iter: to_delete) {  
567 - m->xref_table.erase(iter);  
568 - }  
569 -  
570 - std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;  
571 - std::vector<qpdf_offset_t> trailers;  
572 - std::vector<qpdf_offset_t> startxrefs;  
573 -  
574 - m->file->seek(0, SEEK_END);  
575 - qpdf_offset_t eof = m->file->tell();  
576 - m->file->seek(0, SEEK_SET);  
577 - // Don't allow very long tokens here during recovery. All the interesting tokens are covered.  
578 - static size_t const MAX_LEN = 10;  
579 - while (m->file->tell() < eof) {  
580 - QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);  
581 - qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());  
582 - if (t1.isInteger()) {  
583 - auto pos = m->file->tell();  
584 - auto t2 = readToken(*m->file, MAX_LEN);  
585 - if (t2.isInteger() && readToken(*m->file, MAX_LEN).isWord("obj")) {  
586 - int obj = QUtil::string_to_int(t1.getValue().c_str());  
587 - int gen = QUtil::string_to_int(t2.getValue().c_str());  
588 - if (obj <= m->xref_table_max_id) {  
589 - found_objects.emplace_back(obj, gen, token_start);  
590 - } else {  
591 - warn(damagedPDF(  
592 - "", 0, "ignoring object with impossibly large id " + std::to_string(obj)));  
593 - }  
594 - }  
595 - m->file->seek(pos, SEEK_SET);  
596 - } else if (!m->trailer && t1.isWord("trailer")) {  
597 - trailers.emplace_back(m->file->tell());  
598 - } else if (!found_startxref && t1.isWord("startxref")) {  
599 - startxrefs.emplace_back(m->file->tell());  
600 - }  
601 - check_warnings();  
602 - m->file->findAndSkipNextEOL();  
603 - }  
604 -  
605 - if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&  
606 - startxrefs.back() > std::get<2>(found_objects.back())) {  
607 - try {  
608 - m->file->seek(startxrefs.back(), SEEK_SET);  
609 - if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {  
610 - read_xref(offset);  
611 - if (getRoot().getKey("/Pages").isDictionary()) {  
612 - QTC::TC("qpdf", "QPDF startxref more than 1024 before end");  
613 - warn(  
614 - damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));  
615 - initializeEncryption();  
616 - m->parsed = true;  
617 - m->reconstructed_xref = false;  
618 - return;  
619 - }  
620 - }  
621 - } catch (...) {  
622 - // ok, bad luck. Do recovery.  
623 - }  
624 - }  
625 -  
626 - auto rend = found_objects.rend();  
627 - for (auto it = found_objects.rbegin(); it != rend; it++) {  
628 - auto [obj, gen, token_start] = *it;  
629 - insertXrefEntry(obj, 1, token_start, gen);  
630 - check_warnings();  
631 - }  
632 - m->deleted_objects.clear();  
633 -  
634 - for (auto it = trailers.rbegin(); it != trailers.rend(); it++) {  
635 - m->file->seek(*it, SEEK_SET);  
636 - auto t = readTrailer();  
637 - if (!t.isDictionary()) {  
638 - // Oh well. It was worth a try.  
639 - } else {  
640 - if (t.hasKey("/Root")) {  
641 - m->trailer = t;  
642 - break;  
643 - }  
644 - warn(damagedPDF("trailer", *it, "recovered trailer has no /Root entry"));  
645 - }  
646 - check_warnings();  
647 - }  
648 -  
649 - if (!m->trailer) {  
650 - qpdf_offset_t max_offset{0};  
651 - size_t max_size{0};  
652 - // If there are any xref streams, take the last one to appear.  
653 - for (auto const& iter: m->xref_table) {  
654 - auto entry = iter.second;  
655 - if (entry.getType() != 1) {  
656 - continue;  
657 - }  
658 - auto oh = getObject(iter.first);  
659 - try {  
660 - if (!oh.isStreamOfType("/XRef")) {  
661 - continue;  
662 - }  
663 - } catch (std::exception&) {  
664 - continue;  
665 - }  
666 - auto offset = entry.getOffset();  
667 - auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();  
668 - if (size > max_size || (size == max_size && offset > max_offset)) {  
669 - max_offset = offset;  
670 - setTrailer(oh.getDict());  
671 - }  
672 - check_warnings();  
673 - }  
674 - if (max_offset > 0) {  
675 - try {  
676 - read_xref(max_offset);  
677 - } catch (std::exception&) {  
678 - warn(damagedPDF(  
679 - "", 0, "error decoding candidate xref stream while recovering damaged file"));  
680 - }  
681 - QTC::TC("qpdf", "QPDF recover xref stream");  
682 - }  
683 - }  
684 -  
685 - if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {  
686 - // Try to find a Root dictionary. As a quick fix try the one with the highest object id.  
687 - QPDFObjectHandle root;  
688 - for (auto const& iter: m->obj_cache) {  
689 - try {  
690 - if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {  
691 - root = iter.second.object;  
692 - }  
693 - } catch (std::exception&) {  
694 - continue;  
695 - }  
696 - }  
697 - if (root) {  
698 - if (!m->trailer) {  
699 - warn(damagedPDF(  
700 - "", 0, "unable to find trailer dictionary while recovering damaged file"));  
701 - m->trailer = QPDFObjectHandle::newDictionary();  
702 - }  
703 - m->trailer.replaceKey("/Root", root);  
704 - }  
705 - }  
706 -  
707 - if (!m->trailer) {  
708 - // We could check the last encountered object to see if it was an xref stream. If so, we  
709 - // could try to get the trailer from there. This may make it possible to recover files with  
710 - // bad startxref pointers even when they have object streams.  
711 -  
712 - throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");  
713 - }  
714 - if (m->xref_table.empty()) {  
715 - // We cannot check for an empty xref table in parse because empty tables are valid when  
716 - // creating QPDF objects from JSON.  
717 - throw damagedPDF("", 0, "unable to find objects while recovering damaged file");  
718 - }  
719 - check_warnings();  
720 - if (!m->parsed) {  
721 - m->parsed = true;  
722 - getAllPages();  
723 - check_warnings();  
724 - if (m->all_pages.empty()) {  
725 - m->parsed = false;  
726 - throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");  
727 - }  
728 - }  
729 - // We could iterate through the objects looking for streams and try to find objects inside of  
730 - // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors  
731 - // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything  
732 - // that involved looking at stream contents, we'd also have to call initializeEncryption() here.  
733 - // It's safe to call it more than once.  
734 -}  
735 -  
736 -void  
737 -QPDF::read_xref(qpdf_offset_t xref_offset)  
738 -{  
739 - std::map<int, int> free_table;  
740 - std::set<qpdf_offset_t> visited;  
741 - while (xref_offset) {  
742 - visited.insert(xref_offset);  
743 - char buf[7];  
744 - memset(buf, 0, sizeof(buf));  
745 - m->file->seek(xref_offset, SEEK_SET);  
746 - // Some files miss the mark a little with startxref. We could do a better job of searching  
747 - // in the neighborhood for something that looks like either an xref table or stream, but the  
748 - // simple heuristic of skipping whitespace can help with the xref table case and is harmless  
749 - // with the stream case.  
750 - bool done = false;  
751 - bool skipped_space = false;  
752 - while (!done) {  
753 - char ch;  
754 - if (1 == m->file->read(&ch, 1)) {  
755 - if (util::is_space(ch)) {  
756 - skipped_space = true;  
757 - } else {  
758 - m->file->unreadCh(ch);  
759 - done = true;  
760 - }  
761 - } else {  
762 - QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);  
763 - done = true;  
764 - }  
765 - }  
766 -  
767 - m->file->read(buf, sizeof(buf) - 1);  
768 - // The PDF spec says xref must be followed by a line terminator, but files exist in the wild  
769 - // where it is terminated by arbitrary whitespace.  
770 - if ((strncmp(buf, "xref", 4) == 0) && util::is_space(buf[4])) {  
771 - if (skipped_space) {  
772 - QTC::TC("qpdf", "QPDF xref skipped space");  
773 - warn(damagedPDF("", 0, "extraneous whitespace seen before xref"));  
774 - }  
775 - QTC::TC(  
776 - "qpdf",  
777 - "QPDF xref space",  
778 - ((buf[4] == '\n') ? 0  
779 - : (buf[4] == '\r') ? 1  
780 - : (buf[4] == ' ') ? 2  
781 - : 9999));  
782 - int skip = 4;  
783 - // buf is null-terminated, and util::is_space('\0') is false, so this won't overrun.  
784 - while (util::is_space(buf[skip])) {  
785 - ++skip;  
786 - }  
787 - xref_offset = read_xrefTable(xref_offset + skip);  
788 - } else {  
789 - xref_offset = read_xrefStream(xref_offset);  
790 - }  
791 - if (visited.count(xref_offset) != 0) {  
792 - QTC::TC("qpdf", "QPDF xref loop");  
793 - throw damagedPDF("", 0, "loop detected following xref tables");  
794 - }  
795 - }  
796 -  
797 - if (!m->trailer) {  
798 - throw damagedPDF("", 0, "unable to find trailer while reading xref");  
799 - }  
800 - int size = m->trailer.getKey("/Size").getIntValueAsInt();  
801 - int max_obj = 0;  
802 - if (!m->xref_table.empty()) {  
803 - max_obj = m->xref_table.rbegin()->first.getObj();  
804 - }  
805 - if (!m->deleted_objects.empty()) {  
806 - max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));  
807 - }  
808 - if ((size < 1) || (size - 1 != max_obj)) {  
809 - QTC::TC("qpdf", "QPDF xref size mismatch");  
810 - warn(damagedPDF(  
811 - "",  
812 - 0,  
813 - ("reported number of objects (" + std::to_string(size) +  
814 - ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));  
815 - }  
816 -  
817 - // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we  
818 - // never depend on its being set.  
819 - m->deleted_objects.clear();  
820 -  
821 - // Make sure we keep only the highest generation for any object.  
822 - QPDFObjGen last_og{-1, 0};  
823 - for (auto const& item: m->xref_table) {  
824 - auto id = item.first.getObj();  
825 - if (id == last_og.getObj() && id > 0) {  
826 - removeObject(last_og);  
827 - }  
828 - last_og = item.first;  
829 - }  
830 -}  
831 -  
832 -bool  
833 -QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)  
834 -{  
835 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
836 - // buffer.  
837 - char const* p = line.c_str();  
838 - char const* start = line.c_str();  
839 -  
840 - // Skip zero or more spaces  
841 - while (util::is_space(*p)) {  
842 - ++p;  
843 - }  
844 - // Require digit  
845 - if (!util::is_digit(*p)) {  
846 - return false;  
847 - }  
848 - // Gather digits  
849 - std::string obj_str;  
850 - while (util::is_digit(*p)) {  
851 - obj_str.append(1, *p++);  
852 - }  
853 - // Require space  
854 - if (!util::is_space(*p)) {  
855 - return false;  
856 - }  
857 - // Skip spaces  
858 - while (util::is_space(*p)) {  
859 - ++p;  
860 - }  
861 - // Require digit  
862 - if (!util::is_digit(*p)) {  
863 - return false;  
864 - }  
865 - // Gather digits  
866 - std::string num_str;  
867 - while (util::is_digit(*p)) {  
868 - num_str.append(1, *p++);  
869 - }  
870 - // Skip any space including line terminators  
871 - while (util::is_space(*p)) {  
872 - ++p;  
873 - }  
874 - bytes = toI(p - start);  
875 - obj = QUtil::string_to_int(obj_str.c_str());  
876 - num = QUtil::string_to_int(num_str.c_str());  
877 - return true;  
878 -}  
879 -  
880 -bool  
881 -QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)  
882 -{  
883 - // Reposition after initial read attempt and reread.  
884 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
885 - auto line = m->file->readLine(30);  
886 -  
887 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
888 - // buffer.  
889 - char const* p = line.data();  
890 -  
891 - // Skip zero or more spaces. There aren't supposed to be any.  
892 - bool invalid = false;  
893 - while (util::is_space(*p)) {  
894 - ++p;  
895 - QTC::TC("qpdf", "QPDF ignore first space in xref entry");  
896 - invalid = true;  
897 - }  
898 - // Require digit  
899 - if (!util::is_digit(*p)) {  
900 - return false;  
901 - }  
902 - // Gather digits  
903 - std::string f1_str;  
904 - while (util::is_digit(*p)) {  
905 - f1_str.append(1, *p++);  
906 - }  
907 - // Require space  
908 - if (!util::is_space(*p)) {  
909 - return false;  
910 - }  
911 - if (util::is_space(*(p + 1))) {  
912 - QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");  
913 - invalid = true;  
914 - }  
915 - // Skip spaces  
916 - while (util::is_space(*p)) {  
917 - ++p;  
918 - }  
919 - // Require digit  
920 - if (!util::is_digit(*p)) {  
921 - return false;  
922 - }  
923 - // Gather digits  
924 - std::string f2_str;  
925 - while (util::is_digit(*p)) {  
926 - f2_str.append(1, *p++);  
927 - }  
928 - // Require space  
929 - if (!util::is_space(*p)) {  
930 - return false;  
931 - }  
932 - if (util::is_space(*(p + 1))) {  
933 - QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");  
934 - invalid = true;  
935 - }  
936 - // Skip spaces  
937 - while (util::is_space(*p)) {  
938 - ++p;  
939 - }  
940 - if ((*p == 'f') || (*p == 'n')) {  
941 - type = *p;  
942 - } else {  
943 - return false;  
944 - }  
945 - if ((f1_str.length() != 10) || (f2_str.length() != 5)) {  
946 - QTC::TC("qpdf", "QPDF ignore length error xref entry");  
947 - invalid = true;  
948 - }  
949 -  
950 - if (invalid) {  
951 - warn(damagedPDF("xref table", "accepting invalid xref table entry"));  
952 - }  
953 -  
954 - f1 = QUtil::string_to_ll(f1_str.c_str());  
955 - f2 = QUtil::string_to_int(f2_str.c_str());  
956 -  
957 - return true;  
958 -}  
959 -  
960 -// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return  
961 -// result.  
962 -bool  
963 -QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)  
964 -{  
965 - std::array<char, 21> line;  
966 - if (m->file->read(line.data(), 20) != 20) {  
967 - // C++20: [[unlikely]]  
968 - return false;  
969 - }  
970 - line[20] = '\0';  
971 - char const* p = line.data();  
972 -  
973 - int f1_len = 0;  
974 - int f2_len = 0;  
975 -  
976 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
977 - // buffer.  
978 -  
979 - // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.  
980 - while (*p == '0') {  
981 - ++f1_len;  
982 - ++p;  
983 - }  
984 - while (util::is_digit(*p) && f1_len++ < 10) {  
985 - f1 *= 10;  
986 - f1 += *p++ - '0';  
987 - }  
988 - // Require space  
989 - if (!util::is_space(*p++)) {  
990 - // Entry doesn't start with space or digit.  
991 - // C++20: [[unlikely]]  
992 - return false;  
993 - }  
994 - // Gather digits. NB No risk of overflow as 99'999 < max int.  
995 - while (*p == '0') {  
996 - ++f2_len;  
997 - ++p;  
998 - }  
999 - while (util::is_digit(*p) && f2_len++ < 5) {  
1000 - f2 *= 10;  
1001 - f2 += static_cast<int>(*p++ - '0');  
1002 - }  
1003 - if (util::is_space(*p++) && (*p == 'f' || *p == 'n')) {  
1004 - // C++20: [[likely]]  
1005 - type = *p;  
1006 - // No test for valid line[19].  
1007 - if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {  
1008 - // C++20: [[likely]]  
1009 - return true;  
1010 - }  
1011 - }  
1012 - return read_bad_xrefEntry(f1, f2, type);  
1013 -}  
1014 -  
1015 -// Read a single cross-reference table section and associated trailer.  
1016 -qpdf_offset_t  
1017 -QPDF::read_xrefTable(qpdf_offset_t xref_offset)  
1018 -{  
1019 - m->file->seek(xref_offset, SEEK_SET);  
1020 - std::string line;  
1021 - while (true) {  
1022 - line.assign(50, '\0');  
1023 - m->file->read(line.data(), line.size());  
1024 - int obj = 0;  
1025 - int num = 0;  
1026 - int bytes = 0;  
1027 - if (!parse_xrefFirst(line, obj, num, bytes)) {  
1028 - QTC::TC("qpdf", "QPDF invalid xref");  
1029 - throw damagedPDF("xref table", "xref syntax invalid");  
1030 - }  
1031 - m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);  
1032 - for (qpdf_offset_t i = obj; i - num < obj; ++i) {  
1033 - if (i == 0) {  
1034 - // This is needed by checkLinearization()  
1035 - m->first_xref_item_offset = m->file->tell();  
1036 - }  
1037 - // For xref_table, these will always be small enough to be ints  
1038 - qpdf_offset_t f1 = 0;  
1039 - int f2 = 0;  
1040 - char type = '\0';  
1041 - if (!read_xrefEntry(f1, f2, type)) {  
1042 - QTC::TC("qpdf", "QPDF invalid xref entry");  
1043 - throw damagedPDF(  
1044 - "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");  
1045 - }  
1046 - if (type == 'f') {  
1047 - insertFreeXrefEntry(QPDFObjGen(toI(i), f2));  
1048 - } else {  
1049 - insertXrefEntry(toI(i), 1, f1, f2);  
1050 - }  
1051 - }  
1052 - qpdf_offset_t pos = m->file->tell();  
1053 - if (readToken(*m->file).isWord("trailer")) {  
1054 - break;  
1055 - } else {  
1056 - m->file->seek(pos, SEEK_SET);  
1057 - }  
1058 - }  
1059 -  
1060 - // Set offset to previous xref table if any  
1061 - QPDFObjectHandle cur_trailer = readTrailer();  
1062 - if (!cur_trailer.isDictionary()) {  
1063 - QTC::TC("qpdf", "QPDF missing trailer");  
1064 - throw damagedPDF("", "expected trailer dictionary");  
1065 - }  
1066 -  
1067 - if (!m->trailer) {  
1068 - setTrailer(cur_trailer);  
1069 -  
1070 - if (!m->trailer.hasKey("/Size")) {  
1071 - QTC::TC("qpdf", "QPDF trailer lacks size");  
1072 - throw damagedPDF("trailer", "trailer dictionary lacks /Size key");  
1073 - }  
1074 - if (!m->trailer.getKey("/Size").isInteger()) {  
1075 - QTC::TC("qpdf", "QPDF trailer size not integer");  
1076 - throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");  
1077 - }  
1078 - }  
1079 -  
1080 - if (cur_trailer.hasKey("/XRefStm")) {  
1081 - if (m->ignore_xref_streams) {  
1082 - QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");  
1083 - } else {  
1084 - if (cur_trailer.getKey("/XRefStm").isInteger()) {  
1085 - // Read the xref stream but disregard any return value -- we'll use our trailer's  
1086 - // /Prev key instead of the xref stream's.  
1087 - (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());  
1088 - } else {  
1089 - throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");  
1090 - }  
1091 - }  
1092 - }  
1093 -  
1094 - if (cur_trailer.hasKey("/Prev")) {  
1095 - if (!cur_trailer.getKey("/Prev").isInteger()) {  
1096 - QTC::TC("qpdf", "QPDF trailer prev not integer");  
1097 - throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");  
1098 - }  
1099 - QTC::TC("qpdf", "QPDF prev key in trailer dictionary");  
1100 - return cur_trailer.getKey("/Prev").getIntValue();  
1101 - }  
1102 -  
1103 - return 0;  
1104 -}  
1105 -  
1106 -// Read a single cross-reference stream.  
1107 -qpdf_offset_t  
1108 -QPDF::read_xrefStream(qpdf_offset_t xref_offset)  
1109 -{  
1110 - if (!m->ignore_xref_streams) {  
1111 - QPDFObjGen x_og;  
1112 - QPDFObjectHandle xref_obj;  
1113 - try {  
1114 - xref_obj =  
1115 - readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);  
1116 - } catch (QPDFExc&) {  
1117 - // ignore -- report error below  
1118 - }  
1119 - if (xref_obj.isStreamOfType("/XRef")) {  
1120 - QTC::TC("qpdf", "QPDF found xref stream");  
1121 - return processXRefStream(xref_offset, xref_obj);  
1122 - }  
1123 - }  
1124 -  
1125 - QTC::TC("qpdf", "QPDF can't find xref");  
1126 - throw damagedPDF("", xref_offset, "xref not found");  
1127 - return 0; // unreachable  
1128 -}  
1129 -  
1130 -// Return the entry size of the xref stream and the processed W array.  
1131 -std::pair<int, std::array<int, 3>>  
1132 -QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)  
1133 -{  
1134 - auto W_obj = dict.getKey("/W");  
1135 - if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&  
1136 - W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {  
1137 - throw damaged("Cross-reference stream does not have a proper /W key");  
1138 - }  
1139 -  
1140 - std::array<int, 3> W;  
1141 - int entry_size = 0;  
1142 - auto w_vector = W_obj.getArrayAsVector();  
1143 - int max_bytes = sizeof(qpdf_offset_t);  
1144 - for (size_t i = 0; i < 3; ++i) {  
1145 - W[i] = w_vector[i].getIntValueAsInt();  
1146 - if (W[i] > max_bytes) {  
1147 - throw damaged("Cross-reference stream's /W contains impossibly large values");  
1148 - }  
1149 - if (W[i] < 0) {  
1150 - throw damaged("Cross-reference stream's /W contains negative values");  
1151 - }  
1152 - entry_size += W[i];  
1153 - }  
1154 - if (entry_size == 0) {  
1155 - throw damaged("Cross-reference stream's /W indicates entry size of 0");  
1156 - }  
1157 - return {entry_size, W};  
1158 -}  
1159 -  
1160 -// Validate Size key and return the maximum number of entries that the xref stream can contain.  
1161 -int  
1162 -QPDF::processXRefSize(  
1163 - QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)  
1164 -{  
1165 - // Number of entries is limited by the highest possible object id and stream size.  
1166 - auto max_num_entries = std::numeric_limits<int>::max();  
1167 - if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {  
1168 - max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);  
1169 - }  
1170 -  
1171 - auto Size_obj = dict.getKey("/Size");  
1172 - long long size;  
1173 - if (!dict.getKey("/Size").getValueAsInt(size)) {  
1174 - throw damaged("Cross-reference stream does not have a proper /Size key");  
1175 - } else if (size < 0) {  
1176 - throw damaged("Cross-reference stream has a negative /Size key");  
1177 - } else if (size >= max_num_entries) {  
1178 - throw damaged("Cross-reference stream has an impossibly large /Size key");  
1179 - }  
1180 - // We are not validating that Size <= (Size key of parent xref / trailer).  
1181 - return max_num_entries;  
1182 -}  
1183 -  
1184 -// Return the number of entries of the xref stream and the processed Index array.  
1185 -std::pair<int, std::vector<std::pair<int, int>>>  
1186 -QPDF::processXRefIndex(  
1187 - QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)  
1188 -{  
1189 - auto size = dict.getKey("/Size").getIntValueAsInt();  
1190 - auto Index_obj = dict.getKey("/Index");  
1191 -  
1192 - if (Index_obj.isArray()) {  
1193 - std::vector<std::pair<int, int>> indx;  
1194 - int num_entries = 0;  
1195 - auto index_vec = Index_obj.getArrayAsVector();  
1196 - if ((index_vec.size() % 2) || index_vec.size() < 2) {  
1197 - throw damaged("Cross-reference stream's /Index has an invalid number of values");  
1198 - }  
1199 -  
1200 - int i = 0;  
1201 - long long first = 0;  
1202 - for (auto& val: index_vec) {  
1203 - if (val.isInteger()) {  
1204 - if (i % 2) {  
1205 - auto count = val.getIntValue();  
1206 - if (count <= 0) {  
1207 - throw damaged(  
1208 - "Cross-reference stream section claims to contain " +  
1209 - std::to_string(count) + " entries");  
1210 - }  
1211 - // We are guarding against the possibility of num_entries * entry_size  
1212 - // overflowing. We are not checking that entries are in ascending order as  
1213 - // required by the spec, which probably should generate a warning. We are also  
1214 - // not checking that for each subsection first object number + number of entries  
1215 - // <= /Size. The spec requires us to ignore object number > /Size.  
1216 - if (first > (max_num_entries - count) ||  
1217 - count > (max_num_entries - num_entries)) {  
1218 - throw damaged(  
1219 - "Cross-reference stream claims to contain too many entries: " +  
1220 - std::to_string(first) + " " + std::to_string(max_num_entries) + " " +  
1221 - std::to_string(num_entries));  
1222 - }  
1223 - indx.emplace_back(static_cast<int>(first), static_cast<int>(count));  
1224 - num_entries += static_cast<int>(count);  
1225 - } else {  
1226 - first = val.getIntValue();  
1227 - if (first < 0) {  
1228 - throw damaged(  
1229 - "Cross-reference stream's /Index contains a negative object id");  
1230 - } else if (first > max_num_entries) {  
1231 - throw damaged(  
1232 - "Cross-reference stream's /Index contains an impossibly "  
1233 - "large object id");  
1234 - }  
1235 - }  
1236 - } else {  
1237 - throw damaged(  
1238 - "Cross-reference stream's /Index's item " + std::to_string(i) +  
1239 - " is not an integer");  
1240 - }  
1241 - i++;  
1242 - }  
1243 - QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);  
1244 - return {num_entries, indx};  
1245 - } else if (Index_obj.isNull()) {  
1246 - QTC::TC("qpdf", "QPDF xref /Index is null");  
1247 - return {size, {{0, size}}};  
1248 - } else {  
1249 - throw damaged("Cross-reference stream does not have a proper /Index key");  
1250 - }  
1251 -}  
1252 -  
1253 -qpdf_offset_t  
1254 -QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)  
1255 -{  
1256 - auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {  
1257 - return damagedPDF("xref stream", xref_offset, msg.data());  
1258 - };  
1259 -  
1260 - auto dict = xref_obj.getDict();  
1261 -  
1262 - auto [entry_size, W] = processXRefW(dict, damaged);  
1263 - int max_num_entries = processXRefSize(dict, entry_size, damaged);  
1264 - auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);  
1265 -  
1266 - std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);  
1267 - size_t actual_size = bp->getSize();  
1268 - auto expected_size = toS(entry_size) * toS(num_entries);  
1269 -  
1270 - if (expected_size != actual_size) {  
1271 - QPDFExc x = damaged(  
1272 - "Cross-reference stream data has the wrong size; expected = " +  
1273 - std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));  
1274 - if (expected_size > actual_size) {  
1275 - throw x;  
1276 - } else {  
1277 - warn(x);  
1278 - }  
1279 - }  
1280 -  
1281 - bool saw_first_compressed_object = false;  
1282 -  
1283 - // Actual size vs. expected size check above ensures that we will not overflow any buffers here.  
1284 - // We know that entry_size * num_entries is less or equal to the size of the buffer.  
1285 - auto p = bp->getBuffer();  
1286 - for (auto [obj, sec_entries]: indx) {  
1287 - // Process a subsection.  
1288 - for (int i = 0; i < sec_entries; ++i) {  
1289 - // Read this entry  
1290 - std::array<qpdf_offset_t, 3> fields{};  
1291 - if (W[0] == 0) {  
1292 - QTC::TC("qpdf", "QPDF default for xref stream field 0");  
1293 - fields[0] = 1;  
1294 - }  
1295 - for (size_t j = 0; j < 3; ++j) {  
1296 - for (int k = 0; k < W[j]; ++k) {  
1297 - fields[j] <<= 8;  
1298 - fields[j] |= *p++;  
1299 - }  
1300 - }  
1301 -  
1302 - // Get the generation number. The generation number is 0 unless this is an uncompressed  
1303 - // object record, in which case the generation number appears as the third field.  
1304 - if (saw_first_compressed_object) {  
1305 - if (fields[0] != 2) {  
1306 - m->uncompressed_after_compressed = true;  
1307 - }  
1308 - } else if (fields[0] == 2) {  
1309 - saw_first_compressed_object = true;  
1310 - }  
1311 - if (obj == 0) {  
1312 - // This is needed by checkLinearization()  
1313 - m->first_xref_item_offset = xref_offset;  
1314 - } else if (fields[0] == 0) {  
1315 - // Ignore fields[2], which we don't care about in this case. This works around the  
1316 - // issue of some PDF files that put invalid values, like -1, here for deleted  
1317 - // objects.  
1318 - insertFreeXrefEntry(QPDFObjGen(obj, 0));  
1319 - } else {  
1320 - insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));  
1321 - }  
1322 - ++obj;  
1323 - }  
1324 - }  
1325 -  
1326 - if (!m->trailer) {  
1327 - setTrailer(dict);  
1328 - }  
1329 -  
1330 - if (dict.hasKey("/Prev")) {  
1331 - if (!dict.getKey("/Prev").isInteger()) {  
1332 - throw damagedPDF(  
1333 - "xref stream", "/Prev key in xref stream dictionary is not an integer");  
1334 - }  
1335 - QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");  
1336 - return dict.getKey("/Prev").getIntValue();  
1337 - } else {  
1338 - return 0;  
1339 - }  
1340 -}  
1341 -  
1342 -void  
1343 -QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)  
1344 -{  
1345 - // Populate the xref table in such a way that the first reference to an object that we see,  
1346 - // which is the one in the latest xref table in which it appears, is the one that gets stored.  
1347 - // This works because we are reading more recent appends before older ones.  
1348 -  
1349 - // If there is already an entry for this object and generation in the table, it means that a  
1350 - // later xref table has registered this object. Disregard this one.  
1351 - int new_gen = f0 == 2 ? 0 : f2;  
1352 -  
1353 - if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && new_gen < 65535)) {  
1354 - // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There  
1355 - // is probably no point having another warning but we could count invalid items in order to  
1356 - // decide when to give up.  
1357 - QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");  
1358 - // ignore impossibly large object ids or object ids > Size.  
1359 - return;  
1360 - }  
1361 -  
1362 - if (m->deleted_objects.count(obj)) {  
1363 - QTC::TC("qpdf", "QPDF xref deleted object");  
1364 - return;  
1365 - }  
1366 -  
1367 - if (f0 == 2 && static_cast<int>(f1) == obj) {  
1368 - warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));  
1369 - return;  
1370 - }  
1371 -  
1372 - auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));  
1373 - if (!created) {  
1374 - QTC::TC("qpdf", "QPDF xref reused object");  
1375 - return;  
1376 - }  
1377 -  
1378 - switch (f0) {  
1379 - case 1:  
1380 - // f2 is generation  
1381 - QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));  
1382 - iter->second = QPDFXRefEntry(f1);  
1383 - break;  
1384 -  
1385 - case 2:  
1386 - iter->second = QPDFXRefEntry(toI(f1), f2);  
1387 - break;  
1388 -  
1389 - default:  
1390 - throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));  
1391 - break;  
1392 - }  
1393 -}  
1394 -  
1395 -void  
1396 -QPDF::insertFreeXrefEntry(QPDFObjGen og)  
1397 -{  
1398 - if (!m->xref_table.count(og)) {  
1399 - m->deleted_objects.insert(og.getObj());  
1400 - }  
1401 -}  
1402 -  
1403 -void  
1404 -QPDF::showXRefTable()  
1405 -{  
1406 - auto& cout = *m->log->getInfo();  
1407 - for (auto const& iter: m->xref_table) {  
1408 - QPDFObjGen const& og = iter.first;  
1409 - QPDFXRefEntry const& entry = iter.second;  
1410 - cout << og.unparse('/') << ": ";  
1411 - switch (entry.getType()) {  
1412 - case 1:  
1413 - cout << "uncompressed; offset = " << entry.getOffset();  
1414 - break;  
1415 -  
1416 - case 2:  
1417 - *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()  
1418 - << ", index = " << entry.getObjStreamIndex();  
1419 - break;  
1420 -  
1421 - default:  
1422 - throw std::logic_error("unknown cross-reference table type while showing xref_table");  
1423 - break;  
1424 - }  
1425 - m->log->info("\n");  
1426 - }  
1427 -}  
1428 -  
1429 -// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and  
1430 -// return false. Otherwise return true.  
1431 -bool  
1432 -QPDF::resolveXRefTable()  
1433 -{  
1434 - bool may_change = !m->reconstructed_xref;  
1435 - for (auto& iter: m->xref_table) {  
1436 - if (isUnresolved(iter.first)) {  
1437 - resolve(iter.first);  
1438 - if (may_change && m->reconstructed_xref) {  
1439 - return false;  
1440 - }  
1441 - }  
1442 - }  
1443 - return true;  
1444 -}  
1445 -  
1446 -// Ensure all objects in the pdf file, including those in indirect references, appear in the object  
1447 -// cache.  
1448 -void  
1449 -QPDF::fixDanglingReferences(bool force)  
1450 -{  
1451 - if (m->fixed_dangling_refs) {  
1452 - return;  
1453 - }  
1454 - if (!resolveXRefTable()) {  
1455 - QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");  
1456 - resolveXRefTable();  
1457 - }  
1458 - m->fixed_dangling_refs = true;  
1459 -}  
1460 -  
1461 -size_t  
1462 -QPDF::getObjectCount()  
1463 -{  
1464 - // This method returns the next available indirect object number. makeIndirectObject uses it for  
1465 - // this purpose. After fixDanglingReferences is called, all objects in the xref table will also  
1466 - // be in obj_cache.  
1467 - fixDanglingReferences();  
1468 - QPDFObjGen og;  
1469 - if (!m->obj_cache.empty()) {  
1470 - og = (*(m->obj_cache.rbegin())).first;  
1471 - }  
1472 - return toS(og.getObj());  
1473 -}  
1474 -  
1475 -std::vector<QPDFObjectHandle>  
1476 -QPDF::getAllObjects()  
1477 -{  
1478 - // After fixDanglingReferences is called, all objects are in the object cache.  
1479 - fixDanglingReferences();  
1480 - std::vector<QPDFObjectHandle> result;  
1481 - for (auto const& iter: m->obj_cache) {  
1482 - result.push_back(newIndirect(iter.first, iter.second.object));  
1483 - }  
1484 - return result;  
1485 -}  
1486 -  
1487 -void  
1488 -QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen og)  
1489 -{  
1490 - m->last_object_description.clear();  
1491 - if (!description.empty()) {  
1492 - m->last_object_description += description;  
1493 - if (og.isIndirect()) {  
1494 - m->last_object_description += ": ";  
1495 - }  
1496 - }  
1497 - if (og.isIndirect()) {  
1498 - m->last_object_description += "object " + og.unparse(' ');  
1499 - }  
1500 -}  
1501 -  
1502 -QPDFObjectHandle  
1503 -QPDF::readTrailer()  
1504 -{  
1505 - qpdf_offset_t offset = m->file->tell();  
1506 - bool empty = false;  
1507 - auto object =  
1508 - QPDFParser(*m->file, "trailer", m->tokenizer, nullptr, this, true).parse(empty, false);  
1509 - if (empty) {  
1510 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1511 - // actual PDF files and Adobe Reader appears to ignore them.  
1512 - warn(damagedPDF("trailer", "empty object treated as null"));  
1513 - } else if (object.isDictionary() && readToken(*m->file).isWord("stream")) {  
1514 - warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));  
1515 - }  
1516 - // Override last_offset so that it points to the beginning of the object we just read  
1517 - m->file->setLastOffset(offset);  
1518 - return object;  
1519 -}  
1520 -  
1521 -QPDFObjectHandle  
1522 -QPDF::readObject(std::string const& description, QPDFObjGen og)  
1523 -{  
1524 - setLastObjectDescription(description, og);  
1525 - qpdf_offset_t offset = m->file->tell();  
1526 - bool empty = false;  
1527 -  
1528 - StringDecrypter decrypter{this, og};  
1529 - StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;  
1530 - auto object =  
1531 - QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this, true)  
1532 - .parse(empty, false);  
1533 - if (empty) {  
1534 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1535 - // actual PDF files and Adobe Reader appears to ignore them.  
1536 - warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));  
1537 - return object;  
1538 - }  
1539 - auto token = readToken(*m->file);  
1540 - if (object.isDictionary() && token.isWord("stream")) {  
1541 - readStream(object, og, offset);  
1542 - token = readToken(*m->file);  
1543 - }  
1544 - if (!token.isWord("endobj")) {  
1545 - QTC::TC("qpdf", "QPDF err expected endobj");  
1546 - warn(damagedPDF("expected endobj"));  
1547 - }  
1548 - return object;  
1549 -}  
1550 -  
1551 -// After reading stream dictionary and stream keyword, read rest of stream.  
1552 void 416 void
1553 -QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)  
1554 -{  
1555 - validateStreamLineEnd(object, og, offset);  
1556 -  
1557 - // Must get offset before accessing any additional objects since resolving a previously  
1558 - // unresolved indirect object will change file position.  
1559 - qpdf_offset_t stream_offset = m->file->tell();  
1560 - size_t length = 0;  
1561 -  
1562 - try {  
1563 - auto length_obj = object.getKey("/Length");  
1564 -  
1565 - if (!length_obj.isInteger()) {  
1566 - if (length_obj.isNull()) {  
1567 - QTC::TC("qpdf", "QPDF stream without length");  
1568 - throw damagedPDF(offset, "stream dictionary lacks /Length key");  
1569 - }  
1570 - QTC::TC("qpdf", "QPDF stream length not integer");  
1571 - throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");  
1572 - }  
1573 -  
1574 - length = toS(length_obj.getUIntValue());  
1575 - // Seek in two steps to avoid potential integer overflow  
1576 - m->file->seek(stream_offset, SEEK_SET);  
1577 - m->file->seek(toO(length), SEEK_CUR);  
1578 - if (!readToken(*m->file).isWord("endstream")) {  
1579 - QTC::TC("qpdf", "QPDF missing endstream");  
1580 - throw damagedPDF("expected endstream");  
1581 - }  
1582 - } catch (QPDFExc& e) {  
1583 - if (m->attempt_recovery) {  
1584 - warn(e);  
1585 - length = recoverStreamLength(m->file, og, stream_offset);  
1586 - } else {  
1587 - throw;  
1588 - }  
1589 - }  
1590 - object = QPDFObjectHandle(qpdf::Stream(*this, og, object, stream_offset, length));  
1591 -}  
1592 -  
1593 -void  
1594 -QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)  
1595 -{  
1596 - // The PDF specification states that the word "stream" should be followed by either a carriage  
1597 - // return and a newline or by a newline alone. It specifically disallowed following it by a  
1598 - // carriage return alone since, in that case, there would be no way to tell whether the NL in a  
1599 - // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,  
1600 - // accept a carriage return by itself when followed by a non-newline character, so that's what  
1601 - // we do here. We have also seen files that have extraneous whitespace between the stream  
1602 - // keyword and the newline.  
1603 - while (true) {  
1604 - char ch;  
1605 - if (m->file->read(&ch, 1) == 0) {  
1606 - // A premature EOF here will result in some other problem that will get reported at  
1607 - // another time.  
1608 - return;  
1609 - }  
1610 - if (ch == '\n') {  
1611 - // ready to read stream data  
1612 - QTC::TC("qpdf", "QPDF stream with NL only");  
1613 - return;  
1614 - }  
1615 - if (ch == '\r') {  
1616 - // Read another character  
1617 - if (m->file->read(&ch, 1) != 0) {  
1618 - if (ch == '\n') {  
1619 - // Ready to read stream data  
1620 - QTC::TC("qpdf", "QPDF stream with CRNL");  
1621 - } else {  
1622 - // Treat the \r by itself as the whitespace after endstream and start reading  
1623 - // stream data in spite of not having seen a newline.  
1624 - QTC::TC("qpdf", "QPDF stream with CR only");  
1625 - m->file->unreadCh(ch);  
1626 - warn(damagedPDF(  
1627 - m->file->tell(), "stream keyword followed by carriage return only"));  
1628 - }  
1629 - }  
1630 - return;  
1631 - }  
1632 - if (!util::is_space(ch)) {  
1633 - QTC::TC("qpdf", "QPDF stream without newline");  
1634 - m->file->unreadCh(ch);  
1635 - warn(damagedPDF(  
1636 - m->file->tell(), "stream keyword not followed by proper line terminator"));  
1637 - return;  
1638 - }  
1639 - warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));  
1640 - }  
1641 -}  
1642 -  
1643 -QPDFObjectHandle  
1644 -QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)  
1645 -{  
1646 - m->last_object_description.erase(7); // last_object_description starts with "object "  
1647 - m->last_object_description += std::to_string(obj);  
1648 - m->last_object_description += " 0";  
1649 -  
1650 - bool empty = false;  
1651 - auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true)  
1652 - .parse(empty, false);  
1653 - if (empty) {  
1654 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1655 - // actual PDF files and Adobe Reader appears to ignore them.  
1656 - warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));  
1657 - }  
1658 - return object;  
1659 -}  
1660 -  
1661 -bool  
1662 -QPDF::findEndstream()  
1663 -{  
1664 - // Find endstream or endobj. Position the input at that token.  
1665 - auto t = readToken(*m->file, 20);  
1666 - if (t.isWord("endobj") || t.isWord("endstream")) {  
1667 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
1668 - return true;  
1669 - }  
1670 - return false;  
1671 -}  
1672 -  
1673 -size_t  
1674 -QPDF::recoverStreamLength(  
1675 - std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)  
1676 -{  
1677 - // Try to reconstruct stream length by looking for endstream or endobj  
1678 - warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));  
1679 -  
1680 - PatternFinder ef(*this, &QPDF::findEndstream);  
1681 - size_t length = 0;  
1682 - if (m->file->findFirst("end", stream_offset, 0, ef)) {  
1683 - length = toS(m->file->tell() - stream_offset);  
1684 - // Reread endstream but, if it was endobj, don't skip that.  
1685 - QPDFTokenizer::Token t = readToken(*m->file);  
1686 - if (t.getValue() == "endobj") {  
1687 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
1688 - }  
1689 - }  
1690 -  
1691 - if (length) {  
1692 - auto end = stream_offset + toO(length);  
1693 - qpdf_offset_t found_offset = 0;  
1694 - QPDFObjGen found_og;  
1695 -  
1696 - // Make sure this is inside this object  
1697 - for (auto const& [current_og, entry]: m->xref_table) {  
1698 - if (entry.getType() == 1) {  
1699 - qpdf_offset_t obj_offset = entry.getOffset();  
1700 - if (found_offset < obj_offset && obj_offset < end) {  
1701 - found_offset = obj_offset;  
1702 - found_og = current_og;  
1703 - }  
1704 - }  
1705 - }  
1706 - if (!found_offset || found_og == og) {  
1707 - // If we are trying to recover an XRef stream the xref table will not contain and  
1708 - // won't contain any entries, therefore we cannot check the found length. Otherwise we  
1709 - // found endstream\nendobj within the space allowed for this object, so we're probably  
1710 - // in good shape.  
1711 - } else {  
1712 - QTC::TC("qpdf", "QPDF found wrong endstream in recovery");  
1713 - length = 0;  
1714 - }  
1715 - }  
1716 -  
1717 - if (length == 0) {  
1718 - warn(damagedPDF(  
1719 - *input, stream_offset, "unable to recover stream data; treating stream as empty"));  
1720 - } else {  
1721 - warn(damagedPDF(  
1722 - *input, stream_offset, "recovered stream length: " + std::to_string(length)));  
1723 - }  
1724 -  
1725 - QTC::TC("qpdf", "QPDF recovered stream length");  
1726 - return length;  
1727 -}  
1728 -  
1729 -QPDFTokenizer::Token  
1730 -QPDF::readToken(InputSource& input, size_t max_len)  
1731 -{  
1732 - return m->tokenizer.readToken(input, m->last_object_description, true, max_len);  
1733 -}  
1734 -  
1735 -QPDFObjectHandle  
1736 -QPDF::readObjectAtOffset(  
1737 - bool try_recovery,  
1738 - qpdf_offset_t offset,  
1739 - std::string const& description,  
1740 - QPDFObjGen exp_og,  
1741 - QPDFObjGen& og,  
1742 - bool skip_cache_if_in_xref)  
1743 -{  
1744 - bool check_og = true;  
1745 - if (exp_og.getObj() == 0) {  
1746 - // This method uses an expect object ID of 0 to indicate that we don't know or don't care  
1747 - // what the actual object ID is at this offset. This is true when we read the xref stream  
1748 - // and linearization hint streams. In this case, we don't verify the expect object  
1749 - // ID/generation against what was read from the file. There is also no reason to attempt  
1750 - // xref recovery if we get a failure in this case since the read attempt was not triggered  
1751 - // by an xref lookup.  
1752 - check_og = false;  
1753 - try_recovery = false;  
1754 - }  
1755 - setLastObjectDescription(description, exp_og);  
1756 -  
1757 - if (!m->attempt_recovery) {  
1758 - try_recovery = false;  
1759 - }  
1760 -  
1761 - // Special case: if offset is 0, just return null. Some PDF writers, in particular  
1762 - // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as  
1763 - // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore  
1764 - // these.  
1765 - if (offset == 0) {  
1766 - QTC::TC("qpdf", "QPDF bogus 0 offset", 0);  
1767 - warn(damagedPDF(0, "object has offset 0"));  
1768 - return QPDFObjectHandle::newNull();  
1769 - }  
1770 -  
1771 - m->file->seek(offset, SEEK_SET);  
1772 - try {  
1773 - QPDFTokenizer::Token tobjid = readToken(*m->file);  
1774 - bool objidok = tobjid.isInteger();  
1775 - QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);  
1776 - if (!objidok) {  
1777 - QTC::TC("qpdf", "QPDF expected n n obj");  
1778 - throw damagedPDF(offset, "expected n n obj");  
1779 - }  
1780 - QPDFTokenizer::Token tgen = readToken(*m->file);  
1781 - bool genok = tgen.isInteger();  
1782 - QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);  
1783 - if (!genok) {  
1784 - throw damagedPDF(offset, "expected n n obj");  
1785 - }  
1786 - QPDFTokenizer::Token tobj = readToken(*m->file);  
1787 -  
1788 - bool objok = tobj.isWord("obj");  
1789 - QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);  
1790 -  
1791 - if (!objok) {  
1792 - throw damagedPDF(offset, "expected n n obj");  
1793 - }  
1794 - int objid = QUtil::string_to_int(tobjid.getValue().c_str());  
1795 - int generation = QUtil::string_to_int(tgen.getValue().c_str());  
1796 - og = QPDFObjGen(objid, generation);  
1797 - if (objid == 0) {  
1798 - QTC::TC("qpdf", "QPDF object id 0");  
1799 - throw damagedPDF(offset, "object with ID 0");  
1800 - }  
1801 - if (check_og && (exp_og != og)) {  
1802 - QTC::TC("qpdf", "QPDF err wrong objid/generation");  
1803 - QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");  
1804 - if (try_recovery) {  
1805 - // Will be retried below  
1806 - throw e;  
1807 - } else {  
1808 - // We can try reading the object anyway even if the ID doesn't match.  
1809 - warn(e);  
1810 - }  
1811 - }  
1812 - } catch (QPDFExc& e) {  
1813 - if (try_recovery) {  
1814 - // Try again after reconstructing xref table  
1815 - reconstruct_xref(e);  
1816 - if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {  
1817 - qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();  
1818 - QPDFObjectHandle result =  
1819 - readObjectAtOffset(false, new_offset, description, exp_og, og, false);  
1820 - QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");  
1821 - return result;  
1822 - } else {  
1823 - QTC::TC("qpdf", "QPDF object gone after xref reconstruction");  
1824 - warn(damagedPDF(  
1825 - "",  
1826 - 0,  
1827 - ("object " + exp_og.unparse(' ') +  
1828 - " not found in file after regenerating cross reference "  
1829 - "table")));  
1830 - return QPDFObjectHandle::newNull();  
1831 - }  
1832 - } else {  
1833 - throw;  
1834 - }  
1835 - }  
1836 -  
1837 - QPDFObjectHandle oh = readObject(description, og);  
1838 -  
1839 - if (isUnresolved(og)) {  
1840 - // Store the object in the cache here so it gets cached whether we first know the offset or  
1841 - // whether we first know the object ID and generation (in which we case we would get here  
1842 - // through resolve).  
1843 -  
1844 - // Determine the end offset of this object before and after white space. We use these  
1845 - // numbers to validate linearization hint tables. Offsets and lengths of objects may imply  
1846 - // the end of an object to be anywhere between these values.  
1847 - qpdf_offset_t end_before_space = m->file->tell();  
1848 -  
1849 - // skip over spaces  
1850 - while (true) {  
1851 - char ch;  
1852 - if (m->file->read(&ch, 1)) {  
1853 - if (!isspace(static_cast<unsigned char>(ch))) {  
1854 - m->file->seek(-1, SEEK_CUR);  
1855 - break;  
1856 - }  
1857 - } else {  
1858 - throw damagedPDF(m->file->tell(), "EOF after endobj");  
1859 - }  
1860 - }  
1861 - qpdf_offset_t end_after_space = m->file->tell();  
1862 - if (skip_cache_if_in_xref && m->xref_table.count(og)) {  
1863 - // Ordinarily, an object gets read here when resolved through xref table or stream. In  
1864 - // the special case of the xref stream and linearization hint tables, the offset comes  
1865 - // from another source. For the specific case of xref streams, the xref stream is read  
1866 - // and loaded into the object cache very early in parsing. Ordinarily, when a file is  
1867 - // updated by appending, items inserted into the xref table in later updates take  
1868 - // precedence over earlier items. In the special case of reusing the object number  
1869 - // previously used as the xref stream, we have the following order of events:  
1870 - //  
1871 - // * reused object gets loaded into the xref table  
1872 - // * old object is read here while reading xref streams  
1873 - // * original xref entry is ignored (since already in xref table)  
1874 - //  
1875 - // It is the second step that causes a problem. Even though the xref table is correct in  
1876 - // this case, the old object is already in the cache and so effectively prevails over  
1877 - // the reused object. To work around this issue, we have a special case for the xref  
1878 - // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,  
1879 - // don't cache what we read here.  
1880 - //  
1881 - // It is likely that the same bug may exist for linearization hint tables, but the  
1882 - // existing code uses end_before_space and end_after_space from the cache, so fixing  
1883 - // that would require more significant rework. The chances of a linearization hint  
1884 - // stream being reused seems smaller because the xref stream is probably the highest  
1885 - // object in the file and the linearization hint stream would be some random place in  
1886 - // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we  
1887 - // could use !check_og in place of skip_cache_if_in_xref.  
1888 - QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");  
1889 - } else {  
1890 - updateCache(og, oh.getObj(), end_before_space, end_after_space);  
1891 - }  
1892 - }  
1893 -  
1894 - return oh;  
1895 -}  
1896 -  
1897 -std::shared_ptr<QPDFObject> const&  
1898 -QPDF::resolve(QPDFObjGen og)  
1899 -{  
1900 - if (!isUnresolved(og)) {  
1901 - return m->obj_cache[og].object;  
1902 - }  
1903 -  
1904 - if (m->resolving.count(og)) {  
1905 - // This can happen if an object references itself directly or indirectly in some key that  
1906 - // has to be resolved during object parsing, such as stream length.  
1907 - QTC::TC("qpdf", "QPDF recursion loop in resolve");  
1908 - warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));  
1909 - updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);  
1910 - return m->obj_cache[og].object;  
1911 - }  
1912 - ResolveRecorder rr(this, og);  
1913 -  
1914 - if (m->xref_table.count(og) != 0) {  
1915 - QPDFXRefEntry const& entry = m->xref_table[og];  
1916 - try {  
1917 - switch (entry.getType()) {  
1918 - case 1:  
1919 - {  
1920 - qpdf_offset_t offset = entry.getOffset();  
1921 - // Object stored in cache by readObjectAtOffset  
1922 - QPDFObjGen a_og;  
1923 - QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);  
1924 - }  
1925 - break;  
1926 -  
1927 - case 2:  
1928 - resolveObjectsInStream(entry.getObjStreamNumber());  
1929 - break;  
1930 -  
1931 - default:  
1932 - throw damagedPDF(  
1933 - "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));  
1934 - }  
1935 - } catch (QPDFExc& e) {  
1936 - warn(e);  
1937 - } catch (std::exception& e) {  
1938 - warn(damagedPDF(  
1939 - "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));  
1940 - }  
1941 - }  
1942 -  
1943 - if (isUnresolved(og)) {  
1944 - // PDF spec says unknown objects resolve to the null object.  
1945 - QTC::TC("qpdf", "QPDF resolve failure to null");  
1946 - updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);  
1947 - }  
1948 -  
1949 - auto& result(m->obj_cache[og].object);  
1950 - result->setDefaultDescription(this, og);  
1951 - return result;  
1952 -}  
1953 -  
1954 -void  
1955 -QPDF::resolveObjectsInStream(int obj_stream_number) 417 +QPDF::warn(QPDFExc const& e)
1956 { 418 {
1957 - if (m->resolved_object_streams.count(obj_stream_number)) {  
1958 - return;  
1959 - }  
1960 - m->resolved_object_streams.insert(obj_stream_number);  
1961 - // Force resolution of object stream  
1962 - QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);  
1963 - if (!obj_stream.isStream()) {  
1964 - throw damagedPDF(  
1965 - "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");  
1966 - }  
1967 -  
1968 - // For linearization data in the object, use the data from the object stream for the objects in  
1969 - // the stream.  
1970 - QPDFObjGen stream_og(obj_stream_number, 0);  
1971 - qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;  
1972 - qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;  
1973 -  
1974 - QPDFObjectHandle dict = obj_stream.getDict();  
1975 - if (!dict.isDictionaryOfType("/ObjStm")) {  
1976 - QTC::TC("qpdf", "QPDF ERR object stream with wrong type");  
1977 - warn(damagedPDF(  
1978 - "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));  
1979 - }  
1980 -  
1981 - if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {  
1982 - throw damagedPDF(  
1983 - ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));  
1984 - }  
1985 -  
1986 - int n = dict.getKey("/N").getIntValueAsInt();  
1987 - int first = dict.getKey("/First").getIntValueAsInt();  
1988 -  
1989 - std::map<int, int> offsets;  
1990 -  
1991 - std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);  
1992 - auto input = std::shared_ptr<InputSource>(  
1993 - // line-break  
1994 - new BufferInputSource(  
1995 - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),  
1996 - bp.get()));  
1997 -  
1998 - long long last_offset = -1;  
1999 - for (int i = 0; i < n; ++i) {  
2000 - QPDFTokenizer::Token tnum = readToken(*input);  
2001 - QPDFTokenizer::Token toffset = readToken(*input);  
2002 - if (!(tnum.isInteger() && toffset.isInteger())) {  
2003 - throw damagedPDF(  
2004 - *input,  
2005 - m->last_object_description,  
2006 - input->getLastOffset(),  
2007 - "expected integer in object stream header");  
2008 - }  
2009 -  
2010 - int num = QUtil::string_to_int(tnum.getValue().c_str());  
2011 - long long offset = QUtil::string_to_int(toffset.getValue().c_str());  
2012 -  
2013 - if (num == obj_stream_number) {  
2014 - QTC::TC("qpdf", "QPDF ignore self-referential object stream");  
2015 - warn(damagedPDF(  
2016 - *input,  
2017 - m->last_object_description,  
2018 - input->getLastOffset(),  
2019 - "object stream claims to contain itself"));  
2020 - continue;  
2021 - }  
2022 -  
2023 - if (num < 1) {  
2024 - QTC::TC("qpdf", "QPDF object stream contains id < 1");  
2025 - warn(damagedPDF(  
2026 - *input,  
2027 - "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),  
2028 - 0,  
2029 - "object id is invalid"s));  
2030 - continue;  
2031 - }  
2032 -  
2033 - if (offset <= last_offset) {  
2034 - QTC::TC("qpdf", "QPDF object stream offsets not increasing");  
2035 - warn(damagedPDF(  
2036 - *input,  
2037 - "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),  
2038 - 0,  
2039 - "offset is invalid (must be larger than previous offset " +  
2040 - std::to_string(last_offset) + ")"));  
2041 - continue;  
2042 - }  
2043 - last_offset = offset;  
2044 -  
2045 - if (num > m->xref_table_max_id) {  
2046 - continue;  
2047 - }  
2048 -  
2049 - offsets[num] = toI(offset + first); 419 + if (m->max_warnings > 0 && m->warnings.size() >= m->max_warnings) {
  420 + stopOnError("Too many warnings - file is too badly damaged");
2050 } 421 }
2051 -  
2052 - // To avoid having to read the object stream multiple times, store all objects that would be  
2053 - // found here in the cache. Remember that some objects stored here might have been overridden  
2054 - // by new objects appended to the file, so it is necessary to recheck the xref table and only  
2055 - // cache what would actually be resolved here.  
2056 - m->last_object_description.clear();  
2057 - m->last_object_description += "object ";  
2058 - for (auto const& iter: offsets) {  
2059 - QPDFObjGen og(iter.first, 0);  
2060 - auto entry = m->xref_table.find(og);  
2061 - if (entry != m->xref_table.end() && entry->second.getType() == 2 &&  
2062 - entry->second.getObjStreamNumber() == obj_stream_number) {  
2063 - int offset = iter.second;  
2064 - input->seek(offset, SEEK_SET);  
2065 - QPDFObjectHandle oh = readObjectInStream(input, iter.first);  
2066 - updateCache(og, oh.getObj(), end_before_space, end_after_space);  
2067 - } else {  
2068 - QTC::TC("qpdf", "QPDF not caching overridden objstm object");  
2069 - } 422 + m->warnings.push_back(e);
  423 + if (!m->suppress_warnings) {
  424 + *m->log->getWarn() << "WARNING: " << m->warnings.back().what() << "\n";
2070 } 425 }
2071 } 426 }
2072 427
2073 -QPDFObjectHandle  
2074 -QPDF::newIndirect(QPDFObjGen og, std::shared_ptr<QPDFObject> const& obj)  
2075 -{  
2076 - obj->setDefaultDescription(this, og);  
2077 - return {obj};  
2078 -}  
2079 -  
2080 void 428 void
2081 -QPDF::updateCache(  
2082 - QPDFObjGen og,  
2083 - std::shared_ptr<QPDFObject> const& object,  
2084 - qpdf_offset_t end_before_space,  
2085 - qpdf_offset_t end_after_space,  
2086 - bool destroy)  
2087 -{  
2088 - object->setObjGen(this, og);  
2089 - if (isCached(og)) {  
2090 - auto& cache = m->obj_cache[og];  
2091 - object->move_to(cache.object, destroy);  
2092 - cache.end_before_space = end_before_space;  
2093 - cache.end_after_space = end_after_space;  
2094 - } else {  
2095 - m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);  
2096 - }  
2097 -}  
2098 -  
2099 -bool  
2100 -QPDF::isCached(QPDFObjGen og)  
2101 -{  
2102 - return m->obj_cache.count(og) != 0;  
2103 -}  
2104 -  
2105 -bool  
2106 -QPDF::isUnresolved(QPDFObjGen og)  
2107 -{  
2108 - return !isCached(og) || m->obj_cache[og].object->isUnresolved();  
2109 -}  
2110 -  
2111 -QPDFObjGen  
2112 -QPDF::nextObjGen()  
2113 -{  
2114 - int max_objid = toI(getObjectCount());  
2115 - if (max_objid == std::numeric_limits<int>::max()) {  
2116 - throw std::range_error("max object id is too high to create new objects");  
2117 - }  
2118 - return QPDFObjGen(max_objid + 1, 0);  
2119 -}  
2120 -  
2121 -QPDFObjectHandle  
2122 -QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)  
2123 -{  
2124 - QPDFObjGen next{nextObjGen()};  
2125 - m->obj_cache[next] = ObjCache(obj, -1, -1);  
2126 - return newIndirect(next, m->obj_cache[next].object);  
2127 -}  
2128 -  
2129 -QPDFObjectHandle  
2130 -QPDF::makeIndirectObject(QPDFObjectHandle oh) 429 +QPDF::warn(
  430 + qpdf_error_code_e error_code,
  431 + std::string const& object,
  432 + qpdf_offset_t offset,
  433 + std::string const& message)
2131 { 434 {
2132 - if (!oh) {  
2133 - throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");  
2134 - }  
2135 - return makeIndirectFromQPDFObject(oh.getObj()); 435 + warn(QPDFExc(error_code, getFilename(), object, offset, message));
2136 } 436 }
2137 437
2138 QPDFObjectHandle 438 QPDFObjectHandle
@@ -2170,52 +470,6 @@ QPDF::newStream(std::string const&amp; data) @@ -2170,52 +470,6 @@ QPDF::newStream(std::string const&amp; data)
2170 return result; 470 return result;
2171 } 471 }
2172 472
2173 -std::shared_ptr<QPDFObject>  
2174 -QPDF::getObjectForParser(int id, int gen, bool parse_pdf)  
2175 -{  
2176 - // This method is called by the parser and therefore must not resolve any objects.  
2177 - auto og = QPDFObjGen(id, gen);  
2178 - if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {  
2179 - return iter->second.object;  
2180 - }  
2181 - if (m->xref_table.count(og) || !m->parsed) {  
2182 - return m->obj_cache.insert({og, QPDFObject::create<QPDF_Unresolved>(this, og)})  
2183 - .first->second.object;  
2184 - }  
2185 - if (parse_pdf) {  
2186 - return QPDFObject::create<QPDF_Null>();  
2187 - }  
2188 - return m->obj_cache.insert({og, QPDFObject::create<QPDF_Null>(this, og)}).first->second.object;  
2189 -}  
2190 -  
2191 -std::shared_ptr<QPDFObject>  
2192 -QPDF::getObjectForJSON(int id, int gen)  
2193 -{  
2194 - auto og = QPDFObjGen(id, gen);  
2195 - auto [it, inserted] = m->obj_cache.try_emplace(og);  
2196 - auto& obj = it->second.object;  
2197 - if (inserted) {  
2198 - obj = (m->parsed && !m->xref_table.count(og))  
2199 - ? QPDFObject::create<QPDF_Null>(this, og)  
2200 - : QPDFObject::create<QPDF_Unresolved>(this, og);  
2201 - }  
2202 - return obj;  
2203 -}  
2204 -  
2205 -QPDFObjectHandle  
2206 -QPDF::getObject(QPDFObjGen og)  
2207 -{  
2208 - if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {  
2209 - return {it->second.object};  
2210 - } else if (m->parsed && !m->xref_table.count(og)) {  
2211 - return QPDFObject::create<QPDF_Null>();  
2212 - } else {  
2213 - auto result =  
2214 - m->obj_cache.try_emplace(og, QPDFObject::create<QPDF_Unresolved>(this, og), -1, -1);  
2215 - return {result.first->second.object};  
2216 - }  
2217 -}  
2218 -  
2219 QPDFObjectHandle 473 QPDFObjectHandle
2220 QPDF::getObject(int objid, int generation) 474 QPDF::getObject(int objid, int generation)
2221 { 475 {
@@ -2234,45 +488,6 @@ QPDF::getObjectByID(int objid, int generation) @@ -2234,45 +488,6 @@ QPDF::getObjectByID(int objid, int generation)
2234 return getObject(QPDFObjGen(objid, generation)); 488 return getObject(QPDFObjGen(objid, generation));
2235 } 489 }
2236 490
2237 -void  
2238 -QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)  
2239 -{  
2240 - replaceObject(QPDFObjGen(objid, generation), oh);  
2241 -}  
2242 -  
2243 -void  
2244 -QPDF::replaceObject(QPDFObjGen og, QPDFObjectHandle oh)  
2245 -{  
2246 - if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {  
2247 - QTC::TC("qpdf", "QPDF replaceObject called with indirect object");  
2248 - throw std::logic_error("QPDF::replaceObject called with indirect object handle");  
2249 - }  
2250 - updateCache(og, oh.getObj(), -1, -1, false);  
2251 -}  
2252 -  
2253 -void  
2254 -QPDF::removeObject(QPDFObjGen og)  
2255 -{  
2256 - m->xref_table.erase(og);  
2257 - if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {  
2258 - // Take care of any object handles that may be floating around.  
2259 - cached->second.object->assign_null();  
2260 - cached->second.object->setObjGen(nullptr, QPDFObjGen());  
2261 - m->obj_cache.erase(cached);  
2262 - }  
2263 -}  
2264 -  
2265 -void  
2266 -QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)  
2267 -{  
2268 - QTC::TC("qpdf", "QPDF replaceReserved");  
2269 - auto tc = reserved.getTypeCode();  
2270 - if (!(tc == ::ot_reserved || tc == ::ot_null)) {  
2271 - throw std::logic_error("replaceReserved called with non-reserved object");  
2272 - }  
2273 - replaceObject(reserved.getObjGen(), replacement);  
2274 -}  
2275 -  
2276 QPDFObjectHandle 491 QPDFObjectHandle
2277 QPDF::copyForeignObject(QPDFObjectHandle foreign) 492 QPDF::copyForeignObject(QPDFObjectHandle foreign)
2278 { 493 {
@@ -2532,21 +747,6 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign) @@ -2532,21 +747,6 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign)
2532 } 747 }
2533 } 748 }
2534 749
2535 -void  
2536 -QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)  
2537 -{  
2538 - swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));  
2539 -}  
2540 -  
2541 -void  
2542 -QPDF::swapObjects(QPDFObjGen og1, QPDFObjGen og2)  
2543 -{  
2544 - // Force objects to be read from the input source if needed, then swap them in the cache.  
2545 - resolve(og1);  
2546 - resolve(og2);  
2547 - m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);  
2548 -}  
2549 -  
2550 unsigned long long 750 unsigned long long
2551 QPDF::getUniqueId() const 751 QPDF::getUniqueId() const
2552 { 752 {
@@ -2640,136 +840,6 @@ QPDF::getXRefTableInternal() @@ -2640,136 +840,6 @@ QPDF::getXRefTableInternal()
2640 return m->xref_table; 840 return m->xref_table;
2641 } 841 }
2642 842
2643 -size_t  
2644 -QPDF::tableSize()  
2645 -{  
2646 - // If obj_cache is dense, accommodate all object in tables,else accommodate only original  
2647 - // objects.  
2648 - auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0;  
2649 - auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;  
2650 - auto max_id = std::numeric_limits<int>::max() - 1;  
2651 - if (max_obj >= max_id || max_xref >= max_id) {  
2652 - // Temporary fix. Long-term solution is  
2653 - // - QPDFObjGen to enforce objgens are valid and sensible  
2654 - // - xref table and obj cache to protect against insertion of impossibly large obj ids  
2655 - stopOnError("Impossibly large object id encountered.");  
2656 - }  
2657 - if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {  
2658 - return toS(++max_obj);  
2659 - }  
2660 - return toS(++max_xref);  
2661 -}  
2662 -  
2663 -std::vector<QPDFObjGen>  
2664 -QPDF::getCompressibleObjVector()  
2665 -{  
2666 - return getCompressibleObjGens<QPDFObjGen>();  
2667 -}  
2668 -  
2669 -std::vector<bool>  
2670 -QPDF::getCompressibleObjSet()  
2671 -{  
2672 - return getCompressibleObjGens<bool>();  
2673 -}  
2674 -  
2675 -template <typename T>  
2676 -std::vector<T>  
2677 -QPDF::getCompressibleObjGens()  
2678 -{  
2679 - // Return a list of objects that are allowed to be in object streams. Walk through the objects  
2680 - // by traversing the document from the root, including a traversal of the pages tree. This  
2681 - // makes that objects that are on the same page are more likely to be in the same object stream,  
2682 - // which is slightly more efficient, particularly with linearized files. This is better than  
2683 - // iterating through the xref table since it avoids preserving orphaned items.  
2684 -  
2685 - // Exclude encryption dictionary, if any  
2686 - QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");  
2687 - QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();  
2688 -  
2689 - const size_t max_obj = getObjectCount();  
2690 - std::vector<bool> visited(max_obj, false);  
2691 - std::vector<QPDFObjectHandle> queue;  
2692 - queue.reserve(512);  
2693 - queue.push_back(m->trailer);  
2694 - std::vector<T> result;  
2695 - if constexpr (std::is_same_v<T, QPDFObjGen>) {  
2696 - result.reserve(m->obj_cache.size());  
2697 - } else if constexpr (std::is_same_v<T, bool>) {  
2698 - result.resize(max_obj + 1U, false);  
2699 - } else {  
2700 - throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");  
2701 - }  
2702 - while (!queue.empty()) {  
2703 - auto obj = queue.back();  
2704 - queue.pop_back();  
2705 - if (obj.getObjectID() > 0) {  
2706 - QPDFObjGen og = obj.getObjGen();  
2707 - const size_t id = toS(og.getObj() - 1);  
2708 - if (id >= max_obj) {  
2709 - throw std::logic_error(  
2710 - "unexpected object id encountered in getCompressibleObjGens");  
2711 - }  
2712 - if (visited[id]) {  
2713 - QTC::TC("qpdf", "QPDF loop detected traversing objects");  
2714 - continue;  
2715 - }  
2716 -  
2717 - // Check whether this is the current object. If not, remove it (which changes it into a  
2718 - // direct null and therefore stops us from revisiting it) and move on to the next object  
2719 - // in the queue.  
2720 - auto upper = m->obj_cache.upper_bound(og);  
2721 - if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {  
2722 - removeObject(og);  
2723 - continue;  
2724 - }  
2725 -  
2726 - visited[id] = true;  
2727 -  
2728 - if (og == encryption_dict_og) {  
2729 - QTC::TC("qpdf", "QPDF exclude encryption dictionary");  
2730 - } else if (!(obj.isStream() ||  
2731 - (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&  
2732 - obj.hasKey("/Contents")))) {  
2733 - if constexpr (std::is_same_v<T, QPDFObjGen>) {  
2734 - result.push_back(og);  
2735 - } else if constexpr (std::is_same_v<T, bool>) {  
2736 - result[id + 1U] = true;  
2737 - }  
2738 - }  
2739 - }  
2740 - if (obj.isStream()) {  
2741 - auto dict = obj.getDict().as_dictionary();  
2742 - auto end = dict.crend();  
2743 - for (auto iter = dict.crbegin(); iter != end; ++iter) {  
2744 - std::string const& key = iter->first;  
2745 - QPDFObjectHandle const& value = iter->second;  
2746 - if (!value.null()) {  
2747 - if (key == "/Length") {  
2748 - // omit stream lengths  
2749 - if (value.isIndirect()) {  
2750 - QTC::TC("qpdf", "QPDF exclude indirect length");  
2751 - }  
2752 - } else {  
2753 - queue.emplace_back(value);  
2754 - }  
2755 - }  
2756 - }  
2757 - } else if (obj.isDictionary()) {  
2758 - auto dict = obj.as_dictionary();  
2759 - auto end = dict.crend();  
2760 - for (auto iter = dict.crbegin(); iter != end; ++iter) {  
2761 - if (!iter->second.null()) {  
2762 - queue.emplace_back(iter->second);  
2763 - }  
2764 - }  
2765 - } else if (auto items = obj.as_array()) {  
2766 - queue.insert(queue.end(), items.crbegin(), items.crend());  
2767 - }  
2768 - }  
2769 -  
2770 - return result;  
2771 -}  
2772 -  
2773 bool 843 bool
2774 QPDF::pipeStreamData( 844 QPDF::pipeStreamData(
2775 std::shared_ptr<EncryptionParameters> encp, 845 std::shared_ptr<EncryptionParameters> encp,
libqpdf/QPDF_objects.cc 0 โ†’ 100644
  1 +#include <qpdf/qpdf-config.h> // include first for large file support
  2 +
  3 +#include <qpdf/QPDF_private.hh>
  4 +
  5 +#include <array>
  6 +#include <atomic>
  7 +#include <cstring>
  8 +#include <limits>
  9 +#include <map>
  10 +#include <regex>
  11 +#include <sstream>
  12 +#include <vector>
  13 +
  14 +#include <qpdf/BufferInputSource.hh>
  15 +#include <qpdf/FileInputSource.hh>
  16 +#include <qpdf/InputSource_private.hh>
  17 +#include <qpdf/OffsetInputSource.hh>
  18 +#include <qpdf/Pipeline.hh>
  19 +#include <qpdf/QPDFExc.hh>
  20 +#include <qpdf/QPDFLogger.hh>
  21 +#include <qpdf/QPDFObjectHandle_private.hh>
  22 +#include <qpdf/QPDFObject_private.hh>
  23 +#include <qpdf/QPDFParser.hh>
  24 +#include <qpdf/QTC.hh>
  25 +#include <qpdf/QUtil.hh>
  26 +#include <qpdf/Util.hh>
  27 +
  28 +using namespace qpdf;
  29 +using namespace std::literals;
  30 +
  31 +namespace
  32 +{
  33 + class InvalidInputSource: public InputSource
  34 + {
  35 + public:
  36 + ~InvalidInputSource() override = default;
  37 + qpdf_offset_t
  38 + findAndSkipNextEOL() override
  39 + {
  40 + throwException();
  41 + return 0;
  42 + }
  43 + std::string const&
  44 + getName() const override
  45 + {
  46 + static std::string name("closed input source");
  47 + return name;
  48 + }
  49 + qpdf_offset_t
  50 + tell() override
  51 + {
  52 + throwException();
  53 + return 0;
  54 + }
  55 + void
  56 + seek(qpdf_offset_t offset, int whence) override
  57 + {
  58 + throwException();
  59 + }
  60 + void
  61 + rewind() override
  62 + {
  63 + throwException();
  64 + }
  65 + size_t
  66 + read(char* buffer, size_t length) override
  67 + {
  68 + throwException();
  69 + return 0;
  70 + }
  71 + void
  72 + unreadCh(char ch) override
  73 + {
  74 + throwException();
  75 + }
  76 +
  77 + private:
  78 + void
  79 + throwException()
  80 + {
  81 + throw std::logic_error(
  82 + "QPDF operation attempted on a QPDF object with no input "
  83 + "source. QPDF operations are invalid before processFile (or "
  84 + "another process method) or after closeInputSource");
  85 + }
  86 + };
  87 +} // namespace
  88 +
  89 +bool
  90 +QPDF::findStartxref()
  91 +{
  92 + if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {
  93 + // Position in front of offset token
  94 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  95 + return true;
  96 + }
  97 + return false;
  98 +}
  99 +
  100 +void
  101 +QPDF::parse(char const* password)
  102 +{
  103 + if (password) {
  104 + m->encp->provided_password = password;
  105 + }
  106 +
  107 + // Find the header anywhere in the first 1024 bytes of the file.
  108 + PatternFinder hf(*this, &QPDF::findHeader);
  109 + if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {
  110 + QTC::TC("qpdf", "QPDF not a pdf file");
  111 + warn(damagedPDF("", 0, "can't find PDF header"));
  112 + // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode
  113 + m->pdf_version = "1.2";
  114 + }
  115 +
  116 + // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
  117 + // 30 characters to leave room for the startxref stuff.
  118 + m->file->seek(0, SEEK_END);
  119 + qpdf_offset_t end_offset = m->file->tell();
  120 + m->xref_table_max_offset = end_offset;
  121 + // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
  122 + // scenarios at least 3 bytes are required.
  123 + if (m->xref_table_max_id > m->xref_table_max_offset / 3) {
  124 + m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);
  125 + }
  126 + qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
  127 + PatternFinder sf(*this, &QPDF::findStartxref);
  128 + qpdf_offset_t xref_offset = 0;
  129 + if (m->file->findLast("startxref", start_offset, 0, sf)) {
  130 + xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());
  131 + }
  132 +
  133 + try {
  134 + if (xref_offset == 0) {
  135 + QTC::TC("qpdf", "QPDF can't find startxref");
  136 + throw damagedPDF("", 0, "can't find startxref");
  137 + }
  138 + try {
  139 + read_xref(xref_offset);
  140 + } catch (QPDFExc&) {
  141 + throw;
  142 + } catch (std::exception& e) {
  143 + throw damagedPDF("", 0, std::string("error reading xref: ") + e.what());
  144 + }
  145 + } catch (QPDFExc& e) {
  146 + if (m->attempt_recovery) {
  147 + reconstruct_xref(e, xref_offset > 0);
  148 + QTC::TC("qpdf", "QPDF reconstructed xref table");
  149 + } else {
  150 + throw;
  151 + }
  152 + }
  153 +
  154 + initializeEncryption();
  155 + m->parsed = true;
  156 + if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) {
  157 + // QPDFs created from JSON have an empty xref table and no root object yet.
  158 + throw damagedPDF("", 0, "unable to find page tree");
  159 + }
  160 +}
  161 +
  162 +void
  163 +QPDF::inParse(bool v)
  164 +{
  165 + if (m->in_parse == v) {
  166 + // This happens if QPDFParser::parse tries to resolve an indirect object while it is
  167 + // parsing.
  168 + throw std::logic_error(
  169 + "QPDF: re-entrant parsing detected. This is a qpdf bug."
  170 + " Please report at https://github.com/qpdf/qpdf/issues.");
  171 + }
  172 + m->in_parse = v;
  173 +}
  174 +
  175 +void
  176 +QPDF::setTrailer(QPDFObjectHandle obj)
  177 +{
  178 + if (m->trailer) {
  179 + return;
  180 + }
  181 + m->trailer = obj;
  182 +}
  183 +
  184 +void
  185 +QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
  186 +{
  187 + if (m->reconstructed_xref) {
  188 + // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
  189 + // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
  190 + throw e;
  191 + }
  192 +
  193 + // If recovery generates more than 1000 warnings, the file is so severely damaged that there
  194 + // probably is no point trying to continue.
  195 + const auto max_warnings = m->warnings.size() + 1000U;
  196 + auto check_warnings = [this, max_warnings]() {
  197 + if (m->warnings.size() > max_warnings) {
  198 + throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");
  199 + }
  200 + };
  201 +
  202 + m->reconstructed_xref = true;
  203 + // We may find more objects, which may contain dangling references.
  204 + m->fixed_dangling_refs = false;
  205 +
  206 + warn(damagedPDF("", 0, "file is damaged"));
  207 + warn(e);
  208 + warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));
  209 +
  210 + // Delete all references to type 1 (uncompressed) objects
  211 + std::vector<QPDFObjGen> to_delete;
  212 + for (auto const& iter: m->xref_table) {
  213 + if (iter.second.getType() == 1) {
  214 + to_delete.emplace_back(iter.first);
  215 + }
  216 + }
  217 + for (auto const& iter: to_delete) {
  218 + m->xref_table.erase(iter);
  219 + }
  220 +
  221 + std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
  222 + std::vector<qpdf_offset_t> trailers;
  223 + std::vector<qpdf_offset_t> startxrefs;
  224 +
  225 + m->file->seek(0, SEEK_END);
  226 + qpdf_offset_t eof = m->file->tell();
  227 + m->file->seek(0, SEEK_SET);
  228 + // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
  229 + static size_t const MAX_LEN = 10;
  230 + while (m->file->tell() < eof) {
  231 + QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);
  232 + qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
  233 + if (t1.isInteger()) {
  234 + auto pos = m->file->tell();
  235 + auto t2 = readToken(*m->file, MAX_LEN);
  236 + if (t2.isInteger() && readToken(*m->file, MAX_LEN).isWord("obj")) {
  237 + int obj = QUtil::string_to_int(t1.getValue().c_str());
  238 + int gen = QUtil::string_to_int(t2.getValue().c_str());
  239 + if (obj <= m->xref_table_max_id) {
  240 + found_objects.emplace_back(obj, gen, token_start);
  241 + } else {
  242 + warn(damagedPDF(
  243 + "", 0, "ignoring object with impossibly large id " + std::to_string(obj)));
  244 + }
  245 + }
  246 + m->file->seek(pos, SEEK_SET);
  247 + } else if (!m->trailer && t1.isWord("trailer")) {
  248 + trailers.emplace_back(m->file->tell());
  249 + } else if (!found_startxref && t1.isWord("startxref")) {
  250 + startxrefs.emplace_back(m->file->tell());
  251 + }
  252 + check_warnings();
  253 + m->file->findAndSkipNextEOL();
  254 + }
  255 +
  256 + if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
  257 + startxrefs.back() > std::get<2>(found_objects.back())) {
  258 + try {
  259 + m->file->seek(startxrefs.back(), SEEK_SET);
  260 + if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
  261 + read_xref(offset);
  262 + if (getRoot().getKey("/Pages").isDictionary()) {
  263 + QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
  264 + warn(
  265 + damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));
  266 + initializeEncryption();
  267 + m->parsed = true;
  268 + m->reconstructed_xref = false;
  269 + return;
  270 + }
  271 + }
  272 + } catch (...) {
  273 + // ok, bad luck. Do recovery.
  274 + }
  275 + }
  276 +
  277 + auto rend = found_objects.rend();
  278 + for (auto it = found_objects.rbegin(); it != rend; it++) {
  279 + auto [obj, gen, token_start] = *it;
  280 + insertXrefEntry(obj, 1, token_start, gen);
  281 + check_warnings();
  282 + }
  283 + m->deleted_objects.clear();
  284 +
  285 + for (auto it = trailers.rbegin(); it != trailers.rend(); it++) {
  286 + m->file->seek(*it, SEEK_SET);
  287 + auto t = readTrailer();
  288 + if (!t.isDictionary()) {
  289 + // Oh well. It was worth a try.
  290 + } else {
  291 + if (t.hasKey("/Root")) {
  292 + m->trailer = t;
  293 + break;
  294 + }
  295 + warn(damagedPDF("trailer", *it, "recovered trailer has no /Root entry"));
  296 + }
  297 + check_warnings();
  298 + }
  299 +
  300 + if (!m->trailer) {
  301 + qpdf_offset_t max_offset{0};
  302 + size_t max_size{0};
  303 + // If there are any xref streams, take the last one to appear.
  304 + for (auto const& iter: m->xref_table) {
  305 + auto entry = iter.second;
  306 + if (entry.getType() != 1) {
  307 + continue;
  308 + }
  309 + auto oh = getObject(iter.first);
  310 + try {
  311 + if (!oh.isStreamOfType("/XRef")) {
  312 + continue;
  313 + }
  314 + } catch (std::exception&) {
  315 + continue;
  316 + }
  317 + auto offset = entry.getOffset();
  318 + auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();
  319 + if (size > max_size || (size == max_size && offset > max_offset)) {
  320 + max_offset = offset;
  321 + setTrailer(oh.getDict());
  322 + }
  323 + check_warnings();
  324 + }
  325 + if (max_offset > 0) {
  326 + try {
  327 + read_xref(max_offset);
  328 + } catch (std::exception&) {
  329 + warn(damagedPDF(
  330 + "", 0, "error decoding candidate xref stream while recovering damaged file"));
  331 + }
  332 + QTC::TC("qpdf", "QPDF recover xref stream");
  333 + }
  334 + }
  335 +
  336 + if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {
  337 + // Try to find a Root dictionary. As a quick fix try the one with the highest object id.
  338 + QPDFObjectHandle root;
  339 + for (auto const& iter: m->obj_cache) {
  340 + try {
  341 + if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {
  342 + root = iter.second.object;
  343 + }
  344 + } catch (std::exception&) {
  345 + continue;
  346 + }
  347 + }
  348 + if (root) {
  349 + if (!m->trailer) {
  350 + warn(damagedPDF(
  351 + "", 0, "unable to find trailer dictionary while recovering damaged file"));
  352 + m->trailer = QPDFObjectHandle::newDictionary();
  353 + }
  354 + m->trailer.replaceKey("/Root", root);
  355 + }
  356 + }
  357 +
  358 + if (!m->trailer) {
  359 + // We could check the last encountered object to see if it was an xref stream. If so, we
  360 + // could try to get the trailer from there. This may make it possible to recover files with
  361 + // bad startxref pointers even when they have object streams.
  362 +
  363 + throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");
  364 + }
  365 + if (m->xref_table.empty()) {
  366 + // We cannot check for an empty xref table in parse because empty tables are valid when
  367 + // creating QPDF objects from JSON.
  368 + throw damagedPDF("", 0, "unable to find objects while recovering damaged file");
  369 + }
  370 + check_warnings();
  371 + if (!m->parsed) {
  372 + m->parsed = true;
  373 + getAllPages();
  374 + check_warnings();
  375 + if (m->all_pages.empty()) {
  376 + m->parsed = false;
  377 + throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");
  378 + }
  379 + }
  380 + // We could iterate through the objects looking for streams and try to find objects inside of
  381 + // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors
  382 + // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything
  383 + // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
  384 + // It's safe to call it more than once.
  385 +}
  386 +
  387 +void
  388 +QPDF::read_xref(qpdf_offset_t xref_offset)
  389 +{
  390 + std::map<int, int> free_table;
  391 + std::set<qpdf_offset_t> visited;
  392 + while (xref_offset) {
  393 + visited.insert(xref_offset);
  394 + char buf[7];
  395 + memset(buf, 0, sizeof(buf));
  396 + m->file->seek(xref_offset, SEEK_SET);
  397 + // Some files miss the mark a little with startxref. We could do a better job of searching
  398 + // in the neighborhood for something that looks like either an xref table or stream, but the
  399 + // simple heuristic of skipping whitespace can help with the xref table case and is harmless
  400 + // with the stream case.
  401 + bool done = false;
  402 + bool skipped_space = false;
  403 + while (!done) {
  404 + char ch;
  405 + if (1 == m->file->read(&ch, 1)) {
  406 + if (util::is_space(ch)) {
  407 + skipped_space = true;
  408 + } else {
  409 + m->file->unreadCh(ch);
  410 + done = true;
  411 + }
  412 + } else {
  413 + QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
  414 + done = true;
  415 + }
  416 + }
  417 +
  418 + m->file->read(buf, sizeof(buf) - 1);
  419 + // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
  420 + // where it is terminated by arbitrary whitespace.
  421 + if ((strncmp(buf, "xref", 4) == 0) && util::is_space(buf[4])) {
  422 + if (skipped_space) {
  423 + QTC::TC("qpdf", "QPDF xref skipped space");
  424 + warn(damagedPDF("", 0, "extraneous whitespace seen before xref"));
  425 + }
  426 + QTC::TC(
  427 + "qpdf",
  428 + "QPDF xref space",
  429 + ((buf[4] == '\n') ? 0
  430 + : (buf[4] == '\r') ? 1
  431 + : (buf[4] == ' ') ? 2
  432 + : 9999));
  433 + int skip = 4;
  434 + // buf is null-terminated, and util::is_space('\0') is false, so this won't overrun.
  435 + while (util::is_space(buf[skip])) {
  436 + ++skip;
  437 + }
  438 + xref_offset = read_xrefTable(xref_offset + skip);
  439 + } else {
  440 + xref_offset = read_xrefStream(xref_offset);
  441 + }
  442 + if (visited.count(xref_offset) != 0) {
  443 + QTC::TC("qpdf", "QPDF xref loop");
  444 + throw damagedPDF("", 0, "loop detected following xref tables");
  445 + }
  446 + }
  447 +
  448 + if (!m->trailer) {
  449 + throw damagedPDF("", 0, "unable to find trailer while reading xref");
  450 + }
  451 + int size = m->trailer.getKey("/Size").getIntValueAsInt();
  452 + int max_obj = 0;
  453 + if (!m->xref_table.empty()) {
  454 + max_obj = m->xref_table.rbegin()->first.getObj();
  455 + }
  456 + if (!m->deleted_objects.empty()) {
  457 + max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));
  458 + }
  459 + if ((size < 1) || (size - 1 != max_obj)) {
  460 + QTC::TC("qpdf", "QPDF xref size mismatch");
  461 + warn(damagedPDF(
  462 + "",
  463 + 0,
  464 + ("reported number of objects (" + std::to_string(size) +
  465 + ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));
  466 + }
  467 +
  468 + // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we
  469 + // never depend on its being set.
  470 + m->deleted_objects.clear();
  471 +
  472 + // Make sure we keep only the highest generation for any object.
  473 + QPDFObjGen last_og{-1, 0};
  474 + for (auto const& item: m->xref_table) {
  475 + auto id = item.first.getObj();
  476 + if (id == last_og.getObj() && id > 0) {
  477 + removeObject(last_og);
  478 + }
  479 + last_og = item.first;
  480 + }
  481 +}
  482 +
  483 +bool
  484 +QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)
  485 +{
  486 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  487 + // buffer.
  488 + char const* p = line.c_str();
  489 + char const* start = line.c_str();
  490 +
  491 + // Skip zero or more spaces
  492 + while (util::is_space(*p)) {
  493 + ++p;
  494 + }
  495 + // Require digit
  496 + if (!util::is_digit(*p)) {
  497 + return false;
  498 + }
  499 + // Gather digits
  500 + std::string obj_str;
  501 + while (util::is_digit(*p)) {
  502 + obj_str.append(1, *p++);
  503 + }
  504 + // Require space
  505 + if (!util::is_space(*p)) {
  506 + return false;
  507 + }
  508 + // Skip spaces
  509 + while (util::is_space(*p)) {
  510 + ++p;
  511 + }
  512 + // Require digit
  513 + if (!util::is_digit(*p)) {
  514 + return false;
  515 + }
  516 + // Gather digits
  517 + std::string num_str;
  518 + while (util::is_digit(*p)) {
  519 + num_str.append(1, *p++);
  520 + }
  521 + // Skip any space including line terminators
  522 + while (util::is_space(*p)) {
  523 + ++p;
  524 + }
  525 + bytes = toI(p - start);
  526 + obj = QUtil::string_to_int(obj_str.c_str());
  527 + num = QUtil::string_to_int(num_str.c_str());
  528 + return true;
  529 +}
  530 +
  531 +bool
  532 +QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
  533 +{
  534 + // Reposition after initial read attempt and reread.
  535 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  536 + auto line = m->file->readLine(30);
  537 +
  538 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  539 + // buffer.
  540 + char const* p = line.data();
  541 +
  542 + // Skip zero or more spaces. There aren't supposed to be any.
  543 + bool invalid = false;
  544 + while (util::is_space(*p)) {
  545 + ++p;
  546 + QTC::TC("qpdf", "QPDF ignore first space in xref entry");
  547 + invalid = true;
  548 + }
  549 + // Require digit
  550 + if (!util::is_digit(*p)) {
  551 + return false;
  552 + }
  553 + // Gather digits
  554 + std::string f1_str;
  555 + while (util::is_digit(*p)) {
  556 + f1_str.append(1, *p++);
  557 + }
  558 + // Require space
  559 + if (!util::is_space(*p)) {
  560 + return false;
  561 + }
  562 + if (util::is_space(*(p + 1))) {
  563 + QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
  564 + invalid = true;
  565 + }
  566 + // Skip spaces
  567 + while (util::is_space(*p)) {
  568 + ++p;
  569 + }
  570 + // Require digit
  571 + if (!util::is_digit(*p)) {
  572 + return false;
  573 + }
  574 + // Gather digits
  575 + std::string f2_str;
  576 + while (util::is_digit(*p)) {
  577 + f2_str.append(1, *p++);
  578 + }
  579 + // Require space
  580 + if (!util::is_space(*p)) {
  581 + return false;
  582 + }
  583 + if (util::is_space(*(p + 1))) {
  584 + QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
  585 + invalid = true;
  586 + }
  587 + // Skip spaces
  588 + while (util::is_space(*p)) {
  589 + ++p;
  590 + }
  591 + if ((*p == 'f') || (*p == 'n')) {
  592 + type = *p;
  593 + } else {
  594 + return false;
  595 + }
  596 + if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
  597 + QTC::TC("qpdf", "QPDF ignore length error xref entry");
  598 + invalid = true;
  599 + }
  600 +
  601 + if (invalid) {
  602 + warn(damagedPDF("xref table", "accepting invalid xref table entry"));
  603 + }
  604 +
  605 + f1 = QUtil::string_to_ll(f1_str.c_str());
  606 + f2 = QUtil::string_to_int(f2_str.c_str());
  607 +
  608 + return true;
  609 +}
  610 +
  611 +// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
  612 +// result.
  613 +bool
  614 +QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
  615 +{
  616 + std::array<char, 21> line;
  617 + if (m->file->read(line.data(), 20) != 20) {
  618 + // C++20: [[unlikely]]
  619 + return false;
  620 + }
  621 + line[20] = '\0';
  622 + char const* p = line.data();
  623 +
  624 + int f1_len = 0;
  625 + int f2_len = 0;
  626 +
  627 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  628 + // buffer.
  629 +
  630 + // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
  631 + while (*p == '0') {
  632 + ++f1_len;
  633 + ++p;
  634 + }
  635 + while (util::is_digit(*p) && f1_len++ < 10) {
  636 + f1 *= 10;
  637 + f1 += *p++ - '0';
  638 + }
  639 + // Require space
  640 + if (!util::is_space(*p++)) {
  641 + // Entry doesn't start with space or digit.
  642 + // C++20: [[unlikely]]
  643 + return false;
  644 + }
  645 + // Gather digits. NB No risk of overflow as 99'999 < max int.
  646 + while (*p == '0') {
  647 + ++f2_len;
  648 + ++p;
  649 + }
  650 + while (util::is_digit(*p) && f2_len++ < 5) {
  651 + f2 *= 10;
  652 + f2 += static_cast<int>(*p++ - '0');
  653 + }
  654 + if (util::is_space(*p++) && (*p == 'f' || *p == 'n')) {
  655 + // C++20: [[likely]]
  656 + type = *p;
  657 + // No test for valid line[19].
  658 + if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
  659 + // C++20: [[likely]]
  660 + return true;
  661 + }
  662 + }
  663 + return read_bad_xrefEntry(f1, f2, type);
  664 +}
  665 +
  666 +// Read a single cross-reference table section and associated trailer.
  667 +qpdf_offset_t
  668 +QPDF::read_xrefTable(qpdf_offset_t xref_offset)
  669 +{
  670 + m->file->seek(xref_offset, SEEK_SET);
  671 + std::string line;
  672 + while (true) {
  673 + line.assign(50, '\0');
  674 + m->file->read(line.data(), line.size());
  675 + int obj = 0;
  676 + int num = 0;
  677 + int bytes = 0;
  678 + if (!parse_xrefFirst(line, obj, num, bytes)) {
  679 + QTC::TC("qpdf", "QPDF invalid xref");
  680 + throw damagedPDF("xref table", "xref syntax invalid");
  681 + }
  682 + m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
  683 + for (qpdf_offset_t i = obj; i - num < obj; ++i) {
  684 + if (i == 0) {
  685 + // This is needed by checkLinearization()
  686 + m->first_xref_item_offset = m->file->tell();
  687 + }
  688 + // For xref_table, these will always be small enough to be ints
  689 + qpdf_offset_t f1 = 0;
  690 + int f2 = 0;
  691 + char type = '\0';
  692 + if (!read_xrefEntry(f1, f2, type)) {
  693 + QTC::TC("qpdf", "QPDF invalid xref entry");
  694 + throw damagedPDF(
  695 + "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
  696 + }
  697 + if (type == 'f') {
  698 + insertFreeXrefEntry(QPDFObjGen(toI(i), f2));
  699 + } else {
  700 + insertXrefEntry(toI(i), 1, f1, f2);
  701 + }
  702 + }
  703 + qpdf_offset_t pos = m->file->tell();
  704 + if (readToken(*m->file).isWord("trailer")) {
  705 + break;
  706 + } else {
  707 + m->file->seek(pos, SEEK_SET);
  708 + }
  709 + }
  710 +
  711 + // Set offset to previous xref table if any
  712 + QPDFObjectHandle cur_trailer = readTrailer();
  713 + if (!cur_trailer.isDictionary()) {
  714 + QTC::TC("qpdf", "QPDF missing trailer");
  715 + throw damagedPDF("", "expected trailer dictionary");
  716 + }
  717 +
  718 + if (!m->trailer) {
  719 + setTrailer(cur_trailer);
  720 +
  721 + if (!m->trailer.hasKey("/Size")) {
  722 + QTC::TC("qpdf", "QPDF trailer lacks size");
  723 + throw damagedPDF("trailer", "trailer dictionary lacks /Size key");
  724 + }
  725 + if (!m->trailer.getKey("/Size").isInteger()) {
  726 + QTC::TC("qpdf", "QPDF trailer size not integer");
  727 + throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
  728 + }
  729 + }
  730 +
  731 + if (cur_trailer.hasKey("/XRefStm")) {
  732 + if (m->ignore_xref_streams) {
  733 + QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
  734 + } else {
  735 + if (cur_trailer.getKey("/XRefStm").isInteger()) {
  736 + // Read the xref stream but disregard any return value -- we'll use our trailer's
  737 + // /Prev key instead of the xref stream's.
  738 + (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());
  739 + } else {
  740 + throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");
  741 + }
  742 + }
  743 + }
  744 +
  745 + if (cur_trailer.hasKey("/Prev")) {
  746 + if (!cur_trailer.getKey("/Prev").isInteger()) {
  747 + QTC::TC("qpdf", "QPDF trailer prev not integer");
  748 + throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");
  749 + }
  750 + QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
  751 + return cur_trailer.getKey("/Prev").getIntValue();
  752 + }
  753 +
  754 + return 0;
  755 +}
  756 +
  757 +// Read a single cross-reference stream.
  758 +qpdf_offset_t
  759 +QPDF::read_xrefStream(qpdf_offset_t xref_offset)
  760 +{
  761 + if (!m->ignore_xref_streams) {
  762 + QPDFObjGen x_og;
  763 + QPDFObjectHandle xref_obj;
  764 + try {
  765 + xref_obj =
  766 + readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);
  767 + } catch (QPDFExc&) {
  768 + // ignore -- report error below
  769 + }
  770 + if (xref_obj.isStreamOfType("/XRef")) {
  771 + QTC::TC("qpdf", "QPDF found xref stream");
  772 + return processXRefStream(xref_offset, xref_obj);
  773 + }
  774 + }
  775 +
  776 + QTC::TC("qpdf", "QPDF can't find xref");
  777 + throw damagedPDF("", xref_offset, "xref not found");
  778 + return 0; // unreachable
  779 +}
  780 +
  781 +// Return the entry size of the xref stream and the processed W array.
  782 +std::pair<int, std::array<int, 3>>
  783 +QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
  784 +{
  785 + auto W_obj = dict.getKey("/W");
  786 + if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&
  787 + W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
  788 + throw damaged("Cross-reference stream does not have a proper /W key");
  789 + }
  790 +
  791 + std::array<int, 3> W;
  792 + int entry_size = 0;
  793 + auto w_vector = W_obj.getArrayAsVector();
  794 + int max_bytes = sizeof(qpdf_offset_t);
  795 + for (size_t i = 0; i < 3; ++i) {
  796 + W[i] = w_vector[i].getIntValueAsInt();
  797 + if (W[i] > max_bytes) {
  798 + throw damaged("Cross-reference stream's /W contains impossibly large values");
  799 + }
  800 + if (W[i] < 0) {
  801 + throw damaged("Cross-reference stream's /W contains negative values");
  802 + }
  803 + entry_size += W[i];
  804 + }
  805 + if (entry_size == 0) {
  806 + throw damaged("Cross-reference stream's /W indicates entry size of 0");
  807 + }
  808 + return {entry_size, W};
  809 +}
  810 +
  811 +// Validate Size key and return the maximum number of entries that the xref stream can contain.
  812 +int
  813 +QPDF::processXRefSize(
  814 + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
  815 +{
  816 + // Number of entries is limited by the highest possible object id and stream size.
  817 + auto max_num_entries = std::numeric_limits<int>::max();
  818 + if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
  819 + max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
  820 + }
  821 +
  822 + auto Size_obj = dict.getKey("/Size");
  823 + long long size;
  824 + if (!dict.getKey("/Size").getValueAsInt(size)) {
  825 + throw damaged("Cross-reference stream does not have a proper /Size key");
  826 + } else if (size < 0) {
  827 + throw damaged("Cross-reference stream has a negative /Size key");
  828 + } else if (size >= max_num_entries) {
  829 + throw damaged("Cross-reference stream has an impossibly large /Size key");
  830 + }
  831 + // We are not validating that Size <= (Size key of parent xref / trailer).
  832 + return max_num_entries;
  833 +}
  834 +
  835 +// Return the number of entries of the xref stream and the processed Index array.
  836 +std::pair<int, std::vector<std::pair<int, int>>>
  837 +QPDF::processXRefIndex(
  838 + QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
  839 +{
  840 + auto size = dict.getKey("/Size").getIntValueAsInt();
  841 + auto Index_obj = dict.getKey("/Index");
  842 +
  843 + if (Index_obj.isArray()) {
  844 + std::vector<std::pair<int, int>> indx;
  845 + int num_entries = 0;
  846 + auto index_vec = Index_obj.getArrayAsVector();
  847 + if ((index_vec.size() % 2) || index_vec.size() < 2) {
  848 + throw damaged("Cross-reference stream's /Index has an invalid number of values");
  849 + }
  850 +
  851 + int i = 0;
  852 + long long first = 0;
  853 + for (auto& val: index_vec) {
  854 + if (val.isInteger()) {
  855 + if (i % 2) {
  856 + auto count = val.getIntValue();
  857 + if (count <= 0) {
  858 + throw damaged(
  859 + "Cross-reference stream section claims to contain " +
  860 + std::to_string(count) + " entries");
  861 + }
  862 + // We are guarding against the possibility of num_entries * entry_size
  863 + // overflowing. We are not checking that entries are in ascending order as
  864 + // required by the spec, which probably should generate a warning. We are also
  865 + // not checking that for each subsection first object number + number of entries
  866 + // <= /Size. The spec requires us to ignore object number > /Size.
  867 + if (first > (max_num_entries - count) ||
  868 + count > (max_num_entries - num_entries)) {
  869 + throw damaged(
  870 + "Cross-reference stream claims to contain too many entries: " +
  871 + std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
  872 + std::to_string(num_entries));
  873 + }
  874 + indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
  875 + num_entries += static_cast<int>(count);
  876 + } else {
  877 + first = val.getIntValue();
  878 + if (first < 0) {
  879 + throw damaged(
  880 + "Cross-reference stream's /Index contains a negative object id");
  881 + } else if (first > max_num_entries) {
  882 + throw damaged(
  883 + "Cross-reference stream's /Index contains an impossibly "
  884 + "large object id");
  885 + }
  886 + }
  887 + } else {
  888 + throw damaged(
  889 + "Cross-reference stream's /Index's item " + std::to_string(i) +
  890 + " is not an integer");
  891 + }
  892 + i++;
  893 + }
  894 + QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
  895 + return {num_entries, indx};
  896 + } else if (Index_obj.isNull()) {
  897 + QTC::TC("qpdf", "QPDF xref /Index is null");
  898 + return {size, {{0, size}}};
  899 + } else {
  900 + throw damaged("Cross-reference stream does not have a proper /Index key");
  901 + }
  902 +}
  903 +
  904 +qpdf_offset_t
  905 +QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
  906 +{
  907 + auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
  908 + return damagedPDF("xref stream", xref_offset, msg.data());
  909 + };
  910 +
  911 + auto dict = xref_obj.getDict();
  912 +
  913 + auto [entry_size, W] = processXRefW(dict, damaged);
  914 + int max_num_entries = processXRefSize(dict, entry_size, damaged);
  915 + auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
  916 +
  917 + std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
  918 + size_t actual_size = bp->getSize();
  919 + auto expected_size = toS(entry_size) * toS(num_entries);
  920 +
  921 + if (expected_size != actual_size) {
  922 + QPDFExc x = damaged(
  923 + "Cross-reference stream data has the wrong size; expected = " +
  924 + std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
  925 + if (expected_size > actual_size) {
  926 + throw x;
  927 + } else {
  928 + warn(x);
  929 + }
  930 + }
  931 +
  932 + bool saw_first_compressed_object = false;
  933 +
  934 + // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
  935 + // We know that entry_size * num_entries is less or equal to the size of the buffer.
  936 + auto p = bp->getBuffer();
  937 + for (auto [obj, sec_entries]: indx) {
  938 + // Process a subsection.
  939 + for (int i = 0; i < sec_entries; ++i) {
  940 + // Read this entry
  941 + std::array<qpdf_offset_t, 3> fields{};
  942 + if (W[0] == 0) {
  943 + QTC::TC("qpdf", "QPDF default for xref stream field 0");
  944 + fields[0] = 1;
  945 + }
  946 + for (size_t j = 0; j < 3; ++j) {
  947 + for (int k = 0; k < W[j]; ++k) {
  948 + fields[j] <<= 8;
  949 + fields[j] |= *p++;
  950 + }
  951 + }
  952 +
  953 + // Get the generation number. The generation number is 0 unless this is an uncompressed
  954 + // object record, in which case the generation number appears as the third field.
  955 + if (saw_first_compressed_object) {
  956 + if (fields[0] != 2) {
  957 + m->uncompressed_after_compressed = true;
  958 + }
  959 + } else if (fields[0] == 2) {
  960 + saw_first_compressed_object = true;
  961 + }
  962 + if (obj == 0) {
  963 + // This is needed by checkLinearization()
  964 + m->first_xref_item_offset = xref_offset;
  965 + } else if (fields[0] == 0) {
  966 + // Ignore fields[2], which we don't care about in this case. This works around the
  967 + // issue of some PDF files that put invalid values, like -1, here for deleted
  968 + // objects.
  969 + insertFreeXrefEntry(QPDFObjGen(obj, 0));
  970 + } else {
  971 + insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
  972 + }
  973 + ++obj;
  974 + }
  975 + }
  976 +
  977 + if (!m->trailer) {
  978 + setTrailer(dict);
  979 + }
  980 +
  981 + if (dict.hasKey("/Prev")) {
  982 + if (!dict.getKey("/Prev").isInteger()) {
  983 + throw damagedPDF(
  984 + "xref stream", "/Prev key in xref stream dictionary is not an integer");
  985 + }
  986 + QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
  987 + return dict.getKey("/Prev").getIntValue();
  988 + } else {
  989 + return 0;
  990 + }
  991 +}
  992 +
  993 +void
  994 +QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)
  995 +{
  996 + // Populate the xref table in such a way that the first reference to an object that we see,
  997 + // which is the one in the latest xref table in which it appears, is the one that gets stored.
  998 + // This works because we are reading more recent appends before older ones.
  999 +
  1000 + // If there is already an entry for this object and generation in the table, it means that a
  1001 + // later xref table has registered this object. Disregard this one.
  1002 + int new_gen = f0 == 2 ? 0 : f2;
  1003 +
  1004 + if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && new_gen < 65535)) {
  1005 + // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There
  1006 + // is probably no point having another warning but we could count invalid items in order to
  1007 + // decide when to give up.
  1008 + QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");
  1009 + // ignore impossibly large object ids or object ids > Size.
  1010 + return;
  1011 + }
  1012 +
  1013 + if (m->deleted_objects.count(obj)) {
  1014 + QTC::TC("qpdf", "QPDF xref deleted object");
  1015 + return;
  1016 + }
  1017 +
  1018 + if (f0 == 2 && static_cast<int>(f1) == obj) {
  1019 + warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));
  1020 + return;
  1021 + }
  1022 +
  1023 + auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));
  1024 + if (!created) {
  1025 + QTC::TC("qpdf", "QPDF xref reused object");
  1026 + return;
  1027 + }
  1028 +
  1029 + switch (f0) {
  1030 + case 1:
  1031 + // f2 is generation
  1032 + QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
  1033 + iter->second = QPDFXRefEntry(f1);
  1034 + break;
  1035 +
  1036 + case 2:
  1037 + iter->second = QPDFXRefEntry(toI(f1), f2);
  1038 + break;
  1039 +
  1040 + default:
  1041 + throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));
  1042 + break;
  1043 + }
  1044 +}
  1045 +
  1046 +void
  1047 +QPDF::insertFreeXrefEntry(QPDFObjGen og)
  1048 +{
  1049 + if (!m->xref_table.count(og)) {
  1050 + m->deleted_objects.insert(og.getObj());
  1051 + }
  1052 +}
  1053 +
  1054 +void
  1055 +QPDF::showXRefTable()
  1056 +{
  1057 + auto& cout = *m->log->getInfo();
  1058 + for (auto const& iter: m->xref_table) {
  1059 + QPDFObjGen const& og = iter.first;
  1060 + QPDFXRefEntry const& entry = iter.second;
  1061 + cout << og.unparse('/') << ": ";
  1062 + switch (entry.getType()) {
  1063 + case 1:
  1064 + cout << "uncompressed; offset = " << entry.getOffset();
  1065 + break;
  1066 +
  1067 + case 2:
  1068 + *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()
  1069 + << ", index = " << entry.getObjStreamIndex();
  1070 + break;
  1071 +
  1072 + default:
  1073 + throw std::logic_error("unknown cross-reference table type while showing xref_table");
  1074 + break;
  1075 + }
  1076 + m->log->info("\n");
  1077 + }
  1078 +}
  1079 +
  1080 +// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
  1081 +// return false. Otherwise return true.
  1082 +bool
  1083 +QPDF::resolveXRefTable()
  1084 +{
  1085 + bool may_change = !m->reconstructed_xref;
  1086 + for (auto& iter: m->xref_table) {
  1087 + if (isUnresolved(iter.first)) {
  1088 + resolve(iter.first);
  1089 + if (may_change && m->reconstructed_xref) {
  1090 + return false;
  1091 + }
  1092 + }
  1093 + }
  1094 + return true;
  1095 +}
  1096 +
  1097 +// Ensure all objects in the pdf file, including those in indirect references, appear in the object
  1098 +// cache.
  1099 +void
  1100 +QPDF::fixDanglingReferences(bool force)
  1101 +{
  1102 + if (m->fixed_dangling_refs) {
  1103 + return;
  1104 + }
  1105 + if (!resolveXRefTable()) {
  1106 + QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");
  1107 + resolveXRefTable();
  1108 + }
  1109 + m->fixed_dangling_refs = true;
  1110 +}
  1111 +
  1112 +size_t
  1113 +QPDF::getObjectCount()
  1114 +{
  1115 + // This method returns the next available indirect object number. makeIndirectObject uses it for
  1116 + // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
  1117 + // be in obj_cache.
  1118 + fixDanglingReferences();
  1119 + QPDFObjGen og;
  1120 + if (!m->obj_cache.empty()) {
  1121 + og = (*(m->obj_cache.rbegin())).first;
  1122 + }
  1123 + return toS(og.getObj());
  1124 +}
  1125 +
  1126 +std::vector<QPDFObjectHandle>
  1127 +QPDF::getAllObjects()
  1128 +{
  1129 + // After fixDanglingReferences is called, all objects are in the object cache.
  1130 + fixDanglingReferences();
  1131 + std::vector<QPDFObjectHandle> result;
  1132 + for (auto const& iter: m->obj_cache) {
  1133 + result.push_back(newIndirect(iter.first, iter.second.object));
  1134 + }
  1135 + return result;
  1136 +}
  1137 +
  1138 +void
  1139 +QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen og)
  1140 +{
  1141 + m->last_object_description.clear();
  1142 + if (!description.empty()) {
  1143 + m->last_object_description += description;
  1144 + if (og.isIndirect()) {
  1145 + m->last_object_description += ": ";
  1146 + }
  1147 + }
  1148 + if (og.isIndirect()) {
  1149 + m->last_object_description += "object " + og.unparse(' ');
  1150 + }
  1151 +}
  1152 +
  1153 +QPDFObjectHandle
  1154 +QPDF::readTrailer()
  1155 +{
  1156 + qpdf_offset_t offset = m->file->tell();
  1157 + bool empty = false;
  1158 + auto object =
  1159 + QPDFParser(*m->file, "trailer", m->tokenizer, nullptr, this, true).parse(empty, false);
  1160 + if (empty) {
  1161 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1162 + // actual PDF files and Adobe Reader appears to ignore them.
  1163 + warn(damagedPDF("trailer", "empty object treated as null"));
  1164 + } else if (object.isDictionary() && readToken(*m->file).isWord("stream")) {
  1165 + warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
  1166 + }
  1167 + // Override last_offset so that it points to the beginning of the object we just read
  1168 + m->file->setLastOffset(offset);
  1169 + return object;
  1170 +}
  1171 +
  1172 +QPDFObjectHandle
  1173 +QPDF::readObject(std::string const& description, QPDFObjGen og)
  1174 +{
  1175 + setLastObjectDescription(description, og);
  1176 + qpdf_offset_t offset = m->file->tell();
  1177 + bool empty = false;
  1178 +
  1179 + StringDecrypter decrypter{this, og};
  1180 + StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
  1181 + auto object =
  1182 + QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this, true)
  1183 + .parse(empty, false);
  1184 + if (empty) {
  1185 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1186 + // actual PDF files and Adobe Reader appears to ignore them.
  1187 + warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));
  1188 + return object;
  1189 + }
  1190 + auto token = readToken(*m->file);
  1191 + if (object.isDictionary() && token.isWord("stream")) {
  1192 + readStream(object, og, offset);
  1193 + token = readToken(*m->file);
  1194 + }
  1195 + if (!token.isWord("endobj")) {
  1196 + QTC::TC("qpdf", "QPDF err expected endobj");
  1197 + warn(damagedPDF("expected endobj"));
  1198 + }
  1199 + return object;
  1200 +}
  1201 +
  1202 +// After reading stream dictionary and stream keyword, read rest of stream.
  1203 +void
  1204 +QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
  1205 +{
  1206 + validateStreamLineEnd(object, og, offset);
  1207 +
  1208 + // Must get offset before accessing any additional objects since resolving a previously
  1209 + // unresolved indirect object will change file position.
  1210 + qpdf_offset_t stream_offset = m->file->tell();
  1211 + size_t length = 0;
  1212 +
  1213 + try {
  1214 + auto length_obj = object.getKey("/Length");
  1215 +
  1216 + if (!length_obj.isInteger()) {
  1217 + if (length_obj.isNull()) {
  1218 + QTC::TC("qpdf", "QPDF stream without length");
  1219 + throw damagedPDF(offset, "stream dictionary lacks /Length key");
  1220 + }
  1221 + QTC::TC("qpdf", "QPDF stream length not integer");
  1222 + throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");
  1223 + }
  1224 +
  1225 + length = toS(length_obj.getUIntValue());
  1226 + // Seek in two steps to avoid potential integer overflow
  1227 + m->file->seek(stream_offset, SEEK_SET);
  1228 + m->file->seek(toO(length), SEEK_CUR);
  1229 + if (!readToken(*m->file).isWord("endstream")) {
  1230 + QTC::TC("qpdf", "QPDF missing endstream");
  1231 + throw damagedPDF("expected endstream");
  1232 + }
  1233 + } catch (QPDFExc& e) {
  1234 + if (m->attempt_recovery) {
  1235 + warn(e);
  1236 + length = recoverStreamLength(m->file, og, stream_offset);
  1237 + } else {
  1238 + throw;
  1239 + }
  1240 + }
  1241 + object = QPDFObjectHandle(qpdf::Stream(*this, og, object, stream_offset, length));
  1242 +}
  1243 +
  1244 +void
  1245 +QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
  1246 +{
  1247 + // The PDF specification states that the word "stream" should be followed by either a carriage
  1248 + // return and a newline or by a newline alone. It specifically disallowed following it by a
  1249 + // carriage return alone since, in that case, there would be no way to tell whether the NL in a
  1250 + // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,
  1251 + // accept a carriage return by itself when followed by a non-newline character, so that's what
  1252 + // we do here. We have also seen files that have extraneous whitespace between the stream
  1253 + // keyword and the newline.
  1254 + while (true) {
  1255 + char ch;
  1256 + if (m->file->read(&ch, 1) == 0) {
  1257 + // A premature EOF here will result in some other problem that will get reported at
  1258 + // another time.
  1259 + return;
  1260 + }
  1261 + if (ch == '\n') {
  1262 + // ready to read stream data
  1263 + QTC::TC("qpdf", "QPDF stream with NL only");
  1264 + return;
  1265 + }
  1266 + if (ch == '\r') {
  1267 + // Read another character
  1268 + if (m->file->read(&ch, 1) != 0) {
  1269 + if (ch == '\n') {
  1270 + // Ready to read stream data
  1271 + QTC::TC("qpdf", "QPDF stream with CRNL");
  1272 + } else {
  1273 + // Treat the \r by itself as the whitespace after endstream and start reading
  1274 + // stream data in spite of not having seen a newline.
  1275 + QTC::TC("qpdf", "QPDF stream with CR only");
  1276 + m->file->unreadCh(ch);
  1277 + warn(damagedPDF(
  1278 + m->file->tell(), "stream keyword followed by carriage return only"));
  1279 + }
  1280 + }
  1281 + return;
  1282 + }
  1283 + if (!util::is_space(ch)) {
  1284 + QTC::TC("qpdf", "QPDF stream without newline");
  1285 + m->file->unreadCh(ch);
  1286 + warn(damagedPDF(
  1287 + m->file->tell(), "stream keyword not followed by proper line terminator"));
  1288 + return;
  1289 + }
  1290 + warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
  1291 + }
  1292 +}
  1293 +
  1294 +QPDFObjectHandle
  1295 +QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)
  1296 +{
  1297 + m->last_object_description.erase(7); // last_object_description starts with "object "
  1298 + m->last_object_description += std::to_string(obj);
  1299 + m->last_object_description += " 0";
  1300 +
  1301 + bool empty = false;
  1302 + auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true)
  1303 + .parse(empty, false);
  1304 + if (empty) {
  1305 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1306 + // actual PDF files and Adobe Reader appears to ignore them.
  1307 + warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));
  1308 + }
  1309 + return object;
  1310 +}
  1311 +
  1312 +bool
  1313 +QPDF::findEndstream()
  1314 +{
  1315 + // Find endstream or endobj. Position the input at that token.
  1316 + auto t = readToken(*m->file, 20);
  1317 + if (t.isWord("endobj") || t.isWord("endstream")) {
  1318 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1319 + return true;
  1320 + }
  1321 + return false;
  1322 +}
  1323 +
  1324 +size_t
  1325 +QPDF::recoverStreamLength(
  1326 + std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)
  1327 +{
  1328 + // Try to reconstruct stream length by looking for endstream or endobj
  1329 + warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));
  1330 +
  1331 + PatternFinder ef(*this, &QPDF::findEndstream);
  1332 + size_t length = 0;
  1333 + if (m->file->findFirst("end", stream_offset, 0, ef)) {
  1334 + length = toS(m->file->tell() - stream_offset);
  1335 + // Reread endstream but, if it was endobj, don't skip that.
  1336 + QPDFTokenizer::Token t = readToken(*m->file);
  1337 + if (t.getValue() == "endobj") {
  1338 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1339 + }
  1340 + }
  1341 +
  1342 + if (length) {
  1343 + auto end = stream_offset + toO(length);
  1344 + qpdf_offset_t found_offset = 0;
  1345 + QPDFObjGen found_og;
  1346 +
  1347 + // Make sure this is inside this object
  1348 + for (auto const& [current_og, entry]: m->xref_table) {
  1349 + if (entry.getType() == 1) {
  1350 + qpdf_offset_t obj_offset = entry.getOffset();
  1351 + if (found_offset < obj_offset && obj_offset < end) {
  1352 + found_offset = obj_offset;
  1353 + found_og = current_og;
  1354 + }
  1355 + }
  1356 + }
  1357 + if (!found_offset || found_og == og) {
  1358 + // If we are trying to recover an XRef stream the xref table will not contain and
  1359 + // won't contain any entries, therefore we cannot check the found length. Otherwise we
  1360 + // found endstream\nendobj within the space allowed for this object, so we're probably
  1361 + // in good shape.
  1362 + } else {
  1363 + QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
  1364 + length = 0;
  1365 + }
  1366 + }
  1367 +
  1368 + if (length == 0) {
  1369 + warn(damagedPDF(
  1370 + *input, stream_offset, "unable to recover stream data; treating stream as empty"));
  1371 + } else {
  1372 + warn(damagedPDF(
  1373 + *input, stream_offset, "recovered stream length: " + std::to_string(length)));
  1374 + }
  1375 +
  1376 + QTC::TC("qpdf", "QPDF recovered stream length");
  1377 + return length;
  1378 +}
  1379 +
  1380 +QPDFTokenizer::Token
  1381 +QPDF::readToken(InputSource& input, size_t max_len)
  1382 +{
  1383 + return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
  1384 +}
  1385 +
  1386 +QPDFObjectHandle
  1387 +QPDF::readObjectAtOffset(
  1388 + bool try_recovery,
  1389 + qpdf_offset_t offset,
  1390 + std::string const& description,
  1391 + QPDFObjGen exp_og,
  1392 + QPDFObjGen& og,
  1393 + bool skip_cache_if_in_xref)
  1394 +{
  1395 + bool check_og = true;
  1396 + if (exp_og.getObj() == 0) {
  1397 + // This method uses an expect object ID of 0 to indicate that we don't know or don't care
  1398 + // what the actual object ID is at this offset. This is true when we read the xref stream
  1399 + // and linearization hint streams. In this case, we don't verify the expect object
  1400 + // ID/generation against what was read from the file. There is also no reason to attempt
  1401 + // xref recovery if we get a failure in this case since the read attempt was not triggered
  1402 + // by an xref lookup.
  1403 + check_og = false;
  1404 + try_recovery = false;
  1405 + }
  1406 + setLastObjectDescription(description, exp_og);
  1407 +
  1408 + if (!m->attempt_recovery) {
  1409 + try_recovery = false;
  1410 + }
  1411 +
  1412 + // Special case: if offset is 0, just return null. Some PDF writers, in particular
  1413 + // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
  1414 + // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
  1415 + // these.
  1416 + if (offset == 0) {
  1417 + QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
  1418 + warn(damagedPDF(0, "object has offset 0"));
  1419 + return QPDFObjectHandle::newNull();
  1420 + }
  1421 +
  1422 + m->file->seek(offset, SEEK_SET);
  1423 + try {
  1424 + QPDFTokenizer::Token tobjid = readToken(*m->file);
  1425 + bool objidok = tobjid.isInteger();
  1426 + QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
  1427 + if (!objidok) {
  1428 + QTC::TC("qpdf", "QPDF expected n n obj");
  1429 + throw damagedPDF(offset, "expected n n obj");
  1430 + }
  1431 + QPDFTokenizer::Token tgen = readToken(*m->file);
  1432 + bool genok = tgen.isInteger();
  1433 + QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
  1434 + if (!genok) {
  1435 + throw damagedPDF(offset, "expected n n obj");
  1436 + }
  1437 + QPDFTokenizer::Token tobj = readToken(*m->file);
  1438 +
  1439 + bool objok = tobj.isWord("obj");
  1440 + QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
  1441 +
  1442 + if (!objok) {
  1443 + throw damagedPDF(offset, "expected n n obj");
  1444 + }
  1445 + int objid = QUtil::string_to_int(tobjid.getValue().c_str());
  1446 + int generation = QUtil::string_to_int(tgen.getValue().c_str());
  1447 + og = QPDFObjGen(objid, generation);
  1448 + if (objid == 0) {
  1449 + QTC::TC("qpdf", "QPDF object id 0");
  1450 + throw damagedPDF(offset, "object with ID 0");
  1451 + }
  1452 + if (check_og && (exp_og != og)) {
  1453 + QTC::TC("qpdf", "QPDF err wrong objid/generation");
  1454 + QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
  1455 + if (try_recovery) {
  1456 + // Will be retried below
  1457 + throw e;
  1458 + } else {
  1459 + // We can try reading the object anyway even if the ID doesn't match.
  1460 + warn(e);
  1461 + }
  1462 + }
  1463 + } catch (QPDFExc& e) {
  1464 + if (try_recovery) {
  1465 + // Try again after reconstructing xref table
  1466 + reconstruct_xref(e);
  1467 + if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {
  1468 + qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
  1469 + QPDFObjectHandle result =
  1470 + readObjectAtOffset(false, new_offset, description, exp_og, og, false);
  1471 + QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
  1472 + return result;
  1473 + } else {
  1474 + QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
  1475 + warn(damagedPDF(
  1476 + "",
  1477 + 0,
  1478 + ("object " + exp_og.unparse(' ') +
  1479 + " not found in file after regenerating cross reference "
  1480 + "table")));
  1481 + return QPDFObjectHandle::newNull();
  1482 + }
  1483 + } else {
  1484 + throw;
  1485 + }
  1486 + }
  1487 +
  1488 + QPDFObjectHandle oh = readObject(description, og);
  1489 +
  1490 + if (isUnresolved(og)) {
  1491 + // Store the object in the cache here so it gets cached whether we first know the offset or
  1492 + // whether we first know the object ID and generation (in which we case we would get here
  1493 + // through resolve).
  1494 +
  1495 + // Determine the end offset of this object before and after white space. We use these
  1496 + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
  1497 + // the end of an object to be anywhere between these values.
  1498 + qpdf_offset_t end_before_space = m->file->tell();
  1499 +
  1500 + // skip over spaces
  1501 + while (true) {
  1502 + char ch;
  1503 + if (m->file->read(&ch, 1)) {
  1504 + if (!isspace(static_cast<unsigned char>(ch))) {
  1505 + m->file->seek(-1, SEEK_CUR);
  1506 + break;
  1507 + }
  1508 + } else {
  1509 + throw damagedPDF(m->file->tell(), "EOF after endobj");
  1510 + }
  1511 + }
  1512 + qpdf_offset_t end_after_space = m->file->tell();
  1513 + if (skip_cache_if_in_xref && m->xref_table.count(og)) {
  1514 + // Ordinarily, an object gets read here when resolved through xref table or stream. In
  1515 + // the special case of the xref stream and linearization hint tables, the offset comes
  1516 + // from another source. For the specific case of xref streams, the xref stream is read
  1517 + // and loaded into the object cache very early in parsing. Ordinarily, when a file is
  1518 + // updated by appending, items inserted into the xref table in later updates take
  1519 + // precedence over earlier items. In the special case of reusing the object number
  1520 + // previously used as the xref stream, we have the following order of events:
  1521 + //
  1522 + // * reused object gets loaded into the xref table
  1523 + // * old object is read here while reading xref streams
  1524 + // * original xref entry is ignored (since already in xref table)
  1525 + //
  1526 + // It is the second step that causes a problem. Even though the xref table is correct in
  1527 + // this case, the old object is already in the cache and so effectively prevails over
  1528 + // the reused object. To work around this issue, we have a special case for the xref
  1529 + // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,
  1530 + // don't cache what we read here.
  1531 + //
  1532 + // It is likely that the same bug may exist for linearization hint tables, but the
  1533 + // existing code uses end_before_space and end_after_space from the cache, so fixing
  1534 + // that would require more significant rework. The chances of a linearization hint
  1535 + // stream being reused seems smaller because the xref stream is probably the highest
  1536 + // object in the file and the linearization hint stream would be some random place in
  1537 + // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we
  1538 + // could use !check_og in place of skip_cache_if_in_xref.
  1539 + QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
  1540 + } else {
  1541 + updateCache(og, oh.getObj(), end_before_space, end_after_space);
  1542 + }
  1543 + }
  1544 +
  1545 + return oh;
  1546 +}
  1547 +
  1548 +std::shared_ptr<QPDFObject> const&
  1549 +QPDF::resolve(QPDFObjGen og)
  1550 +{
  1551 + if (!isUnresolved(og)) {
  1552 + return m->obj_cache[og].object;
  1553 + }
  1554 +
  1555 + if (m->resolving.count(og)) {
  1556 + // This can happen if an object references itself directly or indirectly in some key that
  1557 + // has to be resolved during object parsing, such as stream length.
  1558 + QTC::TC("qpdf", "QPDF recursion loop in resolve");
  1559 + warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
  1560 + updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
  1561 + return m->obj_cache[og].object;
  1562 + }
  1563 + ResolveRecorder rr(this, og);
  1564 +
  1565 + if (m->xref_table.count(og) != 0) {
  1566 + QPDFXRefEntry const& entry = m->xref_table[og];
  1567 + try {
  1568 + switch (entry.getType()) {
  1569 + case 1:
  1570 + {
  1571 + qpdf_offset_t offset = entry.getOffset();
  1572 + // Object stored in cache by readObjectAtOffset
  1573 + QPDFObjGen a_og;
  1574 + QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);
  1575 + }
  1576 + break;
  1577 +
  1578 + case 2:
  1579 + resolveObjectsInStream(entry.getObjStreamNumber());
  1580 + break;
  1581 +
  1582 + default:
  1583 + throw damagedPDF(
  1584 + "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));
  1585 + }
  1586 + } catch (QPDFExc& e) {
  1587 + warn(e);
  1588 + } catch (std::exception& e) {
  1589 + warn(damagedPDF(
  1590 + "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
  1591 + }
  1592 + }
  1593 +
  1594 + if (isUnresolved(og)) {
  1595 + // PDF spec says unknown objects resolve to the null object.
  1596 + QTC::TC("qpdf", "QPDF resolve failure to null");
  1597 + updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
  1598 + }
  1599 +
  1600 + auto& result(m->obj_cache[og].object);
  1601 + result->setDefaultDescription(this, og);
  1602 + return result;
  1603 +}
  1604 +
  1605 +void
  1606 +QPDF::resolveObjectsInStream(int obj_stream_number)
  1607 +{
  1608 + if (m->resolved_object_streams.count(obj_stream_number)) {
  1609 + return;
  1610 + }
  1611 + m->resolved_object_streams.insert(obj_stream_number);
  1612 + // Force resolution of object stream
  1613 + QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
  1614 + if (!obj_stream.isStream()) {
  1615 + throw damagedPDF(
  1616 + "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
  1617 + }
  1618 +
  1619 + // For linearization data in the object, use the data from the object stream for the objects in
  1620 + // the stream.
  1621 + QPDFObjGen stream_og(obj_stream_number, 0);
  1622 + qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;
  1623 + qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;
  1624 +
  1625 + QPDFObjectHandle dict = obj_stream.getDict();
  1626 + if (!dict.isDictionaryOfType("/ObjStm")) {
  1627 + QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
  1628 + warn(damagedPDF(
  1629 + "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
  1630 + }
  1631 +
  1632 + if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {
  1633 + throw damagedPDF(
  1634 + ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));
  1635 + }
  1636 +
  1637 + int n = dict.getKey("/N").getIntValueAsInt();
  1638 + int first = dict.getKey("/First").getIntValueAsInt();
  1639 +
  1640 + std::map<int, int> offsets;
  1641 +
  1642 + std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
  1643 + auto input = std::shared_ptr<InputSource>(
  1644 + // line-break
  1645 + new BufferInputSource(
  1646 + (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),
  1647 + bp.get()));
  1648 +
  1649 + long long last_offset = -1;
  1650 + for (int i = 0; i < n; ++i) {
  1651 + QPDFTokenizer::Token tnum = readToken(*input);
  1652 + QPDFTokenizer::Token toffset = readToken(*input);
  1653 + if (!(tnum.isInteger() && toffset.isInteger())) {
  1654 + throw damagedPDF(
  1655 + *input,
  1656 + m->last_object_description,
  1657 + input->getLastOffset(),
  1658 + "expected integer in object stream header");
  1659 + }
  1660 +
  1661 + int num = QUtil::string_to_int(tnum.getValue().c_str());
  1662 + long long offset = QUtil::string_to_int(toffset.getValue().c_str());
  1663 +
  1664 + if (num == obj_stream_number) {
  1665 + QTC::TC("qpdf", "QPDF ignore self-referential object stream");
  1666 + warn(damagedPDF(
  1667 + *input,
  1668 + m->last_object_description,
  1669 + input->getLastOffset(),
  1670 + "object stream claims to contain itself"));
  1671 + continue;
  1672 + }
  1673 +
  1674 + if (num < 1) {
  1675 + QTC::TC("qpdf", "QPDF object stream contains id < 1");
  1676 + warn(damagedPDF(
  1677 + *input,
  1678 + "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
  1679 + 0,
  1680 + "object id is invalid"s));
  1681 + continue;
  1682 + }
  1683 +
  1684 + if (offset <= last_offset) {
  1685 + QTC::TC("qpdf", "QPDF object stream offsets not increasing");
  1686 + warn(damagedPDF(
  1687 + *input,
  1688 + "object "s + std::to_string(num) + " 0, offset " + std::to_string(offset),
  1689 + 0,
  1690 + "offset is invalid (must be larger than previous offset " +
  1691 + std::to_string(last_offset) + ")"));
  1692 + continue;
  1693 + }
  1694 + last_offset = offset;
  1695 +
  1696 + if (num > m->xref_table_max_id) {
  1697 + continue;
  1698 + }
  1699 +
  1700 + offsets[num] = toI(offset + first);
  1701 + }
  1702 +
  1703 + // To avoid having to read the object stream multiple times, store all objects that would be
  1704 + // found here in the cache. Remember that some objects stored here might have been overridden
  1705 + // by new objects appended to the file, so it is necessary to recheck the xref table and only
  1706 + // cache what would actually be resolved here.
  1707 + m->last_object_description.clear();
  1708 + m->last_object_description += "object ";
  1709 + for (auto const& iter: offsets) {
  1710 + QPDFObjGen og(iter.first, 0);
  1711 + auto entry = m->xref_table.find(og);
  1712 + if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
  1713 + entry->second.getObjStreamNumber() == obj_stream_number) {
  1714 + int offset = iter.second;
  1715 + input->seek(offset, SEEK_SET);
  1716 + QPDFObjectHandle oh = readObjectInStream(input, iter.first);
  1717 + updateCache(og, oh.getObj(), end_before_space, end_after_space);
  1718 + } else {
  1719 + QTC::TC("qpdf", "QPDF not caching overridden objstm object");
  1720 + }
  1721 + }
  1722 +}
  1723 +
  1724 +QPDFObjectHandle
  1725 +QPDF::newIndirect(QPDFObjGen og, std::shared_ptr<QPDFObject> const& obj)
  1726 +{
  1727 + obj->setDefaultDescription(this, og);
  1728 + return {obj};
  1729 +}
  1730 +
  1731 +void
  1732 +QPDF::updateCache(
  1733 + QPDFObjGen og,
  1734 + std::shared_ptr<QPDFObject> const& object,
  1735 + qpdf_offset_t end_before_space,
  1736 + qpdf_offset_t end_after_space,
  1737 + bool destroy)
  1738 +{
  1739 + object->setObjGen(this, og);
  1740 + if (isCached(og)) {
  1741 + auto& cache = m->obj_cache[og];
  1742 + object->move_to(cache.object, destroy);
  1743 + cache.end_before_space = end_before_space;
  1744 + cache.end_after_space = end_after_space;
  1745 + } else {
  1746 + m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);
  1747 + }
  1748 +}
  1749 +
  1750 +bool
  1751 +QPDF::isCached(QPDFObjGen og)
  1752 +{
  1753 + return m->obj_cache.count(og) != 0;
  1754 +}
  1755 +
  1756 +bool
  1757 +QPDF::isUnresolved(QPDFObjGen og)
  1758 +{
  1759 + return !isCached(og) || m->obj_cache[og].object->isUnresolved();
  1760 +}
  1761 +
  1762 +QPDFObjGen
  1763 +QPDF::nextObjGen()
  1764 +{
  1765 + int max_objid = toI(getObjectCount());
  1766 + if (max_objid == std::numeric_limits<int>::max()) {
  1767 + throw std::range_error("max object id is too high to create new objects");
  1768 + }
  1769 + return QPDFObjGen(max_objid + 1, 0);
  1770 +}
  1771 +
  1772 +QPDFObjectHandle
  1773 +QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
  1774 +{
  1775 + QPDFObjGen next{nextObjGen()};
  1776 + m->obj_cache[next] = ObjCache(obj, -1, -1);
  1777 + return newIndirect(next, m->obj_cache[next].object);
  1778 +}
  1779 +
  1780 +QPDFObjectHandle
  1781 +QPDF::makeIndirectObject(QPDFObjectHandle oh)
  1782 +{
  1783 + if (!oh) {
  1784 + throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
  1785 + }
  1786 + return makeIndirectFromQPDFObject(oh.getObj());
  1787 +}
  1788 +
  1789 +std::shared_ptr<QPDFObject>
  1790 +QPDF::getObjectForParser(int id, int gen, bool parse_pdf)
  1791 +{
  1792 + // This method is called by the parser and therefore must not resolve any objects.
  1793 + auto og = QPDFObjGen(id, gen);
  1794 + if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
  1795 + return iter->second.object;
  1796 + }
  1797 + if (m->xref_table.count(og) || !m->parsed) {
  1798 + return m->obj_cache.insert({og, QPDFObject::create<QPDF_Unresolved>(this, og)})
  1799 + .first->second.object;
  1800 + }
  1801 + if (parse_pdf) {
  1802 + return QPDFObject::create<QPDF_Null>();
  1803 + }
  1804 + return m->obj_cache.insert({og, QPDFObject::create<QPDF_Null>(this, og)}).first->second.object;
  1805 +}
  1806 +
  1807 +std::shared_ptr<QPDFObject>
  1808 +QPDF::getObjectForJSON(int id, int gen)
  1809 +{
  1810 + auto og = QPDFObjGen(id, gen);
  1811 + auto [it, inserted] = m->obj_cache.try_emplace(og);
  1812 + auto& obj = it->second.object;
  1813 + if (inserted) {
  1814 + obj = (m->parsed && !m->xref_table.count(og))
  1815 + ? QPDFObject::create<QPDF_Null>(this, og)
  1816 + : QPDFObject::create<QPDF_Unresolved>(this, og);
  1817 + }
  1818 + return obj;
  1819 +}
  1820 +
  1821 +QPDFObjectHandle
  1822 +QPDF::getObject(QPDFObjGen og)
  1823 +{
  1824 + if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
  1825 + return {it->second.object};
  1826 + } else if (m->parsed && !m->xref_table.count(og)) {
  1827 + return QPDFObject::create<QPDF_Null>();
  1828 + } else {
  1829 + auto result =
  1830 + m->obj_cache.try_emplace(og, QPDFObject::create<QPDF_Unresolved>(this, og), -1, -1);
  1831 + return {result.first->second.object};
  1832 + }
  1833 +}
  1834 +
  1835 +void
  1836 +QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
  1837 +{
  1838 + replaceObject(QPDFObjGen(objid, generation), oh);
  1839 +}
  1840 +
  1841 +void
  1842 +QPDF::replaceObject(QPDFObjGen og, QPDFObjectHandle oh)
  1843 +{
  1844 + if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {
  1845 + QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
  1846 + throw std::logic_error("QPDF::replaceObject called with indirect object handle");
  1847 + }
  1848 + updateCache(og, oh.getObj(), -1, -1, false);
  1849 +}
  1850 +
  1851 +void
  1852 +QPDF::removeObject(QPDFObjGen og)
  1853 +{
  1854 + m->xref_table.erase(og);
  1855 + if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
  1856 + // Take care of any object handles that may be floating around.
  1857 + cached->second.object->assign_null();
  1858 + cached->second.object->setObjGen(nullptr, QPDFObjGen());
  1859 + m->obj_cache.erase(cached);
  1860 + }
  1861 +}
  1862 +
  1863 +void
  1864 +QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
  1865 +{
  1866 + QTC::TC("qpdf", "QPDF replaceReserved");
  1867 + auto tc = reserved.getTypeCode();
  1868 + if (!(tc == ::ot_reserved || tc == ::ot_null)) {
  1869 + throw std::logic_error("replaceReserved called with non-reserved object");
  1870 + }
  1871 + replaceObject(reserved.getObjGen(), replacement);
  1872 +}
  1873 +
  1874 +void
  1875 +QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
  1876 +{
  1877 + swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
  1878 +}
  1879 +
  1880 +void
  1881 +QPDF::swapObjects(QPDFObjGen og1, QPDFObjGen og2)
  1882 +{
  1883 + // Force objects to be read from the input source if needed, then swap them in the cache.
  1884 + resolve(og1);
  1885 + resolve(og2);
  1886 + m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
  1887 +}
  1888 +
  1889 +size_t
  1890 +QPDF::tableSize()
  1891 +{
  1892 + // If obj_cache is dense, accommodate all object in tables,else accommodate only original
  1893 + // objects.
  1894 + auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0;
  1895 + auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;
  1896 + auto max_id = std::numeric_limits<int>::max() - 1;
  1897 + if (max_obj >= max_id || max_xref >= max_id) {
  1898 + // Temporary fix. Long-term solution is
  1899 + // - QPDFObjGen to enforce objgens are valid and sensible
  1900 + // - xref table and obj cache to protect against insertion of impossibly large obj ids
  1901 + stopOnError("Impossibly large object id encountered.");
  1902 + }
  1903 + if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
  1904 + return toS(++max_obj);
  1905 + }
  1906 + return toS(++max_xref);
  1907 +}
  1908 +
  1909 +std::vector<QPDFObjGen>
  1910 +QPDF::getCompressibleObjVector()
  1911 +{
  1912 + return getCompressibleObjGens<QPDFObjGen>();
  1913 +}
  1914 +
  1915 +std::vector<bool>
  1916 +QPDF::getCompressibleObjSet()
  1917 +{
  1918 + return getCompressibleObjGens<bool>();
  1919 +}
  1920 +
  1921 +template <typename T>
  1922 +std::vector<T>
  1923 +QPDF::getCompressibleObjGens()
  1924 +{
  1925 + // Return a list of objects that are allowed to be in object streams. Walk through the objects
  1926 + // by traversing the document from the root, including a traversal of the pages tree. This
  1927 + // makes that objects that are on the same page are more likely to be in the same object stream,
  1928 + // which is slightly more efficient, particularly with linearized files. This is better than
  1929 + // iterating through the xref table since it avoids preserving orphaned items.
  1930 +
  1931 + // Exclude encryption dictionary, if any
  1932 + QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
  1933 + QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
  1934 +
  1935 + const size_t max_obj = getObjectCount();
  1936 + std::vector<bool> visited(max_obj, false);
  1937 + std::vector<QPDFObjectHandle> queue;
  1938 + queue.reserve(512);
  1939 + queue.push_back(m->trailer);
  1940 + std::vector<T> result;
  1941 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  1942 + result.reserve(m->obj_cache.size());
  1943 + } else if constexpr (std::is_same_v<T, bool>) {
  1944 + result.resize(max_obj + 1U, false);
  1945 + } else {
  1946 + throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
  1947 + }
  1948 + while (!queue.empty()) {
  1949 + auto obj = queue.back();
  1950 + queue.pop_back();
  1951 + if (obj.getObjectID() > 0) {
  1952 + QPDFObjGen og = obj.getObjGen();
  1953 + const size_t id = toS(og.getObj() - 1);
  1954 + if (id >= max_obj) {
  1955 + throw std::logic_error(
  1956 + "unexpected object id encountered in getCompressibleObjGens");
  1957 + }
  1958 + if (visited[id]) {
  1959 + QTC::TC("qpdf", "QPDF loop detected traversing objects");
  1960 + continue;
  1961 + }
  1962 +
  1963 + // Check whether this is the current object. If not, remove it (which changes it into a
  1964 + // direct null and therefore stops us from revisiting it) and move on to the next object
  1965 + // in the queue.
  1966 + auto upper = m->obj_cache.upper_bound(og);
  1967 + if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
  1968 + removeObject(og);
  1969 + continue;
  1970 + }
  1971 +
  1972 + visited[id] = true;
  1973 +
  1974 + if (og == encryption_dict_og) {
  1975 + QTC::TC("qpdf", "QPDF exclude encryption dictionary");
  1976 + } else if (!(obj.isStream() ||
  1977 + (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
  1978 + obj.hasKey("/Contents")))) {
  1979 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  1980 + result.push_back(og);
  1981 + } else if constexpr (std::is_same_v<T, bool>) {
  1982 + result[id + 1U] = true;
  1983 + }
  1984 + }
  1985 + }
  1986 + if (obj.isStream()) {
  1987 + auto dict = obj.getDict().as_dictionary();
  1988 + auto end = dict.crend();
  1989 + for (auto iter = dict.crbegin(); iter != end; ++iter) {
  1990 + std::string const& key = iter->first;
  1991 + QPDFObjectHandle const& value = iter->second;
  1992 + if (!value.null()) {
  1993 + if (key == "/Length") {
  1994 + // omit stream lengths
  1995 + if (value.isIndirect()) {
  1996 + QTC::TC("qpdf", "QPDF exclude indirect length");
  1997 + }
  1998 + } else {
  1999 + queue.emplace_back(value);
  2000 + }
  2001 + }
  2002 + }
  2003 + } else if (obj.isDictionary()) {
  2004 + auto dict = obj.as_dictionary();
  2005 + auto end = dict.crend();
  2006 + for (auto iter = dict.crbegin(); iter != end; ++iter) {
  2007 + if (!iter->second.null()) {
  2008 + queue.emplace_back(iter->second);
  2009 + }
  2010 + }
  2011 + } else if (auto items = obj.as_array()) {
  2012 + queue.insert(queue.end(), items.crbegin(), items.crend());
  2013 + }
  2014 + }
  2015 +
  2016 + return result;
  2017 +}
manual/release-notes.rst
@@ -21,16 +21,15 @@ more detail. @@ -21,16 +21,15 @@ more detail.
21 integer object. Previously the method returned false if the first 21 integer object. Previously the method returned false if the first
22 dictionary object was not a linearization parameter dictionary. 22 dictionary object was not a linearization parameter dictionary.
23 23
24 -.. _r12-0-0:  
25 -  
26 -12.0.1: not yet released  
27 - - Other enhancements 24 + - Other enhancements
28 25
29 - - There have been further enhancements to how files with damaged xref  
30 - tables are recovered. 26 + - There have been further enhancements to how files with damaged xref
  27 + tables are recovered.
31 28
32 .. cSpell:ignore substract 29 .. cSpell:ignore substract
33 30
  31 +.. _r12-0-0:
  32 +
34 12.0.0: March 9, 2025 33 12.0.0: March 9, 2025
35 - API breaking changes 34 - API breaking changes
36 35