Commit ef891178f83e46096a3a66bd6ba71b3a987dbc57

Authored by m-holger
Committed by GitHub
2 parents 531f6877 740dd509

Merge pull request #1338 from m-holger/qpdf-11-revert

Exclude #1272, #1289, #1297 and #1301 from qpdf 11.10
.idea/codeStyles/Project.xml
@@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
3 <RiderCodeStyleSettings> 3 <RiderCodeStyleSettings>
4 <option name="/Default/CodeStyle/CodeFormatting/CppClangFormat/EnableClangFormatSupport/@EntryValue" value="true" type="bool" /> 4 <option name="/Default/CodeStyle/CodeFormatting/CppClangFormat/EnableClangFormatSupport/@EntryValue" value="true" type="bool" />
5 </RiderCodeStyleSettings> 5 </RiderCodeStyleSettings>
6 - <SqlCodeStyleSettings version="6"> 6 + <SqlCodeStyleSettings version="7">
7 <option name="KEYWORD_CASE" value="2" /> 7 <option name="KEYWORD_CASE" value="2" />
8 </SqlCodeStyleSettings> 8 </SqlCodeStyleSettings>
9 <clangFormatSettings> 9 <clangFormatSettings>
.readthedocs.yaml
@@ -2,7 +2,6 @@ @@ -2,7 +2,6 @@
2 # Read the Docs configuration file 2 # Read the Docs configuration file
3 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 4
5 -# Required  
6 version: 2 5 version: 2
7 6
8 build: 7 build:
@@ -11,13 +10,11 @@ build: @@ -11,13 +10,11 @@ build:
11 python: "3.11" 10 python: "3.11"
12 11
13 sphinx: 12 sphinx:
14 - configuration: manual/conf.py 13 + configuration: manual/conf.py
  14 + fail_on_warning: true
15 15
16 formats: all 16 formats: all
17 17
18 -sphinx:  
19 - fail_on_warning: true  
20 -  
21 python: 18 python:
22 install: 19 install:
23 - requirements: manual/requirements.txt 20 - requirements: manual/requirements.txt
fuzz/CMakeLists.txt
@@ -142,9 +142,6 @@ set(CORPUS_OTHER @@ -142,9 +142,6 @@ set(CORPUS_OTHER
142 70306b.fuzz 142 70306b.fuzz
143 71624.fuzz 143 71624.fuzz
144 71689.fuzz 144 71689.fuzz
145 - 99999a.fuzz  
146 - 99999b.fuzz  
147 - 99999c.fuzz  
148 99999d.fuzz 145 99999d.fuzz
149 99999e.fuzz 146 99999e.fuzz
150 369662293.fuzz 147 369662293.fuzz
fuzz/qpdf_extra/99999a.fuzz deleted
1 -%PDF-1.5  
2 -%€€€€  
3 -1 0 obj  
4 -<<  
5 - /Type /Catalog  
6 - /Pages 2 0 R  
7 ->>  
8 -endobj  
9 -2 0 obj  
10 -<<  
11 - /Count 6 Ri  
12 - 0K/ds [3 0 R]  
13 - /Type /Pages  
14 ->>  
15 -endobj  
16 -3 0 obj  
17 -<<  
18 - /Resources <<  
19 - /Font <<  
20 - /F1 5 0 R  
21 - >>  
22 - >>  
23 - /MediaBox [0 0 795 842]  
24 - /Parent 2 0 R  
25 - /Contents 4 0 R  
26 - /Type /Page  
27 -=>  
28 -endobj  
29 -4 0 obj  
30 -<<444444444444444444444444 1 Tr /F1 30 Tf 350 750 Td (foobar) Tj ET  
31 -endstream  
32 -endobj  
33 -5 0 obj  
34 -<<  
35 - /Name /F1  
36 - /BaseFont /Helvetica  
37 - /Type /Font  
38 - /Subtype /Type1  
39 ->>  
40 -e„dobj  
41 -6 0 obj  
42 -<< /Length 6 0 R >>  
43 -stre444444444444444444444444444444<<>>  
44 -endobj  
45 -xref  
46 -0 8  
47 -0000000000 65535 f  
48 -0000000015 00000 n  
49 -0000000066 00000 n  
50 -0000000130 00000 n  
51 -0000000269 00000 n  
52 -0000000362 00000 n  
53 -000000ÎËËÉßÏÏÏ00 n  
54 -0000000500 00000 n  
55 -trailer  
56 -<<  
57 - /Size 713115528178535  
58 - /Root 1 0 R  
59 - /Info 7 0 R  
60 ->>  
61 -startxref  
62 -520  
63 -%%EOF  
64 \ No newline at end of file 0 \ No newline at end of file
fuzz/qpdf_extra/99999b.fuzz deleted
No preview for this file type
fuzz/qpdf_extra/99999c.fuzz deleted
No preview for this file type
fuzz/qtest/fuzz.test
@@ -11,7 +11,7 @@ my $td = new TestDriver(&#39;fuzz&#39;); @@ -11,7 +11,7 @@ my $td = new TestDriver(&#39;fuzz&#39;);
11 11
12 my $qpdf_corpus = $ENV{'QPDF_FUZZ_CORPUS'} || die "must set QPDF_FUZZ_CORPUS"; 12 my $qpdf_corpus = $ENV{'QPDF_FUZZ_CORPUS'} || die "must set QPDF_FUZZ_CORPUS";
13 13
14 -my $n_qpdf_files = 87; # increment when adding new files 14 +my $n_qpdf_files = 84; # increment when adding new files
15 15
16 my @fuzzers = ( 16 my @fuzzers = (
17 ['ascii85' => 1], 17 ['ascii85' => 1],
include/qpdf/QPDF.hh
@@ -391,7 +391,7 @@ class QPDF @@ -391,7 +391,7 @@ class QPDF
391 void replaceObject(int objid, int generation, QPDFObjectHandle); 391 void replaceObject(int objid, int generation, QPDFObjectHandle);
392 392
393 // Swap two objects given by ID. Prior to qpdf 10.2.1, existing QPDFObjectHandle instances that 393 // Swap two objects given by ID. Prior to qpdf 10.2.1, existing QPDFObjectHandle instances that
394 - // reference the objects did not notice the swap, but this was fixed in 10.2.1. 394 + // reference them objects not notice the swap, but this was fixed in 10.2.1.
395 QPDF_DLL 395 QPDF_DLL
396 void swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2); 396 void swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2);
397 QPDF_DLL 397 QPDF_DLL
@@ -645,7 +645,7 @@ class QPDF @@ -645,7 +645,7 @@ class QPDF
645 QPDF_DLL 645 QPDF_DLL
646 void fixDanglingReferences(bool force = false); 646 void fixDanglingReferences(bool force = false);
647 647
648 - // Return the approximate number of indirect objects. It is approximate because not all objects 648 + // Return the approximate number of indirect objects. It is/ approximate because not all objects
649 // in the file are preserved in all cases, and gaps in object numbering are not preserved. 649 // in the file are preserved in all cases, and gaps in object numbering are not preserved.
650 QPDF_DLL 650 QPDF_DLL
651 size_t getObjectCount(); 651 size_t getObjectCount();
@@ -725,15 +725,165 @@ class QPDF @@ -725,15 +725,165 @@ class QPDF
725 void removePage(QPDFObjectHandle page); 725 void removePage(QPDFObjectHandle page);
726 // End legacy page helpers 726 // End legacy page helpers
727 727
728 - // End of the public API. The following classes and methods are for qpdf internal use only. 728 + // Writer class is restricted to QPDFWriter so that only it can call certain methods.
  729 + class Writer
  730 + {
  731 + friend class QPDFWriter;
  732 +
  733 + private:
  734 + static void
  735 + optimize(
  736 + QPDF& qpdf,
  737 + QPDFWriter::ObjTable const& obj,
  738 + std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
  739 + {
  740 + return qpdf.optimize(obj, skip_stream_parameters);
  741 + }
  742 +
  743 + static void
  744 + getLinearizedParts(
  745 + QPDF& qpdf,
  746 + QPDFWriter::ObjTable const& obj,
  747 + std::vector<QPDFObjectHandle>& part4,
  748 + std::vector<QPDFObjectHandle>& part6,
  749 + std::vector<QPDFObjectHandle>& part7,
  750 + std::vector<QPDFObjectHandle>& part8,
  751 + std::vector<QPDFObjectHandle>& part9)
  752 + {
  753 + qpdf.getLinearizedParts(obj, part4, part6, part7, part8, part9);
  754 + }
  755 +
  756 + static void
  757 + generateHintStream(
  758 + QPDF& qpdf,
  759 + QPDFWriter::NewObjTable const& new_obj,
  760 + QPDFWriter::ObjTable const& obj,
  761 + std::shared_ptr<Buffer>& hint_stream,
  762 + int& S,
  763 + int& O,
  764 + bool compressed)
  765 + {
  766 + return qpdf.generateHintStream(new_obj, obj, hint_stream, S, O, compressed);
  767 + }
  768 +
  769 + static std::vector<QPDFObjGen>
  770 + getCompressibleObjGens(QPDF& qpdf)
  771 + {
  772 + return qpdf.getCompressibleObjVector();
  773 + }
  774 +
  775 + static std::vector<bool>
  776 + getCompressibleObjSet(QPDF& qpdf)
  777 + {
  778 + return qpdf.getCompressibleObjSet();
  779 + }
  780 +
  781 + static std::map<QPDFObjGen, QPDFXRefEntry> const&
  782 + getXRefTable(QPDF& qpdf)
  783 + {
  784 + return qpdf.getXRefTableInternal();
  785 + }
  786 +
  787 + static size_t
  788 + tableSize(QPDF& qpdf)
  789 + {
  790 + return qpdf.tableSize();
  791 + }
  792 + };
729 793
730 - class Writer;  
731 - class Resolver;  
732 - class StreamCopier;  
733 - class Objects;  
734 - class ParseGuard;  
735 - class Pipe;  
736 - class JobSetter; 794 + // The Resolver class is restricted to QPDFObject so that only it can resolve indirect
  795 + // references.
  796 + class Resolver
  797 + {
  798 + friend class QPDFObject;
  799 + friend class QPDF_Unresolved;
  800 +
  801 + private:
  802 + static QPDFObject*
  803 + resolved(QPDF* qpdf, QPDFObjGen og)
  804 + {
  805 + return qpdf->resolve(og);
  806 + }
  807 + };
  808 +
  809 + // StreamCopier class is restricted to QPDFObjectHandle so it can copy stream data.
  810 + class StreamCopier
  811 + {
  812 + friend class QPDFObjectHandle;
  813 +
  814 + private:
  815 + static void
  816 + copyStreamData(QPDF* qpdf, QPDFObjectHandle const& dest, QPDFObjectHandle const& src)
  817 + {
  818 + qpdf->copyStreamData(dest, src);
  819 + }
  820 + };
  821 +
  822 + // The ParseGuard class allows QPDFParser to detect re-entrant parsing. It also provides
  823 + // special access to allow the parser to create unresolved objects and dangling references.
  824 + class ParseGuard
  825 + {
  826 + friend class QPDFParser;
  827 +
  828 + private:
  829 + ParseGuard(QPDF* qpdf) :
  830 + qpdf(qpdf)
  831 + {
  832 + if (qpdf) {
  833 + qpdf->inParse(true);
  834 + }
  835 + }
  836 +
  837 + static std::shared_ptr<QPDFObject>
  838 + getObject(QPDF* qpdf, int id, int gen, bool parse_pdf)
  839 + {
  840 + return qpdf->getObjectForParser(id, gen, parse_pdf);
  841 + }
  842 +
  843 + ~ParseGuard()
  844 + {
  845 + if (qpdf) {
  846 + qpdf->inParse(false);
  847 + }
  848 + }
  849 + QPDF* qpdf;
  850 + };
  851 +
  852 + // Pipe class is restricted to QPDF_Stream.
  853 + class Pipe
  854 + {
  855 + friend class QPDF_Stream;
  856 +
  857 + private:
  858 + static bool
  859 + pipeStreamData(
  860 + QPDF* qpdf,
  861 + QPDFObjGen const& og,
  862 + qpdf_offset_t offset,
  863 + size_t length,
  864 + QPDFObjectHandle dict,
  865 + Pipeline* pipeline,
  866 + bool suppress_warnings,
  867 + bool will_retry)
  868 + {
  869 + return qpdf->pipeStreamData(
  870 + og, offset, length, dict, pipeline, suppress_warnings, will_retry);
  871 + }
  872 + };
  873 +
  874 + // JobSetter class is restricted to QPDFJob.
  875 + class JobSetter
  876 + {
  877 + friend class QPDFJob;
  878 +
  879 + private:
  880 + // Enable enhanced warnings for pdf file checking.
  881 + static void
  882 + setCheckMode(QPDF& qpdf, bool val)
  883 + {
  884 + qpdf.m->check_mode = val;
  885 + }
  886 + };
737 887
738 // For testing only -- do not add to DLL 888 // For testing only -- do not add to DLL
739 static bool test_json_validators(); 889 static bool test_json_validators();
@@ -748,23 +898,194 @@ class QPDF @@ -748,23 +898,194 @@ class QPDF
748 898
749 static std::string const qpdf_version; 899 static std::string const qpdf_version;
750 900
751 - class ObjCopier;  
752 - class EncryptionParameters;  
753 - class ForeignStreamData;  
754 - class CopiedStreamDataProvider;  
755 - class StringDecrypter;  
756 - class ResolveRecorder; 901 + class ObjCache
  902 + {
  903 + public:
  904 + ObjCache() :
  905 + end_before_space(0),
  906 + end_after_space(0)
  907 + {
  908 + }
  909 + ObjCache(
  910 + std::shared_ptr<QPDFObject> object,
  911 + qpdf_offset_t end_before_space = 0,
  912 + qpdf_offset_t end_after_space = 0) :
  913 + object(object),
  914 + end_before_space(end_before_space),
  915 + end_after_space(end_after_space)
  916 + {
  917 + }
  918 +
  919 + std::shared_ptr<QPDFObject> object;
  920 + qpdf_offset_t end_before_space;
  921 + qpdf_offset_t end_after_space;
  922 + };
  923 +
  924 + class ObjCopier
  925 + {
  926 + public:
  927 + std::map<QPDFObjGen, QPDFObjectHandle> object_map;
  928 + std::vector<QPDFObjectHandle> to_copy;
  929 + QPDFObjGen::set visiting;
  930 + };
  931 +
  932 + class EncryptionParameters
  933 + {
  934 + friend class QPDF;
  935 +
  936 + public:
  937 + EncryptionParameters();
  938 +
  939 + private:
  940 + bool encrypted;
  941 + bool encryption_initialized;
  942 + int encryption_V;
  943 + int encryption_R;
  944 + bool encrypt_metadata;
  945 + std::map<std::string, encryption_method_e> crypt_filters;
  946 + encryption_method_e cf_stream;
  947 + encryption_method_e cf_string;
  948 + encryption_method_e cf_file;
  949 + std::string provided_password;
  950 + std::string user_password;
  951 + std::string encryption_key;
  952 + std::string cached_object_encryption_key;
  953 + QPDFObjGen cached_key_og;
  954 + bool user_password_matched;
  955 + bool owner_password_matched;
  956 + };
  957 +
  958 + class ForeignStreamData
  959 + {
  960 + friend class QPDF;
  961 +
  962 + public:
  963 + ForeignStreamData(
  964 + std::shared_ptr<EncryptionParameters> encp,
  965 + std::shared_ptr<InputSource> file,
  966 + QPDFObjGen const& foreign_og,
  967 + qpdf_offset_t offset,
  968 + size_t length,
  969 + QPDFObjectHandle local_dict);
  970 +
  971 + private:
  972 + std::shared_ptr<EncryptionParameters> encp;
  973 + std::shared_ptr<InputSource> file;
  974 + QPDFObjGen foreign_og;
  975 + qpdf_offset_t offset;
  976 + size_t length;
  977 + QPDFObjectHandle local_dict;
  978 + };
  979 +
  980 + class CopiedStreamDataProvider: public QPDFObjectHandle::StreamDataProvider
  981 + {
  982 + public:
  983 + CopiedStreamDataProvider(QPDF& destination_qpdf);
  984 + ~CopiedStreamDataProvider() override = default;
  985 + bool provideStreamData(
  986 + QPDFObjGen const& og,
  987 + Pipeline* pipeline,
  988 + bool suppress_warnings,
  989 + bool will_retry) override;
  990 + void registerForeignStream(QPDFObjGen const& local_og, QPDFObjectHandle foreign_stream);
  991 + void registerForeignStream(QPDFObjGen const& local_og, std::shared_ptr<ForeignStreamData>);
  992 +
  993 + private:
  994 + QPDF& destination_qpdf;
  995 + std::map<QPDFObjGen, QPDFObjectHandle> foreign_streams;
  996 + std::map<QPDFObjGen, std::shared_ptr<ForeignStreamData>> foreign_stream_data;
  997 + };
  998 +
  999 + class StringDecrypter: public QPDFObjectHandle::StringDecrypter
  1000 + {
  1001 + friend class QPDF;
  1002 +
  1003 + public:
  1004 + StringDecrypter(QPDF* qpdf, QPDFObjGen const& og);
  1005 + ~StringDecrypter() override = default;
  1006 + void decryptString(std::string& val) override;
  1007 +
  1008 + private:
  1009 + QPDF* qpdf;
  1010 + QPDFObjGen og;
  1011 + };
  1012 +
  1013 + class ResolveRecorder
  1014 + {
  1015 + public:
  1016 + ResolveRecorder(QPDF* qpdf, QPDFObjGen const& og) :
  1017 + qpdf(qpdf),
  1018 + iter(qpdf->m->resolving.insert(og).first)
  1019 + {
  1020 + }
  1021 + virtual ~ResolveRecorder()
  1022 + {
  1023 + this->qpdf->m->resolving.erase(iter);
  1024 + }
  1025 +
  1026 + private:
  1027 + QPDF* qpdf;
  1028 + std::set<QPDFObjGen>::const_iterator iter;
  1029 + };
  1030 +
757 class JSONReactor; 1031 class JSONReactor;
758 1032
759 - inline Objects& objects() noexcept;  
760 - inline Objects const& objects() const noexcept;  
761 void parse(char const* password); 1033 void parse(char const* password);
762 void inParse(bool); 1034 void inParse(bool);
  1035 + void setTrailer(QPDFObjectHandle obj);
  1036 + void read_xref(qpdf_offset_t offset);
  1037 + bool resolveXRefTable();
  1038 + void reconstruct_xref(QPDFExc& e);
  1039 + bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
  1040 + bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
  1041 + bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
  1042 + qpdf_offset_t read_xrefTable(qpdf_offset_t offset);
  1043 + qpdf_offset_t read_xrefStream(qpdf_offset_t offset);
  1044 + qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream);
  1045 + std::pair<int, std::array<int, 3>>
  1046 + processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged);
  1047 + int processXRefSize(
  1048 + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged);
  1049 + std::pair<int, std::vector<std::pair<int, int>>> processXRefIndex(
  1050 + QPDFObjectHandle& dict,
  1051 + int max_num_entries,
  1052 + std::function<QPDFExc(std::string_view)> damaged);
  1053 + void insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2);
  1054 + void insertFreeXrefEntry(QPDFObjGen);
  1055 + void insertReconstructedXrefEntry(int obj, qpdf_offset_t f1, int f2);
763 void setLastObjectDescription(std::string const& description, QPDFObjGen const& og); 1056 void setLastObjectDescription(std::string const& description, QPDFObjGen const& og);
  1057 + QPDFObjectHandle readTrailer();
  1058 + QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og);
  1059 + void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
  1060 + void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
  1061 + QPDFObjectHandle readObjectInStream(std::shared_ptr<InputSource>& input, int obj);
  1062 + size_t recoverStreamLength(
  1063 + std::shared_ptr<InputSource> input, QPDFObjGen const& og, qpdf_offset_t stream_offset);
764 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); 1064 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0);
765 1065
  1066 + QPDFObjectHandle readObjectAtOffset(
  1067 + bool attempt_recovery,
  1068 + qpdf_offset_t offset,
  1069 + std::string const& description,
  1070 + QPDFObjGen exp_og,
  1071 + QPDFObjGen& og,
  1072 + bool skip_cache_if_in_xref);
  1073 + QPDFObject* resolve(QPDFObjGen og);
  1074 + void resolveObjectsInStream(int obj_stream_number);
766 void stopOnError(std::string const& message); 1075 void stopOnError(std::string const& message);
  1076 + QPDFObjGen nextObjGen();
767 QPDFObjectHandle newIndirect(QPDFObjGen const&, std::shared_ptr<QPDFObject> const&); 1077 QPDFObjectHandle newIndirect(QPDFObjGen const&, std::shared_ptr<QPDFObject> const&);
  1078 + QPDFObjectHandle makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj);
  1079 + bool isCached(QPDFObjGen const& og);
  1080 + bool isUnresolved(QPDFObjGen const& og);
  1081 + std::shared_ptr<QPDFObject> getObjectForParser(int id, int gen, bool parse_pdf);
  1082 + std::shared_ptr<QPDFObject> getObjectForJSON(int id, int gen);
  1083 + void removeObject(QPDFObjGen og);
  1084 + void updateCache(
  1085 + QPDFObjGen const& og,
  1086 + std::shared_ptr<QPDFObject> const& object,
  1087 + qpdf_offset_t end_before_space,
  1088 + qpdf_offset_t end_after_space);
768 static QPDFExc damagedPDF( 1089 static QPDFExc damagedPDF(
769 InputSource& input, 1090 InputSource& input,
770 std::string const& object, 1091 std::string const& object,
@@ -801,6 +1122,7 @@ class QPDF @@ -801,6 +1122,7 @@ class QPDF
801 1122
802 // For QPDFWriter: 1123 // For QPDFWriter:
803 1124
  1125 + std::map<QPDFObjGen, QPDFXRefEntry> const& getXRefTableInternal();
804 template <typename T> 1126 template <typename T>
805 void optimize_internal( 1127 void optimize_internal(
806 T const& object_stream_data, 1128 T const& object_stream_data,
@@ -809,7 +1131,7 @@ class QPDF @@ -809,7 +1131,7 @@ class QPDF
809 void optimize( 1131 void optimize(
810 QPDFWriter::ObjTable const& obj, 1132 QPDFWriter::ObjTable const& obj,
811 std::function<int(QPDFObjectHandle&)> skip_stream_parameters); 1133 std::function<int(QPDFObjectHandle&)> skip_stream_parameters);
812 - void optimize(Objects const& obj); 1134 + size_t tableSize();
813 1135
814 // Get lists of all objects in order according to the part of a linearized file that they belong 1136 // Get lists of all objects in order according to the part of a linearized file that they belong
815 // to. 1137 // to.
@@ -829,6 +1151,12 @@ class QPDF @@ -829,6 +1151,12 @@ class QPDF
829 int& O, 1151 int& O,
830 bool compressed); 1152 bool compressed);
831 1153
  1154 + // Get a list of objects that would be permitted in an object stream.
  1155 + template <typename T>
  1156 + std::vector<T> getCompressibleObjGens();
  1157 + std::vector<QPDFObjGen> getCompressibleObjVector();
  1158 + std::vector<bool> getCompressibleObjSet();
  1159 +
832 // methods to support page handling 1160 // methods to support page handling
833 1161
834 void getAllPagesInternal( 1162 void getAllPagesInternal(
@@ -868,19 +1196,200 @@ class QPDF @@ -868,19 +1196,200 @@ class QPDF
868 replaceForeignIndirectObjects(QPDFObjectHandle foreign, ObjCopier& obj_copier, bool top); 1196 replaceForeignIndirectObjects(QPDFObjectHandle foreign, ObjCopier& obj_copier, bool top);
869 void copyStreamData(QPDFObjectHandle dest_stream, QPDFObjectHandle src_stream); 1197 void copyStreamData(QPDFObjectHandle dest_stream, QPDFObjectHandle src_stream);
870 1198
871 - struct HPageOffsetEntry;  
872 - struct HPageOffset;  
873 - struct HSharedObjectEntry;  
874 - struct HSharedObject;  
875 - struct HGeneric;  
876 - struct LinParameters;  
877 - struct CHPageOffsetEntry;  
878 - struct CHPageOffset;  
879 - struct CHSharedObjectEntry;  
880 - struct CHSharedObject;  
881 - class ObjUser;  
882 - struct UpdateObjectMapsFrame;  
883 - class PatternFinder; 1199 + // Linearization Hint table structures.
  1200 + // Naming conventions:
  1201 +
  1202 + // HSomething is the Something Hint Table or table header
  1203 + // HSomethingEntry is an entry in the Something table
  1204 +
  1205 + // delta_something + min_something = something
  1206 + // nbits_something = number of bits required for something
  1207 +
  1208 + // something_offset is the pre-adjusted offset in the file. If >=
  1209 + // H0_offset, H0_length must be added to get an actual file
  1210 + // offset.
  1211 +
  1212 + // PDF 1.4: Table F.4
  1213 + struct HPageOffsetEntry
  1214 + {
  1215 + int delta_nobjects{0}; // 1
  1216 + qpdf_offset_t delta_page_length{0}; // 2
  1217 + // vectors' sizes = nshared_objects
  1218 + int nshared_objects{0}; // 3
  1219 + std::vector<int> shared_identifiers; // 4
  1220 + std::vector<int> shared_numerators; // 5
  1221 + qpdf_offset_t delta_content_offset{0}; // 6
  1222 + qpdf_offset_t delta_content_length{0}; // 7
  1223 + };
  1224 +
  1225 + // PDF 1.4: Table F.3
  1226 + struct HPageOffset
  1227 + {
  1228 + int min_nobjects{0}; // 1
  1229 + qpdf_offset_t first_page_offset{0}; // 2
  1230 + int nbits_delta_nobjects{0}; // 3
  1231 + int min_page_length{0}; // 4
  1232 + int nbits_delta_page_length{0}; // 5
  1233 + int min_content_offset{0}; // 6
  1234 + int nbits_delta_content_offset{0}; // 7
  1235 + int min_content_length{0}; // 8
  1236 + int nbits_delta_content_length{0}; // 9
  1237 + int nbits_nshared_objects{0}; // 10
  1238 + int nbits_shared_identifier{0}; // 11
  1239 + int nbits_shared_numerator{0}; // 12
  1240 + int shared_denominator{0}; // 13
  1241 + // vector size is npages
  1242 + std::vector<HPageOffsetEntry> entries;
  1243 + };
  1244 +
  1245 + // PDF 1.4: Table F.6
  1246 + struct HSharedObjectEntry
  1247 + {
  1248 + // Item 3 is a 128-bit signature (unsupported by Acrobat)
  1249 + int delta_group_length{0}; // 1
  1250 + int signature_present{0}; // 2 -- always 0
  1251 + int nobjects_minus_one{0}; // 4 -- always 0
  1252 + };
  1253 +
  1254 + // PDF 1.4: Table F.5
  1255 + struct HSharedObject
  1256 + {
  1257 + int first_shared_obj{0}; // 1
  1258 + qpdf_offset_t first_shared_offset{0}; // 2
  1259 + int nshared_first_page{0}; // 3
  1260 + int nshared_total{0}; // 4
  1261 + int nbits_nobjects{0}; // 5
  1262 + int min_group_length{0}; // 6
  1263 + int nbits_delta_group_length{0}; // 7
  1264 + // vector size is nshared_total
  1265 + std::vector<HSharedObjectEntry> entries;
  1266 + };
  1267 +
  1268 + // PDF 1.4: Table F.9
  1269 + struct HGeneric
  1270 + {
  1271 + int first_object{0}; // 1
  1272 + qpdf_offset_t first_object_offset{0}; // 2
  1273 + int nobjects{0}; // 3
  1274 + int group_length{0}; // 4
  1275 + };
  1276 +
  1277 + // Other linearization data structures
  1278 +
  1279 + // Initialized from Linearization Parameter dictionary
  1280 + struct LinParameters
  1281 + {
  1282 + qpdf_offset_t file_size{0}; // /L
  1283 + int first_page_object{0}; // /O
  1284 + qpdf_offset_t first_page_end{0}; // /E
  1285 + int npages{0}; // /N
  1286 + qpdf_offset_t xref_zero_offset{0}; // /T
  1287 + int first_page{0}; // /P
  1288 + qpdf_offset_t H_offset{0}; // offset of primary hint stream
  1289 + qpdf_offset_t H_length{0}; // length of primary hint stream
  1290 + };
  1291 +
  1292 + // Computed hint table value data structures. These tables contain the computed values on which
  1293 + // the hint table values are based. They exclude things like number of bits and store actual
  1294 + // values instead of mins and deltas. File offsets are also absolute rather than being offset
  1295 + // by the size of the primary hint table. We populate the hint table structures from these
  1296 + // during writing and compare the hint table values with these during validation. We ignore
  1297 + // some values for various reasons described in the code. Those values are omitted from these
  1298 + // structures. Note also that object numbers are object numbers from the input file, not the
  1299 + // output file.
  1300 +
  1301 + // Naming convention: CHSomething is analogous to HSomething above. "CH" is computed hint.
  1302 +
  1303 + struct CHPageOffsetEntry
  1304 + {
  1305 + int nobjects{0};
  1306 + int nshared_objects{0};
  1307 + // vectors' sizes = nshared_objects
  1308 + std::vector<int> shared_identifiers;
  1309 + };
  1310 +
  1311 + struct CHPageOffset
  1312 + {
  1313 + // vector size is npages
  1314 + std::vector<CHPageOffsetEntry> entries;
  1315 + };
  1316 +
  1317 + struct CHSharedObjectEntry
  1318 + {
  1319 + CHSharedObjectEntry(int object) :
  1320 + object(object)
  1321 + {
  1322 + }
  1323 +
  1324 + int object;
  1325 + };
  1326 +
  1327 + // PDF 1.4: Table F.5
  1328 + struct CHSharedObject
  1329 + {
  1330 + int first_shared_obj{0};
  1331 + int nshared_first_page{0};
  1332 + int nshared_total{0};
  1333 + // vector size is nshared_total
  1334 + std::vector<CHSharedObjectEntry> entries;
  1335 + };
  1336 +
  1337 + // No need for CHGeneric -- HGeneric is fine as is.
  1338 +
  1339 + // Data structures to support optimization -- implemented in QPDF_optimization.cc
  1340 +
  1341 + class ObjUser
  1342 + {
  1343 + public:
  1344 + enum user_e { ou_bad, ou_page, ou_thumb, ou_trailer_key, ou_root_key, ou_root };
  1345 +
  1346 + // type is set to ou_bad
  1347 + ObjUser();
  1348 +
  1349 + // type must be ou_root
  1350 + ObjUser(user_e type);
  1351 +
  1352 + // type must be one of ou_page or ou_thumb
  1353 + ObjUser(user_e type, int pageno);
  1354 +
  1355 + // type must be one of ou_trailer_key or ou_root_key
  1356 + ObjUser(user_e type, std::string const& key);
  1357 +
  1358 + bool operator<(ObjUser const&) const;
  1359 +
  1360 + user_e ou_type;
  1361 + int pageno; // if ou_page;
  1362 + std::string key; // if ou_trailer_key or ou_root_key
  1363 + };
  1364 +
  1365 + struct UpdateObjectMapsFrame
  1366 + {
  1367 + UpdateObjectMapsFrame(ObjUser const& ou, QPDFObjectHandle oh, bool top);
  1368 +
  1369 + ObjUser const& ou;
  1370 + QPDFObjectHandle oh;
  1371 + bool top;
  1372 + };
  1373 +
  1374 + class PatternFinder: public InputSource::Finder
  1375 + {
  1376 + public:
  1377 + PatternFinder(QPDF& qpdf, bool (QPDF::*checker)()) :
  1378 + qpdf(qpdf),
  1379 + checker(checker)
  1380 + {
  1381 + }
  1382 + ~PatternFinder() override = default;
  1383 + bool
  1384 + check() override
  1385 + {
  1386 + return (this->qpdf.*checker)();
  1387 + }
  1388 +
  1389 + private:
  1390 + QPDF& qpdf;
  1391 + bool (QPDF::*checker)();
  1392 + };
884 1393
885 // Methods to support pattern finding 1394 // Methods to support pattern finding
886 static bool validatePDFVersion(char const*&, std::string& version); 1395 static bool validatePDFVersion(char const*&, std::string& version);
@@ -902,7 +1411,6 @@ class QPDF @@ -902,7 +1411,6 @@ class QPDF
902 QPDFObjectHandle 1411 QPDFObjectHandle
903 getUncompressedObject(QPDFObjectHandle&, std::map<int, int> const& object_stream_data); 1412 getUncompressedObject(QPDFObjectHandle&, std::map<int, int> const& object_stream_data);
904 QPDFObjectHandle getUncompressedObject(QPDFObjectHandle&, QPDFWriter::ObjTable const& obj); 1413 QPDFObjectHandle getUncompressedObject(QPDFObjectHandle&, QPDFWriter::ObjTable const& obj);
905 - QPDFObjectHandle getUncompressedObject(QPDFObjectHandle&, Objects const& obj);  
906 int lengthNextN(int first_object, int n); 1414 int lengthNextN(int first_object, int n);
907 void 1415 void
908 checkHPageOffset(std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& idx_to_obj); 1416 checkHPageOffset(std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& idx_to_obj);
@@ -948,7 +1456,6 @@ class QPDF @@ -948,7 +1456,6 @@ class QPDF
948 std::function<int(QPDFObjectHandle&)> skip_stream_parameters); 1456 std::function<int(QPDFObjectHandle&)> skip_stream_parameters);
949 void filterCompressedObjects(std::map<int, int> const& object_stream_data); 1457 void filterCompressedObjects(std::map<int, int> const& object_stream_data);
950 void filterCompressedObjects(QPDFWriter::ObjTable const& object_stream_data); 1458 void filterCompressedObjects(QPDFWriter::ObjTable const& object_stream_data);
951 - void filterCompressedObjects(Objects const& object_stream_data);  
952 1459
953 // JSON import 1460 // JSON import
954 void importJSON(std::shared_ptr<InputSource>, bool must_be_complete); 1461 void importJSON(std::shared_ptr<InputSource>, bool must_be_complete);
@@ -979,7 +1486,90 @@ class QPDF @@ -979,7 +1486,90 @@ class QPDF
979 return QIntC::to_ulonglong(i); 1486 return QIntC::to_ulonglong(i);
980 } 1487 }
981 1488
982 - class Members; 1489 + class Members
  1490 + {
  1491 + friend class QPDF;
  1492 + friend class ResolveRecorder;
  1493 +
  1494 + public:
  1495 + QPDF_DLL
  1496 + ~Members() = default;
  1497 +
  1498 + private:
  1499 + Members();
  1500 + Members(Members const&) = delete;
  1501 +
  1502 + std::shared_ptr<QPDFLogger> log;
  1503 + unsigned long long unique_id{0};
  1504 + QPDFTokenizer tokenizer;
  1505 + std::shared_ptr<InputSource> file;
  1506 + std::string last_object_description;
  1507 + bool provided_password_is_hex_key{false};
  1508 + bool ignore_xref_streams{false};
  1509 + bool suppress_warnings{false};
  1510 + size_t max_warnings{0};
  1511 + bool attempt_recovery{true};
  1512 + bool check_mode{false};
  1513 + std::shared_ptr<EncryptionParameters> encp;
  1514 + std::string pdf_version;
  1515 + std::map<QPDFObjGen, QPDFXRefEntry> xref_table;
  1516 + // Various tables are indexed by object id, with potential size id + 1
  1517 + int xref_table_max_id{std::numeric_limits<int>::max() - 1};
  1518 + qpdf_offset_t xref_table_max_offset{0};
  1519 + std::set<int> deleted_objects;
  1520 + std::map<QPDFObjGen, ObjCache> obj_cache;
  1521 + std::set<QPDFObjGen> resolving;
  1522 + QPDFObjectHandle trailer;
  1523 + std::vector<QPDFObjectHandle> all_pages;
  1524 + bool invalid_page_found{false};
  1525 + std::map<QPDFObjGen, int> pageobj_to_pages_pos;
  1526 + bool pushed_inherited_attributes_to_pages{false};
  1527 + bool ever_pushed_inherited_attributes_to_pages{false};
  1528 + bool ever_called_get_all_pages{false};
  1529 + std::vector<QPDFExc> warnings;
  1530 + std::map<unsigned long long, ObjCopier> object_copiers;
  1531 + std::shared_ptr<QPDFObjectHandle::StreamDataProvider> copied_streams;
  1532 + // copied_stream_data_provider is owned by copied_streams
  1533 + CopiedStreamDataProvider* copied_stream_data_provider{nullptr};
  1534 + bool reconstructed_xref{false};
  1535 + bool fixed_dangling_refs{false};
  1536 + bool immediate_copy_from{false};
  1537 + bool in_parse{false};
  1538 + bool parsed{false};
  1539 + std::set<int> resolved_object_streams;
  1540 +
  1541 + // Linearization data
  1542 + qpdf_offset_t first_xref_item_offset{0}; // actual value from file
  1543 + bool uncompressed_after_compressed{false};
  1544 + bool linearization_warnings{false};
  1545 +
  1546 + // Linearization parameter dictionary and hint table data: may be read from file or computed
  1547 + // prior to writing a linearized file
  1548 + QPDFObjectHandle lindict;
  1549 + LinParameters linp;
  1550 + HPageOffset page_offset_hints;
  1551 + HSharedObject shared_object_hints;
  1552 + HGeneric outline_hints;
  1553 +
  1554 + // Computed linearization data: used to populate above tables during writing and to compare
  1555 + // with them during validation. c_ means computed.
  1556 + LinParameters c_linp;
  1557 + CHPageOffset c_page_offset_data;
  1558 + CHSharedObject c_shared_object_data;
  1559 + HGeneric c_outline_data;
  1560 +
  1561 + // Object ordering data for linearized files: initialized by calculateLinearizationData().
  1562 + // Part numbers refer to the PDF 1.4 specification.
  1563 + std::vector<QPDFObjectHandle> part4;
  1564 + std::vector<QPDFObjectHandle> part6;
  1565 + std::vector<QPDFObjectHandle> part7;
  1566 + std::vector<QPDFObjectHandle> part8;
  1567 + std::vector<QPDFObjectHandle> part9;
  1568 +
  1569 + // Optimization data
  1570 + std::map<ObjUser, std::set<QPDFObjGen>> obj_user_to_objects;
  1571 + std::map<QPDFObjGen, std::set<ObjUser>> object_to_obj_users;
  1572 + };
983 1573
984 // Keep all member variables inside the Members object, which we dynamically allocate. This 1574 // Keep all member variables inside the Members object, which we dynamically allocate. This
985 // makes it possible to add new private members without breaking binary compatibility. 1575 // makes it possible to add new private members without breaking binary compatibility.
libqpdf/CMakeLists.txt
@@ -107,7 +107,6 @@ set(libqpdf_SOURCES @@ -107,7 +107,6 @@ set(libqpdf_SOURCES
107 QPDF_encryption.cc 107 QPDF_encryption.cc
108 QPDF_json.cc 108 QPDF_json.cc
109 QPDF_linearization.cc 109 QPDF_linearization.cc
110 - QPDF_objects.cc  
111 QPDF_optimization.cc 110 QPDF_optimization.cc
112 QPDF_pages.cc 111 QPDF_pages.cc
113 QTC.cc 112 QTC.cc
libqpdf/QPDF.cc
1 #include <qpdf/qpdf-config.h> // include first for large file support 1 #include <qpdf/qpdf-config.h> // include first for large file support
2 2
3 -#include <qpdf/QPDF_private.hh> 3 +#include <qpdf/QPDF.hh>
4 4
  5 +#include <array>
5 #include <atomic> 6 #include <atomic>
6 #include <cstring> 7 #include <cstring>
  8 +#include <limits>
7 #include <map> 9 #include <map>
8 #include <regex> 10 #include <regex>
9 #include <sstream> 11 #include <sstream>
@@ -30,51 +32,67 @@ @@ -30,51 +32,67 @@
30 // being static as well. 32 // being static as well.
31 std::string const QPDF::qpdf_version(QPDF_VERSION); 33 std::string const QPDF::qpdf_version(QPDF_VERSION);
32 34
  35 +static char const* EMPTY_PDF = (
  36 + // force line break
  37 + "%PDF-1.3\n"
  38 + "1 0 obj\n"
  39 + "<< /Type /Catalog /Pages 2 0 R >>\n"
  40 + "endobj\n"
  41 + "2 0 obj\n"
  42 + "<< /Type /Pages /Kids [] /Count 0 >>\n"
  43 + "endobj\n"
  44 + "xref\n"
  45 + "0 3\n"
  46 + "0000000000 65535 f \n"
  47 + "0000000009 00000 n \n"
  48 + "0000000058 00000 n \n"
  49 + "trailer << /Size 3 /Root 1 0 R >>\n"
  50 + "startxref\n"
  51 + "110\n"
  52 + "%%EOF\n");
  53 +
33 namespace 54 namespace
34 { 55 {
35 - class InvalidInputSource final: public InputSource 56 + class InvalidInputSource: public InputSource
36 { 57 {
37 public: 58 public:
38 - InvalidInputSource(std::string const& name) :  
39 - name(name)  
40 - {  
41 - }  
42 - ~InvalidInputSource() final = default; 59 + ~InvalidInputSource() override = default;
43 qpdf_offset_t 60 qpdf_offset_t
44 - findAndSkipNextEOL() final 61 + findAndSkipNextEOL() override
45 { 62 {
46 throwException(); 63 throwException();
47 return 0; 64 return 0;
48 } 65 }
49 std::string const& 66 std::string const&
50 - getName() const final 67 + getName() const override
51 { 68 {
  69 + static std::string name("closed input source");
52 return name; 70 return name;
53 } 71 }
54 qpdf_offset_t 72 qpdf_offset_t
55 - tell() final 73 + tell() override
56 { 74 {
57 throwException(); 75 throwException();
58 return 0; 76 return 0;
59 } 77 }
60 void 78 void
61 - seek(qpdf_offset_t offset, int whence) final 79 + seek(qpdf_offset_t offset, int whence) override
62 { 80 {
63 throwException(); 81 throwException();
64 } 82 }
65 void 83 void
66 - rewind() final 84 + rewind() override
67 { 85 {
68 throwException(); 86 throwException();
69 } 87 }
70 size_t 88 size_t
71 - read(char* buffer, size_t length) final 89 + read(char* buffer, size_t length) override
72 { 90 {
73 throwException(); 91 throwException();
74 return 0; 92 return 0;
75 } 93 }
76 void 94 void
77 - unreadCh(char ch) final 95 + unreadCh(char ch) override
78 { 96 {
79 throwException(); 97 throwException();
80 } 98 }
@@ -87,8 +105,6 @@ namespace @@ -87,8 +105,6 @@ namespace
87 "source. QPDF operations are invalid before processFile (or " 105 "source. QPDF operations are invalid before processFile (or "
88 "another process method) or after closeInputSource"); 106 "another process method) or after closeInputSource");
89 } 107 }
90 -  
91 - std::string const& name;  
92 }; 108 };
93 } // namespace 109 } // namespace
94 110
@@ -180,17 +196,15 @@ QPDF::EncryptionParameters::EncryptionParameters() : @@ -180,17 +196,15 @@ QPDF::EncryptionParameters::EncryptionParameters() :
180 { 196 {
181 } 197 }
182 198
183 -QPDF::Members::Members(QPDF& qpdf) : 199 +QPDF::Members::Members() :
184 log(QPDFLogger::defaultLogger()), 200 log(QPDFLogger::defaultLogger()),
185 - file_sp(new InvalidInputSource(no_input_name)),  
186 - file(file_sp.get()),  
187 - encp(new EncryptionParameters),  
188 - objects(qpdf, this, file) 201 + file(new InvalidInputSource()),
  202 + encp(new EncryptionParameters)
189 { 203 {
190 } 204 }
191 205
192 QPDF::QPDF() : 206 QPDF::QPDF() :
193 - m(new Members(*this)) 207 + m(new Members())
194 { 208 {
195 m->tokenizer.allowEOF(); 209 m->tokenizer.allowEOF();
196 // Generate a unique ID. It just has to be unique among all QPDF objects allocated throughout 210 // Generate a unique ID. It just has to be unique among all QPDF objects allocated throughout
@@ -199,7 +213,28 @@ QPDF::QPDF() : @@ -199,7 +213,28 @@ QPDF::QPDF() :
199 m->unique_id = unique_id.fetch_add(1ULL); 213 m->unique_id = unique_id.fetch_add(1ULL);
200 } 214 }
201 215
202 -QPDF::~QPDF() = default; 216 +QPDF::~QPDF()
  217 +{
  218 + // If two objects are mutually referential (through each object having an array or dictionary
  219 + // that contains an indirect reference to the other), the circular references in the
  220 + // std::shared_ptr objects will prevent the objects from being deleted. Walk through all objects
  221 + // in the object cache, which is those objects that we read from the file, and break all
  222 + // resolved indirect references by replacing them with an internal object type representing that
  223 + // they have been destroyed. Note that we can't break references like this at any time when the
  224 + // QPDF object is active. The call to reset also causes all direct QPDFObjectHandle objects that
  225 + // are reachable from this object to release their association with this QPDF. Direct objects
  226 + // are not destroyed since they can be moved to other QPDF objects safely.
  227 +
  228 + // At this point, obviously no one is still using the QPDF object, but we'll explicitly clear
  229 + // the xref table anyway just to prevent any possibility of resolve() succeeding.
  230 + m->xref_table.clear();
  231 + for (auto const& iter: m->obj_cache) {
  232 + iter.second.object->disconnect();
  233 + if (iter.second.object->getTypeCode() != ::ot_null) {
  234 + iter.second.object->destroy();
  235 + }
  236 + }
  237 +}
203 238
204 std::shared_ptr<QPDF> 239 std::shared_ptr<QPDF>
205 QPDF::create() 240 QPDF::create()
@@ -236,17 +271,14 @@ QPDF::processMemoryFile( @@ -236,17 +271,14 @@ QPDF::processMemoryFile(
236 void 271 void
237 QPDF::processInputSource(std::shared_ptr<InputSource> source, char const* password) 272 QPDF::processInputSource(std::shared_ptr<InputSource> source, char const* password)
238 { 273 {
239 - m->file_sp = source;  
240 - m->file = source.get(); 274 + m->file = source;
241 parse(password); 275 parse(password);
242 } 276 }
243 277
244 void 278 void
245 QPDF::closeInputSource() 279 QPDF::closeInputSource()
246 { 280 {
247 - m->no_input_name = "closed input source";  
248 - m->file_sp = std::shared_ptr<InputSource>(new InvalidInputSource(m->no_input_name));  
249 - m->file = m->file_sp.get(); 281 + m->file = std::shared_ptr<InputSource>(new InvalidInputSource());
250 } 282 }
251 283
252 void 284 void
@@ -258,9 +290,7 @@ QPDF::setPasswordIsHexKey(bool val) @@ -258,9 +290,7 @@ QPDF::setPasswordIsHexKey(bool val)
258 void 290 void
259 QPDF::emptyPDF() 291 QPDF::emptyPDF()
260 { 292 {
261 - m->pdf_version = "1.3";  
262 - m->no_input_name = "empty PDF";  
263 - m->objects.xref_table().initialize_empty(); 293 + processMemoryFile("empty PDF", EMPTY_PDF, strlen(EMPTY_PDF));
264 } 294 }
265 295
266 void 296 void
@@ -273,7 +303,7 @@ QPDF::registerStreamFilter( @@ -273,7 +303,7 @@ QPDF::registerStreamFilter(
273 void 303 void
274 QPDF::setIgnoreXRefStreams(bool val) 304 QPDF::setIgnoreXRefStreams(bool val)
275 { 305 {
276 - m->objects.xref_table().ignore_streams(val); 306 + m->ignore_xref_streams = val;
277 } 307 }
278 308
279 std::shared_ptr<QPDFLogger> 309 std::shared_ptr<QPDFLogger>
@@ -311,7 +341,6 @@ void @@ -311,7 +341,6 @@ void
311 QPDF::setAttemptRecovery(bool val) 341 QPDF::setAttemptRecovery(bool val)
312 { 342 {
313 m->attempt_recovery = val; 343 m->attempt_recovery = val;
314 - m->objects.xref_table().attempt_recovery(val);  
315 } 344 }
316 345
317 void 346 void
@@ -381,14 +410,23 @@ QPDF::findHeader() @@ -381,14 +410,23 @@ QPDF::findHeader()
381 // PDF header, all explicit offsets in the file are such that 0 points to the beginning 410 // PDF header, all explicit offsets in the file are such that 0 points to the beginning
382 // of the header. 411 // of the header.
383 QTC::TC("qpdf", "QPDF global offset"); 412 QTC::TC("qpdf", "QPDF global offset");
384 - m->file_sp =  
385 - std::shared_ptr<InputSource>(new OffsetInputSource(m->file_sp, global_offset));  
386 - m->file = m->file_sp.get(); 413 + m->file = std::shared_ptr<InputSource>(new OffsetInputSource(m->file, global_offset));
387 } 414 }
388 } 415 }
389 return valid; 416 return valid;
390 } 417 }
391 418
  419 +bool
  420 +QPDF::findStartxref()
  421 +{
  422 + if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {
  423 + // Position in front of offset token
  424 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  425 + return true;
  426 + }
  427 + return false;
  428 +}
  429 +
392 void 430 void
393 QPDF::parse(char const* password) 431 QPDF::parse(char const* password)
394 { 432 {
@@ -405,9 +443,47 @@ QPDF::parse(char const* password) @@ -405,9 +443,47 @@ QPDF::parse(char const* password)
405 m->pdf_version = "1.2"; 443 m->pdf_version = "1.2";
406 } 444 }
407 445
408 - m->objects.xref_table().initialize(); 446 + // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
  447 + // 30 characters to leave room for the startxref stuff.
  448 + m->file->seek(0, SEEK_END);
  449 + qpdf_offset_t end_offset = m->file->tell();
  450 + m->xref_table_max_offset = end_offset;
  451 + // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
  452 + // scenarios at least 3 bytes are required.
  453 + if (m->xref_table_max_id > m->xref_table_max_offset / 3) {
  454 + m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);
  455 + }
  456 + qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
  457 + PatternFinder sf(*this, &QPDF::findStartxref);
  458 + qpdf_offset_t xref_offset = 0;
  459 + if (m->file->findLast("startxref", start_offset, 0, sf)) {
  460 + xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());
  461 + }
  462 +
  463 + try {
  464 + if (xref_offset == 0) {
  465 + QTC::TC("qpdf", "QPDF can't find startxref");
  466 + throw damagedPDF("", 0, "can't find startxref");
  467 + }
  468 + try {
  469 + read_xref(xref_offset);
  470 + } catch (QPDFExc&) {
  471 + throw;
  472 + } catch (std::exception& e) {
  473 + throw damagedPDF("", 0, std::string("error reading xref: ") + e.what());
  474 + }
  475 + } catch (QPDFExc& e) {
  476 + if (m->attempt_recovery) {
  477 + reconstruct_xref(e);
  478 + QTC::TC("qpdf", "QPDF reconstructed xref table");
  479 + } else {
  480 + throw;
  481 + }
  482 + }
  483 +
409 initializeEncryption(); 484 initializeEncryption();
410 - if (m->objects.xref_table().size() > 0 && !getRoot().getKey("/Pages").isDictionary()) { 485 + m->parsed = true;
  486 + if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) {
411 // QPDFs created from JSON have an empty xref table and no root object yet. 487 // QPDFs created from JSON have an empty xref table and no root object yet.
412 throw damagedPDF("", 0, "unable to find page tree"); 488 throw damagedPDF("", 0, "unable to find page tree");
413 } 489 }
@@ -448,9 +524,873 @@ QPDF::warn( @@ -448,9 +524,873 @@ QPDF::warn(
448 } 524 }
449 525
450 void 526 void
  527 +QPDF::setTrailer(QPDFObjectHandle obj)
  528 +{
  529 + if (m->trailer) {
  530 + return;
  531 + }
  532 + m->trailer = obj;
  533 +}
  534 +
  535 +void
  536 +QPDF::reconstruct_xref(QPDFExc& e)
  537 +{
  538 + if (m->reconstructed_xref) {
  539 + // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
  540 + // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
  541 + throw e;
  542 + }
  543 +
  544 + // If recovery generates more than 1000 warnings, the file is so severely damaged that there
  545 + // probably is no point trying to continue.
  546 + const auto max_warnings = m->warnings.size() + 1000U;
  547 + auto check_warnings = [this, max_warnings]() {
  548 + if (m->warnings.size() > max_warnings) {
  549 + throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");
  550 + }
  551 + };
  552 +
  553 + m->reconstructed_xref = true;
  554 + // We may find more objects, which may contain dangling references.
  555 + m->fixed_dangling_refs = false;
  556 +
  557 + warn(damagedPDF("", 0, "file is damaged"));
  558 + warn(e);
  559 + warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));
  560 +
  561 + // Delete all references to type 1 (uncompressed) objects
  562 + std::set<QPDFObjGen> to_delete;
  563 + for (auto const& iter: m->xref_table) {
  564 + if (iter.second.getType() == 1) {
  565 + to_delete.insert(iter.first);
  566 + }
  567 + }
  568 + for (auto const& iter: to_delete) {
  569 + m->xref_table.erase(iter);
  570 + }
  571 +
  572 + m->file->seek(0, SEEK_END);
  573 + qpdf_offset_t eof = m->file->tell();
  574 + m->file->seek(0, SEEK_SET);
  575 + // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
  576 + static size_t const MAX_LEN = 10;
  577 + while (m->file->tell() < eof) {
  578 + QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);
  579 + qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
  580 + if (t1.isInteger()) {
  581 + auto pos = m->file->tell();
  582 + QPDFTokenizer::Token t2 = readToken(*m->file, MAX_LEN);
  583 + if ((t2.isInteger()) && (readToken(*m->file, MAX_LEN).isWord("obj"))) {
  584 + int obj = QUtil::string_to_int(t1.getValue().c_str());
  585 + int gen = QUtil::string_to_int(t2.getValue().c_str());
  586 + if (obj <= m->xref_table_max_id) {
  587 + insertReconstructedXrefEntry(obj, token_start, gen);
  588 + } else {
  589 + warn(damagedPDF(
  590 + "", 0, "ignoring object with impossibly large id " + std::to_string(obj)));
  591 + }
  592 + }
  593 + m->file->seek(pos, SEEK_SET);
  594 + } else if (!m->trailer && t1.isWord("trailer")) {
  595 + auto pos = m->file->tell();
  596 + QPDFObjectHandle t = readTrailer();
  597 + if (!t.isDictionary()) {
  598 + // Oh well. It was worth a try.
  599 + } else {
  600 + setTrailer(t);
  601 + }
  602 + m->file->seek(pos, SEEK_SET);
  603 + }
  604 + check_warnings();
  605 + m->file->findAndSkipNextEOL();
  606 + }
  607 + m->deleted_objects.clear();
  608 +
  609 + if (!m->trailer) {
  610 + qpdf_offset_t max_offset{0};
  611 + // If there are any xref streams, take the last one to appear.
  612 + for (auto const& iter: m->xref_table) {
  613 + auto entry = iter.second;
  614 + if (entry.getType() != 1) {
  615 + continue;
  616 + }
  617 + auto oh = getObjectByObjGen(iter.first);
  618 + try {
  619 + if (!oh.isStreamOfType("/XRef")) {
  620 + continue;
  621 + }
  622 + } catch (std::exception&) {
  623 + continue;
  624 + }
  625 + auto offset = entry.getOffset();
  626 + if (offset > max_offset) {
  627 + max_offset = offset;
  628 + setTrailer(oh.getDict());
  629 + }
  630 + check_warnings();
  631 + }
  632 + if (max_offset > 0) {
  633 + try {
  634 + read_xref(max_offset);
  635 + } catch (std::exception&) {
  636 + throw damagedPDF(
  637 + "", 0, "error decoding candidate xref stream while recovering damaged file");
  638 + }
  639 + QTC::TC("qpdf", "QPDF recover xref stream");
  640 + }
  641 + }
  642 +
  643 + if (!m->trailer) {
  644 + // We could check the last encountered object to see if it was an xref stream. If so, we
  645 + // could try to get the trailer from there. This may make it possible to recover files with
  646 + // bad startxref pointers even when they have object streams.
  647 +
  648 + throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");
  649 + }
  650 + if (m->xref_table.empty()) {
  651 + // We cannot check for an empty xref table in parse because empty tables are valid when
  652 + // creating QPDF objects from JSON.
  653 + throw damagedPDF("", 0, "unable to find objects while recovering damaged file");
  654 + }
  655 + check_warnings();
  656 + if (!m->parsed) {
  657 + m->parsed = true;
  658 + getAllPages();
  659 + check_warnings();
  660 + if (m->all_pages.empty()) {
  661 + m->parsed = false;
  662 + throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");
  663 + }
  664 + }
  665 + // We could iterate through the objects looking for streams and try to find objects inside of
  666 + // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors
  667 + // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything
  668 + // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
  669 + // It's safe to call it more than once.
  670 +}
  671 +
  672 +void
  673 +QPDF::read_xref(qpdf_offset_t xref_offset)
  674 +{
  675 + std::map<int, int> free_table;
  676 + std::set<qpdf_offset_t> visited;
  677 + while (xref_offset) {
  678 + visited.insert(xref_offset);
  679 + char buf[7];
  680 + memset(buf, 0, sizeof(buf));
  681 + m->file->seek(xref_offset, SEEK_SET);
  682 + // Some files miss the mark a little with startxref. We could do a better job of searching
  683 + // in the neighborhood for something that looks like either an xref table or stream, but the
  684 + // simple heuristic of skipping whitespace can help with the xref table case and is harmless
  685 + // with the stream case.
  686 + bool done = false;
  687 + bool skipped_space = false;
  688 + while (!done) {
  689 + char ch;
  690 + if (1 == m->file->read(&ch, 1)) {
  691 + if (QUtil::is_space(ch)) {
  692 + skipped_space = true;
  693 + } else {
  694 + m->file->unreadCh(ch);
  695 + done = true;
  696 + }
  697 + } else {
  698 + QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
  699 + done = true;
  700 + }
  701 + }
  702 +
  703 + m->file->read(buf, sizeof(buf) - 1);
  704 + // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
  705 + // where it is terminated by arbitrary whitespace.
  706 + if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) {
  707 + if (skipped_space) {
  708 + QTC::TC("qpdf", "QPDF xref skipped space");
  709 + warn(damagedPDF("", 0, "extraneous whitespace seen before xref"));
  710 + }
  711 + QTC::TC(
  712 + "qpdf",
  713 + "QPDF xref space",
  714 + ((buf[4] == '\n') ? 0
  715 + : (buf[4] == '\r') ? 1
  716 + : (buf[4] == ' ') ? 2
  717 + : 9999));
  718 + int skip = 4;
  719 + // buf is null-terminated, and QUtil::is_space('\0') is false, so this won't overrun.
  720 + while (QUtil::is_space(buf[skip])) {
  721 + ++skip;
  722 + }
  723 + xref_offset = read_xrefTable(xref_offset + skip);
  724 + } else {
  725 + xref_offset = read_xrefStream(xref_offset);
  726 + }
  727 + if (visited.count(xref_offset) != 0) {
  728 + QTC::TC("qpdf", "QPDF xref loop");
  729 + throw damagedPDF("", 0, "loop detected following xref tables");
  730 + }
  731 + }
  732 +
  733 + if (!m->trailer) {
  734 + throw damagedPDF("", 0, "unable to find trailer while reading xref");
  735 + }
  736 + int size = m->trailer.getKey("/Size").getIntValueAsInt();
  737 + int max_obj = 0;
  738 + if (!m->xref_table.empty()) {
  739 + max_obj = m->xref_table.rbegin()->first.getObj();
  740 + }
  741 + if (!m->deleted_objects.empty()) {
  742 + max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));
  743 + }
  744 + if ((size < 1) || (size - 1 != max_obj)) {
  745 + QTC::TC("qpdf", "QPDF xref size mismatch");
  746 + warn(damagedPDF(
  747 + "",
  748 + 0,
  749 + ("reported number of objects (" + std::to_string(size) +
  750 + ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));
  751 + }
  752 +
  753 + // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we
  754 + // never depend on its being set.
  755 + m->deleted_objects.clear();
  756 +
  757 + // Make sure we keep only the highest generation for any object.
  758 + QPDFObjGen last_og{-1, 0};
  759 + for (auto const& item: m->xref_table) {
  760 + auto id = item.first.getObj();
  761 + if (id == last_og.getObj() && id > 0) {
  762 + removeObject(last_og);
  763 + }
  764 + last_og = item.first;
  765 + }
  766 +}
  767 +
  768 +bool
  769 +QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)
  770 +{
  771 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  772 + // buffer.
  773 + char const* p = line.c_str();
  774 + char const* start = line.c_str();
  775 +
  776 + // Skip zero or more spaces
  777 + while (QUtil::is_space(*p)) {
  778 + ++p;
  779 + }
  780 + // Require digit
  781 + if (!QUtil::is_digit(*p)) {
  782 + return false;
  783 + }
  784 + // Gather digits
  785 + std::string obj_str;
  786 + while (QUtil::is_digit(*p)) {
  787 + obj_str.append(1, *p++);
  788 + }
  789 + // Require space
  790 + if (!QUtil::is_space(*p)) {
  791 + return false;
  792 + }
  793 + // Skip spaces
  794 + while (QUtil::is_space(*p)) {
  795 + ++p;
  796 + }
  797 + // Require digit
  798 + if (!QUtil::is_digit(*p)) {
  799 + return false;
  800 + }
  801 + // Gather digits
  802 + std::string num_str;
  803 + while (QUtil::is_digit(*p)) {
  804 + num_str.append(1, *p++);
  805 + }
  806 + // Skip any space including line terminators
  807 + while (QUtil::is_space(*p)) {
  808 + ++p;
  809 + }
  810 + bytes = toI(p - start);
  811 + obj = QUtil::string_to_int(obj_str.c_str());
  812 + num = QUtil::string_to_int(num_str.c_str());
  813 + return true;
  814 +}
  815 +
  816 +bool
  817 +QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
  818 +{
  819 + // Reposition after initial read attempt and reread.
  820 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  821 + auto line = m->file->readLine(30);
  822 +
  823 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  824 + // buffer.
  825 + char const* p = line.data();
  826 +
  827 + // Skip zero or more spaces. There aren't supposed to be any.
  828 + bool invalid = false;
  829 + while (QUtil::is_space(*p)) {
  830 + ++p;
  831 + QTC::TC("qpdf", "QPDF ignore first space in xref entry");
  832 + invalid = true;
  833 + }
  834 + // Require digit
  835 + if (!QUtil::is_digit(*p)) {
  836 + return false;
  837 + }
  838 + // Gather digits
  839 + std::string f1_str;
  840 + while (QUtil::is_digit(*p)) {
  841 + f1_str.append(1, *p++);
  842 + }
  843 + // Require space
  844 + if (!QUtil::is_space(*p)) {
  845 + return false;
  846 + }
  847 + if (QUtil::is_space(*(p + 1))) {
  848 + QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
  849 + invalid = true;
  850 + }
  851 + // Skip spaces
  852 + while (QUtil::is_space(*p)) {
  853 + ++p;
  854 + }
  855 + // Require digit
  856 + if (!QUtil::is_digit(*p)) {
  857 + return false;
  858 + }
  859 + // Gather digits
  860 + std::string f2_str;
  861 + while (QUtil::is_digit(*p)) {
  862 + f2_str.append(1, *p++);
  863 + }
  864 + // Require space
  865 + if (!QUtil::is_space(*p)) {
  866 + return false;
  867 + }
  868 + if (QUtil::is_space(*(p + 1))) {
  869 + QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
  870 + invalid = true;
  871 + }
  872 + // Skip spaces
  873 + while (QUtil::is_space(*p)) {
  874 + ++p;
  875 + }
  876 + if ((*p == 'f') || (*p == 'n')) {
  877 + type = *p;
  878 + } else {
  879 + return false;
  880 + }
  881 + if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
  882 + QTC::TC("qpdf", "QPDF ignore length error xref entry");
  883 + invalid = true;
  884 + }
  885 +
  886 + if (invalid) {
  887 + warn(damagedPDF("xref table", "accepting invalid xref table entry"));
  888 + }
  889 +
  890 + f1 = QUtil::string_to_ll(f1_str.c_str());
  891 + f2 = QUtil::string_to_int(f2_str.c_str());
  892 +
  893 + return true;
  894 +}
  895 +
  896 +// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
  897 +// result.
  898 +bool
  899 +QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
  900 +{
  901 + std::array<char, 21> line;
  902 + if (m->file->read(line.data(), 20) != 20) {
  903 + // C++20: [[unlikely]]
  904 + return false;
  905 + }
  906 + line[20] = '\0';
  907 + char const* p = line.data();
  908 +
  909 + int f1_len = 0;
  910 + int f2_len = 0;
  911 +
  912 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  913 + // buffer.
  914 +
  915 + // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
  916 + while (*p == '0') {
  917 + ++f1_len;
  918 + ++p;
  919 + }
  920 + while (QUtil::is_digit(*p) && f1_len++ < 10) {
  921 + f1 *= 10;
  922 + f1 += *p++ - '0';
  923 + }
  924 + // Require space
  925 + if (!QUtil::is_space(*p++)) {
  926 + // Entry doesn't start with space or digit.
  927 + // C++20: [[unlikely]]
  928 + return false;
  929 + }
  930 + // Gather digits. NB No risk of overflow as 99'999 < max int.
  931 + while (*p == '0') {
  932 + ++f2_len;
  933 + ++p;
  934 + }
  935 + while (QUtil::is_digit(*p) && f2_len++ < 5) {
  936 + f2 *= 10;
  937 + f2 += static_cast<int>(*p++ - '0');
  938 + }
  939 + if (QUtil::is_space(*p++) && (*p == 'f' || *p == 'n')) {
  940 + // C++20: [[likely]]
  941 + type = *p;
  942 + // No test for valid line[19].
  943 + if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
  944 + // C++20: [[likely]]
  945 + return true;
  946 + }
  947 + }
  948 + return read_bad_xrefEntry(f1, f2, type);
  949 +}
  950 +
  951 +// Read a single cross-reference table section and associated trailer.
  952 +qpdf_offset_t
  953 +QPDF::read_xrefTable(qpdf_offset_t xref_offset)
  954 +{
  955 + m->file->seek(xref_offset, SEEK_SET);
  956 + std::string line;
  957 + while (true) {
  958 + line.assign(50, '\0');
  959 + m->file->read(line.data(), line.size());
  960 + int obj = 0;
  961 + int num = 0;
  962 + int bytes = 0;
  963 + if (!parse_xrefFirst(line, obj, num, bytes)) {
  964 + QTC::TC("qpdf", "QPDF invalid xref");
  965 + throw damagedPDF("xref table", "xref syntax invalid");
  966 + }
  967 + m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
  968 + for (qpdf_offset_t i = obj; i - num < obj; ++i) {
  969 + if (i == 0) {
  970 + // This is needed by checkLinearization()
  971 + m->first_xref_item_offset = m->file->tell();
  972 + }
  973 + // For xref_table, these will always be small enough to be ints
  974 + qpdf_offset_t f1 = 0;
  975 + int f2 = 0;
  976 + char type = '\0';
  977 + if (!read_xrefEntry(f1, f2, type)) {
  978 + QTC::TC("qpdf", "QPDF invalid xref entry");
  979 + throw damagedPDF(
  980 + "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
  981 + }
  982 + if (type == 'f') {
  983 + insertFreeXrefEntry(QPDFObjGen(toI(i), f2));
  984 + } else {
  985 + insertXrefEntry(toI(i), 1, f1, f2);
  986 + }
  987 + }
  988 + qpdf_offset_t pos = m->file->tell();
  989 + if (readToken(*m->file).isWord("trailer")) {
  990 + break;
  991 + } else {
  992 + m->file->seek(pos, SEEK_SET);
  993 + }
  994 + }
  995 +
  996 + // Set offset to previous xref table if any
  997 + QPDFObjectHandle cur_trailer = readTrailer();
  998 + if (!cur_trailer.isDictionary()) {
  999 + QTC::TC("qpdf", "QPDF missing trailer");
  1000 + throw damagedPDF("", "expected trailer dictionary");
  1001 + }
  1002 +
  1003 + if (!m->trailer) {
  1004 + setTrailer(cur_trailer);
  1005 +
  1006 + if (!m->trailer.hasKey("/Size")) {
  1007 + QTC::TC("qpdf", "QPDF trailer lacks size");
  1008 + throw damagedPDF("trailer", "trailer dictionary lacks /Size key");
  1009 + }
  1010 + if (!m->trailer.getKey("/Size").isInteger()) {
  1011 + QTC::TC("qpdf", "QPDF trailer size not integer");
  1012 + throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
  1013 + }
  1014 + }
  1015 +
  1016 + if (cur_trailer.hasKey("/XRefStm")) {
  1017 + if (m->ignore_xref_streams) {
  1018 + QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
  1019 + } else {
  1020 + if (cur_trailer.getKey("/XRefStm").isInteger()) {
  1021 + // Read the xref stream but disregard any return value -- we'll use our trailer's
  1022 + // /Prev key instead of the xref stream's.
  1023 + (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());
  1024 + } else {
  1025 + throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");
  1026 + }
  1027 + }
  1028 + }
  1029 +
  1030 + if (cur_trailer.hasKey("/Prev")) {
  1031 + if (!cur_trailer.getKey("/Prev").isInteger()) {
  1032 + QTC::TC("qpdf", "QPDF trailer prev not integer");
  1033 + throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");
  1034 + }
  1035 + QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
  1036 + return cur_trailer.getKey("/Prev").getIntValue();
  1037 + }
  1038 +
  1039 + return 0;
  1040 +}
  1041 +
  1042 +// Read a single cross-reference stream.
  1043 +qpdf_offset_t
  1044 +QPDF::read_xrefStream(qpdf_offset_t xref_offset)
  1045 +{
  1046 + if (!m->ignore_xref_streams) {
  1047 + QPDFObjGen x_og;
  1048 + QPDFObjectHandle xref_obj;
  1049 + try {
  1050 + xref_obj =
  1051 + readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);
  1052 + } catch (QPDFExc&) {
  1053 + // ignore -- report error below
  1054 + }
  1055 + if (xref_obj.isStreamOfType("/XRef")) {
  1056 + QTC::TC("qpdf", "QPDF found xref stream");
  1057 + return processXRefStream(xref_offset, xref_obj);
  1058 + }
  1059 + }
  1060 +
  1061 + QTC::TC("qpdf", "QPDF can't find xref");
  1062 + throw damagedPDF("", xref_offset, "xref not found");
  1063 + return 0; // unreachable
  1064 +}
  1065 +
  1066 +// Return the entry size of the xref stream and the processed W array.
  1067 +std::pair<int, std::array<int, 3>>
  1068 +QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
  1069 +{
  1070 + auto W_obj = dict.getKey("/W");
  1071 + if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&
  1072 + W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
  1073 + throw damaged("Cross-reference stream does not have a proper /W key");
  1074 + }
  1075 +
  1076 + std::array<int, 3> W;
  1077 + int entry_size = 0;
  1078 + auto w_vector = W_obj.getArrayAsVector();
  1079 + int max_bytes = sizeof(qpdf_offset_t);
  1080 + for (size_t i = 0; i < 3; ++i) {
  1081 + W[i] = w_vector[i].getIntValueAsInt();
  1082 + if (W[i] > max_bytes) {
  1083 + throw damaged("Cross-reference stream's /W contains impossibly large values");
  1084 + }
  1085 + if (W[i] < 0) {
  1086 + throw damaged("Cross-reference stream's /W contains negative values");
  1087 + }
  1088 + entry_size += W[i];
  1089 + }
  1090 + if (entry_size == 0) {
  1091 + throw damaged("Cross-reference stream's /W indicates entry size of 0");
  1092 + }
  1093 + return {entry_size, W};
  1094 +}
  1095 +
  1096 +// Validate Size key and return the maximum number of entries that the xref stream can contain.
  1097 +int
  1098 +QPDF::processXRefSize(
  1099 + QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
  1100 +{
  1101 + // Number of entries is limited by the highest possible object id and stream size.
  1102 + auto max_num_entries = std::numeric_limits<int>::max();
  1103 + if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
  1104 + max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
  1105 + }
  1106 +
  1107 + auto Size_obj = dict.getKey("/Size");
  1108 + long long size;
  1109 + if (!dict.getKey("/Size").getValueAsInt(size)) {
  1110 + throw damaged("Cross-reference stream does not have a proper /Size key");
  1111 + } else if (size < 0) {
  1112 + throw damaged("Cross-reference stream has a negative /Size key");
  1113 + } else if (size >= max_num_entries) {
  1114 + throw damaged("Cross-reference stream has an impossibly large /Size key");
  1115 + }
  1116 + // We are not validating that Size <= (Size key of parent xref / trailer).
  1117 + return max_num_entries;
  1118 +}
  1119 +
  1120 +// Return the number of entries of the xref stream and the processed Index array.
  1121 +std::pair<int, std::vector<std::pair<int, int>>>
  1122 +QPDF::processXRefIndex(
  1123 + QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
  1124 +{
  1125 + auto size = dict.getKey("/Size").getIntValueAsInt();
  1126 + auto Index_obj = dict.getKey("/Index");
  1127 +
  1128 + if (Index_obj.isArray()) {
  1129 + std::vector<std::pair<int, int>> indx;
  1130 + int num_entries = 0;
  1131 + auto index_vec = Index_obj.getArrayAsVector();
  1132 + if ((index_vec.size() % 2) || index_vec.size() < 2) {
  1133 + throw damaged("Cross-reference stream's /Index has an invalid number of values");
  1134 + }
  1135 +
  1136 + int i = 0;
  1137 + long long first = 0;
  1138 + for (auto& val: index_vec) {
  1139 + if (val.isInteger()) {
  1140 + if (i % 2) {
  1141 + auto count = val.getIntValue();
  1142 + if (count <= 0) {
  1143 + throw damaged(
  1144 + "Cross-reference stream section claims to contain " +
  1145 + std::to_string(count) + " entries");
  1146 + }
  1147 + // We are guarding against the possibility of num_entries * entry_size
  1148 + // overflowing. We are not checking that entries are in ascending order as
  1149 + // required by the spec, which probably should generate a warning. We are also
  1150 + // not checking that for each subsection first object number + number of entries
  1151 + // <= /Size. The spec requires us to ignore object number > /Size.
  1152 + if (first > (max_num_entries - count) ||
  1153 + count > (max_num_entries - num_entries)) {
  1154 + throw damaged(
  1155 + "Cross-reference stream claims to contain too many entries: " +
  1156 + std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
  1157 + std::to_string(num_entries));
  1158 + }
  1159 + indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
  1160 + num_entries += static_cast<int>(count);
  1161 + } else {
  1162 + first = val.getIntValue();
  1163 + if (first < 0) {
  1164 + throw damaged(
  1165 + "Cross-reference stream's /Index contains a negative object id");
  1166 + } else if (first > max_num_entries) {
  1167 + throw damaged("Cross-reference stream's /Index contains an impossibly "
  1168 + "large object id");
  1169 + }
  1170 + }
  1171 + } else {
  1172 + throw damaged(
  1173 + "Cross-reference stream's /Index's item " + std::to_string(i) +
  1174 + " is not an integer");
  1175 + }
  1176 + i++;
  1177 + }
  1178 + QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
  1179 + return {num_entries, indx};
  1180 + } else if (Index_obj.isNull()) {
  1181 + QTC::TC("qpdf", "QPDF xref /Index is null");
  1182 + return {size, {{0, size}}};
  1183 + } else {
  1184 + throw damaged("Cross-reference stream does not have a proper /Index key");
  1185 + }
  1186 +}
  1187 +
  1188 +qpdf_offset_t
  1189 +QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
  1190 +{
  1191 + auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
  1192 + return damagedPDF("xref stream", xref_offset, msg.data());
  1193 + };
  1194 +
  1195 + auto dict = xref_obj.getDict();
  1196 +
  1197 + auto [entry_size, W] = processXRefW(dict, damaged);
  1198 + int max_num_entries = processXRefSize(dict, entry_size, damaged);
  1199 + auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
  1200 +
  1201 + std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
  1202 + size_t actual_size = bp->getSize();
  1203 + auto expected_size = toS(entry_size) * toS(num_entries);
  1204 +
  1205 + if (expected_size != actual_size) {
  1206 + QPDFExc x = damaged(
  1207 + "Cross-reference stream data has the wrong size; expected = " +
  1208 + std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
  1209 + if (expected_size > actual_size) {
  1210 + throw x;
  1211 + } else {
  1212 + warn(x);
  1213 + }
  1214 + }
  1215 +
  1216 + bool saw_first_compressed_object = false;
  1217 +
  1218 + // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
  1219 + // We know that entry_size * num_entries is less or equal to the size of the buffer.
  1220 + auto p = bp->getBuffer();
  1221 + for (auto [obj, sec_entries]: indx) {
  1222 + // Process a subsection.
  1223 + for (int i = 0; i < sec_entries; ++i) {
  1224 + // Read this entry
  1225 + std::array<qpdf_offset_t, 3> fields{};
  1226 + if (W[0] == 0) {
  1227 + QTC::TC("qpdf", "QPDF default for xref stream field 0");
  1228 + fields[0] = 1;
  1229 + }
  1230 + for (size_t j = 0; j < 3; ++j) {
  1231 + for (int k = 0; k < W[j]; ++k) {
  1232 + fields[j] <<= 8;
  1233 + fields[j] |= *p++;
  1234 + }
  1235 + }
  1236 +
  1237 + // Get the generation number. The generation number is 0 unless this is an uncompressed
  1238 + // object record, in which case the generation number appears as the third field.
  1239 + if (saw_first_compressed_object) {
  1240 + if (fields[0] != 2) {
  1241 + m->uncompressed_after_compressed = true;
  1242 + }
  1243 + } else if (fields[0] == 2) {
  1244 + saw_first_compressed_object = true;
  1245 + }
  1246 + if (obj == 0) {
  1247 + // This is needed by checkLinearization()
  1248 + m->first_xref_item_offset = xref_offset;
  1249 + } else if (fields[0] == 0) {
  1250 + // Ignore fields[2], which we don't care about in this case. This works around the
  1251 + // issue of some PDF files that put invalid values, like -1, here for deleted
  1252 + // objects.
  1253 + insertFreeXrefEntry(QPDFObjGen(obj, 0));
  1254 + } else {
  1255 + insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
  1256 + }
  1257 + ++obj;
  1258 + }
  1259 + }
  1260 +
  1261 + if (!m->trailer) {
  1262 + setTrailer(dict);
  1263 + }
  1264 +
  1265 + if (dict.hasKey("/Prev")) {
  1266 + if (!dict.getKey("/Prev").isInteger()) {
  1267 + throw damagedPDF(
  1268 + "xref stream", "/Prev key in xref stream dictionary is not an integer");
  1269 + }
  1270 + QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
  1271 + return dict.getKey("/Prev").getIntValue();
  1272 + } else {
  1273 + return 0;
  1274 + }
  1275 +}
  1276 +
  1277 +void
  1278 +QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)
  1279 +{
  1280 + // Populate the xref table in such a way that the first reference to an object that we see,
  1281 + // which is the one in the latest xref table in which it appears, is the one that gets stored.
  1282 + // This works because we are reading more recent appends before older ones.
  1283 +
  1284 + // If there is already an entry for this object and generation in the table, it means that a
  1285 + // later xref table has registered this object. Disregard this one.
  1286 +
  1287 + if (obj > m->xref_table_max_id) {
  1288 + // ignore impossibly large object ids or object ids > Size.
  1289 + return;
  1290 + }
  1291 +
  1292 + if (m->deleted_objects.count(obj)) {
  1293 + QTC::TC("qpdf", "QPDF xref deleted object");
  1294 + return;
  1295 + }
  1296 +
  1297 + if (f0 == 2 && static_cast<int>(f1) == obj) {
  1298 + warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));
  1299 + return;
  1300 + }
  1301 +
  1302 + auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));
  1303 + if (!created) {
  1304 + QTC::TC("qpdf", "QPDF xref reused object");
  1305 + return;
  1306 + }
  1307 +
  1308 + switch (f0) {
  1309 + case 1:
  1310 + // f2 is generation
  1311 + QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
  1312 + iter->second = QPDFXRefEntry(f1);
  1313 + break;
  1314 +
  1315 + case 2:
  1316 + iter->second = QPDFXRefEntry(toI(f1), f2);
  1317 + break;
  1318 +
  1319 + default:
  1320 + throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));
  1321 + break;
  1322 + }
  1323 +}
  1324 +
  1325 +void
  1326 +QPDF::insertFreeXrefEntry(QPDFObjGen og)
  1327 +{
  1328 + if (!m->xref_table.count(og)) {
  1329 + m->deleted_objects.insert(og.getObj());
  1330 + }
  1331 +}
  1332 +
  1333 +// Replace uncompressed object. This is used in xref recovery mode, which reads the file from
  1334 +// beginning to end.
  1335 +void
  1336 +QPDF::insertReconstructedXrefEntry(int obj, qpdf_offset_t f1, int f2)
  1337 +{
  1338 + if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && f2 < 65535)) {
  1339 + QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");
  1340 + return;
  1341 + }
  1342 +
  1343 + QPDFObjGen og(obj, f2);
  1344 + if (!m->deleted_objects.count(obj)) {
  1345 + // deleted_objects stores the uncompressed objects removed from the xref table at the start
  1346 + // of recovery.
  1347 + QTC::TC("qpdf", "QPDF xref overwrite object");
  1348 + m->xref_table[QPDFObjGen(obj, f2)] = QPDFXRefEntry(f1);
  1349 + }
  1350 +}
  1351 +
  1352 +void
451 QPDF::showXRefTable() 1353 QPDF::showXRefTable()
452 { 1354 {
453 - m->objects.xref_table().show(); 1355 + auto& cout = *m->log->getInfo();
  1356 + for (auto const& iter: m->xref_table) {
  1357 + QPDFObjGen const& og = iter.first;
  1358 + QPDFXRefEntry const& entry = iter.second;
  1359 + cout << og.unparse('/') << ": ";
  1360 + switch (entry.getType()) {
  1361 + case 1:
  1362 + cout << "uncompressed; offset = " << entry.getOffset();
  1363 + break;
  1364 +
  1365 + case 2:
  1366 + *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()
  1367 + << ", index = " << entry.getObjStreamIndex();
  1368 + break;
  1369 +
  1370 + default:
  1371 + throw std::logic_error("unknown cross-reference table type while"
  1372 + " showing xref_table");
  1373 + break;
  1374 + }
  1375 + m->log->info("\n");
  1376 + }
  1377 +}
  1378 +
  1379 +// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
  1380 +// return false. Otherwise return true.
  1381 +bool
  1382 +QPDF::resolveXRefTable()
  1383 +{
  1384 + bool may_change = !m->reconstructed_xref;
  1385 + for (auto& iter: m->xref_table) {
  1386 + if (isUnresolved(iter.first)) {
  1387 + resolve(iter.first);
  1388 + if (may_change && m->reconstructed_xref) {
  1389 + return false;
  1390 + }
  1391 + }
  1392 + }
  1393 + return true;
454 } 1394 }
455 1395
456 // Ensure all objects in the pdf file, including those in indirect references, appear in the object 1396 // Ensure all objects in the pdf file, including those in indirect references, appear in the object
@@ -461,9 +1401,9 @@ QPDF::fixDanglingReferences(bool force) @@ -461,9 +1401,9 @@ QPDF::fixDanglingReferences(bool force)
461 if (m->fixed_dangling_refs) { 1401 if (m->fixed_dangling_refs) {
462 return; 1402 return;
463 } 1403 }
464 - if (!m->objects.xref_table().resolve()) { 1404 + if (!resolveXRefTable()) {
465 QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction"); 1405 QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");
466 - m->objects.xref_table().resolve(); 1406 + resolveXRefTable();
467 } 1407 }
468 m->fixed_dangling_refs = true; 1408 m->fixed_dangling_refs = true;
469 } 1409 }
@@ -474,13 +1414,24 @@ QPDF::getObjectCount() @@ -474,13 +1414,24 @@ QPDF::getObjectCount()
474 // This method returns the next available indirect object number. makeIndirectObject uses it for 1414 // This method returns the next available indirect object number. makeIndirectObject uses it for
475 // this purpose. After fixDanglingReferences is called, all objects in the xref table will also 1415 // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
476 // be in obj_cache. 1416 // be in obj_cache.
477 - return toS(m->objects.next_id().getObj() - 1); 1417 + fixDanglingReferences();
  1418 + QPDFObjGen og;
  1419 + if (!m->obj_cache.empty()) {
  1420 + og = (*(m->obj_cache.rbegin())).first;
  1421 + }
  1422 + return toS(og.getObj());
478 } 1423 }
479 1424
480 std::vector<QPDFObjectHandle> 1425 std::vector<QPDFObjectHandle>
481 QPDF::getAllObjects() 1426 QPDF::getAllObjects()
482 { 1427 {
483 - return m->objects.all(); 1428 + // After fixDanglingReferences is called, all objects are in the object cache.
  1429 + fixDanglingReferences();
  1430 + std::vector<QPDFObjectHandle> result;
  1431 + for (auto const& iter: m->obj_cache) {
  1432 + result.push_back(newIndirect(iter.first, iter.second.object));
  1433 + }
  1434 + return result;
484 } 1435 }
485 1436
486 void 1437 void
@@ -498,6 +1449,233 @@ QPDF::setLastObjectDescription(std::string const&amp; description, QPDFObjGen const&amp; @@ -498,6 +1449,233 @@ QPDF::setLastObjectDescription(std::string const&amp; description, QPDFObjGen const&amp;
498 } 1449 }
499 } 1450 }
500 1451
  1452 +QPDFObjectHandle
  1453 +QPDF::readTrailer()
  1454 +{
  1455 + qpdf_offset_t offset = m->file->tell();
  1456 + bool empty = false;
  1457 + auto object =
  1458 + QPDFParser(*m->file, "trailer", m->tokenizer, nullptr, this, true).parse(empty, false);
  1459 + if (empty) {
  1460 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1461 + // actual PDF files and Adobe Reader appears to ignore them.
  1462 + warn(damagedPDF("trailer", "empty object treated as null"));
  1463 + } else if (object.isDictionary() && readToken(*m->file).isWord("stream")) {
  1464 + warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
  1465 + }
  1466 + // Override last_offset so that it points to the beginning of the object we just read
  1467 + m->file->setLastOffset(offset);
  1468 + return object;
  1469 +}
  1470 +
  1471 +QPDFObjectHandle
  1472 +QPDF::readObject(std::string const& description, QPDFObjGen og)
  1473 +{
  1474 + setLastObjectDescription(description, og);
  1475 + qpdf_offset_t offset = m->file->tell();
  1476 + bool empty = false;
  1477 +
  1478 + StringDecrypter decrypter{this, og};
  1479 + StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
  1480 + auto object =
  1481 + QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this, true)
  1482 + .parse(empty, false);
  1483 + if (empty) {
  1484 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1485 + // actual PDF files and Adobe Reader appears to ignore them.
  1486 + warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));
  1487 + return object;
  1488 + }
  1489 + auto token = readToken(*m->file);
  1490 + if (object.isDictionary() && token.isWord("stream")) {
  1491 + readStream(object, og, offset);
  1492 + token = readToken(*m->file);
  1493 + }
  1494 + if (!token.isWord("endobj")) {
  1495 + QTC::TC("qpdf", "QPDF err expected endobj");
  1496 + warn(damagedPDF("expected endobj"));
  1497 + }
  1498 + return object;
  1499 +}
  1500 +
  1501 +// After reading stream dictionary and stream keyword, read rest of stream.
  1502 +void
  1503 +QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
  1504 +{
  1505 + validateStreamLineEnd(object, og, offset);
  1506 +
  1507 + // Must get offset before accessing any additional objects since resolving a previously
  1508 + // unresolved indirect object will change file position.
  1509 + qpdf_offset_t stream_offset = m->file->tell();
  1510 + size_t length = 0;
  1511 +
  1512 + try {
  1513 + auto length_obj = object.getKey("/Length");
  1514 +
  1515 + if (!length_obj.isInteger()) {
  1516 + if (length_obj.isNull()) {
  1517 + QTC::TC("qpdf", "QPDF stream without length");
  1518 + throw damagedPDF(offset, "stream dictionary lacks /Length key");
  1519 + }
  1520 + QTC::TC("qpdf", "QPDF stream length not integer");
  1521 + throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");
  1522 + }
  1523 +
  1524 + length = toS(length_obj.getUIntValue());
  1525 + // Seek in two steps to avoid potential integer overflow
  1526 + m->file->seek(stream_offset, SEEK_SET);
  1527 + m->file->seek(toO(length), SEEK_CUR);
  1528 + if (!readToken(*m->file).isWord("endstream")) {
  1529 + QTC::TC("qpdf", "QPDF missing endstream");
  1530 + throw damagedPDF("expected endstream");
  1531 + }
  1532 + } catch (QPDFExc& e) {
  1533 + if (m->attempt_recovery) {
  1534 + warn(e);
  1535 + length = recoverStreamLength(m->file, og, stream_offset);
  1536 + } else {
  1537 + throw;
  1538 + }
  1539 + }
  1540 + object = {QPDF_Stream::create(this, og, object, stream_offset, length)};
  1541 +}
  1542 +
  1543 +void
  1544 +QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
  1545 +{
  1546 + // The PDF specification states that the word "stream" should be followed by either a carriage
  1547 + // return and a newline or by a newline alone. It specifically disallowed following it by a
  1548 + // carriage return alone since, in that case, there would be no way to tell whether the NL in a
  1549 + // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,
  1550 + // accept a carriage return by itself when followed by a non-newline character, so that's what
  1551 + // we do here. We have also seen files that have extraneous whitespace between the stream
  1552 + // keyword and the newline.
  1553 + while (true) {
  1554 + char ch;
  1555 + if (m->file->read(&ch, 1) == 0) {
  1556 + // A premature EOF here will result in some other problem that will get reported at
  1557 + // another time.
  1558 + return;
  1559 + }
  1560 + if (ch == '\n') {
  1561 + // ready to read stream data
  1562 + QTC::TC("qpdf", "QPDF stream with NL only");
  1563 + return;
  1564 + }
  1565 + if (ch == '\r') {
  1566 + // Read another character
  1567 + if (m->file->read(&ch, 1) != 0) {
  1568 + if (ch == '\n') {
  1569 + // Ready to read stream data
  1570 + QTC::TC("qpdf", "QPDF stream with CRNL");
  1571 + } else {
  1572 + // Treat the \r by itself as the whitespace after endstream and start reading
  1573 + // stream data in spite of not having seen a newline.
  1574 + QTC::TC("qpdf", "QPDF stream with CR only");
  1575 + m->file->unreadCh(ch);
  1576 + warn(damagedPDF(
  1577 + m->file->tell(), "stream keyword followed by carriage return only"));
  1578 + }
  1579 + }
  1580 + return;
  1581 + }
  1582 + if (!QUtil::is_space(ch)) {
  1583 + QTC::TC("qpdf", "QPDF stream without newline");
  1584 + m->file->unreadCh(ch);
  1585 + warn(damagedPDF(
  1586 + m->file->tell(), "stream keyword not followed by proper line terminator"));
  1587 + return;
  1588 + }
  1589 + warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
  1590 + }
  1591 +}
  1592 +
  1593 +QPDFObjectHandle
  1594 +QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)
  1595 +{
  1596 + m->last_object_description.erase(7); // last_object_description starts with "object "
  1597 + m->last_object_description += std::to_string(obj);
  1598 + m->last_object_description += " 0";
  1599 +
  1600 + bool empty = false;
  1601 + auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true)
  1602 + .parse(empty, false);
  1603 + if (empty) {
  1604 + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
  1605 + // actual PDF files and Adobe Reader appears to ignore them.
  1606 + warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));
  1607 + }
  1608 + return object;
  1609 +}
  1610 +
  1611 +bool
  1612 +QPDF::findEndstream()
  1613 +{
  1614 + // Find endstream or endobj. Position the input at that token.
  1615 + auto t = readToken(*m->file, 20);
  1616 + if (t.isWord("endobj") || t.isWord("endstream")) {
  1617 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1618 + return true;
  1619 + }
  1620 + return false;
  1621 +}
  1622 +
  1623 +size_t
  1624 +QPDF::recoverStreamLength(
  1625 + std::shared_ptr<InputSource> input, QPDFObjGen const& og, qpdf_offset_t stream_offset)
  1626 +{
  1627 + // Try to reconstruct stream length by looking for endstream or endobj
  1628 + warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));
  1629 +
  1630 + PatternFinder ef(*this, &QPDF::findEndstream);
  1631 + size_t length = 0;
  1632 + if (m->file->findFirst("end", stream_offset, 0, ef)) {
  1633 + length = toS(m->file->tell() - stream_offset);
  1634 + // Reread endstream but, if it was endobj, don't skip that.
  1635 + QPDFTokenizer::Token t = readToken(*m->file);
  1636 + if (t.getValue() == "endobj") {
  1637 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  1638 + }
  1639 + }
  1640 +
  1641 + if (length) {
  1642 + auto end = stream_offset + toO(length);
  1643 + qpdf_offset_t found_offset = 0;
  1644 + QPDFObjGen found_og;
  1645 +
  1646 + // Make sure this is inside this object
  1647 + for (auto const& [current_og, entry]: m->xref_table) {
  1648 + if (entry.getType() == 1) {
  1649 + qpdf_offset_t obj_offset = entry.getOffset();
  1650 + if (found_offset < obj_offset && obj_offset < end) {
  1651 + found_offset = obj_offset;
  1652 + found_og = current_og;
  1653 + }
  1654 + }
  1655 + }
  1656 + if (!found_offset || found_og == og) {
  1657 + // If we are trying to recover an XRef stream the xref table will not contain and
  1658 + // won't contain any entries, therefore we cannot check the found length. Otherwise we
  1659 + // found endstream\nendobj within the space allowed for this object, so we're probably
  1660 + // in good shape.
  1661 + } else {
  1662 + QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
  1663 + length = 0;
  1664 + }
  1665 + }
  1666 +
  1667 + if (length == 0) {
  1668 + warn(damagedPDF(
  1669 + *input, stream_offset, "unable to recover stream data; treating stream as empty"));
  1670 + } else {
  1671 + warn(damagedPDF(
  1672 + *input, stream_offset, "recovered stream length: " + std::to_string(length)));
  1673 + }
  1674 +
  1675 + QTC::TC("qpdf", "QPDF recovered stream length");
  1676 + return length;
  1677 +}
  1678 +
501 QPDFTokenizer::Token 1679 QPDFTokenizer::Token
502 QPDF::readToken(InputSource& input, size_t max_len) 1680 QPDF::readToken(InputSource& input, size_t max_len)
503 { 1681 {
@@ -505,38 +1683,398 @@ QPDF::readToken(InputSource&amp; input, size_t max_len) @@ -505,38 +1683,398 @@ QPDF::readToken(InputSource&amp; input, size_t max_len)
505 } 1683 }
506 1684
507 QPDFObjectHandle 1685 QPDFObjectHandle
  1686 +QPDF::readObjectAtOffset(
  1687 + bool try_recovery,
  1688 + qpdf_offset_t offset,
  1689 + std::string const& description,
  1690 + QPDFObjGen exp_og,
  1691 + QPDFObjGen& og,
  1692 + bool skip_cache_if_in_xref)
  1693 +{
  1694 + bool check_og = true;
  1695 + if (exp_og.getObj() == 0) {
  1696 + // This method uses an expect object ID of 0 to indicate that we don't know or don't care
  1697 + // what the actual object ID is at this offset. This is true when we read the xref stream
  1698 + // and linearization hint streams. In this case, we don't verify the expect object
  1699 + // ID/generation against what was read from the file. There is also no reason to attempt
  1700 + // xref recovery if we get a failure in this case since the read attempt was not triggered
  1701 + // by an xref lookup.
  1702 + check_og = false;
  1703 + try_recovery = false;
  1704 + }
  1705 + setLastObjectDescription(description, exp_og);
  1706 +
  1707 + if (!m->attempt_recovery) {
  1708 + try_recovery = false;
  1709 + }
  1710 +
  1711 + // Special case: if offset is 0, just return null. Some PDF writers, in particular
  1712 + // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
  1713 + // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
  1714 + // these.
  1715 + if (offset == 0) {
  1716 + QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
  1717 + warn(damagedPDF(0, "object has offset 0"));
  1718 + return QPDFObjectHandle::newNull();
  1719 + }
  1720 +
  1721 + m->file->seek(offset, SEEK_SET);
  1722 + try {
  1723 + QPDFTokenizer::Token tobjid = readToken(*m->file);
  1724 + bool objidok = tobjid.isInteger();
  1725 + QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
  1726 + if (!objidok) {
  1727 + QTC::TC("qpdf", "QPDF expected n n obj");
  1728 + throw damagedPDF(offset, "expected n n obj");
  1729 + }
  1730 + QPDFTokenizer::Token tgen = readToken(*m->file);
  1731 + bool genok = tgen.isInteger();
  1732 + QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
  1733 + if (!genok) {
  1734 + throw damagedPDF(offset, "expected n n obj");
  1735 + }
  1736 + QPDFTokenizer::Token tobj = readToken(*m->file);
  1737 +
  1738 + bool objok = tobj.isWord("obj");
  1739 + QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
  1740 +
  1741 + if (!objok) {
  1742 + throw damagedPDF(offset, "expected n n obj");
  1743 + }
  1744 + int objid = QUtil::string_to_int(tobjid.getValue().c_str());
  1745 + int generation = QUtil::string_to_int(tgen.getValue().c_str());
  1746 + og = QPDFObjGen(objid, generation);
  1747 + if (objid == 0) {
  1748 + QTC::TC("qpdf", "QPDF object id 0");
  1749 + throw damagedPDF(offset, "object with ID 0");
  1750 + }
  1751 + if (check_og && (exp_og != og)) {
  1752 + QTC::TC("qpdf", "QPDF err wrong objid/generation");
  1753 + QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
  1754 + if (try_recovery) {
  1755 + // Will be retried below
  1756 + throw e;
  1757 + } else {
  1758 + // We can try reading the object anyway even if the ID doesn't match.
  1759 + warn(e);
  1760 + }
  1761 + }
  1762 + } catch (QPDFExc& e) {
  1763 + if (try_recovery) {
  1764 + // Try again after reconstructing xref table
  1765 + reconstruct_xref(e);
  1766 + if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {
  1767 + qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
  1768 + QPDFObjectHandle result =
  1769 + readObjectAtOffset(false, new_offset, description, exp_og, og, false);
  1770 + QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
  1771 + return result;
  1772 + } else {
  1773 + QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
  1774 + warn(damagedPDF(
  1775 + "",
  1776 + 0,
  1777 + ("object " + exp_og.unparse(' ') +
  1778 + " not found in file after regenerating cross reference "
  1779 + "table")));
  1780 + return QPDFObjectHandle::newNull();
  1781 + }
  1782 + } else {
  1783 + throw;
  1784 + }
  1785 + }
  1786 +
  1787 + QPDFObjectHandle oh = readObject(description, og);
  1788 +
  1789 + if (isUnresolved(og)) {
  1790 + // Store the object in the cache here so it gets cached whether we first know the offset or
  1791 + // whether we first know the object ID and generation (in which we case we would get here
  1792 + // through resolve).
  1793 +
  1794 + // Determine the end offset of this object before and after white space. We use these
  1795 + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
  1796 + // the end of an object to be anywhere between these values.
  1797 + qpdf_offset_t end_before_space = m->file->tell();
  1798 +
  1799 + // skip over spaces
  1800 + while (true) {
  1801 + char ch;
  1802 + if (m->file->read(&ch, 1)) {
  1803 + if (!isspace(static_cast<unsigned char>(ch))) {
  1804 + m->file->seek(-1, SEEK_CUR);
  1805 + break;
  1806 + }
  1807 + } else {
  1808 + throw damagedPDF(m->file->tell(), "EOF after endobj");
  1809 + }
  1810 + }
  1811 + qpdf_offset_t end_after_space = m->file->tell();
  1812 + if (skip_cache_if_in_xref && m->xref_table.count(og)) {
  1813 + // Ordinarily, an object gets read here when resolved through xref table or stream. In
  1814 + // the special case of the xref stream and linearization hint tables, the offset comes
  1815 + // from another source. For the specific case of xref streams, the xref stream is read
  1816 + // and loaded into the object cache very early in parsing. Ordinarily, when a file is
  1817 + // updated by appending, items inserted into the xref table in later updates take
  1818 + // precedence over earlier items. In the special case of reusing the object number
  1819 + // previously used as the xref stream, we have the following order of events:
  1820 + //
  1821 + // * reused object gets loaded into the xref table
  1822 + // * old object is read here while reading xref streams
  1823 + // * original xref entry is ignored (since already in xref table)
  1824 + //
  1825 + // It is the second step that causes a problem. Even though the xref table is correct in
  1826 + // this case, the old object is already in the cache and so effectively prevails over
  1827 + // the reused object. To work around this issue, we have a special case for the xref
  1828 + // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,
  1829 + // don't cache what we read here.
  1830 + //
  1831 + // It is likely that the same bug may exist for linearization hint tables, but the
  1832 + // existing code uses end_before_space and end_after_space from the cache, so fixing
  1833 + // that would require more significant rework. The chances of a linearization hint
  1834 + // stream being reused seems smaller because the xref stream is probably the highest
  1835 + // object in the file and the linearization hint stream would be some random place in
  1836 + // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we
  1837 + // could use !check_og in place of skip_cache_if_in_xref.
  1838 + QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
  1839 + } else {
  1840 + updateCache(og, oh.getObj(), end_before_space, end_after_space);
  1841 + }
  1842 + }
  1843 +
  1844 + return oh;
  1845 +}
  1846 +
  1847 +QPDFObject*
  1848 +QPDF::resolve(QPDFObjGen og)
  1849 +{
  1850 + if (!isUnresolved(og)) {
  1851 + return m->obj_cache[og].object.get();
  1852 + }
  1853 +
  1854 + if (m->resolving.count(og)) {
  1855 + // This can happen if an object references itself directly or indirectly in some key that
  1856 + // has to be resolved during object parsing, such as stream length.
  1857 + QTC::TC("qpdf", "QPDF recursion loop in resolve");
  1858 + warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
  1859 + updateCache(og, QPDF_Null::create(), -1, -1);
  1860 + return m->obj_cache[og].object.get();
  1861 + }
  1862 + ResolveRecorder rr(this, og);
  1863 +
  1864 + if (m->xref_table.count(og) != 0) {
  1865 + QPDFXRefEntry const& entry = m->xref_table[og];
  1866 + try {
  1867 + switch (entry.getType()) {
  1868 + case 1:
  1869 + {
  1870 + qpdf_offset_t offset = entry.getOffset();
  1871 + // Object stored in cache by readObjectAtOffset
  1872 + QPDFObjGen a_og;
  1873 + QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);
  1874 + }
  1875 + break;
  1876 +
  1877 + case 2:
  1878 + resolveObjectsInStream(entry.getObjStreamNumber());
  1879 + break;
  1880 +
  1881 + default:
  1882 + throw damagedPDF(
  1883 + "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));
  1884 + }
  1885 + } catch (QPDFExc& e) {
  1886 + warn(e);
  1887 + } catch (std::exception& e) {
  1888 + warn(damagedPDF(
  1889 + "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
  1890 + }
  1891 + }
  1892 +
  1893 + if (isUnresolved(og)) {
  1894 + // PDF spec says unknown objects resolve to the null object.
  1895 + QTC::TC("qpdf", "QPDF resolve failure to null");
  1896 + updateCache(og, QPDF_Null::create(), -1, -1);
  1897 + }
  1898 +
  1899 + auto result(m->obj_cache[og].object);
  1900 + result->setDefaultDescription(this, og);
  1901 + return result.get();
  1902 +}
  1903 +
  1904 +void
  1905 +QPDF::resolveObjectsInStream(int obj_stream_number)
  1906 +{
  1907 + if (m->resolved_object_streams.count(obj_stream_number)) {
  1908 + return;
  1909 + }
  1910 + m->resolved_object_streams.insert(obj_stream_number);
  1911 + // Force resolution of object stream
  1912 + QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
  1913 + if (!obj_stream.isStream()) {
  1914 + throw damagedPDF(
  1915 + "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
  1916 + }
  1917 +
  1918 + // For linearization data in the object, use the data from the object stream for the objects in
  1919 + // the stream.
  1920 + QPDFObjGen stream_og(obj_stream_number, 0);
  1921 + qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;
  1922 + qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;
  1923 +
  1924 + QPDFObjectHandle dict = obj_stream.getDict();
  1925 + if (!dict.isDictionaryOfType("/ObjStm")) {
  1926 + QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
  1927 + warn(damagedPDF(
  1928 + "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
  1929 + }
  1930 +
  1931 + if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {
  1932 + throw damagedPDF(
  1933 + ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));
  1934 + }
  1935 +
  1936 + int n = dict.getKey("/N").getIntValueAsInt();
  1937 + int first = dict.getKey("/First").getIntValueAsInt();
  1938 +
  1939 + std::map<int, int> offsets;
  1940 +
  1941 + std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
  1942 + auto input = std::shared_ptr<InputSource>(
  1943 + // line-break
  1944 + new BufferInputSource(
  1945 + (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),
  1946 + bp.get()));
  1947 +
  1948 + for (int i = 0; i < n; ++i) {
  1949 + QPDFTokenizer::Token tnum = readToken(*input);
  1950 + QPDFTokenizer::Token toffset = readToken(*input);
  1951 + if (!(tnum.isInteger() && toffset.isInteger())) {
  1952 + throw damagedPDF(
  1953 + *input,
  1954 + m->last_object_description,
  1955 + input->getLastOffset(),
  1956 + "expected integer in object stream header");
  1957 + }
  1958 +
  1959 + int num = QUtil::string_to_int(tnum.getValue().c_str());
  1960 + long long offset = QUtil::string_to_int(toffset.getValue().c_str());
  1961 + if (num > m->xref_table_max_id) {
  1962 + continue;
  1963 + }
  1964 + if (num == obj_stream_number) {
  1965 + QTC::TC("qpdf", "QPDF ignore self-referential object stream");
  1966 + warn(damagedPDF(
  1967 + *input,
  1968 + m->last_object_description,
  1969 + input->getLastOffset(),
  1970 + "object stream claims to contain itself"));
  1971 + continue;
  1972 + }
  1973 + offsets[num] = toI(offset + first);
  1974 + }
  1975 +
  1976 + // To avoid having to read the object stream multiple times, store all objects that would be
  1977 + // found here in the cache. Remember that some objects stored here might have been overridden
  1978 + // by new objects appended to the file, so it is necessary to recheck the xref table and only
  1979 + // cache what would actually be resolved here.
  1980 + m->last_object_description.clear();
  1981 + m->last_object_description += "object ";
  1982 + for (auto const& iter: offsets) {
  1983 + QPDFObjGen og(iter.first, 0);
  1984 + auto entry = m->xref_table.find(og);
  1985 + if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
  1986 + entry->second.getObjStreamNumber() == obj_stream_number) {
  1987 + int offset = iter.second;
  1988 + input->seek(offset, SEEK_SET);
  1989 + QPDFObjectHandle oh = readObjectInStream(input, iter.first);
  1990 + updateCache(og, oh.getObj(), end_before_space, end_after_space);
  1991 + } else {
  1992 + QTC::TC("qpdf", "QPDF not caching overridden objstm object");
  1993 + }
  1994 + }
  1995 +}
  1996 +
  1997 +QPDFObjectHandle
508 QPDF::newIndirect(QPDFObjGen const& og, std::shared_ptr<QPDFObject> const& obj) 1998 QPDF::newIndirect(QPDFObjGen const& og, std::shared_ptr<QPDFObject> const& obj)
509 { 1999 {
510 obj->setDefaultDescription(this, og); 2000 obj->setDefaultDescription(this, og);
511 return {obj}; 2001 return {obj};
512 } 2002 }
513 2003
  2004 +void
  2005 +QPDF::updateCache(
  2006 + QPDFObjGen const& og,
  2007 + std::shared_ptr<QPDFObject> const& object,
  2008 + qpdf_offset_t end_before_space,
  2009 + qpdf_offset_t end_after_space)
  2010 +{
  2011 + object->setObjGen(this, og);
  2012 + if (isCached(og)) {
  2013 + auto& cache = m->obj_cache[og];
  2014 + cache.object->assign(object);
  2015 + cache.end_before_space = end_before_space;
  2016 + cache.end_after_space = end_after_space;
  2017 + } else {
  2018 + m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);
  2019 + }
  2020 +}
  2021 +
  2022 +bool
  2023 +QPDF::isCached(QPDFObjGen const& og)
  2024 +{
  2025 + return m->obj_cache.count(og) != 0;
  2026 +}
  2027 +
  2028 +bool
  2029 +QPDF::isUnresolved(QPDFObjGen const& og)
  2030 +{
  2031 + return !isCached(og) || m->obj_cache[og].object->isUnresolved();
  2032 +}
  2033 +
  2034 +QPDFObjGen
  2035 +QPDF::nextObjGen()
  2036 +{
  2037 + int max_objid = toI(getObjectCount());
  2038 + if (max_objid == std::numeric_limits<int>::max()) {
  2039 + throw std::range_error("max object id is too high to create new objects");
  2040 + }
  2041 + return QPDFObjGen(max_objid + 1, 0);
  2042 +}
  2043 +
  2044 +QPDFObjectHandle
  2045 +QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
  2046 +{
  2047 + QPDFObjGen next{nextObjGen()};
  2048 + m->obj_cache[next] = ObjCache(obj, -1, -1);
  2049 + return newIndirect(next, m->obj_cache[next].object);
  2050 +}
  2051 +
514 QPDFObjectHandle 2052 QPDFObjectHandle
515 QPDF::makeIndirectObject(QPDFObjectHandle oh) 2053 QPDF::makeIndirectObject(QPDFObjectHandle oh)
516 { 2054 {
517 if (!oh) { 2055 if (!oh) {
518 throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect"); 2056 throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
519 } 2057 }
520 - return m->objects.make_indirect(oh.getObj()); 2058 + return makeIndirectFromQPDFObject(oh.getObj());
521 } 2059 }
522 2060
523 QPDFObjectHandle 2061 QPDFObjectHandle
524 QPDF::newReserved() 2062 QPDF::newReserved()
525 { 2063 {
526 - return m->objects.make_indirect(QPDF_Reserved::create()); 2064 + return makeIndirectFromQPDFObject(QPDF_Reserved::create());
527 } 2065 }
528 2066
529 QPDFObjectHandle 2067 QPDFObjectHandle
530 QPDF::newIndirectNull() 2068 QPDF::newIndirectNull()
531 { 2069 {
532 - return m->objects.make_indirect(QPDF_Null::create()); 2070 + return makeIndirectFromQPDFObject(QPDF_Null::create());
533 } 2071 }
534 2072
535 QPDFObjectHandle 2073 QPDFObjectHandle
536 QPDF::newStream() 2074 QPDF::newStream()
537 { 2075 {
538 - return m->objects.make_indirect(  
539 - QPDF_Stream::create(this, m->objects.next_id(), QPDFObjectHandle::newDictionary(), 0, 0)); 2076 + return makeIndirectFromQPDFObject(
  2077 + QPDF_Stream::create(this, nextObjGen(), QPDFObjectHandle::newDictionary(), 0, 0));
540 } 2078 }
541 2079
542 QPDFObjectHandle 2080 QPDFObjectHandle
@@ -555,40 +2093,93 @@ QPDF::newStream(std::string const&amp; data) @@ -555,40 +2093,93 @@ QPDF::newStream(std::string const&amp; data)
555 return result; 2093 return result;
556 } 2094 }
557 2095
  2096 +std::shared_ptr<QPDFObject>
  2097 +QPDF::getObjectForParser(int id, int gen, bool parse_pdf)
  2098 +{
  2099 + // This method is called by the parser and therefore must not resolve any objects.
  2100 + auto og = QPDFObjGen(id, gen);
  2101 + if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
  2102 + return iter->second.object;
  2103 + }
  2104 + if (m->xref_table.count(og) || !m->parsed) {
  2105 + return m->obj_cache.insert({og, QPDF_Unresolved::create(this, og)}).first->second.object;
  2106 + }
  2107 + if (parse_pdf) {
  2108 + return QPDF_Null::create();
  2109 + }
  2110 + return m->obj_cache.insert({og, QPDF_Null::create(this, og)}).first->second.object;
  2111 +}
  2112 +
  2113 +std::shared_ptr<QPDFObject>
  2114 +QPDF::getObjectForJSON(int id, int gen)
  2115 +{
  2116 + auto og = QPDFObjGen(id, gen);
  2117 + auto [it, inserted] = m->obj_cache.try_emplace(og);
  2118 + auto& obj = it->second.object;
  2119 + if (inserted) {
  2120 + obj = (m->parsed && !m->xref_table.count(og)) ? QPDF_Null::create(this, og)
  2121 + : QPDF_Unresolved::create(this, og);
  2122 + }
  2123 + return obj;
  2124 +}
  2125 +
558 QPDFObjectHandle 2126 QPDFObjectHandle
559 QPDF::getObject(QPDFObjGen const& og) 2127 QPDF::getObject(QPDFObjGen const& og)
560 { 2128 {
561 - return m->objects.get(og); 2129 + if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
  2130 + return {it->second.object};
  2131 + } else if (m->parsed && !m->xref_table.count(og)) {
  2132 + return QPDF_Null::create();
  2133 + } else {
  2134 + auto result = m->obj_cache.try_emplace(og, QPDF_Unresolved::create(this, og), -1, -1);
  2135 + return {result.first->second.object};
  2136 + }
562 } 2137 }
563 2138
564 QPDFObjectHandle 2139 QPDFObjectHandle
565 -QPDF::getObject(int id, int gen) 2140 +QPDF::getObject(int objid, int generation)
566 { 2141 {
567 - return m->objects.get(id, gen); 2142 + return getObject(QPDFObjGen(objid, generation));
568 } 2143 }
569 2144
570 QPDFObjectHandle 2145 QPDFObjectHandle
571 QPDF::getObjectByObjGen(QPDFObjGen const& og) 2146 QPDF::getObjectByObjGen(QPDFObjGen const& og)
572 { 2147 {
573 - return m->objects.get(og); 2148 + return getObject(og);
574 } 2149 }
575 2150
576 QPDFObjectHandle 2151 QPDFObjectHandle
577 -QPDF::getObjectByID(int id, int gen) 2152 +QPDF::getObjectByID(int objid, int generation)
578 { 2153 {
579 - return m->objects.get(id, gen); 2154 + return getObject(QPDFObjGen(objid, generation));
580 } 2155 }
581 2156
582 void 2157 void
583 -QPDF::replaceObject(int id, int gen, QPDFObjectHandle replacement) 2158 +QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
584 { 2159 {
585 - m->objects.replace(QPDFObjGen(id, gen), replacement); 2160 + replaceObject(QPDFObjGen(objid, generation), oh);
586 } 2161 }
587 2162
588 void 2163 void
589 -QPDF::replaceObject(QPDFObjGen const& og, QPDFObjectHandle replacement) 2164 +QPDF::replaceObject(QPDFObjGen const& og, QPDFObjectHandle oh)
590 { 2165 {
591 - m->objects.replace(og, replacement); 2166 + if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {
  2167 + QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
  2168 + throw std::logic_error("QPDF::replaceObject called with indirect object handle");
  2169 + }
  2170 + updateCache(og, oh.getObj(), -1, -1);
  2171 +}
  2172 +
  2173 +void
  2174 +QPDF::removeObject(QPDFObjGen og)
  2175 +{
  2176 + m->xref_table.erase(og);
  2177 + if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
  2178 + // Take care of any object handles that may be floating around.
  2179 + cached->second.object->assign(QPDF_Null::create());
  2180 + cached->second.object->setObjGen(nullptr, QPDFObjGen());
  2181 + m->obj_cache.erase(cached);
  2182 + }
592 } 2183 }
593 2184
594 void 2185 void
@@ -599,7 +2190,7 @@ QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement) @@ -599,7 +2190,7 @@ QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
599 if (!(tc == ::ot_reserved || tc == ::ot_null)) { 2190 if (!(tc == ::ot_reserved || tc == ::ot_null)) {
600 throw std::logic_error("replaceReserved called with non-reserved object"); 2191 throw std::logic_error("replaceReserved called with non-reserved object");
601 } 2192 }
602 - m->objects.replace(reserved.getObjGen(), replacement); 2193 + replaceObject(reserved.getObjGen(), replacement);
603 } 2194 }
604 2195
605 QPDFObjectHandle 2196 QPDFObjectHandle
@@ -851,7 +2442,7 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign) @@ -851,7 +2442,7 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign)
851 } else { 2442 } else {
852 auto foreign_stream_data = std::make_shared<ForeignStreamData>( 2443 auto foreign_stream_data = std::make_shared<ForeignStreamData>(
853 foreign_stream_qpdf.m->encp, 2444 foreign_stream_qpdf.m->encp,
854 - foreign_stream_qpdf.m->file_sp, 2445 + foreign_stream_qpdf.m->file,
855 foreign.getObjGen(), 2446 foreign.getObjGen(),
856 stream->getParsedOffset(), 2447 stream->getParsedOffset(),
857 stream->getLength(), 2448 stream->getLength(),
@@ -865,13 +2456,16 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign) @@ -865,13 +2456,16 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign)
865 void 2456 void
866 QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2) 2457 QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
867 { 2458 {
868 - m->objects.swap(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2)); 2459 + swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
869 } 2460 }
870 2461
871 void 2462 void
872 QPDF::swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2) 2463 QPDF::swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2)
873 { 2464 {
874 - m->objects.swap(og1, og2); 2465 + // Force objects to be read from the input source if needed, then swap them in the cache.
  2466 + resolve(og1);
  2467 + resolve(og2);
  2468 + m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
875 } 2469 }
876 2470
877 unsigned long long 2471 unsigned long long
@@ -932,13 +2526,13 @@ QPDF::getExtensionLevel() @@ -932,13 +2526,13 @@ QPDF::getExtensionLevel()
932 QPDFObjectHandle 2526 QPDFObjectHandle
933 QPDF::getTrailer() 2527 QPDF::getTrailer()
934 { 2528 {
935 - return m->objects.trailer(); 2529 + return m->trailer;
936 } 2530 }
937 2531
938 QPDFObjectHandle 2532 QPDFObjectHandle
939 QPDF::getRoot() 2533 QPDF::getRoot()
940 { 2534 {
941 - auto root = m->objects.trailer().getKey("/Root"); 2535 + QPDFObjectHandle root = m->trailer.getKey("/Root");
942 if (!root.isDictionary()) { 2536 if (!root.isDictionary()) {
943 throw damagedPDF("", 0, "unable to find /Root dictionary"); 2537 throw damagedPDF("", 0, "unable to find /Root dictionary");
944 } else if ( 2538 } else if (
@@ -954,10 +2548,145 @@ QPDF::getRoot() @@ -954,10 +2548,145 @@ QPDF::getRoot()
954 std::map<QPDFObjGen, QPDFXRefEntry> 2548 std::map<QPDFObjGen, QPDFXRefEntry>
955 QPDF::getXRefTable() 2549 QPDF::getXRefTable()
956 { 2550 {
957 - if (!m->objects.xref_table().initialized()) { 2551 + return getXRefTableInternal();
  2552 +}
  2553 +
  2554 +std::map<QPDFObjGen, QPDFXRefEntry> const&
  2555 +QPDF::getXRefTableInternal()
  2556 +{
  2557 + if (!m->parsed) {
958 throw std::logic_error("QPDF::getXRefTable called before parsing."); 2558 throw std::logic_error("QPDF::getXRefTable called before parsing.");
959 } 2559 }
960 - return m->objects.xref_table().as_map(); 2560 +
  2561 + return m->xref_table;
  2562 +}
  2563 +
  2564 +size_t
  2565 +QPDF::tableSize()
  2566 +{
  2567 + // If obj_cache is dense, accommodate all object in tables,else accommodate only original
  2568 + // objects.
  2569 + auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0;
  2570 + auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;
  2571 + auto max_id = std::numeric_limits<int>::max() - 1;
  2572 + if (max_obj >= max_id || max_xref >= max_id) {
  2573 + // Temporary fix. Long-term solution is
  2574 + // - QPDFObjGen to enforce objgens are valid and sensible
  2575 + // - xref table and obj cache to protect against insertion of impossibly large obj ids
  2576 + stopOnError("Impossibly large object id encountered.");
  2577 + }
  2578 + if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
  2579 + return toS(++max_obj);
  2580 + }
  2581 + return toS(++max_xref);
  2582 +}
  2583 +
  2584 +std::vector<QPDFObjGen>
  2585 +QPDF::getCompressibleObjVector()
  2586 +{
  2587 + return getCompressibleObjGens<QPDFObjGen>();
  2588 +}
  2589 +
  2590 +std::vector<bool>
  2591 +QPDF::getCompressibleObjSet()
  2592 +{
  2593 + return getCompressibleObjGens<bool>();
  2594 +}
  2595 +
  2596 +template <typename T>
  2597 +std::vector<T>
  2598 +QPDF::getCompressibleObjGens()
  2599 +{
  2600 + // Return a list of objects that are allowed to be in object streams. Walk through the objects
  2601 + // by traversing the document from the root, including a traversal of the pages tree. This
  2602 + // makes that objects that are on the same page are more likely to be in the same object stream,
  2603 + // which is slightly more efficient, particularly with linearized files. This is better than
  2604 + // iterating through the xref table since it avoids preserving orphaned items.
  2605 +
  2606 + // Exclude encryption dictionary, if any
  2607 + QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
  2608 + QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
  2609 +
  2610 + const size_t max_obj = getObjectCount();
  2611 + std::vector<bool> visited(max_obj, false);
  2612 + std::vector<QPDFObjectHandle> queue;
  2613 + queue.reserve(512);
  2614 + queue.push_back(m->trailer);
  2615 + std::vector<T> result;
  2616 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  2617 + result.reserve(m->obj_cache.size());
  2618 + } else if constexpr (std::is_same_v<T, bool>) {
  2619 + result.resize(max_obj + 1U, false);
  2620 + } else {
  2621 + throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
  2622 + }
  2623 + while (!queue.empty()) {
  2624 + auto obj = queue.back();
  2625 + queue.pop_back();
  2626 + if (obj.getObjectID() > 0) {
  2627 + QPDFObjGen og = obj.getObjGen();
  2628 + const size_t id = toS(og.getObj() - 1);
  2629 + if (id >= max_obj) {
  2630 + throw std::logic_error(
  2631 + "unexpected object id encountered in getCompressibleObjGens");
  2632 + }
  2633 + if (visited[id]) {
  2634 + QTC::TC("qpdf", "QPDF loop detected traversing objects");
  2635 + continue;
  2636 + }
  2637 +
  2638 + // Check whether this is the current object. If not, remove it (which changes it into a
  2639 + // direct null and therefore stops us from revisiting it) and move on to the next object
  2640 + // in the queue.
  2641 + auto upper = m->obj_cache.upper_bound(og);
  2642 + if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
  2643 + removeObject(og);
  2644 + continue;
  2645 + }
  2646 +
  2647 + visited[id] = true;
  2648 +
  2649 + if (og == encryption_dict_og) {
  2650 + QTC::TC("qpdf", "QPDF exclude encryption dictionary");
  2651 + } else if (!(obj.isStream() ||
  2652 + (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
  2653 + obj.hasKey("/Contents")))) {
  2654 + if constexpr (std::is_same_v<T, QPDFObjGen>) {
  2655 + result.push_back(og);
  2656 + } else if constexpr (std::is_same_v<T, bool>) {
  2657 + result[id + 1U] = true;
  2658 + }
  2659 + }
  2660 + }
  2661 + if (obj.isStream()) {
  2662 + QPDFObjectHandle dict = obj.getDict();
  2663 + std::set<std::string> keys = dict.getKeys();
  2664 + for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {
  2665 + std::string const& key = *iter;
  2666 + QPDFObjectHandle value = dict.getKey(key);
  2667 + if (key == "/Length") {
  2668 + // omit stream lengths
  2669 + if (value.isIndirect()) {
  2670 + QTC::TC("qpdf", "QPDF exclude indirect length");
  2671 + }
  2672 + } else {
  2673 + queue.push_back(value);
  2674 + }
  2675 + }
  2676 + } else if (obj.isDictionary()) {
  2677 + std::set<std::string> keys = obj.getKeys();
  2678 + for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {
  2679 + queue.push_back(obj.getKey(*iter));
  2680 + }
  2681 + } else if (obj.isArray()) {
  2682 + int n = obj.getArrayNItems();
  2683 + for (int i = 1; i <= n; ++i) {
  2684 + queue.push_back(obj.getArrayItem(n - i));
  2685 + }
  2686 + }
  2687 + }
  2688 +
  2689 + return result;
961 } 2690 }
962 2691
963 bool 2692 bool
@@ -1037,7 +2766,7 @@ QPDF::pipeStreamData( @@ -1037,7 +2766,7 @@ QPDF::pipeStreamData(
1037 { 2766 {
1038 return pipeStreamData( 2767 return pipeStreamData(
1039 m->encp, 2768 m->encp,
1040 - m->file_sp, 2769 + m->file,
1041 *this, 2770 *this,
1042 og, 2771 og,
1043 offset, 2772 offset,
libqpdf/QPDFJob.cc
@@ -13,6 +13,7 @@ @@ -13,6 +13,7 @@
13 #include <qpdf/Pl_StdioFile.hh> 13 #include <qpdf/Pl_StdioFile.hh>
14 #include <qpdf/Pl_String.hh> 14 #include <qpdf/Pl_String.hh>
15 #include <qpdf/QIntC.hh> 15 #include <qpdf/QIntC.hh>
  16 +#include <qpdf/QPDF.hh>
16 #include <qpdf/QPDFAcroFormDocumentHelper.hh> 17 #include <qpdf/QPDFAcroFormDocumentHelper.hh>
17 #include <qpdf/QPDFCryptoProvider.hh> 18 #include <qpdf/QPDFCryptoProvider.hh>
18 #include <qpdf/QPDFEmbeddedFileDocumentHelper.hh> 19 #include <qpdf/QPDFEmbeddedFileDocumentHelper.hh>
@@ -25,7 +26,6 @@ @@ -25,7 +26,6 @@
25 #include <qpdf/QPDFSystemError.hh> 26 #include <qpdf/QPDFSystemError.hh>
26 #include <qpdf/QPDFUsage.hh> 27 #include <qpdf/QPDFUsage.hh>
27 #include <qpdf/QPDFWriter.hh> 28 #include <qpdf/QPDFWriter.hh>
28 -#include <qpdf/QPDF_private.hh>  
29 #include <qpdf/QTC.hh> 29 #include <qpdf/QTC.hh>
30 #include <qpdf/QUtil.hh> 30 #include <qpdf/QUtil.hh>
31 31
libqpdf/QPDFWriter.cc
@@ -14,10 +14,10 @@ @@ -14,10 +14,10 @@
14 #include <qpdf/Pl_RC4.hh> 14 #include <qpdf/Pl_RC4.hh>
15 #include <qpdf/Pl_StdioFile.hh> 15 #include <qpdf/Pl_StdioFile.hh>
16 #include <qpdf/QIntC.hh> 16 #include <qpdf/QIntC.hh>
  17 +#include <qpdf/QPDF.hh>
17 #include <qpdf/QPDFObjectHandle.hh> 18 #include <qpdf/QPDFObjectHandle.hh>
18 #include <qpdf/QPDF_Name.hh> 19 #include <qpdf/QPDF_Name.hh>
19 #include <qpdf/QPDF_String.hh> 20 #include <qpdf/QPDF_String.hh>
20 -#include <qpdf/QPDF_private.hh>  
21 #include <qpdf/QTC.hh> 21 #include <qpdf/QTC.hh>
22 #include <qpdf/QUtil.hh> 22 #include <qpdf/QUtil.hh>
23 #include <qpdf/RC4.hh> 23 #include <qpdf/RC4.hh>
@@ -1698,6 +1698,7 @@ QPDFWriter::writeObjectStream(QPDFObjectHandle object) @@ -1698,6 +1698,7 @@ QPDFWriter::writeObjectStream(QPDFObjectHandle object)
1698 if (obj_to_write.isStream()) { 1698 if (obj_to_write.isStream()) {
1699 // This condition occurred in a fuzz input. Ideally we should block it at parse 1699 // This condition occurred in a fuzz input. Ideally we should block it at parse
1700 // time, but it's not clear to me how to construct a case for this. 1700 // time, but it's not clear to me how to construct a case for this.
  1701 + QTC::TC("qpdf", "QPDFWriter stream in ostream");
1701 obj_to_write.warnIfPossible("stream found inside object stream; treating as null"); 1702 obj_to_write.warnIfPossible("stream found inside object stream; treating as null");
1702 obj_to_write = QPDFObjectHandle::newNull(); 1703 obj_to_write = QPDFObjectHandle::newNull();
1703 } 1704 }
@@ -1936,26 +1937,47 @@ void @@ -1936,26 +1937,47 @@ void
1936 QPDFWriter::preserveObjectStreams() 1937 QPDFWriter::preserveObjectStreams()
1937 { 1938 {
1938 auto const& xref = QPDF::Writer::getXRefTable(m->pdf); 1939 auto const& xref = QPDF::Writer::getXRefTable(m->pdf);
1939 - m->obj.streams_empty = !xref.object_streams();  
1940 - if (m->obj.streams_empty) {  
1941 - return;  
1942 - }  
1943 - // This code filters out objects that are not allowed to be in object streams. In addition to  
1944 - // removing objects that were erroneously included in object streams in the source PDF, it also  
1945 - // prevents unreferenced objects from being included. 1940 + // Our object_to_object_stream map has to map ObjGen -> ObjGen since we may be generating object
  1941 + // streams out of old objects that have generation numbers greater than zero. However in an
  1942 + // existing PDF, all object stream objects and all objects in them must have generation 0
  1943 + // because the PDF spec does not provide any way to do otherwise. This code filters out objects
  1944 + // that are not allowed to be in object streams. In addition to removing objects that were
  1945 + // erroneously included in object streams in the source PDF, it also prevents unreferenced
  1946 + // objects from being included.
  1947 + auto end = xref.cend();
  1948 + m->obj.streams_empty = true;
1946 if (m->preserve_unreferenced_objects) { 1949 if (m->preserve_unreferenced_objects) {
1947 - QTC::TC("qpdf", "QPDFWriter preserve object streams preserve unreferenced");  
1948 - for (auto [id, stream]: xref.compressed_objects()) {  
1949 - m->obj[id].object_stream = stream; 1950 + for (auto iter = xref.cbegin(); iter != end; ++iter) {
  1951 + if (iter->second.getType() == 2) {
  1952 + // Pdf contains object streams.
  1953 + QTC::TC("qpdf", "QPDFWriter preserve object streams preserve unreferenced");
  1954 + m->obj.streams_empty = false;
  1955 + m->obj[iter->first].object_stream = iter->second.getObjStreamNumber();
  1956 + }
1950 } 1957 }
1951 } else { 1958 } else {
1952 - QTC::TC("qpdf", "QPDFWriter preserve object streams");  
1953 - auto eligible = QPDF::Writer::getCompressibleObjSet(m->pdf);  
1954 - for (auto [id, stream]: xref.compressed_objects()) {  
1955 - if (eligible[id]) {  
1956 - m->obj[id].object_stream = stream;  
1957 - } else {  
1958 - QTC::TC("qpdf", "QPDFWriter exclude from object stream"); 1959 + // Start by scanning for first compressed object in case we don't have any object streams to
  1960 + // process.
  1961 + for (auto iter = xref.cbegin(); iter != end; ++iter) {
  1962 + if (iter->second.getType() == 2) {
  1963 + // Pdf contains object streams.
  1964 + QTC::TC("qpdf", "QPDFWriter preserve object streams");
  1965 + m->obj.streams_empty = false;
  1966 + auto eligible = QPDF::Writer::getCompressibleObjSet(m->pdf);
  1967 + // The object pointed to by iter may be a previous generation, in which case it is
  1968 + // removed by getCompressibleObjSet. We need to restart the loop (while the object
  1969 + // table may contain multiple generations of an object).
  1970 + for (iter = xref.cbegin(); iter != end; ++iter) {
  1971 + if (iter->second.getType() == 2) {
  1972 + auto id = static_cast<size_t>(iter->first.getObj());
  1973 + if (id < eligible.size() && eligible[id]) {
  1974 + m->obj[iter->first].object_stream = iter->second.getObjStreamNumber();
  1975 + } else {
  1976 + QTC::TC("qpdf", "QPDFWriter exclude from object stream");
  1977 + }
  1978 + }
  1979 + }
  1980 + return;
1959 } 1981 }
1960 } 1982 }
1961 } 1983 }
libqpdf/QPDF_Stream.cc
@@ -10,8 +10,8 @@ @@ -10,8 +10,8 @@
10 #include <qpdf/Pl_Flate.hh> 10 #include <qpdf/Pl_Flate.hh>
11 #include <qpdf/Pl_QPDFTokenizer.hh> 11 #include <qpdf/Pl_QPDFTokenizer.hh>
12 #include <qpdf/QIntC.hh> 12 #include <qpdf/QIntC.hh>
  13 +#include <qpdf/QPDF.hh>
13 #include <qpdf/QPDFExc.hh> 14 #include <qpdf/QPDFExc.hh>
14 -#include <qpdf/QPDF_private.hh>  
15 #include <qpdf/QTC.hh> 15 #include <qpdf/QTC.hh>
16 #include <qpdf/QUtil.hh> 16 #include <qpdf/QUtil.hh>
17 #include <qpdf/SF_ASCII85Decode.hh> 17 #include <qpdf/SF_ASCII85Decode.hh>
libqpdf/QPDF_encryption.cc
@@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
3 3
4 #include <qpdf/assert_debug.h> 4 #include <qpdf/assert_debug.h>
5 5
6 -#include <qpdf/QPDF_private.hh> 6 +#include <qpdf/QPDF.hh>
7 7
8 #include <qpdf/QPDFExc.hh> 8 #include <qpdf/QPDFExc.hh>
9 9
@@ -727,7 +727,7 @@ QPDF::initializeEncryption() @@ -727,7 +727,7 @@ QPDF::initializeEncryption()
727 // at /Encrypt again. Otherwise, things could go wrong if someone mutates the encryption 727 // at /Encrypt again. Otherwise, things could go wrong if someone mutates the encryption
728 // dictionary. 728 // dictionary.
729 729
730 - if (!m->objects.trailer().hasKey("/Encrypt")) { 730 + if (!m->trailer.hasKey("/Encrypt")) {
731 return; 731 return;
732 } 732 }
733 733
@@ -736,7 +736,7 @@ QPDF::initializeEncryption() @@ -736,7 +736,7 @@ QPDF::initializeEncryption()
736 m->encp->encrypted = true; 736 m->encp->encrypted = true;
737 737
738 std::string id1; 738 std::string id1;
739 - QPDFObjectHandle id_obj = m->objects.trailer().getKey("/ID"); 739 + QPDFObjectHandle id_obj = m->trailer.getKey("/ID");
740 if ((id_obj.isArray() && (id_obj.getArrayNItems() == 2) && id_obj.getArrayItem(0).isString())) { 740 if ((id_obj.isArray() && (id_obj.getArrayNItems() == 2) && id_obj.getArrayItem(0).isString())) {
741 id1 = id_obj.getArrayItem(0).getStringValue(); 741 id1 = id_obj.getArrayItem(0).getStringValue();
742 } else { 742 } else {
@@ -745,7 +745,7 @@ QPDF::initializeEncryption() @@ -745,7 +745,7 @@ QPDF::initializeEncryption()
745 warn(damagedPDF("trailer", "invalid /ID in trailer dictionary")); 745 warn(damagedPDF("trailer", "invalid /ID in trailer dictionary"));
746 } 746 }
747 747
748 - QPDFObjectHandle encryption_dict = m->objects.trailer().getKey("/Encrypt"); 748 + QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
749 if (!encryption_dict.isDictionary()) { 749 if (!encryption_dict.isDictionary()) {
750 throw damagedPDF("/Encrypt in trailer dictionary is not a dictionary"); 750 throw damagedPDF("/Encrypt in trailer dictionary is not a dictionary");
751 } 751 }
libqpdf/QPDF_json.cc
@@ -51,6 +51,17 @@ @@ -51,6 +51,17 @@
51 // ] | <- st_top 51 // ] | <- st_top
52 // } | 52 // } |
53 53
  54 +static char const* JSON_PDF = (
  55 + // force line break
  56 + "%PDF-1.3\n"
  57 + "xref\n"
  58 + "0 1\n"
  59 + "0000000000 65535 f \n"
  60 + "trailer << /Size 1 >>\n"
  61 + "startxref\n"
  62 + "9\n"
  63 + "%%EOF\n");
  64 +
54 // Validator methods -- these are much more performant than std::regex. 65 // Validator methods -- these are much more performant than std::regex.
55 static bool 66 static bool
56 is_indirect_object(std::string const& v, int& obj, int& gen) 67 is_indirect_object(std::string const& v, int& obj, int& gen)
@@ -256,10 +267,10 @@ class QPDF::JSONReactor: public JSON::Reactor @@ -256,10 +267,10 @@ class QPDF::JSONReactor: public JSON::Reactor
256 struct StackFrame 267 struct StackFrame
257 { 268 {
258 StackFrame(state_e state) : 269 StackFrame(state_e state) :
259 - state(state){}; 270 + state(state) {};
260 StackFrame(state_e state, QPDFObjectHandle&& object) : 271 StackFrame(state_e state, QPDFObjectHandle&& object) :
261 state(state), 272 state(state),
262 - object(object){}; 273 + object(object) {};
263 state_e state; 274 state_e state;
264 QPDFObjectHandle object; 275 QPDFObjectHandle object;
265 }; 276 };
@@ -536,7 +547,7 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value) @@ -536,7 +547,7 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value)
536 } else if (is_obj_key(key, obj, gen)) { 547 } else if (is_obj_key(key, obj, gen)) {
537 this->cur_object = key; 548 this->cur_object = key;
538 if (setNextStateIfDictionary(key, value, st_object_top)) { 549 if (setNextStateIfDictionary(key, value, st_object_top)) {
539 - next_obj = pdf.objects().get_for_json(obj, gen); 550 + next_obj = pdf.getObjectForJSON(obj, gen);
540 } 551 }
541 } else { 552 } else {
542 QTC::TC("qpdf", "QPDF_json bad object key"); 553 QTC::TC("qpdf", "QPDF_json bad object key");
@@ -582,7 +593,8 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value) @@ -582,7 +593,8 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value)
582 this->saw_value = true; 593 this->saw_value = true;
583 // The trailer must be a dictionary, so we can use setNextStateIfDictionary. 594 // The trailer must be a dictionary, so we can use setNextStateIfDictionary.
584 if (setNextStateIfDictionary("trailer.value", value, st_object)) { 595 if (setNextStateIfDictionary("trailer.value", value, st_object)) {
585 - pdf.m->objects.xref_table().trailer(makeObject(value)); 596 + this->pdf.m->trailer = makeObject(value);
  597 + setObjectDescription(this->pdf.m->trailer, value);
586 } 598 }
587 } else if (key == "stream") { 599 } else if (key == "stream") {
588 // Don't need to set saw_stream here since there's already an error. 600 // Don't need to set saw_stream here since there's already an error.
@@ -740,7 +752,7 @@ QPDF::JSONReactor::makeObject(JSON const&amp; value) @@ -740,7 +752,7 @@ QPDF::JSONReactor::makeObject(JSON const&amp; value)
740 int gen = 0; 752 int gen = 0;
741 std::string str; 753 std::string str;
742 if (is_indirect_object(str_v, obj, gen)) { 754 if (is_indirect_object(str_v, obj, gen)) {
743 - result = pdf.objects().get_for_json(obj, gen); 755 + result = pdf.getObjectForJSON(obj, gen);
744 } else if (is_unicode_string(str_v, str)) { 756 } else if (is_unicode_string(str_v, str)) {
745 result = QPDFObjectHandle::newUnicodeString(str); 757 result = QPDFObjectHandle::newUnicodeString(str);
746 } else if (is_binary_string(str_v, str)) { 758 } else if (is_binary_string(str_v, str)) {
@@ -774,9 +786,7 @@ QPDF::createFromJSON(std::string const&amp; json_file) @@ -774,9 +786,7 @@ QPDF::createFromJSON(std::string const&amp; json_file)
774 void 786 void
775 QPDF::createFromJSON(std::shared_ptr<InputSource> is) 787 QPDF::createFromJSON(std::shared_ptr<InputSource> is)
776 { 788 {
777 - m->pdf_version = "1.3";  
778 - m->no_input_name = is->getName();  
779 - m->objects.xref_table().initialize_json(); 789 + processMemoryFile(is->getName().c_str(), JSON_PDF, strlen(JSON_PDF));
780 importJSON(is, true); 790 importJSON(is, true);
781 } 791 }
782 792
libqpdf/QPDF_linearization.cc
1 // See doc/linearization. 1 // See doc/linearization.
2 2
3 -#include <qpdf/QPDF_private.hh> 3 +#include <qpdf/QPDF.hh>
4 4
5 #include <qpdf/BitStream.hh> 5 #include <qpdf/BitStream.hh>
6 #include <qpdf/BitWriter.hh> 6 #include <qpdf/BitWriter.hh>
@@ -130,7 +130,7 @@ QPDF::isLinearized() @@ -130,7 +130,7 @@ QPDF::isLinearized()
130 return false; 130 return false;
131 } 131 }
132 132
133 - auto candidate = m->objects.get(lindict_obj, 0); 133 + auto candidate = getObjectByID(lindict_obj, 0);
134 if (!candidate.isDictionary()) { 134 if (!candidate.isDictionary()) {
135 return false; 135 return false;
136 } 136 }
@@ -287,9 +287,10 @@ QPDF::readHintStream(Pipeline&amp; pl, qpdf_offset_t offset, size_t length) @@ -287,9 +287,10 @@ QPDF::readHintStream(Pipeline&amp; pl, qpdf_offset_t offset, size_t length)
287 { 287 {
288 QPDFObjGen og; 288 QPDFObjGen og;
289 QPDFObjectHandle H = 289 QPDFObjectHandle H =
290 - objects().read(false, offset, "linearization hint stream", QPDFObjGen(0, 0), og, false);  
291 - qpdf_offset_t min_end_offset = m->objects.xref_table().end_before_space(og);  
292 - qpdf_offset_t max_end_offset = m->objects.xref_table().end_after_space(og); 290 + readObjectAtOffset(false, offset, "linearization hint stream", QPDFObjGen(0, 0), og, false);
  291 + ObjCache& oc = m->obj_cache[og];
  292 + qpdf_offset_t min_end_offset = oc.end_before_space;
  293 + qpdf_offset_t max_end_offset = oc.end_after_space;
293 if (!H.isStream()) { 294 if (!H.isStream()) {
294 throw damagedPDF("linearization dictionary", "hint table is not a stream"); 295 throw damagedPDF("linearization dictionary", "hint table is not a stream");
295 } 296 }
@@ -300,11 +301,14 @@ QPDF::readHintStream(Pipeline&amp; pl, qpdf_offset_t offset, size_t length) @@ -300,11 +301,14 @@ QPDF::readHintStream(Pipeline&amp; pl, qpdf_offset_t offset, size_t length)
300 // increasing length to cover it, even though the specification says all objects in the 301 // increasing length to cover it, even though the specification says all objects in the
301 // linearization parameter dictionary must be direct. We have to get the file position of the 302 // linearization parameter dictionary must be direct. We have to get the file position of the
302 // end of length in this case. 303 // end of length in this case.
303 - auto length_og = Hdict.getKey("/Length").getObjGen();  
304 - if (length_og.isIndirect()) { 304 + QPDFObjectHandle length_obj = Hdict.getKey("/Length");
  305 + if (length_obj.isIndirect()) {
305 QTC::TC("qpdf", "QPDF hint table length indirect"); 306 QTC::TC("qpdf", "QPDF hint table length indirect");
306 - min_end_offset = m->objects.xref_table().end_before_space(length_og);  
307 - max_end_offset = m->objects.xref_table().end_after_space(length_og); 307 + // Force resolution
  308 + (void)length_obj.getIntValue();
  309 + ObjCache& oc2 = m->obj_cache[length_obj.getObjGen()];
  310 + min_end_offset = oc2.end_before_space;
  311 + max_end_offset = oc2.end_after_space;
308 } else { 312 } else {
309 QTC::TC("qpdf", "QPDF hint table length direct"); 313 QTC::TC("qpdf", "QPDF hint table length direct");
310 } 314 }
@@ -441,7 +445,7 @@ QPDF::checkLinearizationInternal() @@ -441,7 +445,7 @@ QPDF::checkLinearizationInternal()
441 for (size_t i = 0; i < toS(npages); ++i) { 445 for (size_t i = 0; i < toS(npages); ++i) {
442 QPDFObjectHandle const& page = pages.at(i); 446 QPDFObjectHandle const& page = pages.at(i);
443 QPDFObjGen og(page.getObjGen()); 447 QPDFObjGen og(page.getObjGen());
444 - if (m->objects.xref_table().type(og) == 2) { 448 + if (m->xref_table[og].getType() == 2) {
445 linearizationWarning( 449 linearizationWarning(
446 "page dictionary for page " + std::to_string(i) + " is compressed"); 450 "page dictionary for page " + std::to_string(i) + " is compressed");
447 } 451 }
@@ -457,11 +461,12 @@ QPDF::checkLinearizationInternal() @@ -457,11 +461,12 @@ QPDF::checkLinearizationInternal()
457 break; 461 break;
458 } 462 }
459 } 463 }
460 - if (m->file->tell() != m->objects.xref_table().first_item_offset()) { 464 + if (m->file->tell() != m->first_xref_item_offset) {
461 QTC::TC("qpdf", "QPDF err /T mismatch"); 465 QTC::TC("qpdf", "QPDF err /T mismatch");
462 linearizationWarning( 466 linearizationWarning(
463 - "space before first xref item (/T) mismatch (computed = " +  
464 - std::to_string(m->objects.xref_table().first_item_offset()) + 467 + "space before first xref item (/T) mismatch "
  468 + "(computed = " +
  469 + std::to_string(m->first_xref_item_offset) +
465 "; file = " + std::to_string(m->file->tell())); 470 "; file = " + std::to_string(m->file->tell()));
466 } 471 }
467 472
@@ -472,7 +477,7 @@ QPDF::checkLinearizationInternal() @@ -472,7 +477,7 @@ QPDF::checkLinearizationInternal()
472 // compressed objects are supposed to be at the end of the containing xref section if any object 477 // compressed objects are supposed to be at the end of the containing xref section if any object
473 // streams are in use. 478 // streams are in use.
474 479
475 - if (m->objects.xref_table().uncompressed_after_compressed()) { 480 + if (m->uncompressed_after_compressed) {
476 linearizationWarning("linearized file contains an uncompressed object after a compressed " 481 linearizationWarning("linearized file contains an uncompressed object after a compressed "
477 "one in a cross-reference stream"); 482 "one in a cross-reference stream");
478 } 483 }
@@ -480,9 +485,18 @@ QPDF::checkLinearizationInternal() @@ -480,9 +485,18 @@ QPDF::checkLinearizationInternal()
480 // Further checking requires optimization and order calculation. Don't allow optimization to 485 // Further checking requires optimization and order calculation. Don't allow optimization to
481 // make changes. If it has to, then the file is not properly linearized. We use the xref table 486 // make changes. If it has to, then the file is not properly linearized. We use the xref table
482 // to figure out which objects are compressed and which are uncompressed. 487 // to figure out which objects are compressed and which are uncompressed.
483 -  
484 - optimize(m->objects);  
485 - calculateLinearizationData(m->objects); 488 + { // local scope
  489 + std::map<int, int> object_stream_data;
  490 + for (auto const& iter: m->xref_table) {
  491 + QPDFObjGen const& og = iter.first;
  492 + QPDFXRefEntry const& entry = iter.second;
  493 + if (entry.getType() == 2) {
  494 + object_stream_data[og.getObj()] = entry.getObjStreamNumber();
  495 + }
  496 + }
  497 + optimize(object_stream_data, false);
  498 + calculateLinearizationData(object_stream_data);
  499 + }
486 500
487 // E: offset of end of first page -- Implementation note 123 says Acrobat includes on extra 501 // E: offset of end of first page -- Implementation note 123 says Acrobat includes on extra
488 // object here by mistake. pdlin fails to place thumbnail images in section 9, so when 502 // object here by mistake. pdlin fails to place thumbnail images in section 9, so when
@@ -499,14 +513,13 @@ QPDF::checkLinearizationInternal() @@ -499,14 +513,13 @@ QPDF::checkLinearizationInternal()
499 qpdf_offset_t max_E = -1; 513 qpdf_offset_t max_E = -1;
500 for (auto const& oh: m->part6) { 514 for (auto const& oh: m->part6) {
501 QPDFObjGen og(oh.getObjGen()); 515 QPDFObjGen og(oh.getObjGen());
502 - auto before = m->objects.xref_table().end_before_space(og);  
503 - auto after = m->objects.xref_table().end_after_space(og);  
504 - if (before <= 0) { 516 + if (m->obj_cache.count(og) == 0) {
505 // All objects have to have been dereferenced to be classified. 517 // All objects have to have been dereferenced to be classified.
506 throw std::logic_error("linearization part6 object not in cache"); 518 throw std::logic_error("linearization part6 object not in cache");
507 } 519 }
508 - min_E = std::max(min_E, before);  
509 - max_E = std::max(max_E, after); 520 + ObjCache const& oc = m->obj_cache[og];
  521 + min_E = std::max(min_E, oc.end_before_space);
  522 + max_E = std::max(max_E, oc.end_after_space);
510 } 523 }
511 if ((p.first_page_end < min_E) || (p.first_page_end > max_E)) { 524 if ((p.first_page_end < min_E) || (p.first_page_end > max_E)) {
512 QTC::TC("qpdf", "QPDF warn /E mismatch"); 525 QTC::TC("qpdf", "QPDF warn /E mismatch");
@@ -533,11 +546,10 @@ QPDF::maxEnd(ObjUser const&amp; ou) @@ -533,11 +546,10 @@ QPDF::maxEnd(ObjUser const&amp; ou)
533 } 546 }
534 qpdf_offset_t end = 0; 547 qpdf_offset_t end = 0;
535 for (auto const& og: m->obj_user_to_objects[ou]) { 548 for (auto const& og: m->obj_user_to_objects[ou]) {
536 - auto e = m->objects.xref_table().end_after_space(og);  
537 - if (e <= 0) { 549 + if (m->obj_cache.count(og) == 0) {
538 stopOnError("unknown object referenced in object user table"); 550 stopOnError("unknown object referenced in object user table");
539 } 551 }
540 - end = std::max(end, e); 552 + end = std::max(end, m->obj_cache[og].end_after_space);
541 } 553 }
542 return end; 554 return end;
543 } 555 }
@@ -545,40 +557,34 @@ QPDF::maxEnd(ObjUser const&amp; ou) @@ -545,40 +557,34 @@ QPDF::maxEnd(ObjUser const&amp; ou)
545 qpdf_offset_t 557 qpdf_offset_t
546 QPDF::getLinearizationOffset(QPDFObjGen const& og) 558 QPDF::getLinearizationOffset(QPDFObjGen const& og)
547 { 559 {
548 - switch (m->objects.xref_table().type(og)) { 560 + QPDFXRefEntry entry = m->xref_table[og];
  561 + qpdf_offset_t result = 0;
  562 + switch (entry.getType()) {
549 case 1: 563 case 1:
550 - return m->objects.xref_table().offset(og); 564 + result = entry.getOffset();
  565 + break;
551 566
552 case 2: 567 case 2:
553 // For compressed objects, return the offset of the object stream that contains them. 568 // For compressed objects, return the offset of the object stream that contains them.
554 - return getLinearizationOffset(  
555 - QPDFObjGen(m->objects.xref_table().stream_number(og.getObj()), 0)); 569 + result = getLinearizationOffset(QPDFObjGen(entry.getObjStreamNumber(), 0));
  570 + break;
556 571
557 default: 572 default:
558 stopOnError("getLinearizationOffset called for xref entry not of type 1 or 2"); 573 stopOnError("getLinearizationOffset called for xref entry not of type 1 or 2");
559 - return 0; // unreachable 574 + break;
560 } 575 }
  576 + return result;
561 } 577 }
562 578
563 QPDFObjectHandle 579 QPDFObjectHandle
564 QPDF::getUncompressedObject(QPDFObjectHandle& obj, std::map<int, int> const& object_stream_data) 580 QPDF::getUncompressedObject(QPDFObjectHandle& obj, std::map<int, int> const& object_stream_data)
565 { 581 {
566 - if (obj.isNull() || !object_stream_data.count(obj.getObjectID())) { 582 + if (obj.isNull() || (object_stream_data.count(obj.getObjectID()) == 0)) {
567 return obj; 583 return obj;
568 } else { 584 } else {
569 int repl = (*(object_stream_data.find(obj.getObjectID()))).second; 585 int repl = (*(object_stream_data.find(obj.getObjectID()))).second;
570 - return m->objects.get(repl, 0);  
571 - }  
572 -}  
573 -  
574 -QPDFObjectHandle  
575 -QPDF::getUncompressedObject(QPDFObjectHandle& obj, Objects const& objects)  
576 -{  
577 - auto og = obj.getObjGen();  
578 - if (obj.isNull() || objects.xref_table().type(og) != 2) {  
579 - return obj; 586 + return getObject(repl, 0);
580 } 587 }
581 - return m->objects.get(objects.xref_table().stream_number(og.getObj()), 0);  
582 } 588 }
583 589
584 QPDFObjectHandle 590 QPDFObjectHandle
@@ -586,7 +592,7 @@ QPDF::getUncompressedObject(QPDFObjectHandle&amp; oh, QPDFWriter::ObjTable const&amp; ob @@ -586,7 +592,7 @@ QPDF::getUncompressedObject(QPDFObjectHandle&amp; oh, QPDFWriter::ObjTable const&amp; ob
586 { 592 {
587 if (obj.contains(oh)) { 593 if (obj.contains(oh)) {
588 if (auto id = obj[oh].object_stream; id > 0) { 594 if (auto id = obj[oh].object_stream; id > 0) {
589 - return oh.isNull() ? oh : m->objects.get(id, 0); 595 + return oh.isNull() ? oh : getObject(id, 0);
590 } 596 }
591 } 597 }
592 return oh; 598 return oh;
@@ -598,13 +604,15 @@ QPDF::lengthNextN(int first_object, int n) @@ -598,13 +604,15 @@ QPDF::lengthNextN(int first_object, int n)
598 int length = 0; 604 int length = 0;
599 for (int i = 0; i < n; ++i) { 605 for (int i = 0; i < n; ++i) {
600 QPDFObjGen og(first_object + i, 0); 606 QPDFObjGen og(first_object + i, 0);
601 - auto end = m->objects.xref_table().end_after_space(og);  
602 - if (end <= 0) { 607 + if (m->xref_table.count(og) == 0) {
603 linearizationWarning( 608 linearizationWarning(
604 "no xref table entry for " + std::to_string(first_object + i) + " 0"); 609 "no xref table entry for " + std::to_string(first_object + i) + " 0");
605 - continue; 610 + } else {
  611 + if (m->obj_cache.count(og) == 0) {
  612 + stopOnError("found unknown object while calculating length for linearization data");
  613 + }
  614 + length += toI(m->obj_cache[og].end_after_space - getLinearizationOffset(og));
606 } 615 }
607 - length += toI(end - getLinearizationOffset(og));  
608 } 616 }
609 return length; 617 return length;
610 } 618 }
@@ -628,7 +636,7 @@ QPDF::checkHPageOffset( @@ -628,7 +636,7 @@ QPDF::checkHPageOffset(
628 int npages = toI(pages.size()); 636 int npages = toI(pages.size());
629 qpdf_offset_t table_offset = adjusted_offset(m->page_offset_hints.first_page_offset); 637 qpdf_offset_t table_offset = adjusted_offset(m->page_offset_hints.first_page_offset);
630 QPDFObjGen first_page_og(pages.at(0).getObjGen()); 638 QPDFObjGen first_page_og(pages.at(0).getObjGen());
631 - if (m->objects.xref_table().type(first_page_og) == 0) { 639 + if (m->xref_table.count(first_page_og) == 0) {
632 stopOnError("supposed first page object is not known"); 640 stopOnError("supposed first page object is not known");
633 } 641 }
634 qpdf_offset_t offset = getLinearizationOffset(first_page_og); 642 qpdf_offset_t offset = getLinearizationOffset(first_page_og);
@@ -639,7 +647,7 @@ QPDF::checkHPageOffset( @@ -639,7 +647,7 @@ QPDF::checkHPageOffset(
639 for (int pageno = 0; pageno < npages; ++pageno) { 647 for (int pageno = 0; pageno < npages; ++pageno) {
640 QPDFObjGen page_og(pages.at(toS(pageno)).getObjGen()); 648 QPDFObjGen page_og(pages.at(toS(pageno)).getObjGen());
641 int first_object = page_og.getObj(); 649 int first_object = page_og.getObj();
642 - if (m->objects.xref_table().type(page_og) == 0) { 650 + if (m->xref_table.count(page_og) == 0) {
643 stopOnError("unknown object in page offset hint table"); 651 stopOnError("unknown object in page offset hint table");
644 } 652 }
645 offset = getLinearizationOffset(page_og); 653 offset = getLinearizationOffset(page_og);
@@ -761,7 +769,7 @@ QPDF::checkHSharedObject(std::vector&lt;QPDFObjectHandle&gt; const&amp; pages, std::map&lt;in @@ -761,7 +769,7 @@ QPDF::checkHSharedObject(std::vector&lt;QPDFObjectHandle&gt; const&amp; pages, std::map&lt;in
761 cur_object = so.first_shared_obj; 769 cur_object = so.first_shared_obj;
762 770
763 QPDFObjGen og(cur_object, 0); 771 QPDFObjGen og(cur_object, 0);
764 - if (m->objects.xref_table().type(og) == 0) { 772 + if (m->xref_table.count(og) == 0) {
765 stopOnError("unknown object in shared object hint table"); 773 stopOnError("unknown object in shared object hint table");
766 } 774 }
767 qpdf_offset_t offset = getLinearizationOffset(og); 775 qpdf_offset_t offset = getLinearizationOffset(og);
@@ -812,7 +820,7 @@ QPDF::checkHOutlines() @@ -812,7 +820,7 @@ QPDF::checkHOutlines()
812 return; 820 return;
813 } 821 }
814 QPDFObjGen og(outlines.getObjGen()); 822 QPDFObjGen og(outlines.getObjGen());
815 - if (m->objects.xref_table().type(og) == 0) { 823 + if (m->xref_table.count(og) == 0) {
816 stopOnError("unknown object in outlines hint table"); 824 stopOnError("unknown object in outlines hint table");
817 } 825 }
818 qpdf_offset_t offset = getLinearizationOffset(og); 826 qpdf_offset_t offset = getLinearizationOffset(og);
@@ -831,7 +839,8 @@ QPDF::checkHOutlines() @@ -831,7 +839,8 @@ QPDF::checkHOutlines()
831 std::to_string(table_length) + "; computed = " + std::to_string(length)); 839 std::to_string(table_length) + "; computed = " + std::to_string(length));
832 } 840 }
833 } else { 841 } else {
834 - linearizationWarning("incorrect first object number in outline hints table."); 842 + linearizationWarning("incorrect first object number in outline "
  843 + "hints table.");
835 } 844 }
836 } else { 845 } else {
837 linearizationWarning("incorrect object count in outline hint table"); 846 linearizationWarning("incorrect object count in outline hint table");
@@ -1159,7 +1168,7 @@ QPDF::calculateLinearizationData(T const&amp; object_stream_data) @@ -1159,7 +1168,7 @@ QPDF::calculateLinearizationData(T const&amp; object_stream_data)
1159 // Map all page objects to the containing object stream. This should be a no-op in a 1168 // Map all page objects to the containing object stream. This should be a no-op in a
1160 // properly linearized file. 1169 // properly linearized file.
1161 for (auto oh: getAllPages()) { 1170 for (auto oh: getAllPages()) {
1162 - pages.emplace_back(getUncompressedObject(oh, object_stream_data)); 1171 + pages.push_back(getUncompressedObject(oh, object_stream_data));
1163 } 1172 }
1164 } 1173 }
1165 int npages = toI(pages.size()); 1174 int npages = toI(pages.size());
@@ -1430,9 +1439,9 @@ QPDF::pushOutlinesToPart( @@ -1430,9 +1439,9 @@ QPDF::pushOutlinesToPart(
1430 m->c_outline_data.first_object = outlines_og.getObj(); 1439 m->c_outline_data.first_object = outlines_og.getObj();
1431 m->c_outline_data.nobjects = 1; 1440 m->c_outline_data.nobjects = 1;
1432 lc_outlines.erase(outlines_og); 1441 lc_outlines.erase(outlines_og);
1433 - part.emplace_back(outlines); 1442 + part.push_back(outlines);
1434 for (auto const& og: lc_outlines) { 1443 for (auto const& og: lc_outlines) {
1435 - part.emplace_back(m->objects.get(og)); 1444 + part.push_back(getObject(og));
1436 ++m->c_outline_data.nobjects; 1445 ++m->c_outline_data.nobjects;
1437 } 1446 }
1438 } 1447 }
libqpdf/QPDF_objects.cc deleted
1 -#include <qpdf/qpdf-config.h> // include first for large file support  
2 -  
3 -#include <qpdf/QPDF_private.hh>  
4 -  
5 -#include <array>  
6 -#include <cstring>  
7 -#include <limits>  
8 -#include <map>  
9 -#include <vector>  
10 -  
11 -#include <qpdf/BufferInputSource.hh>  
12 -#include <qpdf/OffsetInputSource.hh>  
13 -#include <qpdf/Pipeline.hh>  
14 -#include <qpdf/QPDFExc.hh>  
15 -#include <qpdf/QPDFLogger.hh>  
16 -#include <qpdf/QPDFObject_private.hh>  
17 -#include <qpdf/QPDFParser.hh>  
18 -#include <qpdf/QPDF_Array.hh>  
19 -#include <qpdf/QPDF_Dictionary.hh>  
20 -#include <qpdf/QPDF_Null.hh>  
21 -#include <qpdf/QPDF_Reserved.hh>  
22 -#include <qpdf/QPDF_Stream.hh>  
23 -#include <qpdf/QPDF_Unresolved.hh>  
24 -#include <qpdf/QTC.hh>  
25 -#include <qpdf/QUtil.hh>  
26 -  
27 -using Objects = QPDF::Objects;  
28 -using Xref_table = Objects::Xref_table;  
29 -  
30 -namespace  
31 -{  
32 - class InvalidInputSource final: public InputSource  
33 - {  
34 - public:  
35 - InvalidInputSource(std::string const& name) :  
36 - name(name)  
37 - {  
38 - }  
39 - ~InvalidInputSource() final = default;  
40 - qpdf_offset_t  
41 - findAndSkipNextEOL() final  
42 - {  
43 - throwException();  
44 - return 0;  
45 - }  
46 - std::string const&  
47 - getName() const final  
48 - {  
49 - return name;  
50 - }  
51 - qpdf_offset_t  
52 - tell() final  
53 - {  
54 - throwException();  
55 - return 0;  
56 - }  
57 - void  
58 - seek(qpdf_offset_t offset, int whence) final  
59 - {  
60 - throwException();  
61 - }  
62 - void  
63 - rewind() final  
64 - {  
65 - throwException();  
66 - }  
67 - size_t  
68 - read(char* buffer, size_t length) final  
69 - {  
70 - throwException();  
71 - return 0;  
72 - }  
73 - void  
74 - unreadCh(char ch) final  
75 - {  
76 - throwException();  
77 - }  
78 -  
79 - private:  
80 - void  
81 - throwException()  
82 - {  
83 - throw std::logic_error("QPDF operation attempted on a QPDF object with no input "  
84 - "source. QPDF operations are invalid before processFile (or "  
85 - "another process method) or after closeInputSource");  
86 - }  
87 -  
88 - std::string const& name;  
89 - };  
90 -} // namespace  
91 -  
92 -bool  
93 -QPDF::findStartxref()  
94 -{  
95 - if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {  
96 - // Position in front of offset token  
97 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
98 - return true;  
99 - }  
100 - return false;  
101 -}  
102 -  
103 -void  
104 -Xref_table::initialize_empty()  
105 -{  
106 - initialized_ = true;  
107 - trailer_ = QPDFObjectHandle::newDictionary();  
108 - auto rt = qpdf.makeIndirectObject(QPDFObjectHandle::newDictionary());  
109 - auto pgs = qpdf.makeIndirectObject(QPDFObjectHandle::newDictionary());  
110 - pgs.replaceKey("/Type", QPDFObjectHandle::newName("/Pages"));  
111 - pgs.replaceKey("/Kids", QPDFObjectHandle::newArray());  
112 - pgs.replaceKey("/Count", QPDFObjectHandle::newInteger(0));  
113 - rt.replaceKey("/Type", QPDFObjectHandle::newName("/Catalog"));  
114 - rt.replaceKey("/Pages", pgs);  
115 - trailer_.replaceKey("/Root", rt);  
116 - trailer_.replaceKey("/Size", QPDFObjectHandle::newInteger(3));  
117 -}  
118 -  
119 -void  
120 -Xref_table::initialize_json()  
121 -{  
122 - initialized_ = true;  
123 - table.resize(1);  
124 - trailer_ = QPDFObjectHandle::newDictionary();  
125 - trailer_.replaceKey("/Size", QPDFObjectHandle::newInteger(1));  
126 -}  
127 -  
128 -void  
129 -Xref_table::initialize()  
130 -{  
131 - // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra  
132 - // 30 characters to leave room for the startxref stuff.  
133 - file->seek(0, SEEK_END);  
134 - qpdf_offset_t end_offset = file->tell();  
135 - // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic  
136 - // scenarios at least 3 bytes are required.  
137 - if (max_id_ > end_offset / 3) {  
138 - max_id_ = static_cast<int>(end_offset / 3);  
139 - }  
140 - qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);  
141 - PatternFinder sf(qpdf, &QPDF::findStartxref);  
142 - qpdf_offset_t xref_offset = 0;  
143 - if (file->findLast("startxref", start_offset, 0, sf)) {  
144 - xref_offset = QUtil::string_to_ll(read_token().getValue().c_str());  
145 - }  
146 -  
147 - try {  
148 - if (xref_offset == 0) {  
149 - QTC::TC("qpdf", "QPDF can't find startxref");  
150 - throw damaged_pdf("can't find startxref");  
151 - }  
152 - try {  
153 - read(xref_offset);  
154 - } catch (QPDFExc&) {  
155 - throw;  
156 - } catch (std::exception& e) {  
157 - throw damaged_pdf(std::string("error reading xref: ") + e.what());  
158 - }  
159 - } catch (QPDFExc& e) {  
160 - if (attempt_recovery_) {  
161 - reconstruct(e);  
162 - QTC::TC("qpdf", "QPDF reconstructed xref table");  
163 - } else {  
164 - throw;  
165 - }  
166 - }  
167 -  
168 - initialized_ = true;  
169 -}  
170 -  
171 -void  
172 -Xref_table::reconstruct(QPDFExc& e)  
173 -{  
174 - if (reconstructed_) {  
175 - // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because  
176 - // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.  
177 - throw e;  
178 - }  
179 -  
180 - // If recovery generates more than 1000 warnings, the file is so severely damaged that there  
181 - // probably is no point trying to continue.  
182 - const auto max_warnings = qpdf.m->warnings.size() + 1000U;  
183 - auto check_warnings = [this, max_warnings]() {  
184 - if (qpdf.m->warnings.size() > max_warnings) {  
185 - throw damaged_pdf("too many errors while reconstructing cross-reference table");  
186 - }  
187 - };  
188 -  
189 - reconstructed_ = true;  
190 - // We may find more objects, which may contain dangling references.  
191 - qpdf.m->fixed_dangling_refs = false;  
192 -  
193 - warn_damaged("file is damaged");  
194 - qpdf.warn(e);  
195 - warn_damaged("Attempting to reconstruct cross-reference table");  
196 -  
197 - // Delete all references to type 1 (uncompressed) objects  
198 - for (auto& iter: table) {  
199 - if (iter.type() == 1) {  
200 - iter = {};  
201 - }  
202 - }  
203 -  
204 - std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;  
205 - std::vector<qpdf_offset_t> trailers;  
206 - int max_found = 0;  
207 -  
208 - file->seek(0, SEEK_END);  
209 - qpdf_offset_t eof = file->tell();  
210 - file->seek(0, SEEK_SET);  
211 - // Don't allow very long tokens here during recovery. All the interesting tokens are covered.  
212 - static size_t const MAX_LEN = 10;  
213 - while (file->tell() < eof) {  
214 - QPDFTokenizer::Token t1 = read_token(MAX_LEN);  
215 - qpdf_offset_t token_start = file->tell() - toO(t1.getValue().length());  
216 - if (t1.isInteger()) {  
217 - auto pos = file->tell();  
218 - QPDFTokenizer::Token t2 = read_token(MAX_LEN);  
219 - if (t2.isInteger() && read_token(MAX_LEN).isWord("obj")) {  
220 - int obj = QUtil::string_to_int(t1.getValue().c_str());  
221 - int gen = QUtil::string_to_int(t2.getValue().c_str());  
222 - if (obj <= max_id_) {  
223 - found_objects.emplace_back(obj, gen, token_start);  
224 - if (obj > max_found) {  
225 - max_found = obj;  
226 - }  
227 - } else {  
228 - warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));  
229 - }  
230 - }  
231 - file->seek(pos, SEEK_SET);  
232 - } else if (!trailer_ && t1.isWord("trailer")) {  
233 - trailers.emplace_back(file->tell());  
234 - }  
235 - file->findAndSkipNextEOL();  
236 - }  
237 -  
238 - table.resize(toS(max_found) + 1);  
239 -  
240 - for (auto tr: trailers) {  
241 - file->seek(tr, SEEK_SET);  
242 - auto t = read_trailer();  
243 - if (!t.isDictionary()) {  
244 - // Oh well. It was worth a try.  
245 - } else {  
246 - trailer_ = t;  
247 - break;  
248 - }  
249 - check_warnings();  
250 - }  
251 -  
252 - auto rend = found_objects.rend();  
253 - for (auto it = found_objects.rbegin(); it != rend; it++) {  
254 - auto [obj, gen, token_start] = *it;  
255 - insert(obj, 1, token_start, gen);  
256 - check_warnings();  
257 - }  
258 -  
259 - if (!trailer_) {  
260 - qpdf_offset_t max_offset{0};  
261 - // If there are any xref streams, take the last one to appear.  
262 - int i = -1;  
263 - for (auto const& item: table) {  
264 - ++i;  
265 - if (item.type() != 1) {  
266 - continue;  
267 - }  
268 - auto oh = objects.get(i, item.gen());  
269 - try {  
270 - if (!oh.isStreamOfType("/XRef")) {  
271 - continue;  
272 - }  
273 - } catch (std::exception&) {  
274 - continue;  
275 - }  
276 - auto offset = item.offset();  
277 - if (offset > max_offset) {  
278 - max_offset = offset;  
279 - trailer_ = oh.getDict();  
280 - }  
281 - check_warnings();  
282 - }  
283 - if (max_offset > 0) {  
284 - try {  
285 - read(max_offset);  
286 - } catch (std::exception&) {  
287 - throw damaged_pdf(  
288 - "error decoding candidate xref stream while recovering damaged file");  
289 - }  
290 - QTC::TC("qpdf", "QPDF recover xref stream");  
291 - }  
292 - }  
293 -  
294 - if (!trailer_) {  
295 - // We could check the last encountered object to see if it was an xref stream. If so, we  
296 - // could try to get the trailer from there. This may make it possible to recover files with  
297 - // bad startxref pointers even when they have object streams.  
298 -  
299 - throw damaged_pdf("unable to find trailer dictionary while recovering damaged file");  
300 - }  
301 - if (table.empty()) {  
302 - // We cannot check for an empty xref table in parse because empty tables are valid when  
303 - // creating QPDF objects from JSON.  
304 - throw damaged_pdf("unable to find objects while recovering damaged file");  
305 - }  
306 - check_warnings();  
307 - if (!initialized_) {  
308 - initialized_ = true;  
309 - qpdf.getAllPages();  
310 - check_warnings();  
311 - if (qpdf.m->all_pages.empty()) {  
312 - initialized_ = false;  
313 - throw damaged_pdf("unable to find any pages while recovering damaged file");  
314 - }  
315 - }  
316 - // We could iterate through the objects looking for streams and try to find objects inside of  
317 - // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors  
318 - // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything  
319 - // that involved looking at stream contents, we'd also have to call initializeEncryption() here.  
320 - // It's safe to call it more than once.  
321 -}  
322 -  
323 -void  
324 -Xref_table::read(qpdf_offset_t xref_offset)  
325 -{  
326 - std::map<int, int> free_table;  
327 - std::set<qpdf_offset_t> visited;  
328 - while (xref_offset) {  
329 - visited.insert(xref_offset);  
330 - char buf[7];  
331 - memset(buf, 0, sizeof(buf));  
332 - file->seek(xref_offset, SEEK_SET);  
333 - // Some files miss the mark a little with startxref. We could do a better job of searching  
334 - // in the neighborhood for something that looks like either an xref table or stream, but the  
335 - // simple heuristic of skipping whitespace can help with the xref table case and is harmless  
336 - // with the stream case.  
337 - bool done = false;  
338 - bool skipped_space = false;  
339 - while (!done) {  
340 - char ch;  
341 - if (1 == file->read(&ch, 1)) {  
342 - if (QUtil::is_space(ch)) {  
343 - skipped_space = true;  
344 - } else {  
345 - file->unreadCh(ch);  
346 - done = true;  
347 - }  
348 - } else {  
349 - QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);  
350 - done = true;  
351 - }  
352 - }  
353 -  
354 - file->read(buf, sizeof(buf) - 1);  
355 - // The PDF spec says xref must be followed by a line terminator, but files exist in the wild  
356 - // where it is terminated by arbitrary whitespace.  
357 - if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) {  
358 - if (skipped_space) {  
359 - QTC::TC("qpdf", "QPDF xref skipped space");  
360 - warn_damaged("extraneous whitespace seen before xref");  
361 - }  
362 - QTC::TC(  
363 - "qpdf",  
364 - "QPDF xref space",  
365 - ((buf[4] == '\n') ? 0  
366 - : (buf[4] == '\r') ? 1  
367 - : (buf[4] == ' ') ? 2  
368 - : 9999));  
369 - int skip = 4;  
370 - // buf is null-terminated, and QUtil::is_space('\0') is false, so this won't overrun.  
371 - while (QUtil::is_space(buf[skip])) {  
372 - ++skip;  
373 - }  
374 - xref_offset = process_section(xref_offset + skip);  
375 - } else {  
376 - xref_offset = read_stream(xref_offset);  
377 - }  
378 - if (visited.count(xref_offset) != 0) {  
379 - QTC::TC("qpdf", "QPDF xref loop");  
380 - throw damaged_pdf("loop detected following xref tables");  
381 - }  
382 - }  
383 -  
384 - if (!trailer_) {  
385 - throw damaged_pdf("unable to find trailer while reading xref");  
386 - }  
387 - int size = trailer_.getKey("/Size").getIntValueAsInt();  
388 -  
389 - if (size < 3) {  
390 - throw damaged_pdf("too few objects - file can't have a page tree");  
391 - }  
392 -  
393 - // We are no longer reporting what the highest id in the xref table is. I don't think it adds  
394 - // anything. If we want to report more detail, we should report the total number of missing  
395 - // entries, including missing entries before the last actual entry.  
396 -}  
397 -  
398 -Xref_table::Subsection  
399 -Xref_table::subsection(std::string const& line)  
400 -{  
401 - auto terminate = [this]() -> void {  
402 - QTC::TC("qpdf", "QPDF invalid xref");  
403 - throw damaged_table("xref syntax invalid");  
404 - };  
405 -  
406 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
407 - // buffer.  
408 - char const* p = line.c_str();  
409 - char const* start = line.c_str();  
410 -  
411 - // Skip zero or more spaces  
412 - while (QUtil::is_space(*p)) {  
413 - ++p;  
414 - }  
415 - // Require digit  
416 - if (!QUtil::is_digit(*p)) {  
417 - terminate();  
418 - }  
419 - // Gather digits  
420 - std::string obj_str;  
421 - while (QUtil::is_digit(*p)) {  
422 - obj_str.append(1, *p++);  
423 - }  
424 - // Require space  
425 - if (!QUtil::is_space(*p)) {  
426 - terminate();  
427 - }  
428 - // Skip spaces  
429 - while (QUtil::is_space(*p)) {  
430 - ++p;  
431 - }  
432 - // Require digit  
433 - if (!QUtil::is_digit(*p)) {  
434 - terminate();  
435 - }  
436 - // Gather digits  
437 - std::string num_str;  
438 - while (QUtil::is_digit(*p)) {  
439 - num_str.append(1, *p++);  
440 - }  
441 - // Skip any space including line terminators  
442 - while (QUtil::is_space(*p)) {  
443 - ++p;  
444 - }  
445 - auto obj = QUtil::string_to_int(obj_str.c_str());  
446 - auto count = QUtil::string_to_int(num_str.c_str());  
447 - if (obj > max_id() || count > max_id() || (obj + count) > max_id()) {  
448 - throw damaged_table("xref table subsection header contains impossibly large entry");  
449 - }  
450 - return {obj, count, file->getLastOffset() + toI(p - start)};  
451 -}  
452 -  
453 -std::vector<Xref_table::Subsection>  
454 -Xref_table::bad_subsections(std::string& line, qpdf_offset_t start)  
455 -{  
456 - std::vector<Xref_table::Subsection> result;  
457 - file->seek(start, SEEK_SET);  
458 -  
459 - while (true) {  
460 - line.assign(50, '\0');  
461 - file->read(line.data(), line.size());  
462 - auto [obj, num, offset] = result.emplace_back(subsection(line));  
463 - file->seek(offset, SEEK_SET);  
464 - for (qpdf_offset_t i = obj; i - num < obj; ++i) {  
465 - if (!std::get<0>(read_entry())) {  
466 - QTC::TC("qpdf", "QPDF invalid xref entry");  
467 - throw damaged_table("invalid xref entry (obj=" + std::to_string(i) + ")");  
468 - }  
469 - }  
470 - qpdf_offset_t pos = file->tell();  
471 - if (read_token().isWord("trailer")) {  
472 - return result;  
473 - } else {  
474 - file->seek(pos, SEEK_SET);  
475 - }  
476 - }  
477 -}  
478 -  
479 -// Optimistically read and parse all subsection headers. If an error is encountered return the  
480 -// result of bad_subsections.  
481 -std::vector<Xref_table::Subsection>  
482 -Xref_table::subsections(std::string& line)  
483 -{  
484 - auto recovery_offset = file->tell();  
485 - try {  
486 - std::vector<Xref_table::Subsection> result;  
487 -  
488 - while (true) {  
489 - line.assign(50, '\0');  
490 - file->read(line.data(), line.size());  
491 - auto& sub = result.emplace_back(subsection(line));  
492 - auto count = std::get<1>(sub);  
493 - auto offset = std::get<2>(sub);  
494 - file->seek(offset + 20 * toO(count) - 1, SEEK_SET);  
495 - file->read(line.data(), 1);  
496 - if (!(line[0] == '\n' || line[0] == '\r')) {  
497 - return bad_subsections(line, recovery_offset);  
498 - }  
499 - qpdf_offset_t pos = file->tell();  
500 - if (read_token().isWord("trailer")) {  
501 - return result;  
502 - } else {  
503 - file->seek(pos, SEEK_SET);  
504 - }  
505 - }  
506 - } catch (...) {  
507 - return bad_subsections(line, recovery_offset);  
508 - }  
509 -}  
510 -  
511 -// Returns (success, f1, f2, type).  
512 -std::tuple<bool, qpdf_offset_t, int, char>  
513 -Xref_table::read_bad_entry()  
514 -{  
515 - qpdf_offset_t f1{0};  
516 - int f2{0};  
517 - char type{'\0'};  
518 - // Reposition after initial read attempt and reread.  
519 - file->seek(file->getLastOffset(), SEEK_SET);  
520 - auto line = file->readLine(30);  
521 -  
522 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
523 - // buffer.  
524 - char const* p = line.data();  
525 -  
526 - // Skip zero or more spaces. There aren't supposed to be any.  
527 - bool invalid = false;  
528 - while (QUtil::is_space(*p)) {  
529 - ++p;  
530 - QTC::TC("qpdf", "QPDF ignore first space in xref entry");  
531 - invalid = true;  
532 - }  
533 - // Require digit  
534 - if (!QUtil::is_digit(*p)) {  
535 - return {false, 0, 0, '\0'};  
536 - }  
537 - // Gather digits  
538 - std::string f1_str;  
539 - while (QUtil::is_digit(*p)) {  
540 - f1_str.append(1, *p++);  
541 - }  
542 - // Require space  
543 - if (!QUtil::is_space(*p)) {  
544 - return {false, 0, 0, '\0'};  
545 - }  
546 - if (QUtil::is_space(*(p + 1))) {  
547 - QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");  
548 - invalid = true;  
549 - }  
550 - // Skip spaces  
551 - while (QUtil::is_space(*p)) {  
552 - ++p;  
553 - }  
554 - // Require digit  
555 - if (!QUtil::is_digit(*p)) {  
556 - return {false, 0, 0, '\0'};  
557 - }  
558 - // Gather digits  
559 - std::string f2_str;  
560 - while (QUtil::is_digit(*p)) {  
561 - f2_str.append(1, *p++);  
562 - }  
563 - // Require space  
564 - if (!QUtil::is_space(*p)) {  
565 - return {false, 0, 0, '\0'};  
566 - }  
567 - if (QUtil::is_space(*(p + 1))) {  
568 - QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");  
569 - invalid = true;  
570 - }  
571 - // Skip spaces  
572 - while (QUtil::is_space(*p)) {  
573 - ++p;  
574 - }  
575 - if ((*p == 'f') || (*p == 'n')) {  
576 - type = *p;  
577 - } else {  
578 - return {false, 0, 0, '\0'};  
579 - }  
580 - if ((f1_str.length() != 10) || (f2_str.length() != 5)) {  
581 - QTC::TC("qpdf", "QPDF ignore length error xref entry");  
582 - invalid = true;  
583 - }  
584 -  
585 - if (invalid) {  
586 - qpdf.warn(damaged_table("accepting invalid xref table entry"));  
587 - }  
588 -  
589 - f1 = QUtil::string_to_ll(f1_str.c_str());  
590 - f2 = QUtil::string_to_int(f2_str.c_str());  
591 -  
592 - return {true, f1, f2, type};  
593 -}  
594 -  
595 -// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return  
596 -// result. Returns (success, f1, f2, type).  
597 -std::tuple<bool, qpdf_offset_t, int, char>  
598 -Xref_table::read_entry()  
599 -{  
600 - qpdf_offset_t f1{0};  
601 - int f2{0};  
602 - char type{'\0'};  
603 - std::array<char, 21> line;  
604 - f1 = 0;  
605 - f2 = 0;  
606 - if (file->read(line.data(), 20) != 20) {  
607 - // C++20: [[unlikely]]  
608 - return {false, 0, 0, '\0'};  
609 - }  
610 - line[20] = '\0';  
611 - char const* p = line.data();  
612 -  
613 - int f1_len = 0;  
614 - int f2_len = 0;  
615 -  
616 - // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated  
617 - // buffer.  
618 -  
619 - // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.  
620 - while (*p == '0') {  
621 - ++f1_len;  
622 - ++p;  
623 - }  
624 - while (QUtil::is_digit(*p) && f1_len++ < 10) {  
625 - f1 *= 10;  
626 - f1 += *p++ - '0';  
627 - }  
628 - // Require space  
629 - if (!QUtil::is_space(*p++)) {  
630 - // Entry doesn't start with space or digit.  
631 - // C++20: [[unlikely]]  
632 - return {false, 0, 0, '\0'};  
633 - }  
634 - // Gather digits. NB No risk of overflow as 99'999 < max int.  
635 - while (*p == '0') {  
636 - ++f2_len;  
637 - ++p;  
638 - }  
639 - while (QUtil::is_digit(*p) && f2_len++ < 5) {  
640 - f2 *= 10;  
641 - f2 += static_cast<int>(*p++ - '0');  
642 - }  
643 - if (QUtil::is_space(*p++) && (*p == 'f' || *p == 'n')) {  
644 - // C++20: [[likely]]  
645 - type = *p;  
646 - // No test for valid line[19].  
647 - if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {  
648 - // C++20: [[likely]]  
649 - return {true, f1, f2, type};  
650 - }  
651 - }  
652 - return read_bad_entry();  
653 -}  
654 -  
655 -// Read a single cross-reference table section and associated trailer.  
656 -qpdf_offset_t  
657 -Xref_table::process_section(qpdf_offset_t xref_offset)  
658 -{  
659 - file->seek(xref_offset, SEEK_SET);  
660 - std::string line;  
661 - auto subs = subsections(line);  
662 -  
663 - auto cur_trailer_offset = file->tell();  
664 - auto cur_trailer = read_trailer();  
665 - if (!cur_trailer.isDictionary()) {  
666 - QTC::TC("qpdf", "QPDF missing trailer");  
667 - throw qpdf.damagedPDF("", "expected trailer dictionary");  
668 - }  
669 -  
670 - if (!trailer_) {  
671 - unsigned int sz;  
672 - trailer_ = cur_trailer;  
673 -  
674 - if (!trailer_.hasKey("/Size")) {  
675 - QTC::TC("qpdf", "QPDF trailer lacks size");  
676 - throw qpdf.damagedPDF("trailer", "trailer dictionary lacks /Size key");  
677 - }  
678 - if (!trailer_.getKey("/Size").getValueAsUInt(sz)) {  
679 - QTC::TC("qpdf", "QPDF trailer size not integer");  
680 - throw qpdf.damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");  
681 - }  
682 - if (sz >= static_cast<unsigned int>(max_id_)) {  
683 - QTC::TC("qpdf", "QPDF trailer size impossibly large");  
684 - throw qpdf.damagedPDF("trailer", "/Size key in trailer dictionary is impossibly large");  
685 - }  
686 - table.resize(sz);  
687 - }  
688 -  
689 - for (auto [obj, num, offset]: subs) {  
690 - file->seek(offset, SEEK_SET);  
691 - for (qpdf_offset_t i = obj; i - num < obj; ++i) {  
692 - if (i == 0) {  
693 - // This is needed by checkLinearization()  
694 - first_item_offset_ = file->tell();  
695 - }  
696 - // For xref_table, these will always be small enough to be ints  
697 - auto [success, f1, f2, type] = read_entry();  
698 - if (!success) {  
699 - throw damaged_table("invalid xref entry (obj=" + std::to_string(i) + ")");  
700 - }  
701 - if (type == 'f') {  
702 - insert_free(QPDFObjGen(toI(i), f2));  
703 - } else {  
704 - insert(toI(i), 1, f1, f2);  
705 - }  
706 - }  
707 - qpdf_offset_t pos = file->tell();  
708 - if (read_token().isWord("trailer")) {  
709 - break;  
710 - } else {  
711 - file->seek(pos, SEEK_SET);  
712 - }  
713 - }  
714 -  
715 - if (cur_trailer.hasKey("/XRefStm")) {  
716 - if (ignore_streams_) {  
717 - QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");  
718 - } else {  
719 - if (cur_trailer.getKey("/XRefStm").isInteger()) {  
720 - // Read the xref stream but disregard any return value -- we'll use our trailer's  
721 - // /Prev key instead of the xref stream's.  
722 - (void)read_stream(cur_trailer.getKey("/XRefStm").getIntValue());  
723 - } else {  
724 - throw qpdf.damagedPDF("xref stream", cur_trailer_offset, "invalid /XRefStm");  
725 - }  
726 - }  
727 - }  
728 -  
729 - if (cur_trailer.hasKey("/Prev")) {  
730 - if (!cur_trailer.getKey("/Prev").isInteger()) {  
731 - QTC::TC("qpdf", "QPDF trailer prev not integer");  
732 - throw qpdf.damagedPDF(  
733 - "trailer", cur_trailer_offset, "/Prev key in trailer dictionary is not an integer");  
734 - }  
735 - QTC::TC("qpdf", "QPDF prev key in trailer dictionary");  
736 - return cur_trailer.getKey("/Prev").getIntValue();  
737 - }  
738 -  
739 - return 0;  
740 -}  
741 -  
742 -// Read a single cross-reference stream.  
743 -qpdf_offset_t  
744 -Xref_table::read_stream(qpdf_offset_t xref_offset)  
745 -{  
746 - if (!ignore_streams_) {  
747 - QPDFObjGen x_og;  
748 - QPDFObjectHandle xref_obj;  
749 - try {  
750 - xref_obj =  
751 - objects.read(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);  
752 - } catch (QPDFExc&) {  
753 - // ignore -- report error below  
754 - }  
755 - if (xref_obj.isStreamOfType("/XRef")) {  
756 - QTC::TC("qpdf", "QPDF found xref stream");  
757 - return process_stream(xref_offset, xref_obj);  
758 - }  
759 - }  
760 -  
761 - QTC::TC("qpdf", "QPDF can't find xref");  
762 - throw qpdf.damagedPDF("", xref_offset, "xref not found");  
763 - return 0; // unreachable  
764 -}  
765 -  
766 -// Return the entry size of the xref stream and the processed W array.  
767 -std::pair<int, std::array<int, 3>>  
768 -Xref_table::process_W(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)  
769 -{  
770 - auto W_obj = dict.getKey("/W");  
771 - if (!(W_obj.isArray() && W_obj.getArrayNItems() >= 3 && W_obj.getArrayItem(0).isInteger() &&  
772 - W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {  
773 - throw damaged("Cross-reference stream does not have a proper /W key");  
774 - }  
775 -  
776 - std::array<int, 3> W;  
777 - int entry_size = 0;  
778 - auto w_vector = W_obj.getArrayAsVector();  
779 - int max_bytes = sizeof(qpdf_offset_t);  
780 - for (size_t i = 0; i < 3; ++i) {  
781 - W[i] = w_vector[i].getIntValueAsInt();  
782 - if (W[i] > max_bytes) {  
783 - throw damaged("Cross-reference stream's /W contains impossibly large values");  
784 - }  
785 - if (W[i] < 0) {  
786 - throw damaged("Cross-reference stream's /W contains negative values");  
787 - }  
788 - entry_size += W[i];  
789 - }  
790 - if (entry_size == 0) {  
791 - throw damaged("Cross-reference stream's /W indicates entry size of 0");  
792 - }  
793 - return {entry_size, W};  
794 -}  
795 -  
796 -// Validate Size entry and return the maximum number of entries that the xref stream can contain and  
797 -// the value of the Size entry.  
798 -std::pair<int, size_t>  
799 -Xref_table::process_Size(  
800 - QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)  
801 -{  
802 - // Number of entries is limited by the highest possible object id and stream size.  
803 - auto max_num_entries = std::numeric_limits<int>::max();  
804 - if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {  
805 - max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);  
806 - }  
807 -  
808 - auto Size_obj = dict.getKey("/Size");  
809 - long long size;  
810 - if (!dict.getKey("/Size").getValueAsInt(size)) {  
811 - throw damaged("Cross-reference stream does not have a proper /Size key");  
812 - } else if (size < 0) {  
813 - throw damaged("Cross-reference stream has a negative /Size key");  
814 - } else if (size >= max_num_entries) {  
815 - throw damaged("Cross-reference stream has an impossibly large /Size key");  
816 - }  
817 - // We are not validating that Size <= (Size key of parent xref / trailer).  
818 - return {max_num_entries, toS(size)};  
819 -}  
820 -  
821 -// Return the number of entries of the xref stream and the processed Index array.  
822 -std::pair<int, std::vector<std::pair<int, int>>>  
823 -Xref_table::process_Index(  
824 - QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)  
825 -{  
826 - auto size = dict.getKey("/Size").getIntValueAsInt();  
827 - auto Index_obj = dict.getKey("/Index");  
828 -  
829 - if (Index_obj.isArray()) {  
830 - std::vector<std::pair<int, int>> indx;  
831 - int num_entries = 0;  
832 - auto index_vec = Index_obj.getArrayAsVector();  
833 - if ((index_vec.size() % 2) || index_vec.size() < 2) {  
834 - throw damaged("Cross-reference stream's /Index has an invalid number of values");  
835 - }  
836 -  
837 - int i = 0;  
838 - long long first = 0;  
839 - for (auto& val: index_vec) {  
840 - if (val.isInteger()) {  
841 - if (i % 2) {  
842 - auto count = val.getIntValue();  
843 - if (count <= 0) {  
844 - throw damaged(  
845 - "Cross-reference stream section claims to contain " +  
846 - std::to_string(count) + " entries");  
847 - }  
848 - // We are guarding against the possibility of num_entries * entry_size  
849 - // overflowing. We are not checking that entries are in ascending order as  
850 - // required by the spec, which probably should generate a warning. We are also  
851 - // not checking that for each subsection first object number + number of entries  
852 - // <= /Size. The spec requires us to ignore object number > /Size.  
853 - if (first > (max_num_entries - count) ||  
854 - count > (max_num_entries - num_entries)) {  
855 - throw damaged(  
856 - "Cross-reference stream claims to contain too many entries: " +  
857 - std::to_string(first) + " " + std::to_string(max_num_entries) + " " +  
858 - std::to_string(num_entries));  
859 - }  
860 - indx.emplace_back(static_cast<int>(first), static_cast<int>(count));  
861 - num_entries += static_cast<int>(count);  
862 - } else {  
863 - first = val.getIntValue();  
864 - if (first < 0) {  
865 - throw damaged(  
866 - "Cross-reference stream's /Index contains a negative object id");  
867 - } else if (first > max_num_entries) {  
868 - throw damaged("Cross-reference stream's /Index contains an impossibly "  
869 - "large object id");  
870 - }  
871 - }  
872 - } else {  
873 - throw damaged(  
874 - "Cross-reference stream's /Index's item " + std::to_string(i) +  
875 - " is not an integer");  
876 - }  
877 - i++;  
878 - }  
879 - QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);  
880 - return {num_entries, indx};  
881 - } else if (Index_obj.isNull()) {  
882 - QTC::TC("qpdf", "QPDF xref /Index is null");  
883 - return {size, {{0, size}}};  
884 - } else {  
885 - throw damaged("Cross-reference stream does not have a proper /Index key");  
886 - }  
887 -}  
888 -  
889 -qpdf_offset_t  
890 -Xref_table::process_stream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)  
891 -{  
892 - auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {  
893 - return qpdf.damagedPDF("xref stream", xref_offset, msg.data());  
894 - };  
895 -  
896 - auto dict = xref_obj.getDict();  
897 -  
898 - auto [entry_size, W] = process_W(dict, damaged);  
899 - auto [max_num_entries, size] = process_Size(dict, entry_size, damaged);  
900 - auto [num_entries, indx] = process_Index(dict, max_num_entries, damaged);  
901 -  
902 - std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);  
903 - size_t actual_size = bp->getSize();  
904 - auto expected_size = toS(entry_size) * toS(num_entries);  
905 -  
906 - if (expected_size != actual_size) {  
907 - QPDFExc x = damaged(  
908 - "Cross-reference stream data has the wrong size; expected = " +  
909 - std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));  
910 - if (expected_size > actual_size) {  
911 - throw x;  
912 - } else {  
913 - qpdf.warn(x);  
914 - }  
915 - }  
916 -  
917 - if (!trailer_) {  
918 - trailer_ = dict;  
919 - if (size > toS(max_id_)) {  
920 - throw damaged("Cross-reference stream /Size entry is impossibly large");  
921 - }  
922 - table.resize(size);  
923 - }  
924 -  
925 - bool saw_first_compressed_object = false;  
926 -  
927 - // Actual size vs. expected size check above ensures that we will not overflow any buffers here.  
928 - // We know that entry_size * num_entries is less or equal to the size of the buffer.  
929 - auto p = bp->getBuffer();  
930 - for (auto [obj, sec_entries]: indx) {  
931 - // Process a subsection.  
932 - for (int i = 0; i < sec_entries; ++i) {  
933 - // Read this entry  
934 - std::array<qpdf_offset_t, 3> fields{};  
935 - if (W[0] == 0) {  
936 - QTC::TC("qpdf", "QPDF default for xref stream field 0");  
937 - fields[0] = 1;  
938 - }  
939 - for (size_t j = 0; j < 3; ++j) {  
940 - for (int k = 0; k < W[j]; ++k) {  
941 - fields[j] <<= 8;  
942 - fields[j] |= *p++;  
943 - }  
944 - }  
945 -  
946 - // Get the generation number. The generation number is 0 unless this is an uncompressed  
947 - // object record, in which case the generation number appears as the third field.  
948 - if (saw_first_compressed_object) {  
949 - if (fields[0] != 2) {  
950 - uncompressed_after_compressed_ = true;  
951 - }  
952 - } else if (fields[0] == 2) {  
953 - saw_first_compressed_object = true;  
954 - }  
955 - if (obj == 0) {  
956 - // This is needed by checkLinearization()  
957 - first_item_offset_ = xref_offset;  
958 - } else if (fields[0] == 0) {  
959 - // Ignore fields[2], which we don't care about in this case. This works around the  
960 - // issue of some PDF files that put invalid values, like -1, here for deleted  
961 - // objects.  
962 - insert_free(QPDFObjGen(obj, 0));  
963 - } else {  
964 - insert(obj, toI(fields[0]), fields[1], toI(fields[2]));  
965 - }  
966 - ++obj;  
967 - }  
968 - }  
969 -  
970 - if (dict.hasKey("/Prev")) {  
971 - if (!dict.getKey("/Prev").isInteger()) {  
972 - throw qpdf.damagedPDF(  
973 - "xref stream", "/Prev key in xref stream dictionary is not an integer");  
974 - }  
975 - QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");  
976 - return dict.getKey("/Prev").getIntValue();  
977 - } else {  
978 - return 0;  
979 - }  
980 -}  
981 -  
982 -void  
983 -Xref_table::insert(int obj, int f0, qpdf_offset_t f1, int f2)  
984 -{  
985 - // Populate the xref table in such a way that the first reference to an object that we see,  
986 - // which is the one in the latest xref table in which it appears, is the one that gets stored.  
987 - // This works because we are reading more recent appends before older ones.  
988 -  
989 - // If there is already an entry for this object and generation in the table, it means that a  
990 - // later xref table has registered this object. Disregard this one.  
991 -  
992 - int new_gen = f0 == 2 ? 0 : f2;  
993 -  
994 - if (!(obj > 0 && static_cast<size_t>(obj) < table.size() && 0 <= f2 && new_gen < 65535)) {  
995 - // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There  
996 - // is probably no point having another warning but we could count invalid items in order to  
997 - // decide when to give up.  
998 - QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");  
999 - return;  
1000 - }  
1001 -  
1002 - auto& entry = table[static_cast<size_t>(obj)];  
1003 - auto old_type = entry.type();  
1004 -  
1005 - if (!old_type && entry.gen() > 0) {  
1006 - // At the moment we are processing the updates last to first and therefore the gen doesn't  
1007 - // matter as long as it > 0 to distinguish it from an uninitialized entry. This will need  
1008 - // to be revisited when we want to support incremental updates or more comprehensive  
1009 - // checking.  
1010 - QTC::TC("qpdf", "QPDF xref deleted object");  
1011 - return;  
1012 - }  
1013 -  
1014 - if (f0 == 2 && static_cast<int>(f1) == obj) {  
1015 - qpdf.warn(qpdf.damagedPDF(  
1016 - "xref stream", "self-referential object stream " + std::to_string(obj)));  
1017 - return;  
1018 - }  
1019 -  
1020 - if (old_type && entry.gen() >= new_gen) {  
1021 - QTC::TC("qpdf", "QPDF xref reused object");  
1022 - return;  
1023 - }  
1024 -  
1025 - switch (f0) {  
1026 - case 1:  
1027 - // f2 is generation  
1028 - QTC::TC("qpdf", "QPDF xref gen > 0", (f2 > 0) ? 1 : 0);  
1029 - entry = {f2, Uncompressed(f1)};  
1030 - break;  
1031 -  
1032 - case 2:  
1033 - entry = {0, Compressed(toI(f1), f2)};  
1034 - object_streams_ = true;  
1035 - break;  
1036 -  
1037 - default:  
1038 - throw qpdf.damagedPDF(  
1039 - "xref stream", "unknown xref stream entry type " + std::to_string(f0));  
1040 - break;  
1041 - }  
1042 -}  
1043 -  
1044 -void  
1045 -Xref_table::insert_free(QPDFObjGen og)  
1046 -{  
1047 - // At the moment we are processing the updates last to first and therefore the gen doesn't  
1048 - // matter as long as it > 0 to distinguish it from an uninitialized entry. This will need to be  
1049 - // revisited when we want to support incremental updates or more comprehensive checking.  
1050 - if (og.getObj() < 1) {  
1051 - return;  
1052 - }  
1053 - size_t id = static_cast<size_t>(og.getObj());  
1054 - if (id < table.size() && !type(id)) {  
1055 - table[id] = {1, {}};  
1056 - }  
1057 -}  
1058 -  
1059 -QPDFObjGen  
1060 -Xref_table::at_offset(qpdf_offset_t offset) const noexcept  
1061 -{  
1062 - int id = 0;  
1063 - int gen = 0;  
1064 - qpdf_offset_t start = 0;  
1065 -  
1066 - int i = 0;  
1067 - for (auto const& item: table) {  
1068 - auto o = item.offset();  
1069 - if (start < o && o <= offset) {  
1070 - start = o;  
1071 - id = i;  
1072 - gen = item.gen();  
1073 - }  
1074 - ++i;  
1075 - }  
1076 - return QPDFObjGen(id, gen);  
1077 -}  
1078 -  
1079 -std::map<QPDFObjGen, QPDFXRefEntry>  
1080 -Xref_table::as_map() const  
1081 -{  
1082 - std::map<QPDFObjGen, QPDFXRefEntry> result;  
1083 - int i{0};  
1084 - for (auto const& item: table) {  
1085 - switch (item.type()) {  
1086 - case 0:  
1087 - break;  
1088 - case 1:  
1089 - result.emplace(QPDFObjGen(i, item.gen()), item.offset());  
1090 - break;  
1091 - case 2:  
1092 - result.emplace(  
1093 - QPDFObjGen(i, 0), QPDFXRefEntry(item.stream_number(), item.stream_index()));  
1094 - break;  
1095 - default:  
1096 - throw std::logic_error("Xref_table: invalid entry type");  
1097 - }  
1098 - ++i;  
1099 - }  
1100 - return result;  
1101 -}  
1102 -  
1103 -void  
1104 -Xref_table::show()  
1105 -{  
1106 - auto& cout = *qpdf.m->log->getInfo();  
1107 - int i = -1;  
1108 - for (auto const& item: table) {  
1109 - ++i;  
1110 - if (item.type()) {  
1111 - cout << std::to_string(i) << "/" << std::to_string(item.gen()) << ": ";  
1112 - switch (item.type()) {  
1113 - case 1:  
1114 - cout << "uncompressed; offset = " << item.offset() << "\n";  
1115 - break;  
1116 -  
1117 - case 2:  
1118 - cout << "compressed; stream = " << item.stream_number()  
1119 - << ", index = " << item.stream_index() << "\n";  
1120 - break;  
1121 -  
1122 - default:  
1123 - throw std::logic_error(  
1124 - "unknown cross-reference table type while showing xref_table");  
1125 - }  
1126 - }  
1127 - }  
1128 -}  
1129 -  
1130 -// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and  
1131 -// return false. Otherwise return true.  
1132 -bool  
1133 -Xref_table::resolve()  
1134 -{  
1135 - bool may_change = !reconstructed_;  
1136 - int i = -1;  
1137 - for (auto& item: table) {  
1138 - ++i;  
1139 - if (item.type()) {  
1140 - if (objects.unresolved(QPDFObjGen(i, item.gen()))) {  
1141 - objects.resolve(QPDFObjGen(i, item.gen()));  
1142 - if (may_change && reconstructed_) {  
1143 - return false;  
1144 - }  
1145 - }  
1146 - }  
1147 - }  
1148 - return true;  
1149 -}  
1150 -  
1151 -std::vector<QPDFObjectHandle>  
1152 -Objects ::all()  
1153 -{  
1154 - // After fixDanglingReferences is called, all objects are in the object cache.  
1155 - qpdf.fixDanglingReferences();  
1156 - std::vector<QPDFObjectHandle> result;  
1157 - for (auto const& iter: table) {  
1158 - result.emplace_back(iter.second.object);  
1159 - }  
1160 - return result;  
1161 -}  
1162 -  
1163 -QPDFObjectHandle  
1164 -Xref_table::read_trailer()  
1165 -{  
1166 - qpdf_offset_t offset = file->tell();  
1167 - bool empty = false;  
1168 - auto object = QPDFParser(*file, "trailer", tokenizer, nullptr, &qpdf, true).parse(empty, false);  
1169 - if (empty) {  
1170 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1171 - // actual PDF files and Adobe Reader appears to ignore them.  
1172 - qpdf.warn(qpdf.damagedPDF("trailer", "empty object treated as null"));  
1173 - } else if (object.isDictionary() && read_token().isWord("stream")) {  
1174 - qpdf.warn(qpdf.damagedPDF("trailer", file->tell(), "stream keyword found in trailer"));  
1175 - }  
1176 - // Override last_offset so that it points to the beginning of the object we just read  
1177 - file->setLastOffset(offset);  
1178 - return object;  
1179 -}  
1180 -  
1181 -QPDFObjectHandle  
1182 -Objects::read_object(std::string const& description, QPDFObjGen og)  
1183 -{  
1184 - qpdf.setLastObjectDescription(description, og);  
1185 - qpdf_offset_t offset = m->file->tell();  
1186 - bool empty = false;  
1187 -  
1188 - StringDecrypter decrypter{&qpdf, og};  
1189 - StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;  
1190 - auto object =  
1191 - QPDFParser(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, &qpdf, true)  
1192 - .parse(empty, false);  
1193 - if (empty) {  
1194 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1195 - // actual PDF files and Adobe Reader appears to ignore them.  
1196 - qpdf.warn(  
1197 - qpdf.damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));  
1198 - return object;  
1199 - }  
1200 - auto token = qpdf.readToken(*m->file);  
1201 - if (object.isDictionary() && token.isWord("stream")) {  
1202 - read_stream(object, og, offset);  
1203 - token = qpdf.readToken(*m->file);  
1204 - }  
1205 - if (!token.isWord("endobj")) {  
1206 - QTC::TC("qpdf", "QPDF err expected endobj");  
1207 - qpdf.warn(qpdf.damagedPDF("expected endobj"));  
1208 - }  
1209 - return object;  
1210 -}  
1211 -  
1212 -// After reading stream dictionary and stream keyword, read rest of stream.  
1213 -void  
1214 -Objects::read_stream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)  
1215 -{  
1216 - validate_stream_line_end(object, og, offset);  
1217 -  
1218 - // Must get offset before accessing any additional objects since resolving a previously  
1219 - // unresolved indirect object will change file position.  
1220 - qpdf_offset_t stream_offset = m->file->tell();  
1221 - size_t length = 0;  
1222 -  
1223 - try {  
1224 - auto length_obj = object.getKey("/Length");  
1225 -  
1226 - if (!length_obj.isInteger()) {  
1227 - if (length_obj.isNull()) {  
1228 - QTC::TC("qpdf", "QPDF stream without length");  
1229 - throw qpdf.damagedPDF(offset, "stream dictionary lacks /Length key");  
1230 - }  
1231 - QTC::TC("qpdf", "QPDF stream length not integer");  
1232 - throw qpdf.damagedPDF(offset, "/Length key in stream dictionary is not an integer");  
1233 - }  
1234 -  
1235 - length = toS(length_obj.getUIntValue());  
1236 - // Seek in two steps to avoid potential integer overflow  
1237 - m->file->seek(stream_offset, SEEK_SET);  
1238 - m->file->seek(toO(length), SEEK_CUR);  
1239 - if (!qpdf.readToken(*m->file).isWord("endstream")) {  
1240 - QTC::TC("qpdf", "QPDF missing endstream");  
1241 - throw qpdf.damagedPDF("expected endstream");  
1242 - }  
1243 - } catch (QPDFExc& e) {  
1244 - if (m->attempt_recovery) {  
1245 - qpdf.warn(e);  
1246 - length = recover_stream_length(m->file_sp, og, stream_offset);  
1247 - } else {  
1248 - throw;  
1249 - }  
1250 - }  
1251 - object = {QPDF_Stream::create(&qpdf, og, object, stream_offset, length)};  
1252 -}  
1253 -  
1254 -void  
1255 -Objects::validate_stream_line_end(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)  
1256 -{  
1257 - // The PDF specification states that the word "stream" should be followed by either a carriage  
1258 - // return and a newline or by a newline alone. It specifically disallowed following it by a  
1259 - // carriage return alone since, in that case, there would be no way to tell whether the NL in a  
1260 - // CR NL sequence was part of the stream data. However, some readers, including Adobe reader,  
1261 - // accept a carriage return by itself when followed by a non-newline character, so that's what  
1262 - // we do here. We have also seen files that have extraneous whitespace between the stream  
1263 - // keyword and the newline.  
1264 - while (true) {  
1265 - char ch;  
1266 - if (m->file->read(&ch, 1) == 0) {  
1267 - // A premature EOF here will result in some other problem that will get reported at  
1268 - // another time.  
1269 - return;  
1270 - }  
1271 - if (ch == '\n') {  
1272 - // ready to read stream data  
1273 - QTC::TC("qpdf", "QPDF stream with NL only");  
1274 - return;  
1275 - }  
1276 - if (ch == '\r') {  
1277 - // Read another character  
1278 - if (m->file->read(&ch, 1) != 0) {  
1279 - if (ch == '\n') {  
1280 - // Ready to read stream data  
1281 - QTC::TC("qpdf", "QPDF stream with CRNL");  
1282 - } else {  
1283 - // Treat the \r by itself as the whitespace after endstream and start reading  
1284 - // stream data in spite of not having seen a newline.  
1285 - QTC::TC("qpdf", "QPDF stream with CR only");  
1286 - m->file->unreadCh(ch);  
1287 - qpdf.warn(qpdf.damagedPDF(  
1288 - m->file->tell(), "stream keyword followed by carriage return only"));  
1289 - }  
1290 - }  
1291 - return;  
1292 - }  
1293 - if (!QUtil::is_space(ch)) {  
1294 - QTC::TC("qpdf", "QPDF stream without newline");  
1295 - m->file->unreadCh(ch);  
1296 - qpdf.warn(qpdf.damagedPDF(  
1297 - m->file->tell(), "stream keyword not followed by proper line terminator"));  
1298 - return;  
1299 - }  
1300 - qpdf.warn(  
1301 - qpdf.damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));  
1302 - }  
1303 -}  
1304 -  
1305 -QPDFObjectHandle  
1306 -Objects::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)  
1307 -{  
1308 - m->last_object_description.erase(7); // last_object_description starts with "object "  
1309 - m->last_object_description += std::to_string(obj);  
1310 - m->last_object_description += " 0";  
1311 -  
1312 - bool empty = false;  
1313 - auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, &qpdf, true)  
1314 - .parse(empty, false);  
1315 - if (empty) {  
1316 - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in  
1317 - // actual PDF files and Adobe Reader appears to ignore them.  
1318 - qpdf.warn(qpdf.damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));  
1319 - }  
1320 - return object;  
1321 -}  
1322 -  
1323 -bool  
1324 -QPDF::findEndstream()  
1325 -{  
1326 - // Find endstream or endobj. Position the input at that token.  
1327 - auto t = readToken(*m->file, 20);  
1328 - if (t.isWord("endobj") || t.isWord("endstream")) {  
1329 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
1330 - return true;  
1331 - }  
1332 - return false;  
1333 -}  
1334 -  
1335 -size_t  
1336 -Objects::recover_stream_length(  
1337 - std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)  
1338 -{  
1339 - // Try to reconstruct stream length by looking for endstream or endobj  
1340 - qpdf.warn(qpdf.damagedPDF(*input, stream_offset, "attempting to recover stream length"));  
1341 -  
1342 - PatternFinder ef(qpdf, &QPDF::findEndstream);  
1343 - size_t length = 0;  
1344 - if (m->file->findFirst("end", stream_offset, 0, ef)) {  
1345 - length = toS(m->file->tell() - stream_offset);  
1346 - // Reread endstream but, if it was endobj, don't skip that.  
1347 - QPDFTokenizer::Token t = qpdf.readToken(*m->file);  
1348 - if (t.getValue() == "endobj") {  
1349 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
1350 - }  
1351 - }  
1352 -  
1353 - if (length) {  
1354 - // Make sure this is inside this object  
1355 - auto found = xref.at_offset(stream_offset + toO(length));  
1356 - if (found == QPDFObjGen() || found == og) {  
1357 - // If we are trying to recover an XRef stream the xref table will not contain and  
1358 - // won't contain any entries, therefore we cannot check the found length. Otherwise we  
1359 - // found endstream\endobj within the space allowed for this object, so we're probably  
1360 - // in good shape.  
1361 - } else {  
1362 - QTC::TC("qpdf", "QPDF found wrong endstream in recovery");  
1363 - length = 0;  
1364 - }  
1365 - }  
1366 -  
1367 - if (length == 0) {  
1368 - qpdf.warn(qpdf.damagedPDF(  
1369 - *input, stream_offset, "unable to recover stream data; treating stream as empty"));  
1370 - } else {  
1371 - qpdf.warn(qpdf.damagedPDF(  
1372 - *input, stream_offset, "recovered stream length: " + std::to_string(length)));  
1373 - }  
1374 -  
1375 - QTC::TC("qpdf", "QPDF recovered stream length");  
1376 - return length;  
1377 -}  
1378 -  
1379 -QPDFObjectHandle  
1380 -Objects::read(  
1381 - bool try_recovery,  
1382 - qpdf_offset_t offset,  
1383 - std::string const& description,  
1384 - QPDFObjGen exp_og,  
1385 - QPDFObjGen& og,  
1386 - bool skip_cache_if_in_xref)  
1387 -{  
1388 - bool check_og = true;  
1389 - if (exp_og.getObj() == 0) {  
1390 - // This method uses an expect object ID of 0 to indicate that we don't know or don't care  
1391 - // what the actual object ID is at this offset. This is true when we read the xref stream  
1392 - // and linearization hint streams. In this case, we don't verify the expect object  
1393 - // ID/generation against what was read from the file. There is also no reason to attempt  
1394 - // xref recovery if we get a failure in this case since the read attempt was not triggered  
1395 - // by an xref lookup.  
1396 - check_og = false;  
1397 - try_recovery = false;  
1398 - }  
1399 - qpdf.setLastObjectDescription(description, exp_og);  
1400 -  
1401 - if (!m->attempt_recovery) {  
1402 - try_recovery = false;  
1403 - }  
1404 -  
1405 - // Special case: if offset is 0, just return null. Some PDF writers, in particular  
1406 - // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as  
1407 - // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore  
1408 - // these.  
1409 - if (offset == 0) {  
1410 - QTC::TC("qpdf", "QPDF bogus 0 offset", 0);  
1411 - qpdf.warn(qpdf.damagedPDF(0, "object has offset 0"));  
1412 - return QPDFObjectHandle::newNull();  
1413 - }  
1414 -  
1415 - m->file->seek(offset, SEEK_SET);  
1416 - try {  
1417 - QPDFTokenizer::Token tobjid = qpdf.readToken(*m->file);  
1418 - bool objidok = tobjid.isInteger();  
1419 - QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);  
1420 - if (!objidok) {  
1421 - QTC::TC("qpdf", "QPDF expected n n obj");  
1422 - throw qpdf.damagedPDF(offset, "expected n n obj");  
1423 - }  
1424 - QPDFTokenizer::Token tgen = qpdf.readToken(*m->file);  
1425 - bool genok = tgen.isInteger();  
1426 - QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);  
1427 - if (!genok) {  
1428 - throw qpdf.damagedPDF(offset, "expected n n obj");  
1429 - }  
1430 - QPDFTokenizer::Token tobj = qpdf.readToken(*m->file);  
1431 -  
1432 - bool objok = tobj.isWord("obj");  
1433 - QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);  
1434 -  
1435 - if (!objok) {  
1436 - throw qpdf.damagedPDF(offset, "expected n n obj");  
1437 - }  
1438 - int objid = QUtil::string_to_int(tobjid.getValue().c_str());  
1439 - int generation = QUtil::string_to_int(tgen.getValue().c_str());  
1440 - og = QPDFObjGen(objid, generation);  
1441 - if (objid == 0) {  
1442 - QTC::TC("qpdf", "QPDF object id 0");  
1443 - throw qpdf.damagedPDF(offset, "object with ID 0");  
1444 - }  
1445 - if (check_og && (exp_og != og)) {  
1446 - QTC::TC("qpdf", "QPDF err wrong objid/generation");  
1447 - QPDFExc e = qpdf.damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");  
1448 - if (try_recovery) {  
1449 - // Will be retried below  
1450 - throw e;  
1451 - } else {  
1452 - // We can try reading the object anyway even if the ID doesn't match.  
1453 - qpdf.warn(e);  
1454 - }  
1455 - }  
1456 - } catch (QPDFExc& e) {  
1457 - if (try_recovery) {  
1458 - // Try again after reconstructing xref table  
1459 - xref.reconstruct(e);  
1460 - if (xref.type(exp_og) == 1) {  
1461 - QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");  
1462 - return read(false, xref.offset(exp_og), description, exp_og, og, false);  
1463 - } else {  
1464 - QTC::TC("qpdf", "QPDF object gone after xref reconstruction");  
1465 - qpdf.warn(qpdf.damagedPDF(  
1466 - "",  
1467 - 0,  
1468 - ("object " + exp_og.unparse(' ') +  
1469 - " not found in file after regenerating cross reference table")));  
1470 - return QPDFObjectHandle::newNull();  
1471 - }  
1472 - } else {  
1473 - throw;  
1474 - }  
1475 - }  
1476 -  
1477 - QPDFObjectHandle oh = read_object(description, og);  
1478 -  
1479 - if (unresolved(og)) {  
1480 - // Store the object in the cache here so it gets cached whether we first know the offset or  
1481 - // whether we first know the object ID and generation (in which we case we would get here  
1482 - // through resolve).  
1483 -  
1484 - // Determine the end offset of this object before and after white space. We use these  
1485 - // numbers to validate linearization hint tables. Offsets and lengths of objects may imply  
1486 - // the end of an object to be anywhere between these values.  
1487 - qpdf_offset_t end_before_space = m->file->tell();  
1488 -  
1489 - // skip over spaces  
1490 - while (true) {  
1491 - char ch;  
1492 - if (m->file->read(&ch, 1)) {  
1493 - if (!isspace(static_cast<unsigned char>(ch))) {  
1494 - m->file->seek(-1, SEEK_CUR);  
1495 - break;  
1496 - }  
1497 - } else {  
1498 - throw qpdf.damagedPDF(m->file->tell(), "EOF after endobj");  
1499 - }  
1500 - }  
1501 - qpdf_offset_t end_after_space = m->file->tell();  
1502 - if (skip_cache_if_in_xref && xref.type(og)) {  
1503 - // Ordinarily, an object gets read here when resolved through xref table or stream. In  
1504 - // the special case of the xref stream and linearization hint tables, the offset comes  
1505 - // from another source. For the specific case of xref streams, the xref stream is read  
1506 - // and loaded into the object cache very early in parsing. Ordinarily, when a file is  
1507 - // updated by appending, items inserted into the xref table in later updates take  
1508 - // precedence over earlier items. In the special case of reusing the object number  
1509 - // previously used as the xref stream, we have the following order of events:  
1510 - //  
1511 - // * reused object gets loaded into the xref table  
1512 - // * old object is read here while reading xref streams  
1513 - // * original xref entry is ignored (since already in xref table)  
1514 - //  
1515 - // It is the second step that causes a problem. Even though the xref table is correct in  
1516 - // this case, the old object is already in the cache and so effectively prevails over  
1517 - // the reused object. To work around this issue, we have a special case for the xref  
1518 - // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,  
1519 - // don't cache what we read here.  
1520 - //  
1521 - // It is likely that the same bug may exist for linearization hint tables, but the  
1522 - // existing code uses end_before_space and end_after_space from the cache, so fixing  
1523 - // that would require more significant rework. The chances of a linearization hint  
1524 - // stream being reused seems smaller because the xref stream is probably the highest  
1525 - // object in the file and the linearization hint stream would be some random place in  
1526 - // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we  
1527 - // could use !check_og in place of skip_cache_if_in_xref.  
1528 - QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");  
1529 - } else {  
1530 - xref.linearization_offsets(toS(og.getObj()), end_before_space, end_after_space);  
1531 - update_table(og, oh.getObj());  
1532 - }  
1533 - }  
1534 -  
1535 - return oh;  
1536 -}  
1537 -  
1538 -QPDFObject*  
1539 -Objects::resolve(QPDFObjGen og)  
1540 -{  
1541 - if (!unresolved(og)) {  
1542 - return table[og].object.get();  
1543 - }  
1544 -  
1545 - if (m->resolving.count(og)) {  
1546 - // This can happen if an object references itself directly or indirectly in some key that  
1547 - // has to be resolved during object parsing, such as stream length.  
1548 - QTC::TC("qpdf", "QPDF recursion loop in resolve");  
1549 - qpdf.warn(qpdf.damagedPDF("", "loop detected resolving object " + og.unparse(' ')));  
1550 - update_table(og, QPDF_Null::create());  
1551 - return table[og].object.get();  
1552 - }  
1553 - ResolveRecorder rr(&qpdf, og);  
1554 -  
1555 - try {  
1556 - switch (xref.type(og)) {  
1557 - case 0:  
1558 - break;  
1559 - case 1:  
1560 - {  
1561 - // Object stored in cache by readObjectAtOffset  
1562 - QPDFObjGen a_og;  
1563 - QPDFObjectHandle oh = read(true, xref.offset(og), "", og, a_og, false);  
1564 - }  
1565 - break;  
1566 -  
1567 - case 2:  
1568 - resolveObjectsInStream(xref.stream_number(og.getObj()));  
1569 - break;  
1570 -  
1571 - default:  
1572 - throw qpdf.damagedPDF(  
1573 - "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));  
1574 - }  
1575 - } catch (QPDFExc& e) {  
1576 - qpdf.warn(e);  
1577 - } catch (std::exception& e) {  
1578 - qpdf.warn(qpdf.damagedPDF(  
1579 - "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));  
1580 - }  
1581 -  
1582 - if (unresolved(og)) {  
1583 - // PDF spec says unknown objects resolve to the null object.  
1584 - QTC::TC("qpdf", "QPDF resolve failure to null");  
1585 - update_table(og, QPDF_Null::create());  
1586 - }  
1587 -  
1588 - auto result(table[og].object);  
1589 - result->setDefaultDescription(&qpdf, og);  
1590 - return result.get();  
1591 -}  
1592 -  
1593 -void  
1594 -Objects::resolveObjectsInStream(int obj_stream_number)  
1595 -{  
1596 - if (m->resolved_object_streams.count(obj_stream_number)) {  
1597 - return;  
1598 - }  
1599 - m->resolved_object_streams.insert(obj_stream_number);  
1600 - // Force resolution of object stream  
1601 - QPDFObjectHandle obj_stream = get(obj_stream_number, 0);  
1602 - if (!obj_stream.isStream()) {  
1603 - throw qpdf.damagedPDF(  
1604 - "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");  
1605 - }  
1606 -  
1607 - QPDFObjectHandle dict = obj_stream.getDict();  
1608 - if (!dict.isDictionaryOfType("/ObjStm")) {  
1609 - QTC::TC("qpdf", "QPDF ERR object stream with wrong type");  
1610 - qpdf.warn(qpdf.damagedPDF(  
1611 - "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));  
1612 - }  
1613 -  
1614 - if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {  
1615 - throw qpdf.damagedPDF(  
1616 - ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));  
1617 - }  
1618 -  
1619 - int n = dict.getKey("/N").getIntValueAsInt();  
1620 - int first = dict.getKey("/First").getIntValueAsInt();  
1621 -  
1622 - std::map<int, int> offsets;  
1623 -  
1624 - std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);  
1625 - auto input = std::shared_ptr<InputSource>(  
1626 - // line-break  
1627 - new BufferInputSource(  
1628 - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),  
1629 - bp.get()));  
1630 -  
1631 - qpdf_offset_t last_offset = -1;  
1632 - for (int i = 0; i < n; ++i) {  
1633 - QPDFTokenizer::Token tnum = qpdf.readToken(*input);  
1634 - QPDFTokenizer::Token toffset = qpdf.readToken(*input);  
1635 - if (!(tnum.isInteger() && toffset.isInteger())) {  
1636 - throw damagedPDF(  
1637 - *input,  
1638 - m->last_object_description,  
1639 - input->getLastOffset(),  
1640 - "expected integer in object stream header");  
1641 - }  
1642 -  
1643 - int num = QUtil::string_to_int(tnum.getValue().c_str());  
1644 - long long offset = QUtil::string_to_int(toffset.getValue().c_str());  
1645 - if (num > xref.max_id()) {  
1646 - continue;  
1647 - }  
1648 - if (num == obj_stream_number) {  
1649 - QTC::TC("qpdf", "QPDF ignore self-referential object stream");  
1650 - qpdf.warn(damagedPDF(  
1651 - *input,  
1652 - m->last_object_description,  
1653 - input->getLastOffset(),  
1654 - "object stream claims to contain itself"));  
1655 - continue;  
1656 - }  
1657 - if (offset <= last_offset) {  
1658 - throw damagedPDF(  
1659 - *input,  
1660 - m->last_object_description,  
1661 - input->getLastOffset(),  
1662 - "expected offsets in object stream to be increasing");  
1663 - }  
1664 - last_offset = offset;  
1665 -  
1666 - offsets[num] = toI(offset + first);  
1667 - }  
1668 -  
1669 - // To avoid having to read the object stream multiple times, store all objects that would be  
1670 - // found here in the cache. Remember that some objects stored here might have been overridden  
1671 - // by new objects appended to the file, so it is necessary to recheck the xref table and only  
1672 - // cache what would actually be resolved here.  
1673 - m->last_object_description.clear();  
1674 - m->last_object_description += "object ";  
1675 - for (auto const& iter: offsets) {  
1676 - QPDFObjGen og(iter.first, 0);  
1677 - if (xref.type(og) == 2 && xref.stream_number(og.getObj()) == obj_stream_number) {  
1678 - int offset = iter.second;  
1679 - input->seek(offset, SEEK_SET);  
1680 - QPDFObjectHandle oh = readObjectInStream(input, iter.first);  
1681 - update_table(og, oh.getObj());  
1682 - } else {  
1683 - QTC::TC("qpdf", "QPDF not caching overridden objstm object");  
1684 - }  
1685 - }  
1686 -}  
1687 -  
1688 -Objects::~Objects()  
1689 -{  
1690 - // If two objects are mutually referential (through each object having an array or dictionary  
1691 - // that contains an indirect reference to the other), the circular references in the  
1692 - // std::shared_ptr objects will prevent the objects from being deleted. Walk through all objects  
1693 - // in the object cache, which is those objects that we read from the file, and break all  
1694 - // resolved indirect references by replacing them with an internal object type representing that  
1695 - // they have been destroyed. Note that we can't break references like this at any time when the  
1696 - // QPDF object is active. The call to reset also causes all direct QPDFObjectHandle objects that  
1697 - // are reachable from this object to release their association with this QPDF. Direct objects  
1698 - // are not destroyed since they can be moved to other QPDF objects safely.  
1699 -  
1700 - for (auto const& iter: table) {  
1701 - iter.second.object->disconnect();  
1702 - if (iter.second.object->getTypeCode() != ::ot_null) {  
1703 - iter.second.object->destroy();  
1704 - }  
1705 - }  
1706 -}  
1707 -  
1708 -void  
1709 -Objects::update_table(QPDFObjGen og, const std::shared_ptr<QPDFObject>& object)  
1710 -{  
1711 - object->setObjGen(&qpdf, og);  
1712 - if (cached(og)) {  
1713 - auto& cache = table[og];  
1714 - cache.object->assign(object);  
1715 - } else {  
1716 - table[og] = Entry(object);  
1717 - }  
1718 -}  
1719 -  
1720 -bool  
1721 -Objects::cached(QPDFObjGen og)  
1722 -{  
1723 - return table.count(og) != 0;  
1724 -}  
1725 -  
1726 -bool  
1727 -Objects::unresolved(QPDFObjGen og)  
1728 -{  
1729 - return !cached(og) || table[og].object->isUnresolved();  
1730 -}  
1731 -  
1732 -QPDFObjGen  
1733 -Objects::next_id()  
1734 -{  
1735 - qpdf.fixDanglingReferences();  
1736 - QPDFObjGen og;  
1737 - if (!table.empty()) {  
1738 - og = (*(m->objects.table.rbegin())).first;  
1739 - }  
1740 - int max_objid = og.getObj();  
1741 - if (max_objid == std::numeric_limits<int>::max()) {  
1742 - throw std::range_error("max object id is too high to create new objects");  
1743 - }  
1744 - return QPDFObjGen(max_objid + 1, 0);  
1745 -}  
1746 -  
1747 -QPDFObjectHandle  
1748 -Objects::make_indirect(std::shared_ptr<QPDFObject> const& obj)  
1749 -{  
1750 - QPDFObjGen next{next_id()};  
1751 - table[next] = Entry(obj);  
1752 - return qpdf.newIndirect(next, table[next].object);  
1753 -}  
1754 -  
1755 -std::shared_ptr<QPDFObject>  
1756 -Objects::get_for_parser(int id, int gen, bool parse_pdf)  
1757 -{  
1758 - // This method is called by the parser and therefore must not resolve any objects.  
1759 - auto og = QPDFObjGen(id, gen);  
1760 - if (auto iter = table.find(og); iter != table.end()) {  
1761 - return iter->second.object;  
1762 - }  
1763 - if (xref.type(og) || !xref.initialized()) {  
1764 - return table.insert({og, QPDF_Unresolved::create(&qpdf, og)}).first->second.object;  
1765 - }  
1766 - if (parse_pdf) {  
1767 - return QPDF_Null::create();  
1768 - }  
1769 - return table.insert({og, QPDF_Null::create(&qpdf, og)}).first->second.object;  
1770 -}  
1771 -  
1772 -std::shared_ptr<QPDFObject>  
1773 -Objects::get_for_json(int id, int gen)  
1774 -{  
1775 - auto og = QPDFObjGen(id, gen);  
1776 - auto [it, inserted] = table.try_emplace(og);  
1777 - auto& obj = it->second.object;  
1778 - if (inserted) {  
1779 - obj = (xref.initialized() && !xref.type(og)) ? QPDF_Null::create(&qpdf, og)  
1780 - : QPDF_Unresolved::create(&qpdf, og);  
1781 - }  
1782 - return obj;  
1783 -}  
1784 -  
1785 -void  
1786 -Objects::replace(QPDFObjGen og, QPDFObjectHandle oh)  
1787 -{  
1788 - if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {  
1789 - QTC::TC("qpdf", "QPDF replaceObject called with indirect object");  
1790 - throw std::logic_error("QPDF::replaceObject called with indirect object handle");  
1791 - }  
1792 - update_table(og, oh.getObj());  
1793 -}  
1794 -  
1795 -void  
1796 -Objects::erase(QPDFObjGen og)  
1797 -{  
1798 - if (auto cached = table.find(og); cached != table.end()) {  
1799 - // Take care of any object handles that may be floating around.  
1800 - cached->second.object->assign(QPDF_Null::create());  
1801 - cached->second.object->setObjGen(nullptr, QPDFObjGen());  
1802 - table.erase(cached);  
1803 - }  
1804 -}  
1805 -  
1806 -void  
1807 -Objects::swap(QPDFObjGen og1, QPDFObjGen og2)  
1808 -{  
1809 - // Force objects to be read from the input source if needed, then swap them in the cache.  
1810 - resolve(og1);  
1811 - resolve(og2);  
1812 - table[og1].object->swapWith(table[og2].object);  
1813 -}  
1814 -  
1815 -size_t  
1816 -Objects::table_size()  
1817 -{  
1818 - // If table is dense, accommodate all object in tables,else accommodate only original  
1819 - // objects.  
1820 - auto max_xref = toI(xref.size());  
1821 - if (max_xref > 0) {  
1822 - --max_xref;  
1823 - }  
1824 - auto max_obj = table.size() ? table.crbegin()->first.getObj() : 0;  
1825 - auto max_id = std::numeric_limits<int>::max() - 1;  
1826 - if (max_obj >= max_id || max_xref >= max_id) {  
1827 - // Temporary fix. Long-term solution is  
1828 - // - QPDFObjGen to enforce objgens are valid and sensible  
1829 - // - xref table and obj cache to protect against insertion of impossibly large obj ids  
1830 - qpdf.stopOnError("Impossibly large object id encountered.");  
1831 - }  
1832 - if (max_obj < 1.1 * std::max(toI(table.size()), max_xref)) {  
1833 - return toS(++max_obj);  
1834 - }  
1835 - return toS(++max_xref);  
1836 -}  
1837 -  
1838 -std::vector<QPDFObjGen>  
1839 -Objects::compressible_vector()  
1840 -{  
1841 - return compressible<QPDFObjGen>();  
1842 -}  
1843 -  
1844 -std::vector<bool>  
1845 -Objects::compressible_set()  
1846 -{  
1847 - return compressible<bool>();  
1848 -}  
1849 -  
1850 -template <typename T>  
1851 -std::vector<T>  
1852 -Objects::compressible()  
1853 -{  
1854 - // Return a list of objects that are allowed to be in object streams. Walk through the objects  
1855 - // by traversing the document from the root, including a traversal of the pages tree. This  
1856 - // makes that objects that are on the same page are more likely to be in the same object stream,  
1857 - // which is slightly more efficient, particularly with linearized files. This is better than  
1858 - // iterating through the xref table since it avoids preserving orphaned items.  
1859 -  
1860 - // Exclude encryption dictionary, if any  
1861 - QPDFObjectHandle encryption_dict = trailer().getKey("/Encrypt");  
1862 - QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();  
1863 -  
1864 - const size_t max_obj = qpdf.getObjectCount();  
1865 - std::vector<bool> visited(max_obj, false);  
1866 - std::vector<QPDFObjectHandle> queue;  
1867 - queue.reserve(512);  
1868 - queue.emplace_back(trailer());  
1869 - std::vector<T> result;  
1870 - if constexpr (std::is_same_v<T, QPDFObjGen>) {  
1871 - result.reserve(table.size());  
1872 - } else if constexpr (std::is_same_v<T, bool>) {  
1873 - result.resize(max_obj + 1U, false);  
1874 - } else {  
1875 - throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");  
1876 - }  
1877 - while (!queue.empty()) {  
1878 - auto obj = queue.back();  
1879 - queue.pop_back();  
1880 - if (obj.getObjectID() > 0) {  
1881 - QPDFObjGen og = obj.getObjGen();  
1882 - const size_t id = toS(og.getObj() - 1);  
1883 - if (id >= max_obj) {  
1884 - throw std::logic_error(  
1885 - "unexpected object id encountered in getCompressibleObjGens");  
1886 - }  
1887 - if (visited[id]) {  
1888 - QTC::TC("qpdf", "QPDF loop detected traversing objects");  
1889 - continue;  
1890 - }  
1891 -  
1892 - // Check whether this is the current object. If not, remove it (which changes it into a  
1893 - // direct null and therefore stops us from revisiting it) and move on to the next object  
1894 - // in the queue.  
1895 - auto upper = table.upper_bound(og);  
1896 - if (upper != table.end() && upper->first.getObj() == og.getObj()) {  
1897 - erase(og);  
1898 - continue;  
1899 - }  
1900 -  
1901 - visited[id] = true;  
1902 -  
1903 - if (og == encryption_dict_og) {  
1904 - QTC::TC("qpdf", "QPDF exclude encryption dictionary");  
1905 - } else if (!(obj.isStream() ||  
1906 - (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&  
1907 - obj.hasKey("/Contents")))) {  
1908 - if constexpr (std::is_same_v<T, QPDFObjGen>) {  
1909 - result.push_back(og);  
1910 - } else if constexpr (std::is_same_v<T, bool>) {  
1911 - result[id + 1U] = true;  
1912 - }  
1913 - }  
1914 - }  
1915 - if (obj.isStream()) {  
1916 - QPDFObjectHandle dict = obj.getDict();  
1917 - std::set<std::string> keys = dict.getKeys();  
1918 - for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {  
1919 - std::string const& key = *iter;  
1920 - QPDFObjectHandle value = dict.getKey(key);  
1921 - if (key == "/Length") {  
1922 - // omit stream lengths  
1923 - if (value.isIndirect()) {  
1924 - QTC::TC("qpdf", "QPDF exclude indirect length");  
1925 - }  
1926 - } else {  
1927 - queue.push_back(value);  
1928 - }  
1929 - }  
1930 - } else if (obj.isDictionary()) {  
1931 - std::set<std::string> keys = obj.getKeys();  
1932 - for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {  
1933 - queue.push_back(obj.getKey(*iter));  
1934 - }  
1935 - } else if (obj.isArray()) {  
1936 - int n = obj.getArrayNItems();  
1937 - for (int i = 1; i <= n; ++i) {  
1938 - queue.push_back(obj.getArrayItem(n - i));  
1939 - }  
1940 - }  
1941 - }  
1942 -  
1943 - return result;  
1944 -}  
libqpdf/QPDF_optimization.cc
@@ -2,7 +2,7 @@ @@ -2,7 +2,7 @@
2 2
3 #include <qpdf/assert_debug.h> 3 #include <qpdf/assert_debug.h>
4 4
5 -#include <qpdf/QPDF_private.hh> 5 +#include <qpdf/QPDF.hh>
6 6
7 #include <qpdf/QPDFExc.hh> 7 #include <qpdf/QPDFExc.hh>
8 #include <qpdf/QPDFWriter_private.hh> 8 #include <qpdf/QPDFWriter_private.hh>
@@ -78,12 +78,6 @@ QPDF::optimize( @@ -78,12 +78,6 @@ QPDF::optimize(
78 optimize_internal(obj, true, skip_stream_parameters); 78 optimize_internal(obj, true, skip_stream_parameters);
79 } 79 }
80 80
81 -void  
82 -QPDF::optimize(QPDF::Objects const& objects)  
83 -{  
84 - optimize_internal(objects, false, nullptr);  
85 -}  
86 -  
87 template <typename T> 81 template <typename T>
88 void 82 void
89 QPDF::optimize_internal( 83 QPDF::optimize_internal(
@@ -121,13 +115,13 @@ QPDF::optimize_internal( @@ -121,13 +115,13 @@ QPDF::optimize_internal(
121 } 115 }
122 116
123 // Traverse document-level items 117 // Traverse document-level items
124 - for (auto const& key: m->objects.trailer().getKeys()) { 118 + for (auto const& key: m->trailer.getKeys()) {
125 if (key == "/Root") { 119 if (key == "/Root") {
126 // handled separately 120 // handled separately
127 } else { 121 } else {
128 updateObjectMaps( 122 updateObjectMaps(
129 ObjUser(ObjUser::ou_trailer_key, key), 123 ObjUser(ObjUser::ou_trailer_key, key),
130 - m->objects.trailer().getKey(key), 124 + m->trailer.getKey(key),
131 skip_stream_parameters); 125 skip_stream_parameters);
132 } 126 }
133 } 127 }
@@ -175,13 +169,13 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys) @@ -175,13 +169,13 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
175 // values for them. 169 // values for them.
176 std::map<std::string, std::vector<QPDFObjectHandle>> key_ancestors; 170 std::map<std::string, std::vector<QPDFObjectHandle>> key_ancestors;
177 pushInheritedAttributesToPageInternal( 171 pushInheritedAttributesToPageInternal(
178 - m->objects.trailer().getKey("/Root").getKey("/Pages"), 172 + m->trailer.getKey("/Root").getKey("/Pages"),
179 key_ancestors, 173 key_ancestors,
180 allow_changes, 174 allow_changes,
181 warn_skipped_keys); 175 warn_skipped_keys);
182 if (!key_ancestors.empty()) { 176 if (!key_ancestors.empty()) {
183 - throw std::logic_error(  
184 - "key_ancestors not empty after pushing inherited attributes to pages"); 177 + throw std::logic_error("key_ancestors not empty after"
  178 + " pushing inherited attributes to pages");
185 } 179 }
186 m->pushed_inherited_attributes_to_pages = true; 180 m->pushed_inherited_attributes_to_pages = true;
187 m->ever_pushed_inherited_attributes_to_pages = true; 181 m->ever_pushed_inherited_attributes_to_pages = true;
@@ -448,46 +442,3 @@ QPDF::filterCompressedObjects(QPDFWriter::ObjTable const&amp; obj) @@ -448,46 +442,3 @@ QPDF::filterCompressedObjects(QPDFWriter::ObjTable const&amp; obj)
448 m->obj_user_to_objects = t_obj_user_to_objects; 442 m->obj_user_to_objects = t_obj_user_to_objects;
449 m->object_to_obj_users = t_object_to_obj_users; 443 m->object_to_obj_users = t_object_to_obj_users;
450 } 444 }
451 -  
452 -void  
453 -QPDF::filterCompressedObjects(QPDF::Objects const& objects)  
454 -{  
455 - auto const& xref = objects.xref_table();  
456 - if (!xref.object_streams()) {  
457 - return;  
458 - }  
459 -  
460 - // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed  
461 - // objects. If something is a user of a compressed object, then it is really a user of the  
462 - // object stream that contains it.  
463 -  
464 - std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;  
465 - std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;  
466 -  
467 - for (auto const& i1: m->obj_user_to_objects) {  
468 - ObjUser const& ou = i1.first;  
469 - // Loop over objects.  
470 - for (auto const& og: i1.second) {  
471 - if (auto stream = xref.stream_number(og.getObj())) {  
472 - t_obj_user_to_objects[ou].insert(QPDFObjGen(stream, 0));  
473 - } else {  
474 - t_obj_user_to_objects[ou].insert(og);  
475 - }  
476 - }  
477 - }  
478 -  
479 - for (auto const& i1: m->object_to_obj_users) {  
480 - QPDFObjGen const& og = i1.first;  
481 - // Loop over obj_users.  
482 - for (auto const& ou: i1.second) {  
483 - if (auto stream = xref.stream_number(og.getObj())) {  
484 - t_object_to_obj_users[QPDFObjGen(stream, 0)].insert(ou);  
485 - } else {  
486 - t_object_to_obj_users[og].insert(ou);  
487 - }  
488 - }  
489 - }  
490 -  
491 - m->obj_user_to_objects = t_obj_user_to_objects;  
492 - m->object_to_obj_users = t_object_to_obj_users;  
493 -}  
libqpdf/QPDF_pages.cc
1 -#include <qpdf/QPDF_private.hh> 1 +#include <qpdf/QPDF.hh>
2 2
3 #include <qpdf/QPDFExc.hh> 3 #include <qpdf/QPDFExc.hh>
4 #include <qpdf/QTC.hh> 4 #include <qpdf/QTC.hh>
libqpdf/qpdf-c.cc
@@ -905,7 +905,7 @@ qpdf_oh @@ -905,7 +905,7 @@ qpdf_oh
905 qpdf_get_object_by_id(qpdf_data qpdf, int objid, int generation) 905 qpdf_get_object_by_id(qpdf_data qpdf, int objid, int generation)
906 { 906 {
907 QTC::TC("qpdf", "qpdf-c called qpdf_get_object_by_id"); 907 QTC::TC("qpdf", "qpdf-c called qpdf_get_object_by_id");
908 - return new_object(qpdf, qpdf->qpdf->getObject(objid, generation)); 908 + return new_object(qpdf, qpdf->qpdf->getObjectByID(objid, generation));
909 } 909 }
910 910
911 template <class RET> 911 template <class RET>
libqpdf/qpdf/ObjTable.hh
@@ -46,12 +46,6 @@ class ObjTable: public std::vector&lt;T&gt; @@ -46,12 +46,6 @@ class ObjTable: public std::vector&lt;T&gt;
46 } 46 }
47 47
48 inline T const& 48 inline T const&
49 - operator[](unsigned int idx) const  
50 - {  
51 - return element(idx);  
52 - }  
53 -  
54 - inline T const&  
55 operator[](QPDFObjGen og) const 49 operator[](QPDFObjGen og) const
56 { 50 {
57 return element(static_cast<size_t>(og.getObj())); 51 return element(static_cast<size_t>(og.getObj()));
libqpdf/qpdf/QPDFObject_private.hh
@@ -6,13 +6,14 @@ @@ -6,13 +6,14 @@
6 6
7 #include <qpdf/Constants.h> 7 #include <qpdf/Constants.h>
8 #include <qpdf/JSON.hh> 8 #include <qpdf/JSON.hh>
  9 +#include <qpdf/QPDF.hh>
9 #include <qpdf/QPDFValue.hh> 10 #include <qpdf/QPDFValue.hh>
10 -#include <qpdf/QPDF_private.hh>  
11 #include <qpdf/Types.h> 11 #include <qpdf/Types.h>
12 12
13 #include <string> 13 #include <string>
14 #include <string_view> 14 #include <string_view>
15 15
  16 +class QPDF;
16 class QPDFObjectHandle; 17 class QPDFObjectHandle;
17 18
18 class QPDFObject 19 class QPDFObject
libqpdf/qpdf/QPDF_objects.hh deleted
1 -#ifndef QPDF_OBJECTS_HH  
2 -#define QPDF_OBJECTS_HH  
3 -  
4 -#include <qpdf/QPDF.hh>  
5 -  
6 -#include <qpdf/QPDF_Null.hh>  
7 -#include <qpdf/QPDF_Unresolved.hh>  
8 -  
9 -#include <variant>  
10 -  
11 -// The Objects class is responsible for keeping track of all objects belonging to a QPDF instance,  
12 -// including loading it from an input source when required.  
13 -class QPDF::Objects  
14 -{  
15 - public:  
16 - // Xref_table encapsulates the pdf's xref table and trailer.  
17 - class Xref_table  
18 - {  
19 - public:  
20 - Xref_table(Objects& objects) :  
21 - qpdf(objects.qpdf),  
22 - objects(objects),  
23 - file(objects.file)  
24 - {  
25 - tokenizer.allowEOF();  
26 - }  
27 -  
28 - void initialize();  
29 - void initialize_empty();  
30 - void initialize_json();  
31 - void reconstruct(QPDFExc& e);  
32 - void show();  
33 - bool resolve();  
34 -  
35 - QPDFObjectHandle  
36 - trailer() noexcept  
37 - {  
38 - return trailer_;  
39 - }  
40 -  
41 - QPDFObjectHandle const&  
42 - trailer() const noexcept  
43 - {  
44 - return trailer_;  
45 - }  
46 -  
47 - void  
48 - trailer(QPDFObjectHandle&& oh)  
49 - {  
50 - trailer_ = std::move(oh);  
51 - }  
52 -  
53 - // Returns 0 if og is not in table.  
54 - size_t  
55 - type(QPDFObjGen og) const  
56 - {  
57 - int id = og.getObj();  
58 - if (id < 1 || static_cast<size_t>(id) >= table.size()) {  
59 - return 0;  
60 - }  
61 - auto& e = table[static_cast<size_t>(id)];  
62 - return e.gen() == og.getGen() ? e.type() : 0;  
63 - }  
64 -  
65 - // Returns 0 if og is not in table.  
66 - size_t  
67 - type(size_t id) const noexcept  
68 - {  
69 - if (id >= table.size()) {  
70 - return 0;  
71 - }  
72 - return table[id].type();  
73 - }  
74 -  
75 - // Returns 0 if og is not in table.  
76 - qpdf_offset_t  
77 - offset(QPDFObjGen og) const noexcept  
78 - {  
79 - int id = og.getObj();  
80 - if (id < 1 || static_cast<size_t>(id) >= table.size()) {  
81 - return 0;  
82 - }  
83 - return table[static_cast<size_t>(id)].offset();  
84 - }  
85 -  
86 - // Returns 0 if id is not in table.  
87 - int  
88 - stream_number(int id) const noexcept  
89 - {  
90 - if (id < 1 || static_cast<size_t>(id) >= table.size()) {  
91 - return 0;  
92 - }  
93 - return table[static_cast<size_t>(id)].stream_number();  
94 - }  
95 -  
96 - int  
97 - stream_index(int id) const noexcept  
98 - {  
99 - if (id < 1 || static_cast<size_t>(id) >= table.size()) {  
100 - return 0;  
101 - }  
102 - return table[static_cast<size_t>(id)].stream_index();  
103 - }  
104 -  
105 - QPDFObjGen at_offset(qpdf_offset_t offset) const noexcept;  
106 -  
107 - std::map<QPDFObjGen, QPDFXRefEntry> as_map() const;  
108 -  
109 - bool  
110 - object_streams() const noexcept  
111 - {  
112 - return object_streams_;  
113 - }  
114 -  
115 - // Return a vector of object id and stream number for each compressed object.  
116 - std::vector<std::pair<unsigned int, int>>  
117 - compressed_objects() const  
118 - {  
119 - if (!initialized()) {  
120 - throw std::logic_error("Xref_table::compressed_objects called before parsing.");  
121 - }  
122 -  
123 - std::vector<std::pair<unsigned int, int>> result;  
124 - result.reserve(table.size());  
125 -  
126 - unsigned int i{0};  
127 - for (auto const& item: table) {  
128 - if (item.type() == 2) {  
129 - result.emplace_back(i, item.stream_number());  
130 - }  
131 - ++i;  
132 - }  
133 - return result;  
134 - }  
135 -  
136 - // Temporary access to underlying table size  
137 - size_t  
138 - size() const noexcept  
139 - {  
140 - return table.size();  
141 - }  
142 -  
143 - void  
144 - ignore_streams(bool val) noexcept  
145 - {  
146 - ignore_streams_ = val;  
147 - }  
148 -  
149 - bool  
150 - initialized() const noexcept  
151 - {  
152 - return initialized_;  
153 - }  
154 -  
155 - void  
156 - attempt_recovery(bool val) noexcept  
157 - {  
158 - attempt_recovery_ = val;  
159 - }  
160 -  
161 - int  
162 - max_id() const noexcept  
163 - {  
164 - return max_id_;  
165 - }  
166 -  
167 - // For Linearization  
168 -  
169 - qpdf_offset_t  
170 - end_after_space(QPDFObjGen og)  
171 - {  
172 - auto& e = entry(toS(og.getObj()));  
173 - switch (e.type()) {  
174 - case 1:  
175 - return e.end_after_space_;  
176 - case 2:  
177 - {  
178 - auto es = entry(toS(e.stream_number()));  
179 - return es.type() == 1 ? es.end_after_space_ : 0;  
180 - }  
181 - default:  
182 - return 0;  
183 - }  
184 - }  
185 -  
186 - qpdf_offset_t  
187 - end_before_space(QPDFObjGen og)  
188 - {  
189 - auto& e = entry(toS(og.getObj()));  
190 - switch (e.type()) {  
191 - case 1:  
192 - return e.end_before_space_;  
193 - case 2:  
194 - {  
195 - auto es = entry(toS(e.stream_number()));  
196 - return es.type() == 1 ? es.end_before_space_ : 0;  
197 - }  
198 - default:  
199 - return 0;  
200 - }  
201 - }  
202 -  
203 - void  
204 - linearization_offsets(size_t id, qpdf_offset_t before, qpdf_offset_t after)  
205 - {  
206 - if (type(id)) {  
207 - table[id].end_before_space_ = before;  
208 - table[id].end_after_space_ = after;  
209 - }  
210 - }  
211 -  
212 - bool  
213 - uncompressed_after_compressed() const noexcept  
214 - {  
215 - return uncompressed_after_compressed_;  
216 - }  
217 -  
218 - // Actual value from file  
219 - qpdf_offset_t  
220 - first_item_offset() const noexcept  
221 - {  
222 - return first_item_offset_;  
223 - }  
224 -  
225 - private:  
226 - // Object, count, offset of first entry  
227 - typedef std::tuple<int, int, qpdf_offset_t> Subsection;  
228 -  
229 - struct Uncompressed  
230 - {  
231 - Uncompressed(qpdf_offset_t offset) :  
232 - offset(offset)  
233 - {  
234 - }  
235 - qpdf_offset_t offset;  
236 - };  
237 -  
238 - struct Compressed  
239 - {  
240 - Compressed(int stream_number, int stream_index) :  
241 - stream_number(stream_number),  
242 - stream_index(stream_index)  
243 - {  
244 - }  
245 - int stream_number{0};  
246 - int stream_index{0};  
247 - };  
248 -  
249 - typedef std::variant<std::monostate, Uncompressed, Compressed> Xref;  
250 -  
251 - struct Entry  
252 - {  
253 - Entry() = default;  
254 -  
255 - Entry(int gen, Xref entry) :  
256 - gen_(gen),  
257 - entry(entry)  
258 - {  
259 - }  
260 -  
261 - int  
262 - gen() const noexcept  
263 - {  
264 - return gen_;  
265 - }  
266 -  
267 - size_t  
268 - type() const noexcept  
269 - {  
270 - return entry.index();  
271 - }  
272 -  
273 - qpdf_offset_t  
274 - offset() const noexcept  
275 - {  
276 - return type() == 1 ? std::get<1>(entry).offset : 0;  
277 - }  
278 -  
279 - int  
280 - stream_number() const noexcept  
281 - {  
282 - return type() == 2 ? std::get<2>(entry).stream_number : 0;  
283 - }  
284 -  
285 - int  
286 - stream_index() const noexcept  
287 - {  
288 - return type() == 2 ? std::get<2>(entry).stream_index : 0;  
289 - }  
290 -  
291 - int gen_{0};  
292 - Xref entry;  
293 - qpdf_offset_t end_before_space_{0};  
294 - qpdf_offset_t end_after_space_{0};  
295 - };  
296 -  
297 - Entry&  
298 - entry(size_t id)  
299 - {  
300 - return id < table.size() ? table[id] : table[0];  
301 - }  
302 -  
303 - void read(qpdf_offset_t offset);  
304 -  
305 - // Methods to parse tables  
306 - qpdf_offset_t process_section(qpdf_offset_t offset);  
307 - std::vector<Subsection> subsections(std::string& line);  
308 - std::vector<Subsection> bad_subsections(std::string& line, qpdf_offset_t offset);  
309 - Subsection subsection(std::string const& line);  
310 - std::tuple<bool, qpdf_offset_t, int, char> read_entry();  
311 - std::tuple<bool, qpdf_offset_t, int, char> read_bad_entry();  
312 -  
313 - // Methods to parse streams  
314 - qpdf_offset_t read_stream(qpdf_offset_t offset);  
315 - qpdf_offset_t process_stream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream);  
316 - std::pair<int, std::array<int, 3>>  
317 - process_W(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged);  
318 - std::pair<int, size_t> process_Size(  
319 - QPDFObjectHandle& dict,  
320 - int entry_size,  
321 - std::function<QPDFExc(std::string_view)> damaged);  
322 - std::pair<int, std::vector<std::pair<int, int>>> process_Index(  
323 - QPDFObjectHandle& dict,  
324 - int max_num_entries,  
325 - std::function<QPDFExc(std::string_view)> damaged);  
326 -  
327 - QPDFObjectHandle read_trailer();  
328 -  
329 - QPDFTokenizer::Token  
330 - read_token(size_t max_len = 0)  
331 - {  
332 - return tokenizer.readToken(*file, "", true, max_len);  
333 - }  
334 -  
335 - // Methods to insert table entries  
336 - void insert(int obj, int f0, qpdf_offset_t f1, int f2);  
337 - void insert_free(QPDFObjGen);  
338 -  
339 - QPDFExc  
340 - damaged_pdf(std::string const& msg)  
341 - {  
342 - return qpdf.damagedPDF("", 0, msg);  
343 - }  
344 -  
345 - QPDFExc  
346 - damaged_table(std::string const& msg)  
347 - {  
348 - return qpdf.damagedPDF("xref table", msg);  
349 - }  
350 -  
351 - void  
352 - warn_damaged(std::string const& msg)  
353 - {  
354 - qpdf.warn(damaged_pdf(msg));  
355 - }  
356 -  
357 - QPDF& qpdf;  
358 - QPDF::Objects& objects;  
359 - InputSource* const& file;  
360 - QPDFTokenizer tokenizer;  
361 -  
362 - std::vector<Entry> table;  
363 - QPDFObjectHandle trailer_;  
364 -  
365 - bool attempt_recovery_{true};  
366 - bool initialized_{false};  
367 - bool ignore_streams_{false};  
368 - bool reconstructed_{false};  
369 - bool object_streams_{false};  
370 - // Before the xref table is initialized, max_id_ is an upper bound on the possible object  
371 - // ids that could be present in the PDF file. Once the trailer has been read, max_id_ is set  
372 - // to the value of /Size. If the file is damaged, max_id_ becomes the maximum object id in  
373 - // the xref table after reconstruction.  
374 - int max_id_{std::numeric_limits<int>::max() - 1};  
375 -  
376 - // Linearization data  
377 - bool uncompressed_after_compressed_{false};  
378 - qpdf_offset_t first_item_offset_{0}; // actual value from file  
379 - }; // Xref_table;  
380 -  
381 - ~Objects();  
382 -  
383 - Objects(QPDF& qpdf, QPDF::Members* m, InputSource* const& file) :  
384 - qpdf(qpdf),  
385 - file(file),  
386 - m(m),  
387 - xref(*this)  
388 - {  
389 - }  
390 -  
391 - Xref_table&  
392 - xref_table() noexcept  
393 - {  
394 - return xref;  
395 - }  
396 -  
397 - Xref_table const&  
398 - xref_table() const noexcept  
399 - {  
400 - return xref;  
401 - }  
402 -  
403 - QPDFObjectHandle  
404 - trailer() noexcept  
405 - {  
406 - return xref.trailer();  
407 - }  
408 -  
409 - QPDFObjectHandle const&  
410 - trailer() const noexcept  
411 - {  
412 - return xref.trailer();  
413 - }  
414 -  
415 - QPDFObjectHandle  
416 - get(QPDFObjGen og)  
417 - {  
418 - if (auto it = table.find(og); it != table.end()) {  
419 - return {it->second.object};  
420 - } else if (xref.initialized() && !xref.type(og)) {  
421 - return QPDF_Null::create();  
422 - } else {  
423 - auto result = table.try_emplace(og, QPDF_Unresolved::create(&qpdf, og));  
424 - return {result.first->second.object};  
425 - }  
426 - }  
427 -  
428 - QPDFObjectHandle  
429 - get(int id, int gen)  
430 - {  
431 - return get(QPDFObjGen(id, gen));  
432 - }  
433 -  
434 - std::vector<QPDFObjectHandle> all();  
435 -  
436 - void erase(QPDFObjGen og);  
437 -  
438 - void replace(QPDFObjGen og, QPDFObjectHandle oh);  
439 -  
440 - void swap(QPDFObjGen og1, QPDFObjGen og2);  
441 -  
442 - QPDFObjectHandle read(  
443 - bool attempt_recovery,  
444 - qpdf_offset_t offset,  
445 - std::string const& description,  
446 - QPDFObjGen exp_og,  
447 - QPDFObjGen& og,  
448 - bool skip_cache_if_in_xref);  
449 - QPDFObject* resolve(QPDFObjGen og);  
450 - void update_table(QPDFObjGen og, std::shared_ptr<QPDFObject> const& object);  
451 - QPDFObjGen next_id();  
452 - QPDFObjectHandle make_indirect(std::shared_ptr<QPDFObject> const& obj);  
453 - std::shared_ptr<QPDFObject> get_for_parser(int id, int gen, bool parse_pdf);  
454 - std::shared_ptr<QPDFObject> get_for_json(int id, int gen);  
455 -  
456 - // Get a list of objects that would be permitted in an object stream.  
457 - template <typename T>  
458 - std::vector<T> compressible();  
459 - std::vector<QPDFObjGen> compressible_vector();  
460 - std::vector<bool> compressible_set();  
461 -  
462 - // Used by QPDFWriter to determine the vector part of its object tables.  
463 - size_t table_size();  
464 -  
465 - private:  
466 - struct Entry  
467 - {  
468 - Entry() = default;  
469 -  
470 - Entry(std::shared_ptr<QPDFObject> object) :  
471 - object(object)  
472 - {  
473 - }  
474 -  
475 - std::shared_ptr<QPDFObject> object;  
476 - };  
477 -  
478 - bool cached(QPDFObjGen og);  
479 - bool unresolved(QPDFObjGen og);  
480 -  
481 - QPDFObjectHandle readObjectInStream(std::shared_ptr<InputSource>& input, int obj);  
482 - void resolveObjectsInStream(int obj_stream_number);  
483 - QPDFObjectHandle read_object(std::string const& description, QPDFObjGen og);  
484 - void read_stream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);  
485 - void validate_stream_line_end(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);  
486 - size_t recover_stream_length(  
487 - std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset);  
488 -  
489 - QPDF& qpdf;  
490 - InputSource* const& file;  
491 - QPDF::Members* m;  
492 - Xref_table xref;  
493 -  
494 - std::map<QPDFObjGen, Entry> table;  
495 -}; // Objects  
496 -  
497 -#endif // QPDF_OBJECTS_HH  
libqpdf/qpdf/QPDF_private.hh deleted
1 -#ifndef QPDF_PRIVATE_HH  
2 -#define QPDF_PRIVATE_HH  
3 -  
4 -#include <qpdf/QPDF.hh>  
5 -  
6 -#include <qpdf/QPDF_objects.hh>  
7 -  
8 -#include <variant>  
9 -  
10 -// StreamCopier class is restricted to QPDFObjectHandle so it can copy stream data.  
11 -class QPDF::StreamCopier  
12 -{  
13 - friend class QPDFObjectHandle;  
14 -  
15 - private:  
16 - static void  
17 - copyStreamData(QPDF* qpdf, QPDFObjectHandle const& dest, QPDFObjectHandle const& src)  
18 - {  
19 - qpdf->copyStreamData(dest, src);  
20 - }  
21 -};  
22 -  
23 -// The ParseGuard class allows QPDFParser to detect re-entrant parsing. It also provides  
24 -// special access to allow the parser to create unresolved objects and dangling references.  
25 -class QPDF::ParseGuard  
26 -{  
27 - friend class QPDFParser;  
28 -  
29 - private:  
30 - ParseGuard(QPDF* qpdf) :  
31 - qpdf(qpdf)  
32 - {  
33 - if (qpdf) {  
34 - qpdf->inParse(true);  
35 - }  
36 - }  
37 -  
38 - static std::shared_ptr<QPDFObject>  
39 - getObject(QPDF* qpdf, int id, int gen, bool parse_pdf)  
40 - {  
41 - return qpdf->objects().get_for_parser(id, gen, parse_pdf);  
42 - }  
43 -  
44 - ~ParseGuard()  
45 - {  
46 - if (qpdf) {  
47 - qpdf->inParse(false);  
48 - }  
49 - }  
50 - QPDF* qpdf;  
51 -};  
52 -  
53 -// Pipe class is restricted to QPDF_Stream.  
54 -class QPDF::Pipe  
55 -{  
56 - friend class QPDF_Stream;  
57 -  
58 - private:  
59 - static bool  
60 - pipeStreamData(  
61 - QPDF* qpdf,  
62 - QPDFObjGen const& og,  
63 - qpdf_offset_t offset,  
64 - size_t length,  
65 - QPDFObjectHandle dict,  
66 - Pipeline* pipeline,  
67 - bool suppress_warnings,  
68 - bool will_retry)  
69 - {  
70 - return qpdf->pipeStreamData(  
71 - og, offset, length, dict, pipeline, suppress_warnings, will_retry);  
72 - }  
73 -};  
74 -  
75 -class QPDF::ObjCopier  
76 -{  
77 - public:  
78 - std::map<QPDFObjGen, QPDFObjectHandle> object_map;  
79 - std::vector<QPDFObjectHandle> to_copy;  
80 - QPDFObjGen::set visiting;  
81 -};  
82 -  
83 -class QPDF::EncryptionParameters  
84 -{  
85 - friend class QPDF;  
86 -  
87 - public:  
88 - EncryptionParameters();  
89 -  
90 - private:  
91 - bool encrypted;  
92 - bool encryption_initialized;  
93 - int encryption_V;  
94 - int encryption_R;  
95 - bool encrypt_metadata;  
96 - std::map<std::string, encryption_method_e> crypt_filters;  
97 - encryption_method_e cf_stream;  
98 - encryption_method_e cf_string;  
99 - encryption_method_e cf_file;  
100 - std::string provided_password;  
101 - std::string user_password;  
102 - std::string encryption_key;  
103 - std::string cached_object_encryption_key;  
104 - QPDFObjGen cached_key_og;  
105 - bool user_password_matched;  
106 - bool owner_password_matched;  
107 -};  
108 -  
109 -class QPDF::ForeignStreamData  
110 -{  
111 - friend class QPDF;  
112 -  
113 - public:  
114 - ForeignStreamData(  
115 - std::shared_ptr<EncryptionParameters> encp,  
116 - std::shared_ptr<InputSource> file,  
117 - QPDFObjGen const& foreign_og,  
118 - qpdf_offset_t offset,  
119 - size_t length,  
120 - QPDFObjectHandle local_dict);  
121 -  
122 - private:  
123 - std::shared_ptr<EncryptionParameters> encp;  
124 - std::shared_ptr<InputSource> file;  
125 - QPDFObjGen foreign_og;  
126 - qpdf_offset_t offset;  
127 - size_t length;  
128 - QPDFObjectHandle local_dict;  
129 -};  
130 -  
131 -class QPDF::CopiedStreamDataProvider: public QPDFObjectHandle::StreamDataProvider  
132 -{  
133 - public:  
134 - CopiedStreamDataProvider(QPDF& destination_qpdf);  
135 - ~CopiedStreamDataProvider() override = default;  
136 - bool provideStreamData(  
137 - QPDFObjGen const& og, Pipeline* pipeline, bool suppress_warnings, bool will_retry) override;  
138 - void registerForeignStream(QPDFObjGen const& local_og, QPDFObjectHandle foreign_stream);  
139 - void registerForeignStream(QPDFObjGen const& local_og, std::shared_ptr<ForeignStreamData>);  
140 -  
141 - private:  
142 - QPDF& destination_qpdf;  
143 - std::map<QPDFObjGen, QPDFObjectHandle> foreign_streams;  
144 - std::map<QPDFObjGen, std::shared_ptr<ForeignStreamData>> foreign_stream_data;  
145 -};  
146 -  
147 -class QPDF::StringDecrypter: public QPDFObjectHandle::StringDecrypter  
148 -{  
149 - friend class QPDF;  
150 -  
151 - public:  
152 - StringDecrypter(QPDF* qpdf, QPDFObjGen const& og);  
153 - ~StringDecrypter() override = default;  
154 - void decryptString(std::string& val) override;  
155 -  
156 - private:  
157 - QPDF* qpdf;  
158 - QPDFObjGen og;  
159 -};  
160 -  
161 -// PDF 1.4: Table F.4  
162 -struct QPDF::HPageOffsetEntry  
163 -{  
164 - int delta_nobjects{0}; // 1  
165 - qpdf_offset_t delta_page_length{0}; // 2  
166 - // vectors' sizes = nshared_objects  
167 - int nshared_objects{0}; // 3  
168 - std::vector<int> shared_identifiers; // 4  
169 - std::vector<int> shared_numerators; // 5  
170 - qpdf_offset_t delta_content_offset{0}; // 6  
171 - qpdf_offset_t delta_content_length{0}; // 7  
172 -};  
173 -  
174 -// PDF 1.4: Table F.3  
175 -struct QPDF::HPageOffset  
176 -{  
177 - int min_nobjects{0}; // 1  
178 - qpdf_offset_t first_page_offset{0}; // 2  
179 - int nbits_delta_nobjects{0}; // 3  
180 - int min_page_length{0}; // 4  
181 - int nbits_delta_page_length{0}; // 5  
182 - int min_content_offset{0}; // 6  
183 - int nbits_delta_content_offset{0}; // 7  
184 - int min_content_length{0}; // 8  
185 - int nbits_delta_content_length{0}; // 9  
186 - int nbits_nshared_objects{0}; // 10  
187 - int nbits_shared_identifier{0}; // 11  
188 - int nbits_shared_numerator{0}; // 12  
189 - int shared_denominator{0}; // 13  
190 - // vector size is npages  
191 - std::vector<HPageOffsetEntry> entries;  
192 -};  
193 -  
194 -// PDF 1.4: Table F.6  
195 -struct QPDF::HSharedObjectEntry  
196 -{  
197 - // Item 3 is a 128-bit signature (unsupported by Acrobat)  
198 - int delta_group_length{0}; // 1  
199 - int signature_present{0}; // 2 -- always 0  
200 - int nobjects_minus_one{0}; // 4 -- always 0  
201 -};  
202 -  
203 -// PDF 1.4: Table F.5  
204 -struct QPDF::HSharedObject  
205 -{  
206 - int first_shared_obj{0}; // 1  
207 - qpdf_offset_t first_shared_offset{0}; // 2  
208 - int nshared_first_page{0}; // 3  
209 - int nshared_total{0}; // 4  
210 - int nbits_nobjects{0}; // 5  
211 - int min_group_length{0}; // 6  
212 - int nbits_delta_group_length{0}; // 7  
213 - // vector size is nshared_total  
214 - std::vector<HSharedObjectEntry> entries;  
215 -};  
216 -  
217 -// PDF 1.4: Table F.9  
218 -struct QPDF::HGeneric  
219 -{  
220 - int first_object{0}; // 1  
221 - qpdf_offset_t first_object_offset{0}; // 2  
222 - int nobjects{0}; // 3  
223 - int group_length{0}; // 4  
224 -};  
225 -  
226 -// Other linearization data structures  
227 -  
228 -// Initialized from Linearization Parameter dictionary  
229 -struct QPDF::LinParameters  
230 -{  
231 - qpdf_offset_t file_size{0}; // /L  
232 - int first_page_object{0}; // /O  
233 - qpdf_offset_t first_page_end{0}; // /E  
234 - int npages{0}; // /N  
235 - qpdf_offset_t xref_zero_offset{0}; // /T  
236 - int first_page{0}; // /P  
237 - qpdf_offset_t H_offset{0}; // offset of primary hint stream  
238 - qpdf_offset_t H_length{0}; // length of primary hint stream  
239 -};  
240 -  
241 -// Computed hint table value data structures. These tables contain the computed values on which  
242 -// the hint table values are based. They exclude things like number of bits and store actual  
243 -// values instead of mins and deltas. File offsets are also absolute rather than being offset  
244 -// by the size of the primary hint table. We populate the hint table structures from these  
245 -// during writing and compare the hint table values with these during validation. We ignore  
246 -// some values for various reasons described in the code. Those values are omitted from these  
247 -// structures. Note also that object numbers are object numbers from the input file, not the  
248 -// output file.  
249 -  
250 -// Naming convention: CHSomething is analogous to HSomething above. "CH" is computed hint.  
251 -  
252 -struct QPDF::CHPageOffsetEntry  
253 -{  
254 - int nobjects{0};  
255 - int nshared_objects{0};  
256 - // vectors' sizes = nshared_objects  
257 - std::vector<int> shared_identifiers;  
258 -};  
259 -  
260 -struct QPDF::CHPageOffset  
261 -{  
262 - // vector size is npages  
263 - std::vector<CHPageOffsetEntry> entries;  
264 -};  
265 -  
266 -struct QPDF::CHSharedObjectEntry  
267 -{  
268 - CHSharedObjectEntry(int object) :  
269 - object(object)  
270 - {  
271 - }  
272 -  
273 - int object;  
274 -};  
275 -  
276 -// PDF 1.4: Table F.5  
277 -struct QPDF::CHSharedObject  
278 -{  
279 - int first_shared_obj{0};  
280 - int nshared_first_page{0};  
281 - int nshared_total{0};  
282 - // vector size is nshared_total  
283 - std::vector<CHSharedObjectEntry> entries;  
284 -};  
285 -  
286 -// No need for CHGeneric -- HGeneric is fine as is.  
287 -  
288 -// Data structures to support optimization -- implemented in QPDF_optimization.cc  
289 -  
290 -class QPDF::ObjUser  
291 -{  
292 - public:  
293 - enum user_e { ou_bad, ou_page, ou_thumb, ou_trailer_key, ou_root_key, ou_root };  
294 -  
295 - // type is set to ou_bad  
296 - ObjUser();  
297 -  
298 - // type must be ou_root  
299 - ObjUser(user_e type);  
300 -  
301 - // type must be one of ou_page or ou_thumb  
302 - ObjUser(user_e type, int pageno);  
303 -  
304 - // type must be one of ou_trailer_key or ou_root_key  
305 - ObjUser(user_e type, std::string const& key);  
306 -  
307 - bool operator<(ObjUser const&) const;  
308 -  
309 - user_e ou_type;  
310 - int pageno; // if ou_page;  
311 - std::string key; // if ou_trailer_key or ou_root_key  
312 -};  
313 -  
314 -struct QPDF::UpdateObjectMapsFrame  
315 -{  
316 - UpdateObjectMapsFrame(ObjUser const& ou, QPDFObjectHandle oh, bool top);  
317 -  
318 - ObjUser const& ou;  
319 - QPDFObjectHandle oh;  
320 - bool top;  
321 -};  
322 -  
323 -class QPDF::PatternFinder: public InputSource::Finder  
324 -{  
325 - public:  
326 - PatternFinder(QPDF& qpdf, bool (QPDF::*checker)()) :  
327 - qpdf(qpdf),  
328 - checker(checker)  
329 - {  
330 - }  
331 - ~PatternFinder() override = default;  
332 - bool  
333 - check() override  
334 - {  
335 - return (this->qpdf.*checker)();  
336 - }  
337 -  
338 - private:  
339 - QPDF& qpdf;  
340 - bool (QPDF::*checker)();  
341 -};  
342 -  
343 -class QPDF::Members  
344 -{  
345 - friend class QPDF;  
346 - friend class ResolveRecorder;  
347 -  
348 - public:  
349 - QPDF_DLL  
350 - ~Members() = default;  
351 -  
352 - private:  
353 - Members(QPDF& qpdf);  
354 - Members(Members const&) = delete;  
355 -  
356 - std::shared_ptr<QPDFLogger> log;  
357 - unsigned long long unique_id{0};  
358 - QPDFTokenizer tokenizer;  
359 - // Filename to use if there is no input PDF  
360 - std::string no_input_name{"closed input source"};  
361 - // If file_sp is updated, file must also be updated.  
362 - std::shared_ptr<InputSource> file_sp;  
363 - InputSource* file;  
364 - std::string last_object_description;  
365 - bool provided_password_is_hex_key{false};  
366 - bool suppress_warnings{false};  
367 - size_t max_warnings{0};  
368 - bool attempt_recovery{true};  
369 - bool check_mode{false};  
370 - std::shared_ptr<EncryptionParameters> encp;  
371 - std::string pdf_version;  
372 - Objects objects;  
373 - std::set<QPDFObjGen> resolving;  
374 - std::vector<QPDFObjectHandle> all_pages;  
375 - bool invalid_page_found{false};  
376 - std::map<QPDFObjGen, int> pageobj_to_pages_pos;  
377 - bool pushed_inherited_attributes_to_pages{false};  
378 - bool ever_pushed_inherited_attributes_to_pages{false};  
379 - bool ever_called_get_all_pages{false};  
380 - std::vector<QPDFExc> warnings;  
381 - std::map<unsigned long long, ObjCopier> object_copiers;  
382 - std::shared_ptr<QPDFObjectHandle::StreamDataProvider> copied_streams;  
383 - // copied_stream_data_provider is owned by copied_streams  
384 - CopiedStreamDataProvider* copied_stream_data_provider{nullptr};  
385 - bool fixed_dangling_refs{false};  
386 - bool immediate_copy_from{false};  
387 - bool in_parse{false};  
388 - std::set<int> resolved_object_streams;  
389 -  
390 - // Linearization data  
391 - bool linearization_warnings{false};  
392 -  
393 - // Linearization parameter dictionary and hint table data: may be read from file or computed  
394 - // prior to writing a linearized file  
395 - QPDFObjectHandle lindict;  
396 - LinParameters linp;  
397 - HPageOffset page_offset_hints;  
398 - HSharedObject shared_object_hints;  
399 - HGeneric outline_hints;  
400 -  
401 - // Computed linearization data: used to populate above tables during writing and to compare  
402 - // with them during validation. c_ means computed.  
403 - LinParameters c_linp;  
404 - CHPageOffset c_page_offset_data;  
405 - CHSharedObject c_shared_object_data;  
406 - HGeneric c_outline_data;  
407 -  
408 - // Object ordering data for linearized files: initialized by calculateLinearizationData().  
409 - // Part numbers refer to the PDF 1.4 specification.  
410 - std::vector<QPDFObjectHandle> part4;  
411 - std::vector<QPDFObjectHandle> part6;  
412 - std::vector<QPDFObjectHandle> part7;  
413 - std::vector<QPDFObjectHandle> part8;  
414 - std::vector<QPDFObjectHandle> part9;  
415 -  
416 - // Optimization data  
417 - std::map<ObjUser, std::set<QPDFObjGen>> obj_user_to_objects;  
418 - std::map<QPDFObjGen, std::set<ObjUser>> object_to_obj_users;  
419 -};  
420 -  
421 -inline QPDF::Objects&  
422 -QPDF::objects() noexcept  
423 -{  
424 - return m->objects;  
425 -}  
426 -  
427 -inline QPDF::Objects const&  
428 -QPDF::objects() const noexcept  
429 -{  
430 - return m->objects;  
431 -}  
432 -  
433 -// The Resolver class is restricted to QPDFObject so that only it can resolve indirect  
434 -// references.  
435 -class QPDF::Resolver  
436 -{  
437 - friend class QPDFObject;  
438 - friend class QPDF_Unresolved;  
439 -  
440 - private:  
441 - static QPDFObject*  
442 - resolved(QPDF* qpdf, QPDFObjGen og)  
443 - {  
444 - return qpdf->m->objects.resolve(og);  
445 - }  
446 -};  
447 -  
448 -// JobSetter class is restricted to QPDFJob.  
449 -class QPDF::JobSetter  
450 -{  
451 - friend class QPDFJob;  
452 -  
453 - private:  
454 - // Enable enhanced warnings for pdf file checking.  
455 - static void  
456 - setCheckMode(QPDF& qpdf, bool val)  
457 - {  
458 - qpdf.m->check_mode = val;  
459 - }  
460 -};  
461 -  
462 -class QPDF::ResolveRecorder  
463 -{  
464 - public:  
465 - ResolveRecorder(QPDF* qpdf, QPDFObjGen const& og) :  
466 - qpdf(qpdf),  
467 - iter(qpdf->m->resolving.insert(og).first)  
468 - {  
469 - }  
470 - virtual ~ResolveRecorder()  
471 - {  
472 - this->qpdf->m->resolving.erase(iter);  
473 - }  
474 -  
475 - private:  
476 - QPDF* qpdf;  
477 - std::set<QPDFObjGen>::const_iterator iter;  
478 -};  
479 -  
480 -// Writer class is restricted to QPDFWriter so that only it can call certain methods.  
481 -class QPDF::Writer  
482 -{  
483 - friend class QPDFWriter;  
484 -  
485 - private:  
486 - static void  
487 - optimize(  
488 - QPDF& qpdf,  
489 - QPDFWriter::ObjTable const& obj,  
490 - std::function<int(QPDFObjectHandle&)> skip_stream_parameters)  
491 - {  
492 - return qpdf.optimize(obj, skip_stream_parameters);  
493 - }  
494 -  
495 - static void  
496 - getLinearizedParts(  
497 - QPDF& qpdf,  
498 - QPDFWriter::ObjTable const& obj,  
499 - std::vector<QPDFObjectHandle>& part4,  
500 - std::vector<QPDFObjectHandle>& part6,  
501 - std::vector<QPDFObjectHandle>& part7,  
502 - std::vector<QPDFObjectHandle>& part8,  
503 - std::vector<QPDFObjectHandle>& part9)  
504 - {  
505 - qpdf.getLinearizedParts(obj, part4, part6, part7, part8, part9);  
506 - }  
507 -  
508 - static void  
509 - generateHintStream(  
510 - QPDF& qpdf,  
511 - QPDFWriter::NewObjTable const& new_obj,  
512 - QPDFWriter::ObjTable const& obj,  
513 - std::shared_ptr<Buffer>& hint_stream,  
514 - int& S,  
515 - int& O,  
516 - bool compressed)  
517 - {  
518 - return qpdf.generateHintStream(new_obj, obj, hint_stream, S, O, compressed);  
519 - }  
520 -  
521 - static std::vector<QPDFObjGen>  
522 - getCompressibleObjGens(QPDF& qpdf)  
523 - {  
524 - return qpdf.objects().compressible_vector();  
525 - }  
526 -  
527 - static std::vector<bool>  
528 - getCompressibleObjSet(QPDF& qpdf)  
529 - {  
530 - return qpdf.objects().compressible_set();  
531 - }  
532 -  
533 - static Objects::Xref_table const&  
534 - getXRefTable(QPDF& qpdf)  
535 - {  
536 - return qpdf.objects().xref_table();  
537 - }  
538 -  
539 - static size_t  
540 - tableSize(QPDF& qpdf)  
541 - {  
542 - return qpdf.objects().table_size();  
543 - }  
544 -};  
545 -  
546 -#endif // QPDF_PRIVATE_HH  
libqpdf/qpdf/qpdf-c_impl.hh
@@ -16,7 +16,7 @@ struct _qpdf_data @@ -16,7 +16,7 @@ struct _qpdf_data
16 _qpdf_data() = default; 16 _qpdf_data() = default;
17 17
18 _qpdf_data(std::unique_ptr<QPDF>&& qpdf) : 18 _qpdf_data(std::unique_ptr<QPDF>&& qpdf) :
19 - qpdf(std::move(qpdf)){}; 19 + qpdf(std::move(qpdf)) {};
20 20
21 ~_qpdf_data() = default; 21 ~_qpdf_data() = default;
22 22
qpdf/qpdf.testcov
@@ -48,6 +48,7 @@ QPDFWriter encrypted hint stream 0 @@ -48,6 +48,7 @@ QPDFWriter encrypted hint stream 0
48 QPDF opt inherited scalar 0 48 QPDF opt inherited scalar 0
49 QPDF xref reused object 0 49 QPDF xref reused object 0
50 QPDF xref gen > 0 1 50 QPDF xref gen > 0 1
  51 +QPDF xref size mismatch 0
51 QPDF not a pdf file 0 52 QPDF not a pdf file 0
52 QPDF can't find startxref 0 53 QPDF can't find startxref 0
53 QPDF invalid xref 0 54 QPDF invalid xref 0
@@ -55,7 +56,6 @@ QPDF invalid xref entry 0 @@ -55,7 +56,6 @@ QPDF invalid xref entry 0
55 QPDF missing trailer 0 56 QPDF missing trailer 0
56 QPDF trailer lacks size 0 57 QPDF trailer lacks size 0
57 QPDF trailer size not integer 0 58 QPDF trailer size not integer 0
58 -QPDF trailer size impossibly large 0  
59 QPDF trailer prev not integer 0 59 QPDF trailer prev not integer 0
60 QPDFParser bad brace 0 60 QPDFParser bad brace 0
61 QPDFParser bad brace in parseRemainder 0 61 QPDFParser bad brace in parseRemainder 0
@@ -105,6 +105,7 @@ QPDFWriter not recompressing /FlateDecode 0 @@ -105,6 +105,7 @@ QPDFWriter not recompressing /FlateDecode 0
105 QPDF_encryption xref stream from encrypted file 0 105 QPDF_encryption xref stream from encrypted file 0
106 QPDFJob unable to filter 0 106 QPDFJob unable to filter 0
107 QUtil non-trivial UTF-16 0 107 QUtil non-trivial UTF-16 0
  108 +QPDF xref overwrite object 0
108 QPDF xref overwrite invalid objgen 0 109 QPDF xref overwrite invalid objgen 0
109 QPDF decoding error warning 0 110 QPDF decoding error warning 0
110 qpdf-c called qpdf_init 0 111 qpdf-c called qpdf_init 0
@@ -436,6 +437,7 @@ QPDF xref skipped space 0 @@ -436,6 +437,7 @@ QPDF xref skipped space 0
436 QPDF eof skipping spaces before xref 1 437 QPDF eof skipping spaces before xref 1
437 QPDF_encryption user matches owner V < 5 0 438 QPDF_encryption user matches owner V < 5 0
438 QPDF_encryption same password 1 439 QPDF_encryption same password 1
  440 +QPDFWriter stream in ostream 0
439 QPDFParser duplicate dict key 0 441 QPDFParser duplicate dict key 0
440 QPDFWriter no encryption sig contents 0 442 QPDFWriter no encryption sig contents 0
441 QPDFPageObjectHelper colorspace lookup 0 443 QPDFPageObjectHelper colorspace lookup 0
qpdf/qtest/qpdf/bad12-recover.out
  1 +WARNING: bad12.pdf: reported number of objects (9) is not one plus the highest object number (7)
1 WARNING: bad12.pdf (object 2 0, offset 128): expected endobj 2 WARNING: bad12.pdf (object 2 0, offset 128): expected endobj
2 /QTest is implicit 3 /QTest is implicit
3 /QTest is direct and has type null (2) 4 /QTest is direct and has type null (2)
qpdf/qtest/qpdf/bad12.out
  1 +WARNING: bad12.pdf: reported number of objects (9) is not one plus the highest object number (7)
1 WARNING: bad12.pdf (object 2 0, offset 128): expected endobj 2 WARNING: bad12.pdf (object 2 0, offset 128): expected endobj
2 /QTest is implicit 3 /QTest is implicit
3 /QTest is direct and has type null (2) 4 /QTest is direct and has type null (2)
qpdf/qtest/qpdf/fuzz-16214.out
@@ -11,9 +11,11 @@ WARNING: fuzz-16214.pdf (object 1 0, offset 7189): expected n n obj @@ -11,9 +11,11 @@ WARNING: fuzz-16214.pdf (object 1 0, offset 7189): expected n n obj
11 WARNING: fuzz-16214.pdf: Attempting to reconstruct cross-reference table 11 WARNING: fuzz-16214.pdf: Attempting to reconstruct cross-reference table
12 WARNING: fuzz-16214.pdf (offset 7207): error decoding stream data for object 2 0: stream inflate: inflate: data: invalid code lengths set 12 WARNING: fuzz-16214.pdf (offset 7207): error decoding stream data for object 2 0: stream inflate: inflate: data: invalid code lengths set
13 WARNING: fuzz-16214.pdf (offset 7207): getStreamData called on unfilterable stream 13 WARNING: fuzz-16214.pdf (offset 7207): getStreamData called on unfilterable stream
14 -WARNING: fuzz-16214.pdf (object 7 0, offset 7207): supposed object stream 5 has wrong type  
15 -WARNING: fuzz-16214.pdf (object 7 0, offset 7207): object stream 5 has incorrect keys 14 +WARNING: fuzz-16214.pdf (object 8 0, offset 7207): supposed object stream 5 has wrong type
  15 +WARNING: fuzz-16214.pdf (object 8 0, offset 7207): object stream 5 has incorrect keys
16 WARNING: fuzz-16214.pdf (object 21 0, offset 3639): expected endstream 16 WARNING: fuzz-16214.pdf (object 21 0, offset 3639): expected endstream
17 WARNING: fuzz-16214.pdf (object 21 0, offset 3112): attempting to recover stream length 17 WARNING: fuzz-16214.pdf (object 21 0, offset 3112): attempting to recover stream length
18 WARNING: fuzz-16214.pdf (object 21 0, offset 3112): recovered stream length: 340 18 WARNING: fuzz-16214.pdf (object 21 0, offset 3112): recovered stream length: 340
  19 +WARNING: fuzz-16214.pdf, stream object 8 0: stream found inside object stream; treating as null
  20 +WARNING: fuzz-16214.pdf, stream object 8 0: stream found inside object stream; treating as null
19 qpdf: operation succeeded with warnings; resulting file may have some problems 21 qpdf: operation succeeded with warnings; resulting file may have some problems
qpdf/qtest/qpdf/issue-147.out
@@ -2,6 +2,6 @@ WARNING: issue-147.pdf: can&#39;t find PDF header @@ -2,6 +2,6 @@ WARNING: issue-147.pdf: can&#39;t find PDF header
2 WARNING: issue-147.pdf: file is damaged 2 WARNING: issue-147.pdf: file is damaged
3 WARNING: issue-147.pdf: can't find startxref 3 WARNING: issue-147.pdf: can't find startxref
4 WARNING: issue-147.pdf: Attempting to reconstruct cross-reference table 4 WARNING: issue-147.pdf: Attempting to reconstruct cross-reference table
5 -WARNING: issue-147.pdf: ignoring object with impossibly large id 62  
6 WARNING: issue-147.pdf (trailer, offset 9): expected dictionary key but found non-name object; inserting key /QPDFFake1 5 WARNING: issue-147.pdf (trailer, offset 9): expected dictionary key but found non-name object; inserting key /QPDFFake1
7 -qpdf: issue-147.pdf: unable to find /Root dictionary 6 +WARNING: issue-147.pdf: ignoring object with impossibly large id 62
  7 +qpdf: issue-147.pdf: unable to find objects while recovering damaged file
qpdf/qtest/qpdf/issue-335b.out
1 WARNING: issue-335b.pdf: can't find PDF header 1 WARNING: issue-335b.pdf: can't find PDF header
2 WARNING: issue-335b.pdf: file is damaged 2 WARNING: issue-335b.pdf: file is damaged
3 -WARNING: issue-335b.pdf (xref table, offset 11): xref table subsection header contains impossibly large entry 3 +WARNING: issue-335b.pdf (xref table, offset 23): invalid xref entry (obj=6)
4 WARNING: issue-335b.pdf: Attempting to reconstruct cross-reference table 4 WARNING: issue-335b.pdf: Attempting to reconstruct cross-reference table
5 qpdf: issue-335b.pdf: unable to find trailer dictionary while recovering damaged file 5 qpdf: issue-335b.pdf: unable to find trailer dictionary while recovering damaged file
qpdf/qtest/qpdf/issue-fuzz.out deleted
1 -WARNING: issue-fuzz.pdf: can't find PDF header  
2 -WARNING: issue-fuzz.pdf (xref table, offset 19): accepting invalid xref table entry  
3 -WARNING: issue-fuzz.pdf (trailer, offset 36): unknown token while reading object; treating as string  
4 -WARNING: issue-fuzz.pdf (trailer, offset 53): unexpected >  
5 -WARNING: issue-fuzz.pdf (trailer, offset 54): unknown token while reading object; treating as string  
6 -WARNING: issue-fuzz.pdf (trailer, offset 58): unknown token while reading object; treating as string  
7 -WARNING: issue-fuzz.pdf (trailer, offset 72): unknown token while reading object; treating as string  
8 -WARNING: issue-fuzz.pdf (trailer, offset 36): dictionary ended prematurely; using null as value for last key  
9 -WARNING: issue-fuzz.pdf (trailer, offset 36): expected dictionary key but found non-name object; inserting key /QPDFFake1  
10 -WARNING: issue-fuzz.pdf (trailer, offset 36): expected dictionary key but found non-name object; inserting key /QPDFFake2  
11 -WARNING: issue-fuzz.pdf (trailer, offset 36): expected dictionary key but found non-name object; inserting key /QPDFFake3  
12 -WARNING: issue-fuzz.pdf (trailer, offset 36): expected dictionary key but found non-name object; inserting key /QPDFFake4  
13 -WARNING: issue-fuzz.pdf (trailer, offset 36): expected dictionary key but found non-name object; inserting key /QPDFFake5  
14 -WARNING: issue-fuzz.pdf (trailer, offset 36): expected dictionary key but found non-name object; inserting key /QPDFFake6  
15 -WARNING: issue-fuzz.pdf (trailer, offset 36): expected dictionary key but found non-name object; inserting key /QPDFFake7  
16 -WARNING: issue-fuzz.pdf: file is damaged  
17 -WARNING: issue-fuzz.pdf (trailer, offset 32): /Size key in trailer dictionary is impossibly large  
18 -WARNING: issue-fuzz.pdf: Attempting to reconstruct cross-reference table  
19 -qpdf: issue-fuzz.pdf: unable to find /Root dictionary  
qpdf/qtest/qpdf/issue-fuzz.pdf deleted
No preview for this file type
qpdf/qtest/qpdf/recover-xref-stream.out
1 WARNING: recover-xref-stream.pdf: file is damaged 1 WARNING: recover-xref-stream.pdf: file is damaged
2 WARNING: recover-xref-stream.pdf: can't find startxref 2 WARNING: recover-xref-stream.pdf: can't find startxref
3 WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table 3 WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table
  4 +WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15)
4 qpdf: operation succeeded with warnings; resulting file may have some problems 5 qpdf: operation succeeded with warnings; resulting file may have some problems
qpdf/qtest/qpdf/recover-xref-stream.pdf
No preview for this file type
qpdf/qtest/qpdf/xref-errors.out
@@ -3,11 +3,6 @@ WARNING: xref-errors.pdf (xref table, offset 606): accepting invalid xref table @@ -3,11 +3,6 @@ WARNING: xref-errors.pdf (xref table, offset 606): accepting invalid xref table
3 WARNING: xref-errors.pdf (xref table, offset 627): accepting invalid xref table entry 3 WARNING: xref-errors.pdf (xref table, offset 627): accepting invalid xref table entry
4 WARNING: xref-errors.pdf (xref table, offset 648): accepting invalid xref table entry 4 WARNING: xref-errors.pdf (xref table, offset 648): accepting invalid xref table entry
5 WARNING: xref-errors.pdf (xref table, offset 667): accepting invalid xref table entry 5 WARNING: xref-errors.pdf (xref table, offset 667): accepting invalid xref table entry
6 -WARNING: xref-errors.pdf (xref table, offset 585): accepting invalid xref table entry  
7 -WARNING: xref-errors.pdf (xref table, offset 606): accepting invalid xref table entry  
8 -WARNING: xref-errors.pdf (xref table, offset 627): accepting invalid xref table entry  
9 -WARNING: xref-errors.pdf (xref table, offset 648): accepting invalid xref table entry  
10 -WARNING: xref-errors.pdf (xref table, offset 667): accepting invalid xref table entry  
11 checking xref-errors.pdf 6 checking xref-errors.pdf
12 PDF Version: 1.3 7 PDF Version: 1.3
13 File is not encrypted 8 File is not encrypted
qpdf/qtest/specific-bugs.test
@@ -16,7 +16,7 @@ my $td = new TestDriver(&#39;specific-bugs&#39;); @@ -16,7 +16,7 @@ my $td = new TestDriver(&#39;specific-bugs&#39;);
16 16
17 # The number is the github issue number in which the bug was reported. 17 # The number is the github issue number in which the bug was reported.
18 my @bug_tests = ( 18 my @bug_tests = (
19 -# ["51", "resolve loop", 2], 19 + ["51", "resolve loop", 2],
20 ["99", "object 0", 2], 20 ["99", "object 0", 2],
21 ["99b", "object 0", 2], 21 ["99b", "object 0", 2],
22 ["100", "xref reconstruction loop", 2], 22 ["100", "xref reconstruction loop", 2],
@@ -28,7 +28,7 @@ my @bug_tests = ( @@ -28,7 +28,7 @@ my @bug_tests = (
28 ["106", "zlib data error", 3], 28 ["106", "zlib data error", 3],
29 ["141a", "/W entry size 0", 2], 29 ["141a", "/W entry size 0", 2],
30 ["141b", "/W entry size 0", 2], 30 ["141b", "/W entry size 0", 2],
31 -# ["143", "self-referential ostream", 2, "--preserve-unreferenced"], 31 + ["143", "self-referential ostream", 2, "--preserve-unreferenced"],
32 ["146", "very deeply nested array", 2], 32 ["146", "very deeply nested array", 2],
33 ["147", "previously caused memory error", 2], 33 ["147", "previously caused memory error", 2],
34 ["148", "free memory on bad flate", 2], 34 ["148", "free memory on bad flate", 2],
@@ -38,8 +38,7 @@ my @bug_tests = ( @@ -38,8 +38,7 @@ my @bug_tests = (
38 ["263", "empty xref stream", 2], 38 ["263", "empty xref stream", 2],
39 ["335a", "ozz-fuzz-12152", 2], 39 ["335a", "ozz-fuzz-12152", 2],
40 ["335b", "ozz-fuzz-14845", 2], 40 ["335b", "ozz-fuzz-14845", 2],
41 - ["fuzz", "impossibly large trailer /Size"],  
42 -# ["fuzz-16214", "stream in object stream", 3, "--preserve-unreferenced"], 41 + ["fuzz-16214", "stream in object stream", 3, "--preserve-unreferenced"],
43 # When adding to this list, consider adding to CORPUS_FROM_TEST in 42 # When adding to this list, consider adding to CORPUS_FROM_TEST in
44 # fuzz/CMakeLists.txt and updating the count in 43 # fuzz/CMakeLists.txt and updating the count in
45 # fuzz/qtest/fuzz.test. 44 # fuzz/qtest/fuzz.test.