Commit 44cd31b96291ba114bc48870be0928b0f75e2dc1
Committed by
m-holger
1 parent
f2ccff52
Add Doxygen-style comments for all Parser methods
Co-authored-by: m-holger <34626170+m-holger@users.noreply.github.com>
Showing
1 changed file
with
155 additions
and
35 deletions
libqpdf/qpdf/QPDFParser.hh
| @@ -15,9 +15,16 @@ using namespace qpdf::global; | @@ -15,9 +15,16 @@ using namespace qpdf::global; | ||
| 15 | 15 | ||
| 16 | namespace qpdf::impl | 16 | namespace qpdf::impl |
| 17 | { | 17 | { |
| 18 | + /// @class Parser | ||
| 19 | + /// @brief Internal parser for PDF objects and content streams. | ||
| 20 | + /// @par | ||
| 21 | + /// The Parser class provides static methods for parsing PDF objects from input sources. | ||
| 22 | + /// It handles tokenization, error recovery, and object construction with proper offset | ||
| 23 | + /// tracking and description for error reporting. | ||
| 18 | class Parser | 24 | class Parser |
| 19 | { | 25 | { |
| 20 | public: | 26 | public: |
| 27 | + /// @brief Exception thrown when parser encounters an unrecoverable error. | ||
| 21 | class Error: public std::exception | 28 | class Error: public std::exception |
| 22 | { | 29 | { |
| 23 | public: | 30 | public: |
| @@ -25,16 +32,34 @@ namespace qpdf::impl | @@ -25,16 +32,34 @@ namespace qpdf::impl | ||
| 25 | virtual ~Error() noexcept = default; | 32 | virtual ~Error() noexcept = default; |
| 26 | }; | 33 | }; |
| 27 | 34 | ||
| 35 | + /// @brief Parse a PDF object from an input source. | ||
| 36 | + /// @param input The input source to read from. | ||
| 37 | + /// @param object_description Description of the object for error messages. | ||
| 38 | + /// @param context The QPDF context, or nullptr if parsing standalone. | ||
| 39 | + /// @return The parsed QPDFObjectHandle, or null if parsing fails. | ||
| 28 | static QPDFObjectHandle | 40 | static QPDFObjectHandle |
| 29 | parse(InputSource& input, std::string const& object_description, QPDF* context); | 41 | parse(InputSource& input, std::string const& object_description, QPDF* context); |
| 30 | 42 | ||
| 43 | + /// @brief Parse a content stream from an input source. | ||
| 44 | + /// @param input The input source to read from. | ||
| 45 | + /// @param sp_description Shared pointer to object description. | ||
| 46 | + /// @param tokenizer The tokenizer to use for parsing. | ||
| 47 | + /// @param context The QPDF context. | ||
| 48 | + /// @return The parsed QPDFObjectHandle, or uninitialized handle on EOF. | ||
| 31 | static QPDFObjectHandle parse_content( | 49 | static QPDFObjectHandle parse_content( |
| 32 | InputSource& input, | 50 | InputSource& input, |
| 33 | std::shared_ptr<QPDFObject::Description> sp_description, | 51 | std::shared_ptr<QPDFObject::Description> sp_description, |
| 34 | qpdf::Tokenizer& tokenizer, | 52 | qpdf::Tokenizer& tokenizer, |
| 35 | QPDF* context); | 53 | QPDF* context); |
| 36 | 54 | ||
| 37 | - // For use by deprecated QPDFObjectHandle::parse. | 55 | + /// @brief Parse a PDF object (interface for deprecated QPDFObjectHandle::parse). |
| 56 | + /// @param input The input source to read from. | ||
| 57 | + /// @param object_description Description of the object for error messages. | ||
| 58 | + /// @param tokenizer The tokenizer to use for parsing. | ||
| 59 | + /// @param empty Output parameter indicating if object was empty. | ||
| 60 | + /// @param decrypter String decrypter for encrypted strings, or nullptr. | ||
| 61 | + /// @param context The QPDF context, or nullptr if parsing standalone. | ||
| 62 | + /// @return The parsed QPDFObjectHandle. | ||
| 38 | static QPDFObjectHandle parse( | 63 | static QPDFObjectHandle parse( |
| 39 | InputSource& input, | 64 | InputSource& input, |
| 40 | std::string const& object_description, | 65 | std::string const& object_description, |
| @@ -43,7 +68,14 @@ namespace qpdf::impl | @@ -43,7 +68,14 @@ namespace qpdf::impl | ||
| 43 | QPDFObjectHandle::StringDecrypter* decrypter, | 68 | QPDFObjectHandle::StringDecrypter* decrypter, |
| 44 | QPDF* context); | 69 | QPDF* context); |
| 45 | 70 | ||
| 46 | - // For use by QPDF. | 71 | + /// @brief Parse a PDF object for use by QPDF. |
| 72 | + /// @param input The input source to read from. | ||
| 73 | + /// @param object_description Description of the object for error messages. | ||
| 74 | + /// @param tokenizer The tokenizer to use for parsing. | ||
| 75 | + /// @param decrypter String decrypter for encrypted strings, or nullptr. | ||
| 76 | + /// @param context The QPDF context. | ||
| 77 | + /// @param sanity_checks Enable additional sanity checks during parsing. | ||
| 78 | + /// @return The parsed QPDFObjectHandle. | ||
| 47 | static QPDFObjectHandle parse( | 79 | static QPDFObjectHandle parse( |
| 48 | InputSource& input, | 80 | InputSource& input, |
| 49 | std::string const& object_description, | 81 | std::string const& object_description, |
| @@ -52,6 +84,13 @@ namespace qpdf::impl | @@ -52,6 +84,13 @@ namespace qpdf::impl | ||
| 52 | QPDF& context, | 84 | QPDF& context, |
| 53 | bool sanity_checks); | 85 | bool sanity_checks); |
| 54 | 86 | ||
| 87 | + /// @brief Parse an object from an object stream. | ||
| 88 | + /// @param input The offset buffer containing the object data. | ||
| 89 | + /// @param stream_id The object stream number. | ||
| 90 | + /// @param obj_id The object ID within the stream. | ||
| 91 | + /// @param tokenizer The tokenizer to use for parsing. | ||
| 92 | + /// @param context The QPDF context. | ||
| 93 | + /// @return The parsed QPDFObjectHandle. | ||
| 55 | static QPDFObjectHandle parse( | 94 | static QPDFObjectHandle parse( |
| 56 | qpdf::is::OffsetBuffer& input, | 95 | qpdf::is::OffsetBuffer& input, |
| 57 | int stream_id, | 96 | int stream_id, |
| @@ -59,6 +98,10 @@ namespace qpdf::impl | @@ -59,6 +98,10 @@ namespace qpdf::impl | ||
| 59 | qpdf::Tokenizer& tokenizer, | 98 | qpdf::Tokenizer& tokenizer, |
| 60 | QPDF& context); | 99 | QPDF& context); |
| 61 | 100 | ||
| 101 | + /// @brief Create a description for a parsed object. | ||
| 102 | + /// @param input_name The name of the input source. | ||
| 103 | + /// @param object_description Description of the object being parsed. | ||
| 104 | + /// @return Shared pointer to object description with offset placeholder. | ||
| 62 | static std::shared_ptr<QPDFObject::Description> | 105 | static std::shared_ptr<QPDFObject::Description> |
| 63 | make_description(std::string const& input_name, std::string const& object_description) | 106 | make_description(std::string const& input_name, std::string const& object_description) |
| 64 | { | 107 | { |
| @@ -68,6 +111,17 @@ namespace qpdf::impl | @@ -68,6 +111,17 @@ namespace qpdf::impl | ||
| 68 | } | 111 | } |
| 69 | 112 | ||
| 70 | private: | 113 | private: |
| 114 | + /// @brief Construct a parser instance. | ||
| 115 | + /// @param input The input source to read from. | ||
| 116 | + /// @param sp_description Shared pointer to object description. | ||
| 117 | + /// @param object_description Description string for error messages. | ||
| 118 | + /// @param tokenizer The tokenizer to use for parsing. | ||
| 119 | + /// @param decrypter String decrypter for encrypted content. | ||
| 120 | + /// @param context The QPDF context. | ||
| 121 | + /// @param parse_pdf Whether parsing PDF objects (vs content streams). | ||
| 122 | + /// @param stream_id Object stream ID for object stream parsing. | ||
| 123 | + /// @param obj_id Object ID within object stream. | ||
| 124 | + /// @param sanity_checks Enable additional sanity checks. | ||
| 71 | Parser( | 125 | Parser( |
| 72 | InputSource& input, | 126 | InputSource& input, |
| 73 | std::shared_ptr<QPDFObject::Description> sp_description, | 127 | std::shared_ptr<QPDFObject::Description> sp_description, |
| @@ -92,11 +146,11 @@ namespace qpdf::impl | @@ -92,11 +146,11 @@ namespace qpdf::impl | ||
| 92 | { | 146 | { |
| 93 | } | 147 | } |
| 94 | 148 | ||
| 95 | - // Parser state. Note: | ||
| 96 | - // state <= st_dictionary_value == (state = st_dictionary_key || state = | ||
| 97 | - // st_dictionary_value) | 149 | + /// @brief Parser state enumeration. |
| 150 | + /// @note state <= st_dictionary_value indicates we're in a dictionary context. | ||
| 98 | enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array }; | 151 | enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array }; |
| 99 | 152 | ||
| 153 | + /// @brief Stack frame for tracking nested arrays and dictionaries. | ||
| 100 | struct StackFrame | 154 | struct StackFrame |
| 101 | { | 155 | { |
| 102 | StackFrame(InputSource& input, parser_state_e state) : | 156 | StackFrame(InputSource& input, parser_state_e state) : |
| @@ -105,63 +159,129 @@ namespace qpdf::impl | @@ -105,63 +159,129 @@ namespace qpdf::impl | ||
| 105 | { | 159 | { |
| 106 | } | 160 | } |
| 107 | 161 | ||
| 108 | - std::vector<QPDFObjectHandle> olist; | ||
| 109 | - std::map<std::string, QPDFObjectHandle> dict; | ||
| 110 | - parser_state_e state; | ||
| 111 | - std::string key; | ||
| 112 | - qpdf_offset_t offset; | ||
| 113 | - std::string contents_string; | ||
| 114 | - qpdf_offset_t contents_offset{-1}; | ||
| 115 | - int null_count{0}; | 162 | + std::vector<QPDFObjectHandle> olist; ///< Object list for arrays/dict values |
| 163 | + std::map<std::string, QPDFObjectHandle> dict; ///< Dictionary entries | ||
| 164 | + parser_state_e state; ///< Current parser state | ||
| 165 | + std::string key; ///< Current dictionary key | ||
| 166 | + qpdf_offset_t offset; ///< Offset of container start | ||
| 167 | + std::string contents_string; ///< For /Contents field in signatures | ||
| 168 | + qpdf_offset_t contents_offset{-1}; ///< Offset of /Contents value | ||
| 169 | + int null_count{0}; ///< Count of null values in container | ||
| 116 | }; | 170 | }; |
| 117 | 171 | ||
| 172 | + /// @brief Parse an object, handling exceptions and returning null on error. | ||
| 173 | + /// @param content_stream True if parsing a content stream. | ||
| 174 | + /// @return The parsed object handle, or null/uninitialized on error. | ||
| 118 | QPDFObjectHandle parse(bool content_stream = false); | 175 | QPDFObjectHandle parse(bool content_stream = false); |
| 176 | + | ||
| 177 | + /// @brief Parse the first token and dispatch to appropriate handler. | ||
| 178 | + /// @param content_stream True if parsing a content stream. | ||
| 179 | + /// @return The parsed object handle. | ||
| 119 | QPDFObjectHandle parse_first(bool content_stream); | 180 | QPDFObjectHandle parse_first(bool content_stream); |
| 181 | + | ||
| 182 | + /// @brief Parse the remainder of a composite object (array/dict/reference). | ||
| 183 | + /// @param content_stream True if parsing a content stream. | ||
| 184 | + /// @return The completed object handle. | ||
| 120 | QPDFObjectHandle parse_remainder(bool content_stream); | 185 | QPDFObjectHandle parse_remainder(bool content_stream); |
| 186 | + | ||
| 187 | + /// @brief Add an object to the current container. | ||
| 188 | + /// @param obj The object to add. | ||
| 121 | void add(std::shared_ptr<QPDFObject>&& obj); | 189 | void add(std::shared_ptr<QPDFObject>&& obj); |
| 190 | + | ||
| 191 | + /// @brief Add a null object to the current container. | ||
| 122 | void add_null(); | 192 | void add_null(); |
| 193 | + | ||
| 194 | + /// @brief Add a null with a warning message. | ||
| 195 | + /// @param msg Warning message describing the error. | ||
| 123 | void add_bad_null(std::string const& msg); | 196 | void add_bad_null(std::string const& msg); |
| 197 | + | ||
| 198 | + /// @brief Add a buffered integer from int_buffer_. | ||
| 199 | + /// @param count Buffer index (1 or 2) to read from. | ||
| 124 | void add_int(int count); | 200 | void add_int(int count); |
| 201 | + | ||
| 202 | + /// @brief Create and add a scalar object to the current container. | ||
| 203 | + /// @tparam T The scalar object type (e.g., QPDF_Integer, QPDF_String). | ||
| 204 | + /// @tparam Args Constructor argument types. | ||
| 205 | + /// @param args Arguments to forward to the object constructor. | ||
| 125 | template <typename T, typename... Args> | 206 | template <typename T, typename... Args> |
| 126 | void add_scalar(Args&&... args); | 207 | void add_scalar(Args&&... args); |
| 208 | + | ||
| 209 | + /// @brief Check if too many bad tokens have been encountered and throw if so. | ||
| 127 | void check_too_many_bad_tokens(); | 210 | void check_too_many_bad_tokens(); |
| 211 | + | ||
| 212 | + /// @brief Issue a warning about a duplicate dictionary key. | ||
| 128 | void warn_duplicate_key(); | 213 | void warn_duplicate_key(); |
| 214 | + | ||
| 215 | + /// @brief Fix dictionaries with missing keys by generating fake keys. | ||
| 129 | void fix_missing_keys(); | 216 | void fix_missing_keys(); |
| 217 | + | ||
| 218 | + /// @brief Report a limits error and throw. | ||
| 219 | + /// @param limit The limit identifier. | ||
| 220 | + /// @param msg Error message. | ||
| 130 | [[noreturn]] void limits_error(std::string const& limit, std::string const& msg); | 221 | [[noreturn]] void limits_error(std::string const& limit, std::string const& msg); |
| 222 | + | ||
| 223 | + /// @brief Issue a warning at a specific offset. | ||
| 224 | + /// @param offset File offset for the warning. | ||
| 225 | + /// @param msg Warning message. | ||
| 131 | void warn(qpdf_offset_t offset, std::string const& msg) const; | 226 | void warn(qpdf_offset_t offset, std::string const& msg) const; |
| 227 | + | ||
| 228 | + /// @brief Issue a warning at the current offset. | ||
| 229 | + /// @param msg Warning message. | ||
| 132 | void warn(std::string const& msg) const; | 230 | void warn(std::string const& msg) const; |
| 133 | - void warn(QPDFExc const&) const; | 231 | + |
| 232 | + /// @brief Issue a warning from a QPDFExc exception. | ||
| 233 | + /// @param e The exception to report. | ||
| 234 | + void warn(QPDFExc const& e) const; | ||
| 235 | + | ||
| 236 | + /// @brief Create a scalar object with description and parsed offset. | ||
| 237 | + /// @tparam T The scalar object type. | ||
| 238 | + /// @tparam Args Constructor argument types. | ||
| 239 | + /// @param args Arguments to forward to the object constructor. | ||
| 240 | + /// @return Object handle with description and offset set. | ||
| 241 | + /// @note The offset includes any leading whitespace. | ||
| 134 | template <typename T, typename... Args> | 242 | template <typename T, typename... Args> |
| 135 | - // Create a new scalar object complete with parsed offset and description. | ||
| 136 | - // NB the offset includes any leading whitespace. | ||
| 137 | QPDFObjectHandle with_description(Args&&... args); | 243 | QPDFObjectHandle with_description(Args&&... args); |
| 244 | + | ||
| 245 | + /// @brief Set the description and offset on an existing object. | ||
| 246 | + /// @param obj The object to update. | ||
| 247 | + /// @param parsed_offset The file offset where the object was parsed. | ||
| 138 | void set_description(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset); | 248 | void set_description(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset); |
| 139 | - InputSource& input_; | ||
| 140 | - std::string const& object_description_; | ||
| 141 | - qpdf::Tokenizer& tokenizer_; | ||
| 142 | - QPDFObjectHandle::StringDecrypter* decrypter_; | ||
| 143 | - QPDF* context_; | ||
| 144 | - std::shared_ptr<QPDFObject::Description> description_; | ||
| 145 | - bool parse_pdf_{false}; | ||
| 146 | - int stream_id_{0}; | ||
| 147 | - int obj_id_{0}; | ||
| 148 | - bool sanity_checks_{false}; | ||
| 149 | - | ||
| 150 | - std::vector<StackFrame> stack_; | ||
| 151 | - StackFrame* frame_{nullptr}; | ||
| 152 | - // Number of recent bad tokens. This will always be > 0 once a bad token has been | ||
| 153 | - // encountered as it only gets incremented or reset when a bad token is encountered. | 249 | + |
| 250 | + // Core parsing state | ||
| 251 | + InputSource& input_; ///< Input source to read from | ||
| 252 | + std::string const& object_description_; ///< Description for error messages | ||
| 253 | + qpdf::Tokenizer& tokenizer_; ///< Tokenizer for lexical analysis | ||
| 254 | + QPDFObjectHandle::StringDecrypter* decrypter_; ///< Decrypter for encrypted strings | ||
| 255 | + QPDF* context_; ///< QPDF context for object resolution | ||
| 256 | + std::shared_ptr<QPDFObject::Description> description_; ///< Shared description for objects | ||
| 257 | + bool parse_pdf_{false}; ///< True if parsing PDF objects vs content streams | ||
| 258 | + int stream_id_{0}; ///< Object stream ID (for object stream parsing) | ||
| 259 | + int obj_id_{0}; ///< Object ID within object stream | ||
| 260 | + bool sanity_checks_{false}; ///< Enable additional validation checks | ||
| 261 | + | ||
| 262 | + // Composite object parsing state | ||
| 263 | + std::vector<StackFrame> stack_; ///< Stack of nested containers | ||
| 264 | + StackFrame* frame_{nullptr}; ///< Current stack frame pointer | ||
| 265 | + | ||
| 266 | + // Error tracking state | ||
| 267 | + /// Number of recent bad tokens. Always > 0 after first bad token encountered. | ||
| 154 | int bad_count_{0}; | 268 | int bad_count_{0}; |
| 155 | - // Number of bad tokens (remaining) before giving up. | 269 | + /// Number of bad tokens remaining before giving up. |
| 156 | uint32_t max_bad_count_{Limits::parser_max_errors()}; | 270 | uint32_t max_bad_count_{Limits::parser_max_errors()}; |
| 157 | - // Number of good tokens since last bad token. Irrelevant if bad_count == 0. | 271 | + /// Number of good tokens since last bad token. Irrelevant if bad_count == 0. |
| 158 | int good_count_{0}; | 272 | int good_count_{0}; |
| 159 | - // Start offset including any leading whitespace. | 273 | + |
| 274 | + // Token buffering state | ||
| 275 | + /// Start offset of current object, including any leading whitespace. | ||
| 160 | qpdf_offset_t start_{0}; | 276 | qpdf_offset_t start_{0}; |
| 161 | - // Number of successive integer tokens. | 277 | + /// Number of successive integer tokens (for indirect reference detection). |
| 162 | int int_count_{0}; | 278 | int int_count_{0}; |
| 279 | + /// Buffer for up to 2 integer tokens. | ||
| 163 | long long int_buffer_[2]{0, 0}; | 280 | long long int_buffer_[2]{0, 0}; |
| 281 | + /// Offsets corresponding to buffered integers. | ||
| 164 | qpdf_offset_t last_offset_buffer_[2]{0, 0}; | 282 | qpdf_offset_t last_offset_buffer_[2]{0, 0}; |
| 283 | + | ||
| 284 | + /// True if object was empty (endobj without content). | ||
| 165 | bool empty_{false}; | 285 | bool empty_{false}; |
| 166 | }; | 286 | }; |
| 167 | } // namespace qpdf::impl | 287 | } // namespace qpdf::impl |