Commit 44cd31b96291ba114bc48870be0928b0f75e2dc1
Committed by
m-holger
1 parent
f2ccff52
Add Doxygen-style comments for all Parser methods
Co-authored-by: m-holger <34626170+m-holger@users.noreply.github.com>
Showing
1 changed file
with
155 additions
and
35 deletions
libqpdf/qpdf/QPDFParser.hh
| ... | ... | @@ -15,9 +15,16 @@ using namespace qpdf::global; |
| 15 | 15 | |
| 16 | 16 | namespace qpdf::impl |
| 17 | 17 | { |
| 18 | + /// @class Parser | |
| 19 | + /// @brief Internal parser for PDF objects and content streams. | |
| 20 | + /// @par | |
| 21 | + /// The Parser class provides static methods for parsing PDF objects from input sources. | |
| 22 | + /// It handles tokenization, error recovery, and object construction with proper offset | |
| 23 | + /// tracking and description for error reporting. | |
| 18 | 24 | class Parser |
| 19 | 25 | { |
| 20 | 26 | public: |
| 27 | + /// @brief Exception thrown when parser encounters an unrecoverable error. | |
| 21 | 28 | class Error: public std::exception |
| 22 | 29 | { |
| 23 | 30 | public: |
| ... | ... | @@ -25,16 +32,34 @@ namespace qpdf::impl |
| 25 | 32 | virtual ~Error() noexcept = default; |
| 26 | 33 | }; |
| 27 | 34 | |
| 35 | + /// @brief Parse a PDF object from an input source. | |
| 36 | + /// @param input The input source to read from. | |
| 37 | + /// @param object_description Description of the object for error messages. | |
| 38 | + /// @param context The QPDF context, or nullptr if parsing standalone. | |
| 39 | + /// @return The parsed QPDFObjectHandle, or null if parsing fails. | |
| 28 | 40 | static QPDFObjectHandle |
| 29 | 41 | parse(InputSource& input, std::string const& object_description, QPDF* context); |
| 30 | 42 | |
| 43 | + /// @brief Parse a content stream from an input source. | |
| 44 | + /// @param input The input source to read from. | |
| 45 | + /// @param sp_description Shared pointer to object description. | |
| 46 | + /// @param tokenizer The tokenizer to use for parsing. | |
| 47 | + /// @param context The QPDF context. | |
| 48 | + /// @return The parsed QPDFObjectHandle, or uninitialized handle on EOF. | |
| 31 | 49 | static QPDFObjectHandle parse_content( |
| 32 | 50 | InputSource& input, |
| 33 | 51 | std::shared_ptr<QPDFObject::Description> sp_description, |
| 34 | 52 | qpdf::Tokenizer& tokenizer, |
| 35 | 53 | QPDF* context); |
| 36 | 54 | |
| 37 | - // For use by deprecated QPDFObjectHandle::parse. | |
| 55 | + /// @brief Parse a PDF object (interface for deprecated QPDFObjectHandle::parse). | |
| 56 | + /// @param input The input source to read from. | |
| 57 | + /// @param object_description Description of the object for error messages. | |
| 58 | + /// @param tokenizer The tokenizer to use for parsing. | |
| 59 | + /// @param empty Output parameter indicating if object was empty. | |
| 60 | + /// @param decrypter String decrypter for encrypted strings, or nullptr. | |
| 61 | + /// @param context The QPDF context, or nullptr if parsing standalone. | |
| 62 | + /// @return The parsed QPDFObjectHandle. | |
| 38 | 63 | static QPDFObjectHandle parse( |
| 39 | 64 | InputSource& input, |
| 40 | 65 | std::string const& object_description, |
| ... | ... | @@ -43,7 +68,14 @@ namespace qpdf::impl |
| 43 | 68 | QPDFObjectHandle::StringDecrypter* decrypter, |
| 44 | 69 | QPDF* context); |
| 45 | 70 | |
| 46 | - // For use by QPDF. | |
| 71 | + /// @brief Parse a PDF object for use by QPDF. | |
| 72 | + /// @param input The input source to read from. | |
| 73 | + /// @param object_description Description of the object for error messages. | |
| 74 | + /// @param tokenizer The tokenizer to use for parsing. | |
| 75 | + /// @param decrypter String decrypter for encrypted strings, or nullptr. | |
| 76 | + /// @param context The QPDF context. | |
| 77 | + /// @param sanity_checks Enable additional sanity checks during parsing. | |
| 78 | + /// @return The parsed QPDFObjectHandle. | |
| 47 | 79 | static QPDFObjectHandle parse( |
| 48 | 80 | InputSource& input, |
| 49 | 81 | std::string const& object_description, |
| ... | ... | @@ -52,6 +84,13 @@ namespace qpdf::impl |
| 52 | 84 | QPDF& context, |
| 53 | 85 | bool sanity_checks); |
| 54 | 86 | |
| 87 | + /// @brief Parse an object from an object stream. | |
| 88 | + /// @param input The offset buffer containing the object data. | |
| 89 | + /// @param stream_id The object stream number. | |
| 90 | + /// @param obj_id The object ID within the stream. | |
| 91 | + /// @param tokenizer The tokenizer to use for parsing. | |
| 92 | + /// @param context The QPDF context. | |
| 93 | + /// @return The parsed QPDFObjectHandle. | |
| 55 | 94 | static QPDFObjectHandle parse( |
| 56 | 95 | qpdf::is::OffsetBuffer& input, |
| 57 | 96 | int stream_id, |
| ... | ... | @@ -59,6 +98,10 @@ namespace qpdf::impl |
| 59 | 98 | qpdf::Tokenizer& tokenizer, |
| 60 | 99 | QPDF& context); |
| 61 | 100 | |
| 101 | + /// @brief Create a description for a parsed object. | |
| 102 | + /// @param input_name The name of the input source. | |
| 103 | + /// @param object_description Description of the object being parsed. | |
| 104 | + /// @return Shared pointer to object description with offset placeholder. | |
| 62 | 105 | static std::shared_ptr<QPDFObject::Description> |
| 63 | 106 | make_description(std::string const& input_name, std::string const& object_description) |
| 64 | 107 | { |
| ... | ... | @@ -68,6 +111,17 @@ namespace qpdf::impl |
| 68 | 111 | } |
| 69 | 112 | |
| 70 | 113 | private: |
| 114 | + /// @brief Construct a parser instance. | |
| 115 | + /// @param input The input source to read from. | |
| 116 | + /// @param sp_description Shared pointer to object description. | |
| 117 | + /// @param object_description Description string for error messages. | |
| 118 | + /// @param tokenizer The tokenizer to use for parsing. | |
| 119 | + /// @param decrypter String decrypter for encrypted content. | |
| 120 | + /// @param context The QPDF context. | |
| 121 | + /// @param parse_pdf Whether parsing PDF objects (vs content streams). | |
| 122 | + /// @param stream_id Object stream ID for object stream parsing. | |
| 123 | + /// @param obj_id Object ID within object stream. | |
| 124 | + /// @param sanity_checks Enable additional sanity checks. | |
| 71 | 125 | Parser( |
| 72 | 126 | InputSource& input, |
| 73 | 127 | std::shared_ptr<QPDFObject::Description> sp_description, |
| ... | ... | @@ -92,11 +146,11 @@ namespace qpdf::impl |
| 92 | 146 | { |
| 93 | 147 | } |
| 94 | 148 | |
| 95 | - // Parser state. Note: | |
| 96 | - // state <= st_dictionary_value == (state = st_dictionary_key || state = | |
| 97 | - // st_dictionary_value) | |
| 149 | + /// @brief Parser state enumeration. | |
| 150 | + /// @note state <= st_dictionary_value indicates we're in a dictionary context. | |
| 98 | 151 | enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array }; |
| 99 | 152 | |
| 153 | + /// @brief Stack frame for tracking nested arrays and dictionaries. | |
| 100 | 154 | struct StackFrame |
| 101 | 155 | { |
| 102 | 156 | StackFrame(InputSource& input, parser_state_e state) : |
| ... | ... | @@ -105,63 +159,129 @@ namespace qpdf::impl |
| 105 | 159 | { |
| 106 | 160 | } |
| 107 | 161 | |
| 108 | - std::vector<QPDFObjectHandle> olist; | |
| 109 | - std::map<std::string, QPDFObjectHandle> dict; | |
| 110 | - parser_state_e state; | |
| 111 | - std::string key; | |
| 112 | - qpdf_offset_t offset; | |
| 113 | - std::string contents_string; | |
| 114 | - qpdf_offset_t contents_offset{-1}; | |
| 115 | - int null_count{0}; | |
| 162 | + std::vector<QPDFObjectHandle> olist; ///< Object list for arrays/dict values | |
| 163 | + std::map<std::string, QPDFObjectHandle> dict; ///< Dictionary entries | |
| 164 | + parser_state_e state; ///< Current parser state | |
| 165 | + std::string key; ///< Current dictionary key | |
| 166 | + qpdf_offset_t offset; ///< Offset of container start | |
| 167 | + std::string contents_string; ///< For /Contents field in signatures | |
| 168 | + qpdf_offset_t contents_offset{-1}; ///< Offset of /Contents value | |
| 169 | + int null_count{0}; ///< Count of null values in container | |
| 116 | 170 | }; |
| 117 | 171 | |
| 172 | + /// @brief Parse an object, handling exceptions and returning null on error. | |
| 173 | + /// @param content_stream True if parsing a content stream. | |
| 174 | + /// @return The parsed object handle, or null/uninitialized on error. | |
| 118 | 175 | QPDFObjectHandle parse(bool content_stream = false); |
| 176 | + | |
| 177 | + /// @brief Parse the first token and dispatch to appropriate handler. | |
| 178 | + /// @param content_stream True if parsing a content stream. | |
| 179 | + /// @return The parsed object handle. | |
| 119 | 180 | QPDFObjectHandle parse_first(bool content_stream); |
| 181 | + | |
| 182 | + /// @brief Parse the remainder of a composite object (array/dict/reference). | |
| 183 | + /// @param content_stream True if parsing a content stream. | |
| 184 | + /// @return The completed object handle. | |
| 120 | 185 | QPDFObjectHandle parse_remainder(bool content_stream); |
| 186 | + | |
| 187 | + /// @brief Add an object to the current container. | |
| 188 | + /// @param obj The object to add. | |
| 121 | 189 | void add(std::shared_ptr<QPDFObject>&& obj); |
| 190 | + | |
| 191 | + /// @brief Add a null object to the current container. | |
| 122 | 192 | void add_null(); |
| 193 | + | |
| 194 | + /// @brief Add a null with a warning message. | |
| 195 | + /// @param msg Warning message describing the error. | |
| 123 | 196 | void add_bad_null(std::string const& msg); |
| 197 | + | |
| 198 | + /// @brief Add a buffered integer from int_buffer_. | |
| 199 | + /// @param count Buffer index (1 or 2) to read from. | |
| 124 | 200 | void add_int(int count); |
| 201 | + | |
| 202 | + /// @brief Create and add a scalar object to the current container. | |
| 203 | + /// @tparam T The scalar object type (e.g., QPDF_Integer, QPDF_String). | |
| 204 | + /// @tparam Args Constructor argument types. | |
| 205 | + /// @param args Arguments to forward to the object constructor. | |
| 125 | 206 | template <typename T, typename... Args> |
| 126 | 207 | void add_scalar(Args&&... args); |
| 208 | + | |
| 209 | + /// @brief Check if too many bad tokens have been encountered and throw if so. | |
| 127 | 210 | void check_too_many_bad_tokens(); |
| 211 | + | |
| 212 | + /// @brief Issue a warning about a duplicate dictionary key. | |
| 128 | 213 | void warn_duplicate_key(); |
| 214 | + | |
| 215 | + /// @brief Fix dictionaries with missing keys by generating fake keys. | |
| 129 | 216 | void fix_missing_keys(); |
| 217 | + | |
| 218 | + /// @brief Report a limits error and throw. | |
| 219 | + /// @param limit The limit identifier. | |
| 220 | + /// @param msg Error message. | |
| 130 | 221 | [[noreturn]] void limits_error(std::string const& limit, std::string const& msg); |
| 222 | + | |
| 223 | + /// @brief Issue a warning at a specific offset. | |
| 224 | + /// @param offset File offset for the warning. | |
| 225 | + /// @param msg Warning message. | |
| 131 | 226 | void warn(qpdf_offset_t offset, std::string const& msg) const; |
| 227 | + | |
| 228 | + /// @brief Issue a warning at the current offset. | |
| 229 | + /// @param msg Warning message. | |
| 132 | 230 | void warn(std::string const& msg) const; |
| 133 | - void warn(QPDFExc const&) const; | |
| 231 | + | |
| 232 | + /// @brief Issue a warning from a QPDFExc exception. | |
| 233 | + /// @param e The exception to report. | |
| 234 | + void warn(QPDFExc const& e) const; | |
| 235 | + | |
| 236 | + /// @brief Create a scalar object with description and parsed offset. | |
| 237 | + /// @tparam T The scalar object type. | |
| 238 | + /// @tparam Args Constructor argument types. | |
| 239 | + /// @param args Arguments to forward to the object constructor. | |
| 240 | + /// @return Object handle with description and offset set. | |
| 241 | + /// @note The offset includes any leading whitespace. | |
| 134 | 242 | template <typename T, typename... Args> |
| 135 | - // Create a new scalar object complete with parsed offset and description. | |
| 136 | - // NB the offset includes any leading whitespace. | |
| 137 | 243 | QPDFObjectHandle with_description(Args&&... args); |
| 244 | + | |
| 245 | + /// @brief Set the description and offset on an existing object. | |
| 246 | + /// @param obj The object to update. | |
| 247 | + /// @param parsed_offset The file offset where the object was parsed. | |
| 138 | 248 | void set_description(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset); |
| 139 | - InputSource& input_; | |
| 140 | - std::string const& object_description_; | |
| 141 | - qpdf::Tokenizer& tokenizer_; | |
| 142 | - QPDFObjectHandle::StringDecrypter* decrypter_; | |
| 143 | - QPDF* context_; | |
| 144 | - std::shared_ptr<QPDFObject::Description> description_; | |
| 145 | - bool parse_pdf_{false}; | |
| 146 | - int stream_id_{0}; | |
| 147 | - int obj_id_{0}; | |
| 148 | - bool sanity_checks_{false}; | |
| 149 | - | |
| 150 | - std::vector<StackFrame> stack_; | |
| 151 | - StackFrame* frame_{nullptr}; | |
| 152 | - // Number of recent bad tokens. This will always be > 0 once a bad token has been | |
| 153 | - // encountered as it only gets incremented or reset when a bad token is encountered. | |
| 249 | + | |
| 250 | + // Core parsing state | |
| 251 | + InputSource& input_; ///< Input source to read from | |
| 252 | + std::string const& object_description_; ///< Description for error messages | |
| 253 | + qpdf::Tokenizer& tokenizer_; ///< Tokenizer for lexical analysis | |
| 254 | + QPDFObjectHandle::StringDecrypter* decrypter_; ///< Decrypter for encrypted strings | |
| 255 | + QPDF* context_; ///< QPDF context for object resolution | |
| 256 | + std::shared_ptr<QPDFObject::Description> description_; ///< Shared description for objects | |
| 257 | + bool parse_pdf_{false}; ///< True if parsing PDF objects vs content streams | |
| 258 | + int stream_id_{0}; ///< Object stream ID (for object stream parsing) | |
| 259 | + int obj_id_{0}; ///< Object ID within object stream | |
| 260 | + bool sanity_checks_{false}; ///< Enable additional validation checks | |
| 261 | + | |
| 262 | + // Composite object parsing state | |
| 263 | + std::vector<StackFrame> stack_; ///< Stack of nested containers | |
| 264 | + StackFrame* frame_{nullptr}; ///< Current stack frame pointer | |
| 265 | + | |
| 266 | + // Error tracking state | |
| 267 | + /// Number of recent bad tokens. Always > 0 after first bad token encountered. | |
| 154 | 268 | int bad_count_{0}; |
| 155 | - // Number of bad tokens (remaining) before giving up. | |
| 269 | + /// Number of bad tokens remaining before giving up. | |
| 156 | 270 | uint32_t max_bad_count_{Limits::parser_max_errors()}; |
| 157 | - // Number of good tokens since last bad token. Irrelevant if bad_count == 0. | |
| 271 | + /// Number of good tokens since last bad token. Irrelevant if bad_count == 0. | |
| 158 | 272 | int good_count_{0}; |
| 159 | - // Start offset including any leading whitespace. | |
| 273 | + | |
| 274 | + // Token buffering state | |
| 275 | + /// Start offset of current object, including any leading whitespace. | |
| 160 | 276 | qpdf_offset_t start_{0}; |
| 161 | - // Number of successive integer tokens. | |
| 277 | + /// Number of successive integer tokens (for indirect reference detection). | |
| 162 | 278 | int int_count_{0}; |
| 279 | + /// Buffer for up to 2 integer tokens. | |
| 163 | 280 | long long int_buffer_[2]{0, 0}; |
| 281 | + /// Offsets corresponding to buffered integers. | |
| 164 | 282 | qpdf_offset_t last_offset_buffer_[2]{0, 0}; |
| 283 | + | |
| 284 | + /// True if object was empty (endobj without content). | |
| 165 | 285 | bool empty_{false}; |
| 166 | 286 | }; |
| 167 | 287 | } // namespace qpdf::impl | ... | ... |