Commit 68e4aec054dd735c0a808133acdf2fbca6c648c7
1 parent
c62e8e2b
Clarify qpdf's representation of names in the API
Clarify that names are to appear in canonical form with PDF escaping resolved when used in non-parsing QPDFObjectHandle APIs and their C API counterparts. See https://github.com/qpdf/qpdf/discussions/625.
Showing
2 changed files
with
60 additions
and
19 deletions
include/qpdf/QPDFObjectHandle.hh
| ... | ... | @@ -145,18 +145,26 @@ class QPDFObjectHandle |
| 145 | 145 | // TokenFilters. |
| 146 | 146 | // |
| 147 | 147 | // Please note that when you call token.getValue() on a token of |
| 148 | - // type tt_string, you get the string value without any | |
| 149 | - // delimiters. token.getRawValue() will return something suitable | |
| 150 | - // for being written to output, or calling writeToken with a | |
| 151 | - // string token will also work. The correct way to construct a | |
| 152 | - // string token that would write the literal value (str) is | |
| 153 | - // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "str"). A | |
| 154 | - // similar situation exists with tt_name. token.getValue() returns | |
| 155 | - // a normalized name with # codes resolved into characters, and | |
| 156 | - // may not be suitable for writing. You can pass it to | |
| 157 | - // QPDF_Name::normalizeName first, or you can use writeToken with | |
| 158 | - // a name token. The correct way to create a name token is | |
| 159 | - // QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/Name"). | |
| 148 | + // type tt_string or tt_name, you get the canonical, "parsed" | |
| 149 | + // representation of the token. For a string, this means that | |
| 150 | + // there are no delimiters, and for a name, it means that all | |
| 151 | + // escaping (# followed by two hex digits) has been resolved. | |
| 152 | + // qpdf's internal representation of name includes the leading | |
| 153 | + // slash. As such, you can't write the value of token.getValue() | |
| 154 | + // directly to output that is supposed to be valid PDF syntax. If | |
| 155 | + // you want to do that, you need to call writeToken() instead, or | |
| 156 | + // you can retrieve the token as it appeared in the input with | |
| 157 | + // token.getRawValue(). To construct a new string or name token | |
| 158 | + // from a canonical representation, use | |
| 159 | + // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "parsed-str") or | |
| 160 | + // QPDFTokenizer::Token(QPDFTokenizer::tt_name, | |
| 161 | + // "/Canonical-Name"). Tokens created this way won't have a | |
| 162 | + // PDF-syntax raw value, but you can still write them with | |
| 163 | + // writeToken(). Example: | |
| 164 | + // writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/text/plain")) | |
| 165 | + // would write `/text#2fplain`, and | |
| 166 | + // writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, "a\\(b")) | |
| 167 | + // would write `(a\(b)` | |
| 160 | 168 | class QPDF_DLL_CLASS TokenFilter |
| 161 | 169 | { |
| 162 | 170 | public: |
| ... | ... | @@ -519,6 +527,22 @@ class QPDFObjectHandle |
| 519 | 527 | QPDF_DLL |
| 520 | 528 | static QPDFObjectHandle newReal(double value, int decimal_places, |
| 521 | 529 | bool trim_trailing_zeroes); |
| 530 | + // Note about name objects: qpdf's internal representation of a | |
| 531 | + // PDF name is a sequence of bytes, excluding the NUL character, | |
| 532 | + // and starting with a slash. Name objects as represented in the | |
| 533 | + // PDF specification can contain characters escaped with #, but | |
| 534 | + // such escaping is not of concern calling QPDFObjectHandle | |
| 535 | + // methods not directly relating to parsing. For example, | |
| 536 | + // newName("/text/plain").getName() and | |
| 537 | + // parse("/text#2fplain").getName() both return "/text/plain", | |
| 538 | + // while newName("/text/plain").unparse() and | |
| 539 | + // parse("/text#2fplain").unparse() both return "/text#2fplain". | |
| 540 | + // When working with the qpdf API for creating, retrieving, and | |
| 541 | + // modifying objects, you want to work with the internal, | |
| 542 | + // canonical representation. For names containing alphanumeric | |
| 543 | + // characters, dashes, and underscores, there is no difference | |
| 544 | + // between the two representations. For a lengthy discussion, see | |
| 545 | + // https://github.com/qpdf/qpdf/discussions/625. | |
| 522 | 546 | QPDF_DLL |
| 523 | 547 | static QPDFObjectHandle newName(std::string const& name); |
| 524 | 548 | QPDF_DLL |
| ... | ... | @@ -719,7 +743,9 @@ class QPDFObjectHandle |
| 719 | 743 | QPDF_DLL |
| 720 | 744 | bool getValueAsNumber(double&); |
| 721 | 745 | |
| 722 | - // Methods for name objects; see also name and array objects | |
| 746 | + // Methods for name objects. The returned name value is in qpdf's | |
| 747 | + // canonical form with all escaping resolved. See comments for | |
| 748 | + // newName() for details. | |
| 723 | 749 | QPDF_DLL |
| 724 | 750 | std::string getName(); |
| 725 | 751 | QPDF_DLL |
| ... | ... | @@ -789,7 +815,10 @@ class QPDFObjectHandle |
| 789 | 815 | QPDF_DLL |
| 790 | 816 | Matrix getArrayAsMatrix(); |
| 791 | 817 | |
| 792 | - // Methods for dictionary objects. | |
| 818 | + // Methods for dictionary objects. In all dictionary methods, keys | |
| 819 | + // are specified/represented as canonical name strings starting | |
| 820 | + // with a leading slash and not containing any PDF syntax | |
| 821 | + // escaping. See comments for getName() for details. | |
| 793 | 822 | |
| 794 | 823 | // Return an object that enables iteration over members. You can |
| 795 | 824 | // do |
| ... | ... | @@ -824,7 +853,9 @@ class QPDFObjectHandle |
| 824 | 853 | QPDF_DLL |
| 825 | 854 | std::map<std::string, QPDFObjectHandle> getDictAsMap(); |
| 826 | 855 | |
| 827 | - // Methods for name and array objects | |
| 856 | + // Methods for name and array objects. The name value is in qpdf's | |
| 857 | + // canonical form with all escaping resolved. See comments for | |
| 858 | + // newName() for details. | |
| 828 | 859 | QPDF_DLL |
| 829 | 860 | bool isOrHasName(std::string const&); |
| 830 | 861 | |
| ... | ... | @@ -1237,8 +1268,8 @@ class QPDFObjectHandle |
| 1237 | 1268 | |
| 1238 | 1269 | // Return encoded as JSON. For most object types, there is an |
| 1239 | 1270 | // obvious mapping. The JSON is generated as follows: |
| 1240 | - // * Names are encoded as strings representing the normalized value of | |
| 1241 | - // getName() | |
| 1271 | + // * Names are encoded as strings representing the normalized name | |
| 1272 | + // in PDF syntax as returned by unparse() | |
| 1242 | 1273 | // * Indirect references are encoded as strings containing "obj gen R" |
| 1243 | 1274 | // * Strings are encoded as UTF-8 strings with unrepresentable binary |
| 1244 | 1275 | // characters encoded as \uHHHH | ... | ... |
include/qpdf/qpdf-c.h
| ... | ... | @@ -654,7 +654,10 @@ extern "C" { |
| 654 | 654 | /* Wrappers around QPDFObjectHandle methods. Be sure to read |
| 655 | 655 | * corresponding comments in QPDFObjectHandle.hh to understand |
| 656 | 656 | * what each function does and what kinds of objects it applies |
| 657 | - * to. | |
| 657 | + * to. Note that names are to appear in a canonicalized form | |
| 658 | + * starting with a leading slash and with all PDF escaping | |
| 659 | + * resolved. See comments for getName() in QPDFObjectHandle.hh for | |
| 660 | + * details. | |
| 658 | 661 | */ |
| 659 | 662 | |
| 660 | 663 | QPDF_DLL |
| ... | ... | @@ -790,6 +793,12 @@ extern "C" { |
| 790 | 793 | QPDF_DLL |
| 791 | 794 | qpdf_oh qpdf_oh_get_array_item(qpdf_data qpdf, qpdf_oh oh, int n); |
| 792 | 795 | |
| 796 | + /* In all dictionary APIs, keys are specified/represented as | |
| 797 | + * canonicalized name strings starting with / and with all PDF | |
| 798 | + * escaping resolved. See comments for getName() in | |
| 799 | + * QPDFObjectHandle for details. | |
| 800 | + */ | |
| 801 | + | |
| 793 | 802 | /* "C"-specific dictionary key iteration */ |
| 794 | 803 | |
| 795 | 804 | /* Iteration is allowed on only one dictionary at a time. */ |
| ... | ... | @@ -813,7 +822,8 @@ extern "C" { |
| 813 | 822 | QPDF_DLL |
| 814 | 823 | qpdf_oh qpdf_oh_get_key(qpdf_data qpdf, qpdf_oh oh, char const* key); |
| 815 | 824 | QPDF_DLL |
| 816 | - qpdf_oh qpdf_oh_get_key_if_dict(qpdf_data qpdf, qpdf_oh oh, char const* key); | |
| 825 | + qpdf_oh qpdf_oh_get_key_if_dict( | |
| 826 | + qpdf_data qpdf, qpdf_oh oh, char const* key); | |
| 817 | 827 | |
| 818 | 828 | QPDF_DLL |
| 819 | 829 | QPDF_BOOL qpdf_oh_is_or_has_name( | ... | ... |