Commit 68e4aec054dd735c0a808133acdf2fbca6c648c7
1 parent
c62e8e2b
Clarify qpdf's representation of names in the API
Clarify that names are to appear in canonical form with PDF escaping resolved when used in non-parsing QPDFObjectHandle APIs and their C API counterparts. See https://github.com/qpdf/qpdf/discussions/625.
Showing
2 changed files
with
60 additions
and
19 deletions
include/qpdf/QPDFObjectHandle.hh
| @@ -145,18 +145,26 @@ class QPDFObjectHandle | @@ -145,18 +145,26 @@ class QPDFObjectHandle | ||
| 145 | // TokenFilters. | 145 | // TokenFilters. |
| 146 | // | 146 | // |
| 147 | // Please note that when you call token.getValue() on a token of | 147 | // Please note that when you call token.getValue() on a token of |
| 148 | - // type tt_string, you get the string value without any | ||
| 149 | - // delimiters. token.getRawValue() will return something suitable | ||
| 150 | - // for being written to output, or calling writeToken with a | ||
| 151 | - // string token will also work. The correct way to construct a | ||
| 152 | - // string token that would write the literal value (str) is | ||
| 153 | - // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "str"). A | ||
| 154 | - // similar situation exists with tt_name. token.getValue() returns | ||
| 155 | - // a normalized name with # codes resolved into characters, and | ||
| 156 | - // may not be suitable for writing. You can pass it to | ||
| 157 | - // QPDF_Name::normalizeName first, or you can use writeToken with | ||
| 158 | - // a name token. The correct way to create a name token is | ||
| 159 | - // QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/Name"). | 148 | + // type tt_string or tt_name, you get the canonical, "parsed" |
| 149 | + // representation of the token. For a string, this means that | ||
| 150 | + // there are no delimiters, and for a name, it means that all | ||
| 151 | + // escaping (# followed by two hex digits) has been resolved. | ||
| 152 | + // qpdf's internal representation of name includes the leading | ||
| 153 | + // slash. As such, you can't write the value of token.getValue() | ||
| 154 | + // directly to output that is supposed to be valid PDF syntax. If | ||
| 155 | + // you want to do that, you need to call writeToken() instead, or | ||
| 156 | + // you can retrieve the token as it appeared in the input with | ||
| 157 | + // token.getRawValue(). To construct a new string or name token | ||
| 158 | + // from a canonical representation, use | ||
| 159 | + // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "parsed-str") or | ||
| 160 | + // QPDFTokenizer::Token(QPDFTokenizer::tt_name, | ||
| 161 | + // "/Canonical-Name"). Tokens created this way won't have a | ||
| 162 | + // PDF-syntax raw value, but you can still write them with | ||
| 163 | + // writeToken(). Example: | ||
| 164 | + // writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/text/plain")) | ||
| 165 | + // would write `/text#2fplain`, and | ||
| 166 | + // writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, "a\\(b")) | ||
| 167 | + // would write `(a\(b)` | ||
| 160 | class QPDF_DLL_CLASS TokenFilter | 168 | class QPDF_DLL_CLASS TokenFilter |
| 161 | { | 169 | { |
| 162 | public: | 170 | public: |
| @@ -519,6 +527,22 @@ class QPDFObjectHandle | @@ -519,6 +527,22 @@ class QPDFObjectHandle | ||
| 519 | QPDF_DLL | 527 | QPDF_DLL |
| 520 | static QPDFObjectHandle newReal(double value, int decimal_places, | 528 | static QPDFObjectHandle newReal(double value, int decimal_places, |
| 521 | bool trim_trailing_zeroes); | 529 | bool trim_trailing_zeroes); |
| 530 | + // Note about name objects: qpdf's internal representation of a | ||
| 531 | + // PDF name is a sequence of bytes, excluding the NUL character, | ||
| 532 | + // and starting with a slash. Name objects as represented in the | ||
| 533 | + // PDF specification can contain characters escaped with #, but | ||
| 534 | + // such escaping is not of concern calling QPDFObjectHandle | ||
| 535 | + // methods not directly relating to parsing. For example, | ||
| 536 | + // newName("/text/plain").getName() and | ||
| 537 | + // parse("/text#2fplain").getName() both return "/text/plain", | ||
| 538 | + // while newName("/text/plain").unparse() and | ||
| 539 | + // parse("/text#2fplain").unparse() both return "/text#2fplain". | ||
| 540 | + // When working with the qpdf API for creating, retrieving, and | ||
| 541 | + // modifying objects, you want to work with the internal, | ||
| 542 | + // canonical representation. For names containing alphanumeric | ||
| 543 | + // characters, dashes, and underscores, there is no difference | ||
| 544 | + // between the two representations. For a lengthy discussion, see | ||
| 545 | + // https://github.com/qpdf/qpdf/discussions/625. | ||
| 522 | QPDF_DLL | 546 | QPDF_DLL |
| 523 | static QPDFObjectHandle newName(std::string const& name); | 547 | static QPDFObjectHandle newName(std::string const& name); |
| 524 | QPDF_DLL | 548 | QPDF_DLL |
| @@ -719,7 +743,9 @@ class QPDFObjectHandle | @@ -719,7 +743,9 @@ class QPDFObjectHandle | ||
| 719 | QPDF_DLL | 743 | QPDF_DLL |
| 720 | bool getValueAsNumber(double&); | 744 | bool getValueAsNumber(double&); |
| 721 | 745 | ||
| 722 | - // Methods for name objects; see also name and array objects | 746 | + // Methods for name objects. The returned name value is in qpdf's |
| 747 | + // canonical form with all escaping resolved. See comments for | ||
| 748 | + // newName() for details. | ||
| 723 | QPDF_DLL | 749 | QPDF_DLL |
| 724 | std::string getName(); | 750 | std::string getName(); |
| 725 | QPDF_DLL | 751 | QPDF_DLL |
| @@ -789,7 +815,10 @@ class QPDFObjectHandle | @@ -789,7 +815,10 @@ class QPDFObjectHandle | ||
| 789 | QPDF_DLL | 815 | QPDF_DLL |
| 790 | Matrix getArrayAsMatrix(); | 816 | Matrix getArrayAsMatrix(); |
| 791 | 817 | ||
| 792 | - // Methods for dictionary objects. | 818 | + // Methods for dictionary objects. In all dictionary methods, keys |
| 819 | + // are specified/represented as canonical name strings starting | ||
| 820 | + // with a leading slash and not containing any PDF syntax | ||
| 821 | + // escaping. See comments for getName() for details. | ||
| 793 | 822 | ||
| 794 | // Return an object that enables iteration over members. You can | 823 | // Return an object that enables iteration over members. You can |
| 795 | // do | 824 | // do |
| @@ -824,7 +853,9 @@ class QPDFObjectHandle | @@ -824,7 +853,9 @@ class QPDFObjectHandle | ||
| 824 | QPDF_DLL | 853 | QPDF_DLL |
| 825 | std::map<std::string, QPDFObjectHandle> getDictAsMap(); | 854 | std::map<std::string, QPDFObjectHandle> getDictAsMap(); |
| 826 | 855 | ||
| 827 | - // Methods for name and array objects | 856 | + // Methods for name and array objects. The name value is in qpdf's |
| 857 | + // canonical form with all escaping resolved. See comments for | ||
| 858 | + // newName() for details. | ||
| 828 | QPDF_DLL | 859 | QPDF_DLL |
| 829 | bool isOrHasName(std::string const&); | 860 | bool isOrHasName(std::string const&); |
| 830 | 861 | ||
| @@ -1237,8 +1268,8 @@ class QPDFObjectHandle | @@ -1237,8 +1268,8 @@ class QPDFObjectHandle | ||
| 1237 | 1268 | ||
| 1238 | // Return encoded as JSON. For most object types, there is an | 1269 | // Return encoded as JSON. For most object types, there is an |
| 1239 | // obvious mapping. The JSON is generated as follows: | 1270 | // obvious mapping. The JSON is generated as follows: |
| 1240 | - // * Names are encoded as strings representing the normalized value of | ||
| 1241 | - // getName() | 1271 | + // * Names are encoded as strings representing the normalized name |
| 1272 | + // in PDF syntax as returned by unparse() | ||
| 1242 | // * Indirect references are encoded as strings containing "obj gen R" | 1273 | // * Indirect references are encoded as strings containing "obj gen R" |
| 1243 | // * Strings are encoded as UTF-8 strings with unrepresentable binary | 1274 | // * Strings are encoded as UTF-8 strings with unrepresentable binary |
| 1244 | // characters encoded as \uHHHH | 1275 | // characters encoded as \uHHHH |
include/qpdf/qpdf-c.h
| @@ -654,7 +654,10 @@ extern "C" { | @@ -654,7 +654,10 @@ extern "C" { | ||
| 654 | /* Wrappers around QPDFObjectHandle methods. Be sure to read | 654 | /* Wrappers around QPDFObjectHandle methods. Be sure to read |
| 655 | * corresponding comments in QPDFObjectHandle.hh to understand | 655 | * corresponding comments in QPDFObjectHandle.hh to understand |
| 656 | * what each function does and what kinds of objects it applies | 656 | * what each function does and what kinds of objects it applies |
| 657 | - * to. | 657 | + * to. Note that names are to appear in a canonicalized form |
| 658 | + * starting with a leading slash and with all PDF escaping | ||
| 659 | + * resolved. See comments for getName() in QPDFObjectHandle.hh for | ||
| 660 | + * details. | ||
| 658 | */ | 661 | */ |
| 659 | 662 | ||
| 660 | QPDF_DLL | 663 | QPDF_DLL |
| @@ -790,6 +793,12 @@ extern "C" { | @@ -790,6 +793,12 @@ extern "C" { | ||
| 790 | QPDF_DLL | 793 | QPDF_DLL |
| 791 | qpdf_oh qpdf_oh_get_array_item(qpdf_data qpdf, qpdf_oh oh, int n); | 794 | qpdf_oh qpdf_oh_get_array_item(qpdf_data qpdf, qpdf_oh oh, int n); |
| 792 | 795 | ||
| 796 | + /* In all dictionary APIs, keys are specified/represented as | ||
| 797 | + * canonicalized name strings starting with / and with all PDF | ||
| 798 | + * escaping resolved. See comments for getName() in | ||
| 799 | + * QPDFObjectHandle for details. | ||
| 800 | + */ | ||
| 801 | + | ||
| 793 | /* "C"-specific dictionary key iteration */ | 802 | /* "C"-specific dictionary key iteration */ |
| 794 | 803 | ||
| 795 | /* Iteration is allowed on only one dictionary at a time. */ | 804 | /* Iteration is allowed on only one dictionary at a time. */ |
| @@ -813,7 +822,8 @@ extern "C" { | @@ -813,7 +822,8 @@ extern "C" { | ||
| 813 | QPDF_DLL | 822 | QPDF_DLL |
| 814 | qpdf_oh qpdf_oh_get_key(qpdf_data qpdf, qpdf_oh oh, char const* key); | 823 | qpdf_oh qpdf_oh_get_key(qpdf_data qpdf, qpdf_oh oh, char const* key); |
| 815 | QPDF_DLL | 824 | QPDF_DLL |
| 816 | - qpdf_oh qpdf_oh_get_key_if_dict(qpdf_data qpdf, qpdf_oh oh, char const* key); | 825 | + qpdf_oh qpdf_oh_get_key_if_dict( |
| 826 | + qpdf_data qpdf, qpdf_oh oh, char const* key); | ||
| 817 | 827 | ||
| 818 | QPDF_DLL | 828 | QPDF_DLL |
| 819 | QPDF_BOOL qpdf_oh_is_or_has_name( | 829 | QPDF_BOOL qpdf_oh_is_or_has_name( |