Commit 68e4aec054dd735c0a808133acdf2fbca6c648c7

Authored by Jay Berkenbilt
1 parent c62e8e2b

Clarify qpdf's representation of names in the API

Clarify that names are to appear in canonical form with PDF escaping
resolved when used in non-parsing QPDFObjectHandle APIs and their C
API counterparts. See https://github.com/qpdf/qpdf/discussions/625.
include/qpdf/QPDFObjectHandle.hh
... ... @@ -145,18 +145,26 @@ class QPDFObjectHandle
145 145 // TokenFilters.
146 146 //
147 147 // Please note that when you call token.getValue() on a token of
148   - // type tt_string, you get the string value without any
149   - // delimiters. token.getRawValue() will return something suitable
150   - // for being written to output, or calling writeToken with a
151   - // string token will also work. The correct way to construct a
152   - // string token that would write the literal value (str) is
153   - // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "str"). A
154   - // similar situation exists with tt_name. token.getValue() returns
155   - // a normalized name with # codes resolved into characters, and
156   - // may not be suitable for writing. You can pass it to
157   - // QPDF_Name::normalizeName first, or you can use writeToken with
158   - // a name token. The correct way to create a name token is
159   - // QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/Name").
  148 + // type tt_string or tt_name, you get the canonical, "parsed"
  149 + // representation of the token. For a string, this means that
  150 + // there are no delimiters, and for a name, it means that all
  151 + // escaping (# followed by two hex digits) has been resolved.
  152 + // qpdf's internal representation of name includes the leading
  153 + // slash. As such, you can't write the value of token.getValue()
  154 + // directly to output that is supposed to be valid PDF syntax. If
  155 + // you want to do that, you need to call writeToken() instead, or
  156 + // you can retrieve the token as it appeared in the input with
  157 + // token.getRawValue(). To construct a new string or name token
  158 + // from a canonical representation, use
  159 + // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "parsed-str") or
  160 + // QPDFTokenizer::Token(QPDFTokenizer::tt_name,
  161 + // "/Canonical-Name"). Tokens created this way won't have a
  162 + // PDF-syntax raw value, but you can still write them with
  163 + // writeToken(). Example:
  164 + // writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/text/plain"))
  165 + // would write `/text#2fplain`, and
  166 + // writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, "a\\(b"))
  167 + // would write `(a\(b)`
160 168 class QPDF_DLL_CLASS TokenFilter
161 169 {
162 170 public:
... ... @@ -519,6 +527,22 @@ class QPDFObjectHandle
519 527 QPDF_DLL
520 528 static QPDFObjectHandle newReal(double value, int decimal_places,
521 529 bool trim_trailing_zeroes);
  530 + // Note about name objects: qpdf's internal representation of a
  531 + // PDF name is a sequence of bytes, excluding the NUL character,
  532 + // and starting with a slash. Name objects as represented in the
  533 + // PDF specification can contain characters escaped with #, but
  534 + // such escaping is not of concern calling QPDFObjectHandle
  535 + // methods not directly relating to parsing. For example,
  536 + // newName("/text/plain").getName() and
  537 + // parse("/text#2fplain").getName() both return "/text/plain",
  538 + // while newName("/text/plain").unparse() and
  539 + // parse("/text#2fplain").unparse() both return "/text#2fplain".
  540 + // When working with the qpdf API for creating, retrieving, and
  541 + // modifying objects, you want to work with the internal,
  542 + // canonical representation. For names containing alphanumeric
  543 + // characters, dashes, and underscores, there is no difference
  544 + // between the two representations. For a lengthy discussion, see
  545 + // https://github.com/qpdf/qpdf/discussions/625.
522 546 QPDF_DLL
523 547 static QPDFObjectHandle newName(std::string const& name);
524 548 QPDF_DLL
... ... @@ -719,7 +743,9 @@ class QPDFObjectHandle
719 743 QPDF_DLL
720 744 bool getValueAsNumber(double&);
721 745  
722   - // Methods for name objects; see also name and array objects
  746 + // Methods for name objects. The returned name value is in qpdf's
  747 + // canonical form with all escaping resolved. See comments for
  748 + // newName() for details.
723 749 QPDF_DLL
724 750 std::string getName();
725 751 QPDF_DLL
... ... @@ -789,7 +815,10 @@ class QPDFObjectHandle
789 815 QPDF_DLL
790 816 Matrix getArrayAsMatrix();
791 817  
792   - // Methods for dictionary objects.
  818 + // Methods for dictionary objects. In all dictionary methods, keys
  819 + // are specified/represented as canonical name strings starting
  820 + // with a leading slash and not containing any PDF syntax
  821 + // escaping. See comments for getName() for details.
793 822  
794 823 // Return an object that enables iteration over members. You can
795 824 // do
... ... @@ -824,7 +853,9 @@ class QPDFObjectHandle
824 853 QPDF_DLL
825 854 std::map<std::string, QPDFObjectHandle> getDictAsMap();
826 855  
827   - // Methods for name and array objects
  856 + // Methods for name and array objects. The name value is in qpdf's
  857 + // canonical form with all escaping resolved. See comments for
  858 + // newName() for details.
828 859 QPDF_DLL
829 860 bool isOrHasName(std::string const&);
830 861  
... ... @@ -1237,8 +1268,8 @@ class QPDFObjectHandle
1237 1268  
1238 1269 // Return encoded as JSON. For most object types, there is an
1239 1270 // obvious mapping. The JSON is generated as follows:
1240   - // * Names are encoded as strings representing the normalized value of
1241   - // getName()
  1271 + // * Names are encoded as strings representing the normalized name
  1272 + // in PDF syntax as returned by unparse()
1242 1273 // * Indirect references are encoded as strings containing "obj gen R"
1243 1274 // * Strings are encoded as UTF-8 strings with unrepresentable binary
1244 1275 // characters encoded as \uHHHH
... ...
include/qpdf/qpdf-c.h
... ... @@ -654,7 +654,10 @@ extern &quot;C&quot; {
654 654 /* Wrappers around QPDFObjectHandle methods. Be sure to read
655 655 * corresponding comments in QPDFObjectHandle.hh to understand
656 656 * what each function does and what kinds of objects it applies
657   - * to.
  657 + * to. Note that names are to appear in a canonicalized form
  658 + * starting with a leading slash and with all PDF escaping
  659 + * resolved. See comments for getName() in QPDFObjectHandle.hh for
  660 + * details.
658 661 */
659 662  
660 663 QPDF_DLL
... ... @@ -790,6 +793,12 @@ extern &quot;C&quot; {
790 793 QPDF_DLL
791 794 qpdf_oh qpdf_oh_get_array_item(qpdf_data qpdf, qpdf_oh oh, int n);
792 795  
  796 + /* In all dictionary APIs, keys are specified/represented as
  797 + * canonicalized name strings starting with / and with all PDF
  798 + * escaping resolved. See comments for getName() in
  799 + * QPDFObjectHandle for details.
  800 + */
  801 +
793 802 /* "C"-specific dictionary key iteration */
794 803  
795 804 /* Iteration is allowed on only one dictionary at a time. */
... ... @@ -813,7 +822,8 @@ extern &quot;C&quot; {
813 822 QPDF_DLL
814 823 qpdf_oh qpdf_oh_get_key(qpdf_data qpdf, qpdf_oh oh, char const* key);
815 824 QPDF_DLL
816   - qpdf_oh qpdf_oh_get_key_if_dict(qpdf_data qpdf, qpdf_oh oh, char const* key);
  825 + qpdf_oh qpdf_oh_get_key_if_dict(
  826 + qpdf_data qpdf, qpdf_oh oh, char const* key);
817 827  
818 828 QPDF_DLL
819 829 QPDF_BOOL qpdf_oh_is_or_has_name(
... ...