Commit 68e4aec054dd735c0a808133acdf2fbca6c648c7

Authored by Jay Berkenbilt
1 parent c62e8e2b

Clarify qpdf's representation of names in the API

Clarify that names are to appear in canonical form with PDF escaping
resolved when used in non-parsing QPDFObjectHandle APIs and their C
API counterparts. See https://github.com/qpdf/qpdf/discussions/625.
include/qpdf/QPDFObjectHandle.hh
@@ -145,18 +145,26 @@ class QPDFObjectHandle @@ -145,18 +145,26 @@ class QPDFObjectHandle
145 // TokenFilters. 145 // TokenFilters.
146 // 146 //
147 // Please note that when you call token.getValue() on a token of 147 // Please note that when you call token.getValue() on a token of
148 - // type tt_string, you get the string value without any  
149 - // delimiters. token.getRawValue() will return something suitable  
150 - // for being written to output, or calling writeToken with a  
151 - // string token will also work. The correct way to construct a  
152 - // string token that would write the literal value (str) is  
153 - // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "str"). A  
154 - // similar situation exists with tt_name. token.getValue() returns  
155 - // a normalized name with # codes resolved into characters, and  
156 - // may not be suitable for writing. You can pass it to  
157 - // QPDF_Name::normalizeName first, or you can use writeToken with  
158 - // a name token. The correct way to create a name token is  
159 - // QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/Name"). 148 + // type tt_string or tt_name, you get the canonical, "parsed"
  149 + // representation of the token. For a string, this means that
  150 + // there are no delimiters, and for a name, it means that all
  151 + // escaping (# followed by two hex digits) has been resolved.
  152 + // qpdf's internal representation of name includes the leading
  153 + // slash. As such, you can't write the value of token.getValue()
  154 + // directly to output that is supposed to be valid PDF syntax. If
  155 + // you want to do that, you need to call writeToken() instead, or
  156 + // you can retrieve the token as it appeared in the input with
  157 + // token.getRawValue(). To construct a new string or name token
  158 + // from a canonical representation, use
  159 + // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "parsed-str") or
  160 + // QPDFTokenizer::Token(QPDFTokenizer::tt_name,
  161 + // "/Canonical-Name"). Tokens created this way won't have a
  162 + // PDF-syntax raw value, but you can still write them with
  163 + // writeToken(). Example:
  164 + // writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/text/plain"))
  165 + // would write `/text#2fplain`, and
  166 + // writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, "a\\(b"))
  167 + // would write `(a\(b)`
160 class QPDF_DLL_CLASS TokenFilter 168 class QPDF_DLL_CLASS TokenFilter
161 { 169 {
162 public: 170 public:
@@ -519,6 +527,22 @@ class QPDFObjectHandle @@ -519,6 +527,22 @@ class QPDFObjectHandle
519 QPDF_DLL 527 QPDF_DLL
520 static QPDFObjectHandle newReal(double value, int decimal_places, 528 static QPDFObjectHandle newReal(double value, int decimal_places,
521 bool trim_trailing_zeroes); 529 bool trim_trailing_zeroes);
  530 + // Note about name objects: qpdf's internal representation of a
  531 + // PDF name is a sequence of bytes, excluding the NUL character,
  532 + // and starting with a slash. Name objects as represented in the
  533 + // PDF specification can contain characters escaped with #, but
  534 + // such escaping is not of concern calling QPDFObjectHandle
  535 + // methods not directly relating to parsing. For example,
  536 + // newName("/text/plain").getName() and
  537 + // parse("/text#2fplain").getName() both return "/text/plain",
  538 + // while newName("/text/plain").unparse() and
  539 + // parse("/text#2fplain").unparse() both return "/text#2fplain".
  540 + // When working with the qpdf API for creating, retrieving, and
  541 + // modifying objects, you want to work with the internal,
  542 + // canonical representation. For names containing alphanumeric
  543 + // characters, dashes, and underscores, there is no difference
  544 + // between the two representations. For a lengthy discussion, see
  545 + // https://github.com/qpdf/qpdf/discussions/625.
522 QPDF_DLL 546 QPDF_DLL
523 static QPDFObjectHandle newName(std::string const& name); 547 static QPDFObjectHandle newName(std::string const& name);
524 QPDF_DLL 548 QPDF_DLL
@@ -719,7 +743,9 @@ class QPDFObjectHandle @@ -719,7 +743,9 @@ class QPDFObjectHandle
719 QPDF_DLL 743 QPDF_DLL
720 bool getValueAsNumber(double&); 744 bool getValueAsNumber(double&);
721 745
722 - // Methods for name objects; see also name and array objects 746 + // Methods for name objects. The returned name value is in qpdf's
  747 + // canonical form with all escaping resolved. See comments for
  748 + // newName() for details.
723 QPDF_DLL 749 QPDF_DLL
724 std::string getName(); 750 std::string getName();
725 QPDF_DLL 751 QPDF_DLL
@@ -789,7 +815,10 @@ class QPDFObjectHandle @@ -789,7 +815,10 @@ class QPDFObjectHandle
789 QPDF_DLL 815 QPDF_DLL
790 Matrix getArrayAsMatrix(); 816 Matrix getArrayAsMatrix();
791 817
792 - // Methods for dictionary objects. 818 + // Methods for dictionary objects. In all dictionary methods, keys
  819 + // are specified/represented as canonical name strings starting
  820 + // with a leading slash and not containing any PDF syntax
  821 + // escaping. See comments for getName() for details.
793 822
794 // Return an object that enables iteration over members. You can 823 // Return an object that enables iteration over members. You can
795 // do 824 // do
@@ -824,7 +853,9 @@ class QPDFObjectHandle @@ -824,7 +853,9 @@ class QPDFObjectHandle
824 QPDF_DLL 853 QPDF_DLL
825 std::map<std::string, QPDFObjectHandle> getDictAsMap(); 854 std::map<std::string, QPDFObjectHandle> getDictAsMap();
826 855
827 - // Methods for name and array objects 856 + // Methods for name and array objects. The name value is in qpdf's
  857 + // canonical form with all escaping resolved. See comments for
  858 + // newName() for details.
828 QPDF_DLL 859 QPDF_DLL
829 bool isOrHasName(std::string const&); 860 bool isOrHasName(std::string const&);
830 861
@@ -1237,8 +1268,8 @@ class QPDFObjectHandle @@ -1237,8 +1268,8 @@ class QPDFObjectHandle
1237 1268
1238 // Return encoded as JSON. For most object types, there is an 1269 // Return encoded as JSON. For most object types, there is an
1239 // obvious mapping. The JSON is generated as follows: 1270 // obvious mapping. The JSON is generated as follows:
1240 - // * Names are encoded as strings representing the normalized value of  
1241 - // getName() 1271 + // * Names are encoded as strings representing the normalized name
  1272 + // in PDF syntax as returned by unparse()
1242 // * Indirect references are encoded as strings containing "obj gen R" 1273 // * Indirect references are encoded as strings containing "obj gen R"
1243 // * Strings are encoded as UTF-8 strings with unrepresentable binary 1274 // * Strings are encoded as UTF-8 strings with unrepresentable binary
1244 // characters encoded as \uHHHH 1275 // characters encoded as \uHHHH
include/qpdf/qpdf-c.h
@@ -654,7 +654,10 @@ extern &quot;C&quot; { @@ -654,7 +654,10 @@ extern &quot;C&quot; {
654 /* Wrappers around QPDFObjectHandle methods. Be sure to read 654 /* Wrappers around QPDFObjectHandle methods. Be sure to read
655 * corresponding comments in QPDFObjectHandle.hh to understand 655 * corresponding comments in QPDFObjectHandle.hh to understand
656 * what each function does and what kinds of objects it applies 656 * what each function does and what kinds of objects it applies
657 - * to. 657 + * to. Note that names are to appear in a canonicalized form
  658 + * starting with a leading slash and with all PDF escaping
  659 + * resolved. See comments for getName() in QPDFObjectHandle.hh for
  660 + * details.
658 */ 661 */
659 662
660 QPDF_DLL 663 QPDF_DLL
@@ -790,6 +793,12 @@ extern &quot;C&quot; { @@ -790,6 +793,12 @@ extern &quot;C&quot; {
790 QPDF_DLL 793 QPDF_DLL
791 qpdf_oh qpdf_oh_get_array_item(qpdf_data qpdf, qpdf_oh oh, int n); 794 qpdf_oh qpdf_oh_get_array_item(qpdf_data qpdf, qpdf_oh oh, int n);
792 795
  796 + /* In all dictionary APIs, keys are specified/represented as
  797 + * canonicalized name strings starting with / and with all PDF
  798 + * escaping resolved. See comments for getName() in
  799 + * QPDFObjectHandle for details.
  800 + */
  801 +
793 /* "C"-specific dictionary key iteration */ 802 /* "C"-specific dictionary key iteration */
794 803
795 /* Iteration is allowed on only one dictionary at a time. */ 804 /* Iteration is allowed on only one dictionary at a time. */
@@ -813,7 +822,8 @@ extern &quot;C&quot; { @@ -813,7 +822,8 @@ extern &quot;C&quot; {
813 QPDF_DLL 822 QPDF_DLL
814 qpdf_oh qpdf_oh_get_key(qpdf_data qpdf, qpdf_oh oh, char const* key); 823 qpdf_oh qpdf_oh_get_key(qpdf_data qpdf, qpdf_oh oh, char const* key);
815 QPDF_DLL 824 QPDF_DLL
816 - qpdf_oh qpdf_oh_get_key_if_dict(qpdf_data qpdf, qpdf_oh oh, char const* key); 825 + qpdf_oh qpdf_oh_get_key_if_dict(
  826 + qpdf_data qpdf, qpdf_oh oh, char const* key);
817 827
818 QPDF_DLL 828 QPDF_DLL
819 QPDF_BOOL qpdf_oh_is_or_has_name( 829 QPDF_BOOL qpdf_oh_is_or_has_name(