Commit 04edfe9fade7e77342f5e4fe694ee071554a119c

Authored by Jay Berkenbilt
1 parent 63e5cb53

QPDFObjectHandle::newUnicodeString to uses UTF-16 only when needed

Use the first of ASCII, PDFDocEncoding, or UTF-16 that is capable of
encoding the string.
ChangeLog
  1 +2021-01-23 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Change behavior of QPDFObjectHandle::newUnicodeString so that it
  4 + encodes ASCII or PDFDocEncoding if those encodings will support
  5 + all the characters in the string, resorting to UTF-16 only if the
  6 + other encodings are insufficient. This is a cleaner implementation
  7 + of the intention of encoding strings for use outside of contents
  8 + and results in fewer instances of ASCII strings being needlessly
  9 + encoded as UTF-16. This change may cause qpdf to generate
  10 + different output from the same input when form field values are
  11 + set using methods from QPDFFormFieldObjectHelper.
  12 +
1 13 2021-01-16 Jay Berkenbilt <ejb@ql.org>
2 14  
3 15 * Add new constructors for QPDFNameTreeObjectHelper and
... ...
examples/qtest/set-form-values/form-out.pdf
No preview for this file type
include/qpdf/QPDFObjectHandle.hh
... ... @@ -480,10 +480,12 @@ class QPDFObjectHandle
480 480 static QPDFObjectHandle newName(std::string const& name);
481 481 QPDF_DLL
482 482 static QPDFObjectHandle newString(std::string const& str);
483   - // Create a string encoded in UTF-16 from the given utf8-encoded
484   - // string. Such strings are appropriately encoded to appear in PDF
485   - // files outside of content streams, such as in document metadata
486   - // form field values, page labels, outlines, and similar locations.
  483 + // Create a string encoded from the given utf8-encoded string
  484 + // appropriately encoded to appear in PDF files outside of content
  485 + // streams, such as in document metadata form field values, page
  486 + // labels, outlines, and similar locations. We try ASCII first,
  487 + // then PDFDocEncoding, then UTF-16 as needed to successfully
  488 + // encode all the characters.
487 489 QPDF_DLL
488 490 static QPDFObjectHandle newUnicodeString(std::string const& utf8_str);
489 491 QPDF_DLL
... ...
libqpdf/QPDF_String.cc
... ... @@ -31,7 +31,13 @@ QPDF_String::~QPDF_String()
31 31 QPDF_String*
32 32 QPDF_String::new_utf16(std::string const& utf8_val)
33 33 {
34   - return new QPDF_String(QUtil::utf8_to_utf16(utf8_val));
  34 + std::string result;
  35 + if (! (QUtil::utf8_to_ascii(utf8_val, result, '?') ||
  36 + QUtil::utf8_to_pdf_doc(utf8_val, result, '?')))
  37 + {
  38 + result = QUtil::utf8_to_utf16(utf8_val);
  39 + }
  40 + return new QPDF_String(result);
35 41 }
36 42  
37 43 std::string
... ...
manual/qpdf-manual.xml
... ... @@ -4824,6 +4824,26 @@ print &quot;\n&quot;;
4824 4824 <itemizedlist>
4825 4825 <listitem>
4826 4826 <para>
  4827 + Behavior Changes
  4828 + </para>
  4829 + <itemizedlist>
  4830 + <listitem>
  4831 + <para>
  4832 + <function>QPDFObjectHandle::newUnicodeString</function> now
  4833 + uses whichever of ASCII, PDFDocEncoding, of UTF-16 is
  4834 + sufficient to encode all the characters in the string. This
  4835 + reduces needless encoding in UTF-16 of strings that can be
  4836 + encoded in ASCII. This change may cause qpdf to generate
  4837 + different output than before when form field values are set
  4838 + using <classname>QPDFFormFieldObjectHelper</classname>.
  4839 + </para>
  4840 + </listitem>
  4841 + </itemizedlist>
  4842 + </listitem>
  4843 + </itemizedlist>
  4844 + <itemizedlist>
  4845 + <listitem>
  4846 + <para>
4827 4847 Library Enhancements
4828 4848 </para>
4829 4849 <itemizedlist>
... ...
qpdf/qtest/qpdf/appearances-1.pdf
No preview for this file type
qpdf/qtest/qpdf/appearances-11.pdf
No preview for this file type
qpdf/qtest/qpdf/appearances-12.pdf
No preview for this file type
qpdf/qtest/qpdf/appearances-2.pdf
No preview for this file type
qpdf/qtest/qpdf/appearances-quack.pdf
No preview for this file type
qpdf/qtest/qpdf/form-no-need-appearances-filled.pdf
No preview for this file type
qpdf/qtest/qpdf/unicode-errors.out
1   -This file has utf-8 encoding errors and should be edited as a binary file. // <feff0054006800690073002000660069006c006500200068006100730020007500740066002d003800200065006e0063006f00640069006e00670020006500720072006f0072007300200061006e0064002000730068006f0075006c0064002000620065002000650064006900740065006400200061007300200061002000620069006e006100720079002000660069006c0065002e>
2   - // <feff>
  1 +This file has utf-8 encoding errors and should be edited as a binary file. // <546869732066696c6520686173207574662d3820656e636f64696e67206572726f727320616e642073686f756c642062652065646974656420617320612062696e6172792066696c652e>
  2 + // <>
3 3 0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
4 4 1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
5 5 2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
6   -3: not enough bytes for character: �!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
  6 +3: not enough bytes for character: �!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429>
7 7 4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>
... ...
qpdf/qtest/qpdf/unicode.in
... ... @@ -3,3 +3,5 @@ If you wanted to, you could cook some sweet 🥔 π.
3 3 If you think wwwwww is good, you should try ʬʬʬʬʬʬ.
4 4 బంగాళాదుంప సలాడ్
5 5 𝄞 𝄢 𝄪 𝅂
  6 +This can be encoded in ASCII.
  7 +This can be encoded in PDFDocEncoding (€).
... ...
qpdf/qtest/qpdf/unicode.out
... ... @@ -3,3 +3,5 @@ If you wanted to, you could cook some sweet 🥔 π. // &lt;feff0049006600200079006
3 3 If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff0049006600200079006f00750020007400680069006e006b002000770077007700770077007700200069007300200067006f006f0064002c00200079006f0075002000730068006f0075006c00640020007400720079002002ac02ac02ac02ac02ac02ac002e>
4 4 బంగాళాదుంప సలాడ్ // <feff0c2c0c020c170c3e0c330c3e0c260c410c020c2a00200c380c320c3e0c210c4d>
5 5 𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42>
  6 +This can be encoded in ASCII. // <546869732063616e20626520656e636f64656420696e2041534349492e>
  7 +This can be encoded in PDFDocEncoding (€). // <546869732063616e20626520656e636f64656420696e20504446446f63456e636f64696e672028a0292e>
... ...