Commit 04edfe9fade7e77342f5e4fe694ee071554a119c
1 parent
63e5cb53
QPDFObjectHandle::newUnicodeString to uses UTF-16 only when needed
Use the first of ASCII, PDFDocEncoding, or UTF-16 that is capable of encoding the string.
Showing
14 changed files
with
1007 additions
and
963 deletions
ChangeLog
| 1 | +2021-01-23 Jay Berkenbilt <ejb@ql.org> | |
| 2 | + | |
| 3 | + * Change behavior of QPDFObjectHandle::newUnicodeString so that it | |
| 4 | + encodes ASCII or PDFDocEncoding if those encodings will support | |
| 5 | + all the characters in the string, resorting to UTF-16 only if the | |
| 6 | + other encodings are insufficient. This is a cleaner implementation | |
| 7 | + of the intention of encoding strings for use outside of contents | |
| 8 | + and results in fewer instances of ASCII strings being needlessly | |
| 9 | + encoded as UTF-16. This change may cause qpdf to generate | |
| 10 | + different output from the same input when form field values are | |
| 11 | + set using methods from QPDFFormFieldObjectHelper. | |
| 12 | + | |
| 1 | 13 | 2021-01-16 Jay Berkenbilt <ejb@ql.org> |
| 2 | 14 | |
| 3 | 15 | * Add new constructors for QPDFNameTreeObjectHelper and | ... | ... |
examples/qtest/set-form-values/form-out.pdf
No preview for this file type
include/qpdf/QPDFObjectHandle.hh
| ... | ... | @@ -480,10 +480,12 @@ class QPDFObjectHandle |
| 480 | 480 | static QPDFObjectHandle newName(std::string const& name); |
| 481 | 481 | QPDF_DLL |
| 482 | 482 | static QPDFObjectHandle newString(std::string const& str); |
| 483 | - // Create a string encoded in UTF-16 from the given utf8-encoded | |
| 484 | - // string. Such strings are appropriately encoded to appear in PDF | |
| 485 | - // files outside of content streams, such as in document metadata | |
| 486 | - // form field values, page labels, outlines, and similar locations. | |
| 483 | + // Create a string encoded from the given utf8-encoded string | |
| 484 | + // appropriately encoded to appear in PDF files outside of content | |
| 485 | + // streams, such as in document metadata form field values, page | |
| 486 | + // labels, outlines, and similar locations. We try ASCII first, | |
| 487 | + // then PDFDocEncoding, then UTF-16 as needed to successfully | |
| 488 | + // encode all the characters. | |
| 487 | 489 | QPDF_DLL |
| 488 | 490 | static QPDFObjectHandle newUnicodeString(std::string const& utf8_str); |
| 489 | 491 | QPDF_DLL | ... | ... |
libqpdf/QPDF_String.cc
| ... | ... | @@ -31,7 +31,13 @@ QPDF_String::~QPDF_String() |
| 31 | 31 | QPDF_String* |
| 32 | 32 | QPDF_String::new_utf16(std::string const& utf8_val) |
| 33 | 33 | { |
| 34 | - return new QPDF_String(QUtil::utf8_to_utf16(utf8_val)); | |
| 34 | + std::string result; | |
| 35 | + if (! (QUtil::utf8_to_ascii(utf8_val, result, '?') || | |
| 36 | + QUtil::utf8_to_pdf_doc(utf8_val, result, '?'))) | |
| 37 | + { | |
| 38 | + result = QUtil::utf8_to_utf16(utf8_val); | |
| 39 | + } | |
| 40 | + return new QPDF_String(result); | |
| 35 | 41 | } |
| 36 | 42 | |
| 37 | 43 | std::string | ... | ... |
manual/qpdf-manual.xml
| ... | ... | @@ -4824,6 +4824,26 @@ print "\n"; |
| 4824 | 4824 | <itemizedlist> |
| 4825 | 4825 | <listitem> |
| 4826 | 4826 | <para> |
| 4827 | + Behavior Changes | |
| 4828 | + </para> | |
| 4829 | + <itemizedlist> | |
| 4830 | + <listitem> | |
| 4831 | + <para> | |
| 4832 | + <function>QPDFObjectHandle::newUnicodeString</function> now | |
| 4833 | + uses whichever of ASCII, PDFDocEncoding, of UTF-16 is | |
| 4834 | + sufficient to encode all the characters in the string. This | |
| 4835 | + reduces needless encoding in UTF-16 of strings that can be | |
| 4836 | + encoded in ASCII. This change may cause qpdf to generate | |
| 4837 | + different output than before when form field values are set | |
| 4838 | + using <classname>QPDFFormFieldObjectHelper</classname>. | |
| 4839 | + </para> | |
| 4840 | + </listitem> | |
| 4841 | + </itemizedlist> | |
| 4842 | + </listitem> | |
| 4843 | + </itemizedlist> | |
| 4844 | + <itemizedlist> | |
| 4845 | + <listitem> | |
| 4846 | + <para> | |
| 4827 | 4847 | Library Enhancements |
| 4828 | 4848 | </para> |
| 4829 | 4849 | <itemizedlist> | ... | ... |
qpdf/qtest/qpdf/appearances-1.pdf
No preview for this file type
qpdf/qtest/qpdf/appearances-11.pdf
No preview for this file type
qpdf/qtest/qpdf/appearances-12.pdf
No preview for this file type
qpdf/qtest/qpdf/appearances-2.pdf
No preview for this file type
qpdf/qtest/qpdf/appearances-quack.pdf
No preview for this file type
qpdf/qtest/qpdf/form-no-need-appearances-filled.pdf
No preview for this file type
qpdf/qtest/qpdf/unicode-errors.out
| 1 | -This file has utf-8 encoding errors and should be edited as a binary file. // <feff0054006800690073002000660069006c006500200068006100730020007500740066002d003800200065006e0063006f00640069006e00670020006500720072006f0072007300200061006e0064002000730068006f0075006c0064002000620065002000650064006900740065006400200061007300200061002000620069006e006100720079002000660069006c0065002e> | |
| 2 | - // <feff> | |
| 1 | +This file has utf-8 encoding errors and should be edited as a binary file. // <546869732066696c6520686173207574662d3820656e636f64696e67206572726f727320616e642073686f756c642062652065646974656420617320612062696e6172792066696c652e> | |
| 2 | + // <> | |
| 3 | 3 | 0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072> |
| 4 | 4 | 1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072> |
| 5 | 5 | 2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072> |
| 6 | -3: not enough bytes for character: �!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029> | |
| 6 | +3: not enough bytes for character: �!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429> | |
| 7 | 7 | 4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd> | ... | ... |
qpdf/qtest/qpdf/unicode.in
qpdf/qtest/qpdf/unicode.out
| ... | ... | @@ -3,3 +3,5 @@ If you wanted to, you could cook some sweet 🥔 π. // <feff0049006600200079006 |
| 3 | 3 | If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff0049006600200079006f00750020007400680069006e006b002000770077007700770077007700200069007300200067006f006f0064002c00200079006f0075002000730068006f0075006c00640020007400720079002002ac02ac02ac02ac02ac02ac002e> |
| 4 | 4 | బంగాళాదుంప సలాడ్ // <feff0c2c0c020c170c3e0c330c3e0c260c410c020c2a00200c380c320c3e0c210c4d> |
| 5 | 5 | 𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42> |
| 6 | +This can be encoded in ASCII. // <546869732063616e20626520656e636f64656420696e2041534349492e> | |
| 7 | +This can be encoded in PDFDocEncoding (€). // <546869732063616e20626520656e636f64656420696e20504446446f63456e636f64696e672028a0292e> | ... | ... |