Commit 952a665a4ed51400b5925e7cd69f08f0aeb374fe
1 parent
e44c395c
Better support for creating Unicode strings
Showing
12 changed files
with
172 additions
and
2 deletions
ChangeLog
| 1 | +2018-06-21 Jay Berkenbilt <ejb@ql.org> | |
| 2 | + | |
| 3 | + * Added QPDFObject::newUnicodeString and QPDFObject::unparseBinary | |
| 4 | + to allow for more convenient creation of strings that are | |
| 5 | + explicitly encoded in UTF-16 BE. This is useful for creating | |
| 6 | + Unicode strings that appear outside of content streams, such as in | |
| 7 | + page labels, outlines, form field values, etc. | |
| 8 | + | |
| 1 | 9 | 2018-06-20 Jay Berkenbilt <ejb@ql.org> |
| 2 | 10 | |
| 3 | 11 | * Added new classes QPDFAcroFormDocumentHelper, | ... | ... |
include/qpdf/QPDFObjectHandle.hh
| ... | ... | @@ -344,6 +344,12 @@ class QPDFObjectHandle |
| 344 | 344 | static QPDFObjectHandle newName(std::string const& name); |
| 345 | 345 | QPDF_DLL |
| 346 | 346 | static QPDFObjectHandle newString(std::string const& str); |
| 347 | + // Create a string encoded in UTF-16 from the given utf8-encoded | |
| 348 | + // string. Such strings are appropriately encoded to appear in PDF | |
| 349 | + // files outside of content streams, such as in document metadata | |
| 350 | + // form field values, page labels, outlines, and similar locations. | |
| 351 | + QPDF_DLL | |
| 352 | + static QPDFObjectHandle newUnicodeString(std::string const& utf8_str); | |
| 347 | 353 | QPDF_DLL |
| 348 | 354 | static QPDFObjectHandle newOperator(std::string const&); |
| 349 | 355 | QPDF_DLL |
| ... | ... | @@ -715,6 +721,10 @@ class QPDFObjectHandle |
| 715 | 721 | std::string unparse(); |
| 716 | 722 | QPDF_DLL |
| 717 | 723 | std::string unparseResolved(); |
| 724 | + // For strings only, force binary representation. Otherwise, same | |
| 725 | + // as unparse. | |
| 726 | + QPDF_DLL | |
| 727 | + std::string unparseBinary(); | |
| 718 | 728 | |
| 719 | 729 | // Legacy helper methods for commonly performed operations on |
| 720 | 730 | // pages. Newer code should use QPDFPageObjectHelper instead. The | ... | ... |
libqpdf/QPDFObjectHandle.cc
| ... | ... | @@ -1221,6 +1221,20 @@ QPDFObjectHandle::unparseResolved() |
| 1221 | 1221 | return this->m->obj->unparse(); |
| 1222 | 1222 | } |
| 1223 | 1223 | |
| 1224 | +std::string | |
| 1225 | +QPDFObjectHandle::unparseBinary() | |
| 1226 | +{ | |
| 1227 | + if (this->isString()) | |
| 1228 | + { | |
| 1229 | + return dynamic_cast<QPDF_String*>( | |
| 1230 | + this->m->obj.getPointer())->unparse(true); | |
| 1231 | + } | |
| 1232 | + else | |
| 1233 | + { | |
| 1234 | + return unparse(); | |
| 1235 | + } | |
| 1236 | +} | |
| 1237 | + | |
| 1224 | 1238 | QPDFObjectHandle |
| 1225 | 1239 | QPDFObjectHandle::parse(std::string const& object_str, |
| 1226 | 1240 | std::string const& object_description) |
| ... | ... | @@ -1846,6 +1860,12 @@ QPDFObjectHandle::newString(std::string const& str) |
| 1846 | 1860 | } |
| 1847 | 1861 | |
| 1848 | 1862 | QPDFObjectHandle |
| 1863 | +QPDFObjectHandle::newUnicodeString(std::string const& utf8_str) | |
| 1864 | +{ | |
| 1865 | + return QPDFObjectHandle(QPDF_String::new_utf16(utf8_str)); | |
| 1866 | +} | |
| 1867 | + | |
| 1868 | +QPDFObjectHandle | |
| 1849 | 1869 | QPDFObjectHandle::newOperator(std::string const& value) |
| 1850 | 1870 | { |
| 1851 | 1871 | return QPDFObjectHandle(new QPDF_Operator(value)); | ... | ... |
libqpdf/QPDF_String.cc
| ... | ... | @@ -64,6 +64,58 @@ QPDF_String::~QPDF_String() |
| 64 | 64 | { |
| 65 | 65 | } |
| 66 | 66 | |
| 67 | +QPDF_String* | |
| 68 | +QPDF_String::new_utf16(std::string const& utf8_val) | |
| 69 | +{ | |
| 70 | + std::string result = "\xfe\xff"; | |
| 71 | + size_t len = utf8_val.length(); | |
| 72 | + for (size_t i = 0; i < len; ++i) | |
| 73 | + { | |
| 74 | + unsigned char ch = static_cast<unsigned char>(utf8_val.at(i)); | |
| 75 | + if (ch < 128) | |
| 76 | + { | |
| 77 | + result += QUtil::toUTF16(ch); | |
| 78 | + } | |
| 79 | + else | |
| 80 | + { | |
| 81 | + size_t bytes_needed = 0; | |
| 82 | + unsigned bit_check = 0x40; | |
| 83 | + unsigned char to_clear = 0x80; | |
| 84 | + while (ch & bit_check) | |
| 85 | + { | |
| 86 | + ++bytes_needed; | |
| 87 | + to_clear |= bit_check; | |
| 88 | + bit_check >>= 1; | |
| 89 | + } | |
| 90 | + | |
| 91 | + if (((bytes_needed > 5) || (bytes_needed < 1)) || | |
| 92 | + ((i + bytes_needed) >= len)) | |
| 93 | + { | |
| 94 | + result += "\xff\xfd"; | |
| 95 | + } | |
| 96 | + else | |
| 97 | + { | |
| 98 | + unsigned long codepoint = (ch & ~to_clear); | |
| 99 | + while (bytes_needed > 0) | |
| 100 | + { | |
| 101 | + --bytes_needed; | |
| 102 | + ch = utf8_val.at(++i); | |
| 103 | + if ((ch & 0xc0) != 0x80) | |
| 104 | + { | |
| 105 | + --i; | |
| 106 | + codepoint = 0xfffd; | |
| 107 | + break; | |
| 108 | + } | |
| 109 | + codepoint <<= 6; | |
| 110 | + codepoint += (ch & 0x3f); | |
| 111 | + } | |
| 112 | + result += QUtil::toUTF16(codepoint); | |
| 113 | + } | |
| 114 | + } | |
| 115 | + } | |
| 116 | + return new QPDF_String(result); | |
| 117 | +} | |
| 118 | + | |
| 67 | 119 | std::string |
| 68 | 120 | QPDF_String::unparse() |
| 69 | 121 | { | ... | ... |
libqpdf/qpdf/QPDF_String.hh
| ... | ... | @@ -9,6 +9,7 @@ class QPDF_String: public QPDFObject |
| 9 | 9 | { |
| 10 | 10 | public: |
| 11 | 11 | QPDF_String(std::string const& val); |
| 12 | + static QPDF_String* new_utf16(std::string const& utf8_val); | |
| 12 | 13 | virtual ~QPDF_String(); |
| 13 | 14 | virtual std::string unparse(); |
| 14 | 15 | virtual QPDFObject::object_type_e getTypeCode() const; | ... | ... |
qpdf/build.mk
qpdf/qtest/qpdf.test
| ... | ... | @@ -84,13 +84,21 @@ flush_tiff_cache(); |
| 84 | 84 | |
| 85 | 85 | show_ntests(); |
| 86 | 86 | # ---------- |
| 87 | -$td->notify("--- PDF Doc Encoding ---"); | |
| 88 | -$n_tests += 1; | |
| 87 | +$td->notify("--- Character Encoding ---"); | |
| 88 | +$n_tests += 3; | |
| 89 | 89 | |
| 90 | 90 | $td->runtest("PDF doc encoding to Unicode", |
| 91 | 91 | {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"}, |
| 92 | 92 | {$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0}, |
| 93 | 93 | $td->NORMALIZE_NEWLINES); |
| 94 | +$td->runtest("UTF-16 encoding", | |
| 95 | + {$td->COMMAND => "test_pdf_unicode unicode.in"}, | |
| 96 | + {$td->FILE => "unicode.out", $td->EXIT_STATUS => 0}, | |
| 97 | + $td->NORMALIZE_NEWLINES); | |
| 98 | +$td->runtest("UTF-16 encoding errors", | |
| 99 | + {$td->COMMAND => "test_pdf_unicode unicode-errors.in"}, | |
| 100 | + {$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0}, | |
| 101 | + $td->NORMALIZE_NEWLINES); | |
| 94 | 102 | |
| 95 | 103 | show_ntests(); |
| 96 | 104 | # ---------- | ... | ... |
qpdf/qtest/qpdf/unicode-errors.in
0 → 100644
qpdf/qtest/qpdf/unicode-errors.out
0 → 100644
| 1 | +This file has utf-8 encoding errors and should be edited as a binary file. // <feff0054006800690073002000660069006c006500200068006100730020007500740066002d003800200065006e0063006f00640069006e00670020006500720072006f0072007300200061006e0064002000730068006f0075006c0064002000620065002000650064006900740065006400200061007300200061002000620069006e006100720079002000660069006c0065002e> | |
| 2 | + // <feff> | |
| 3 | +0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072> | |
| 4 | +1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072> | |
| 5 | +2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072> | |
| 6 | +3: not enough bytes for character: �!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029> | |
| 7 | +4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd> | ... | ... |
qpdf/qtest/qpdf/unicode.in
0 → 100644
qpdf/qtest/qpdf/unicode.out
0 → 100644
| 1 | +This is a potato: 🥔 (u+01f954). // <feff00540068006900730020006900730020006100200070006f007400610074006f003a0020d83edd54002000280075002b0030003100660039003500340029002e> | |
| 2 | +If you wanted to, you could cook some sweet 🥔 π. // <feff0049006600200079006f0075002000770061006e00740065006400200074006f002c00200079006f007500200063006f0075006c006400200063006f006f006b00200073006f006d00650020007300770065006500740020d83edd54002003c0002e> | |
| 3 | +If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff0049006600200079006f00750020007400680069006e006b002000770077007700770077007700200069007300200067006f006f0064002c00200079006f0075002000730068006f0075006c00640020007400720079002002ac02ac02ac02ac02ac02ac002e> | |
| 4 | +బంగాళాదుంప సలాడ్ // <feff0c2c0c020c170c3e0c330c3e0c260c410c020c2a00200c380c320c3e0c210c4d> | |
| 5 | +𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42> | ... | ... |
qpdf/test_pdf_unicode.cc
0 → 100644
| 1 | +#include <qpdf/QUtil.hh> | |
| 2 | +#include <qpdf/QPDFObjectHandle.hh> | |
| 3 | +#include <iostream> | |
| 4 | +#include <stdlib.h> | |
| 5 | +#include <string.h> | |
| 6 | + | |
| 7 | +static char const* whoami = 0; | |
| 8 | + | |
| 9 | +void usage() | |
| 10 | +{ | |
| 11 | + std::cerr << "Usage: " << whoami << " infile" << std::endl; | |
| 12 | + exit(2); | |
| 13 | +} | |
| 14 | + | |
| 15 | +int main(int argc, char* argv[]) | |
| 16 | +{ | |
| 17 | + if ((whoami = strrchr(argv[0], '/')) == NULL) | |
| 18 | + { | |
| 19 | + whoami = argv[0]; | |
| 20 | + } | |
| 21 | + else | |
| 22 | + { | |
| 23 | + ++whoami; | |
| 24 | + } | |
| 25 | + // For libtool's sake.... | |
| 26 | + if (strncmp(whoami, "lt-", 3) == 0) | |
| 27 | + { | |
| 28 | + whoami += 3; | |
| 29 | + } | |
| 30 | + | |
| 31 | + if (argc != 2) | |
| 32 | + { | |
| 33 | + usage(); | |
| 34 | + } | |
| 35 | + char const* infilename = argv[1]; | |
| 36 | + std::list<std::string> lines = | |
| 37 | + QUtil::read_lines_from_file(infilename); | |
| 38 | + for (std::list<std::string>::iterator iter = lines.begin(); | |
| 39 | + iter != lines.end(); ++iter) | |
| 40 | + { | |
| 41 | + QPDFObjectHandle str = QPDFObjectHandle::newUnicodeString(*iter); | |
| 42 | + std::cout << str.getUTF8Value() << " // " | |
| 43 | + << str.unparseBinary() << std::endl; | |
| 44 | + } | |
| 45 | + return 0; | |
| 46 | +} | ... | ... |