Commit 952a665a4ed51400b5925e7cd69f08f0aeb374fe
1 parent
e44c395c
Better support for creating Unicode strings
Showing
12 changed files
with
172 additions
and
2 deletions
ChangeLog
| 1 | +2018-06-21 Jay Berkenbilt <ejb@ql.org> | ||
| 2 | + | ||
| 3 | + * Added QPDFObject::newUnicodeString and QPDFObject::unparseBinary | ||
| 4 | + to allow for more convenient creation of strings that are | ||
| 5 | + explicitly encoded in UTF-16 BE. This is useful for creating | ||
| 6 | + Unicode strings that appear outside of content streams, such as in | ||
| 7 | + page labels, outlines, form field values, etc. | ||
| 8 | + | ||
| 1 | 2018-06-20 Jay Berkenbilt <ejb@ql.org> | 9 | 2018-06-20 Jay Berkenbilt <ejb@ql.org> |
| 2 | 10 | ||
| 3 | * Added new classes QPDFAcroFormDocumentHelper, | 11 | * Added new classes QPDFAcroFormDocumentHelper, |
include/qpdf/QPDFObjectHandle.hh
| @@ -344,6 +344,12 @@ class QPDFObjectHandle | @@ -344,6 +344,12 @@ class QPDFObjectHandle | ||
| 344 | static QPDFObjectHandle newName(std::string const& name); | 344 | static QPDFObjectHandle newName(std::string const& name); |
| 345 | QPDF_DLL | 345 | QPDF_DLL |
| 346 | static QPDFObjectHandle newString(std::string const& str); | 346 | static QPDFObjectHandle newString(std::string const& str); |
| 347 | + // Create a string encoded in UTF-16 from the given utf8-encoded | ||
| 348 | + // string. Such strings are appropriately encoded to appear in PDF | ||
| 349 | + // files outside of content streams, such as in document metadata | ||
| 350 | + // form field values, page labels, outlines, and similar locations. | ||
| 351 | + QPDF_DLL | ||
| 352 | + static QPDFObjectHandle newUnicodeString(std::string const& utf8_str); | ||
| 347 | QPDF_DLL | 353 | QPDF_DLL |
| 348 | static QPDFObjectHandle newOperator(std::string const&); | 354 | static QPDFObjectHandle newOperator(std::string const&); |
| 349 | QPDF_DLL | 355 | QPDF_DLL |
| @@ -715,6 +721,10 @@ class QPDFObjectHandle | @@ -715,6 +721,10 @@ class QPDFObjectHandle | ||
| 715 | std::string unparse(); | 721 | std::string unparse(); |
| 716 | QPDF_DLL | 722 | QPDF_DLL |
| 717 | std::string unparseResolved(); | 723 | std::string unparseResolved(); |
| 724 | + // For strings only, force binary representation. Otherwise, same | ||
| 725 | + // as unparse. | ||
| 726 | + QPDF_DLL | ||
| 727 | + std::string unparseBinary(); | ||
| 718 | 728 | ||
| 719 | // Legacy helper methods for commonly performed operations on | 729 | // Legacy helper methods for commonly performed operations on |
| 720 | // pages. Newer code should use QPDFPageObjectHelper instead. The | 730 | // pages. Newer code should use QPDFPageObjectHelper instead. The |
libqpdf/QPDFObjectHandle.cc
| @@ -1221,6 +1221,20 @@ QPDFObjectHandle::unparseResolved() | @@ -1221,6 +1221,20 @@ QPDFObjectHandle::unparseResolved() | ||
| 1221 | return this->m->obj->unparse(); | 1221 | return this->m->obj->unparse(); |
| 1222 | } | 1222 | } |
| 1223 | 1223 | ||
| 1224 | +std::string | ||
| 1225 | +QPDFObjectHandle::unparseBinary() | ||
| 1226 | +{ | ||
| 1227 | + if (this->isString()) | ||
| 1228 | + { | ||
| 1229 | + return dynamic_cast<QPDF_String*>( | ||
| 1230 | + this->m->obj.getPointer())->unparse(true); | ||
| 1231 | + } | ||
| 1232 | + else | ||
| 1233 | + { | ||
| 1234 | + return unparse(); | ||
| 1235 | + } | ||
| 1236 | +} | ||
| 1237 | + | ||
| 1224 | QPDFObjectHandle | 1238 | QPDFObjectHandle |
| 1225 | QPDFObjectHandle::parse(std::string const& object_str, | 1239 | QPDFObjectHandle::parse(std::string const& object_str, |
| 1226 | std::string const& object_description) | 1240 | std::string const& object_description) |
| @@ -1846,6 +1860,12 @@ QPDFObjectHandle::newString(std::string const& str) | @@ -1846,6 +1860,12 @@ QPDFObjectHandle::newString(std::string const& str) | ||
| 1846 | } | 1860 | } |
| 1847 | 1861 | ||
| 1848 | QPDFObjectHandle | 1862 | QPDFObjectHandle |
| 1863 | +QPDFObjectHandle::newUnicodeString(std::string const& utf8_str) | ||
| 1864 | +{ | ||
| 1865 | + return QPDFObjectHandle(QPDF_String::new_utf16(utf8_str)); | ||
| 1866 | +} | ||
| 1867 | + | ||
| 1868 | +QPDFObjectHandle | ||
| 1849 | QPDFObjectHandle::newOperator(std::string const& value) | 1869 | QPDFObjectHandle::newOperator(std::string const& value) |
| 1850 | { | 1870 | { |
| 1851 | return QPDFObjectHandle(new QPDF_Operator(value)); | 1871 | return QPDFObjectHandle(new QPDF_Operator(value)); |
libqpdf/QPDF_String.cc
| @@ -64,6 +64,58 @@ QPDF_String::~QPDF_String() | @@ -64,6 +64,58 @@ QPDF_String::~QPDF_String() | ||
| 64 | { | 64 | { |
| 65 | } | 65 | } |
| 66 | 66 | ||
| 67 | +QPDF_String* | ||
| 68 | +QPDF_String::new_utf16(std::string const& utf8_val) | ||
| 69 | +{ | ||
| 70 | + std::string result = "\xfe\xff"; | ||
| 71 | + size_t len = utf8_val.length(); | ||
| 72 | + for (size_t i = 0; i < len; ++i) | ||
| 73 | + { | ||
| 74 | + unsigned char ch = static_cast<unsigned char>(utf8_val.at(i)); | ||
| 75 | + if (ch < 128) | ||
| 76 | + { | ||
| 77 | + result += QUtil::toUTF16(ch); | ||
| 78 | + } | ||
| 79 | + else | ||
| 80 | + { | ||
| 81 | + size_t bytes_needed = 0; | ||
| 82 | + unsigned bit_check = 0x40; | ||
| 83 | + unsigned char to_clear = 0x80; | ||
| 84 | + while (ch & bit_check) | ||
| 85 | + { | ||
| 86 | + ++bytes_needed; | ||
| 87 | + to_clear |= bit_check; | ||
| 88 | + bit_check >>= 1; | ||
| 89 | + } | ||
| 90 | + | ||
| 91 | + if (((bytes_needed > 5) || (bytes_needed < 1)) || | ||
| 92 | + ((i + bytes_needed) >= len)) | ||
| 93 | + { | ||
| 94 | + result += "\xff\xfd"; | ||
| 95 | + } | ||
| 96 | + else | ||
| 97 | + { | ||
| 98 | + unsigned long codepoint = (ch & ~to_clear); | ||
| 99 | + while (bytes_needed > 0) | ||
| 100 | + { | ||
| 101 | + --bytes_needed; | ||
| 102 | + ch = utf8_val.at(++i); | ||
| 103 | + if ((ch & 0xc0) != 0x80) | ||
| 104 | + { | ||
| 105 | + --i; | ||
| 106 | + codepoint = 0xfffd; | ||
| 107 | + break; | ||
| 108 | + } | ||
| 109 | + codepoint <<= 6; | ||
| 110 | + codepoint += (ch & 0x3f); | ||
| 111 | + } | ||
| 112 | + result += QUtil::toUTF16(codepoint); | ||
| 113 | + } | ||
| 114 | + } | ||
| 115 | + } | ||
| 116 | + return new QPDF_String(result); | ||
| 117 | +} | ||
| 118 | + | ||
| 67 | std::string | 119 | std::string |
| 68 | QPDF_String::unparse() | 120 | QPDF_String::unparse() |
| 69 | { | 121 | { |
libqpdf/qpdf/QPDF_String.hh
| @@ -9,6 +9,7 @@ class QPDF_String: public QPDFObject | @@ -9,6 +9,7 @@ class QPDF_String: public QPDFObject | ||
| 9 | { | 9 | { |
| 10 | public: | 10 | public: |
| 11 | QPDF_String(std::string const& val); | 11 | QPDF_String(std::string const& val); |
| 12 | + static QPDF_String* new_utf16(std::string const& utf8_val); | ||
| 12 | virtual ~QPDF_String(); | 13 | virtual ~QPDF_String(); |
| 13 | virtual std::string unparse(); | 14 | virtual std::string unparse(); |
| 14 | virtual QPDFObject::object_type_e getTypeCode() const; | 15 | virtual QPDFObject::object_type_e getTypeCode() const; |
qpdf/build.mk
qpdf/qtest/qpdf.test
| @@ -84,13 +84,21 @@ flush_tiff_cache(); | @@ -84,13 +84,21 @@ flush_tiff_cache(); | ||
| 84 | 84 | ||
| 85 | show_ntests(); | 85 | show_ntests(); |
| 86 | # ---------- | 86 | # ---------- |
| 87 | -$td->notify("--- PDF Doc Encoding ---"); | ||
| 88 | -$n_tests += 1; | 87 | +$td->notify("--- Character Encoding ---"); |
| 88 | +$n_tests += 3; | ||
| 89 | 89 | ||
| 90 | $td->runtest("PDF doc encoding to Unicode", | 90 | $td->runtest("PDF doc encoding to Unicode", |
| 91 | {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"}, | 91 | {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"}, |
| 92 | {$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0}, | 92 | {$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0}, |
| 93 | $td->NORMALIZE_NEWLINES); | 93 | $td->NORMALIZE_NEWLINES); |
| 94 | +$td->runtest("UTF-16 encoding", | ||
| 95 | + {$td->COMMAND => "test_pdf_unicode unicode.in"}, | ||
| 96 | + {$td->FILE => "unicode.out", $td->EXIT_STATUS => 0}, | ||
| 97 | + $td->NORMALIZE_NEWLINES); | ||
| 98 | +$td->runtest("UTF-16 encoding errors", | ||
| 99 | + {$td->COMMAND => "test_pdf_unicode unicode-errors.in"}, | ||
| 100 | + {$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0}, | ||
| 101 | + $td->NORMALIZE_NEWLINES); | ||
| 94 | 102 | ||
| 95 | show_ntests(); | 103 | show_ntests(); |
| 96 | # ---------- | 104 | # ---------- |
qpdf/qtest/qpdf/unicode-errors.in
0 → 100644
qpdf/qtest/qpdf/unicode-errors.out
0 → 100644
| 1 | +This file has utf-8 encoding errors and should be edited as a binary file. // <feff0054006800690073002000660069006c006500200068006100730020007500740066002d003800200065006e0063006f00640069006e00670020006500720072006f0072007300200061006e0064002000730068006f0075006c0064002000620065002000650064006900740065006400200061007300200061002000620069006e006100720079002000660069006c0065002e> | ||
| 2 | + // <feff> | ||
| 3 | +0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072> | ||
| 4 | +1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072> | ||
| 5 | +2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072> | ||
| 6 | +3: not enough bytes for character: �!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029> | ||
| 7 | +4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd> |
qpdf/qtest/qpdf/unicode.in
0 → 100644
qpdf/qtest/qpdf/unicode.out
0 → 100644
| 1 | +This is a potato: 🥔 (u+01f954). // <feff00540068006900730020006900730020006100200070006f007400610074006f003a0020d83edd54002000280075002b0030003100660039003500340029002e> | ||
| 2 | +If you wanted to, you could cook some sweet 🥔 π. // <feff0049006600200079006f0075002000770061006e00740065006400200074006f002c00200079006f007500200063006f0075006c006400200063006f006f006b00200073006f006d00650020007300770065006500740020d83edd54002003c0002e> | ||
| 3 | +If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff0049006600200079006f00750020007400680069006e006b002000770077007700770077007700200069007300200067006f006f0064002c00200079006f0075002000730068006f0075006c00640020007400720079002002ac02ac02ac02ac02ac02ac002e> | ||
| 4 | +బంగాళాదుంప సలాడ్ // <feff0c2c0c020c170c3e0c330c3e0c260c410c020c2a00200c380c320c3e0c210c4d> | ||
| 5 | +𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42> |
qpdf/test_pdf_unicode.cc
0 → 100644
| 1 | +#include <qpdf/QUtil.hh> | ||
| 2 | +#include <qpdf/QPDFObjectHandle.hh> | ||
| 3 | +#include <iostream> | ||
| 4 | +#include <stdlib.h> | ||
| 5 | +#include <string.h> | ||
| 6 | + | ||
| 7 | +static char const* whoami = 0; | ||
| 8 | + | ||
| 9 | +void usage() | ||
| 10 | +{ | ||
| 11 | + std::cerr << "Usage: " << whoami << " infile" << std::endl; | ||
| 12 | + exit(2); | ||
| 13 | +} | ||
| 14 | + | ||
| 15 | +int main(int argc, char* argv[]) | ||
| 16 | +{ | ||
| 17 | + if ((whoami = strrchr(argv[0], '/')) == NULL) | ||
| 18 | + { | ||
| 19 | + whoami = argv[0]; | ||
| 20 | + } | ||
| 21 | + else | ||
| 22 | + { | ||
| 23 | + ++whoami; | ||
| 24 | + } | ||
| 25 | + // For libtool's sake.... | ||
| 26 | + if (strncmp(whoami, "lt-", 3) == 0) | ||
| 27 | + { | ||
| 28 | + whoami += 3; | ||
| 29 | + } | ||
| 30 | + | ||
| 31 | + if (argc != 2) | ||
| 32 | + { | ||
| 33 | + usage(); | ||
| 34 | + } | ||
| 35 | + char const* infilename = argv[1]; | ||
| 36 | + std::list<std::string> lines = | ||
| 37 | + QUtil::read_lines_from_file(infilename); | ||
| 38 | + for (std::list<std::string>::iterator iter = lines.begin(); | ||
| 39 | + iter != lines.end(); ++iter) | ||
| 40 | + { | ||
| 41 | + QPDFObjectHandle str = QPDFObjectHandle::newUnicodeString(*iter); | ||
| 42 | + std::cout << str.getUTF8Value() << " // " | ||
| 43 | + << str.unparseBinary() << std::endl; | ||
| 44 | + } | ||
| 45 | + return 0; | ||
| 46 | +} |