Commit 952a665a4ed51400b5925e7cd69f08f0aeb374fe

Authored by Jay Berkenbilt
1 parent e44c395c

Better support for creating Unicode strings

ChangeLog
  1 +2018-06-21 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Added QPDFObject::newUnicodeString and QPDFObject::unparseBinary
  4 + to allow for more convenient creation of strings that are
  5 + explicitly encoded in UTF-16 BE. This is useful for creating
  6 + Unicode strings that appear outside of content streams, such as in
  7 + page labels, outlines, form field values, etc.
  8 +
1 9 2018-06-20 Jay Berkenbilt <ejb@ql.org>
2 10  
3 11 * Added new classes QPDFAcroFormDocumentHelper,
... ...
include/qpdf/QPDFObjectHandle.hh
... ... @@ -344,6 +344,12 @@ class QPDFObjectHandle
344 344 static QPDFObjectHandle newName(std::string const& name);
345 345 QPDF_DLL
346 346 static QPDFObjectHandle newString(std::string const& str);
  347 + // Create a string encoded in UTF-16 from the given utf8-encoded
  348 + // string. Such strings are appropriately encoded to appear in PDF
  349 + // files outside of content streams, such as in document metadata
  350 + // form field values, page labels, outlines, and similar locations.
  351 + QPDF_DLL
  352 + static QPDFObjectHandle newUnicodeString(std::string const& utf8_str);
347 353 QPDF_DLL
348 354 static QPDFObjectHandle newOperator(std::string const&);
349 355 QPDF_DLL
... ... @@ -715,6 +721,10 @@ class QPDFObjectHandle
715 721 std::string unparse();
716 722 QPDF_DLL
717 723 std::string unparseResolved();
  724 + // For strings only, force binary representation. Otherwise, same
  725 + // as unparse.
  726 + QPDF_DLL
  727 + std::string unparseBinary();
718 728  
719 729 // Legacy helper methods for commonly performed operations on
720 730 // pages. Newer code should use QPDFPageObjectHelper instead. The
... ...
libqpdf/QPDFObjectHandle.cc
... ... @@ -1221,6 +1221,20 @@ QPDFObjectHandle::unparseResolved()
1221 1221 return this->m->obj->unparse();
1222 1222 }
1223 1223  
  1224 +std::string
  1225 +QPDFObjectHandle::unparseBinary()
  1226 +{
  1227 + if (this->isString())
  1228 + {
  1229 + return dynamic_cast<QPDF_String*>(
  1230 + this->m->obj.getPointer())->unparse(true);
  1231 + }
  1232 + else
  1233 + {
  1234 + return unparse();
  1235 + }
  1236 +}
  1237 +
1224 1238 QPDFObjectHandle
1225 1239 QPDFObjectHandle::parse(std::string const& object_str,
1226 1240 std::string const& object_description)
... ... @@ -1846,6 +1860,12 @@ QPDFObjectHandle::newString(std::string const&amp; str)
1846 1860 }
1847 1861  
1848 1862 QPDFObjectHandle
  1863 +QPDFObjectHandle::newUnicodeString(std::string const& utf8_str)
  1864 +{
  1865 + return QPDFObjectHandle(QPDF_String::new_utf16(utf8_str));
  1866 +}
  1867 +
  1868 +QPDFObjectHandle
1849 1869 QPDFObjectHandle::newOperator(std::string const& value)
1850 1870 {
1851 1871 return QPDFObjectHandle(new QPDF_Operator(value));
... ...
libqpdf/QPDF_String.cc
... ... @@ -64,6 +64,58 @@ QPDF_String::~QPDF_String()
64 64 {
65 65 }
66 66  
  67 +QPDF_String*
  68 +QPDF_String::new_utf16(std::string const& utf8_val)
  69 +{
  70 + std::string result = "\xfe\xff";
  71 + size_t len = utf8_val.length();
  72 + for (size_t i = 0; i < len; ++i)
  73 + {
  74 + unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
  75 + if (ch < 128)
  76 + {
  77 + result += QUtil::toUTF16(ch);
  78 + }
  79 + else
  80 + {
  81 + size_t bytes_needed = 0;
  82 + unsigned bit_check = 0x40;
  83 + unsigned char to_clear = 0x80;
  84 + while (ch & bit_check)
  85 + {
  86 + ++bytes_needed;
  87 + to_clear |= bit_check;
  88 + bit_check >>= 1;
  89 + }
  90 +
  91 + if (((bytes_needed > 5) || (bytes_needed < 1)) ||
  92 + ((i + bytes_needed) >= len))
  93 + {
  94 + result += "\xff\xfd";
  95 + }
  96 + else
  97 + {
  98 + unsigned long codepoint = (ch & ~to_clear);
  99 + while (bytes_needed > 0)
  100 + {
  101 + --bytes_needed;
  102 + ch = utf8_val.at(++i);
  103 + if ((ch & 0xc0) != 0x80)
  104 + {
  105 + --i;
  106 + codepoint = 0xfffd;
  107 + break;
  108 + }
  109 + codepoint <<= 6;
  110 + codepoint += (ch & 0x3f);
  111 + }
  112 + result += QUtil::toUTF16(codepoint);
  113 + }
  114 + }
  115 + }
  116 + return new QPDF_String(result);
  117 +}
  118 +
67 119 std::string
68 120 QPDF_String::unparse()
69 121 {
... ...
libqpdf/qpdf/QPDF_String.hh
... ... @@ -9,6 +9,7 @@ class QPDF_String: public QPDFObject
9 9 {
10 10 public:
11 11 QPDF_String(std::string const& val);
  12 + static QPDF_String* new_utf16(std::string const& utf8_val);
12 13 virtual ~QPDF_String();
13 14 virtual std::string unparse();
14 15 virtual QPDFObject::object_type_e getTypeCode() const;
... ...
qpdf/build.mk
... ... @@ -4,6 +4,7 @@ BINS_qpdf = \
4 4 test_driver \
5 5 test_large_file \
6 6 test_pdf_doc_encoding \
  7 + test_pdf_unicode \
7 8 test_tokenizer
8 9 CBINS_qpdf = qpdf-ctest
9 10  
... ...
qpdf/qtest/qpdf.test
... ... @@ -84,13 +84,21 @@ flush_tiff_cache();
84 84  
85 85 show_ntests();
86 86 # ----------
87   -$td->notify("--- PDF Doc Encoding ---");
88   -$n_tests += 1;
  87 +$td->notify("--- Character Encoding ---");
  88 +$n_tests += 3;
89 89  
90 90 $td->runtest("PDF doc encoding to Unicode",
91 91 {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
92 92 {$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0},
93 93 $td->NORMALIZE_NEWLINES);
  94 +$td->runtest("UTF-16 encoding",
  95 + {$td->COMMAND => "test_pdf_unicode unicode.in"},
  96 + {$td->FILE => "unicode.out", $td->EXIT_STATUS => 0},
  97 + $td->NORMALIZE_NEWLINES);
  98 +$td->runtest("UTF-16 encoding errors",
  99 + {$td->COMMAND => "test_pdf_unicode unicode-errors.in"},
  100 + {$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0},
  101 + $td->NORMALIZE_NEWLINES);
94 102  
95 103 show_ntests();
96 104 # ----------
... ...
qpdf/qtest/qpdf/unicode-errors.in 0 → 100644
  1 +This file has utf-8 encoding errors and should be edited as a binary file.
  2 +
  3 +0: too many bytes: þafter
  4 +1: too few bytes: €after
  5 +2: invalid codepoint (U+DEAD): íº­after
  6 +3: not enough bytes for character: ð„!after (! included)
  7 +4: not enough bytes left in file ð
... ...
qpdf/qtest/qpdf/unicode-errors.out 0 → 100644
  1 +This file has utf-8 encoding errors and should be edited as a binary file. // <feff0054006800690073002000660069006c006500200068006100730020007500740066002d003800200065006e0063006f00640069006e00670020006500720072006f0072007300200061006e0064002000730068006f0075006c0064002000620065002000650064006900740065006400200061007300200061002000620069006e006100720079002000660069006c0065002e>
  2 + // <feff>
  3 +0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
  4 +1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
  5 +2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
  6 +3: not enough bytes for character: �!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
  7 +4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>
... ...
qpdf/qtest/qpdf/unicode.in 0 → 100644
  1 +This is a potato: 🥔 (u+01f954).
  2 +If you wanted to, you could cook some sweet 🥔 π.
  3 +If you think wwwwww is good, you should try ʬʬʬʬʬʬ.
  4 +బంగాళాదుంప సలాడ్
  5 +𝄞 𝄢 𝄪 𝅂
... ...
qpdf/qtest/qpdf/unicode.out 0 → 100644
  1 +This is a potato: 🥔 (u+01f954). // <feff00540068006900730020006900730020006100200070006f007400610074006f003a0020d83edd54002000280075002b0030003100660039003500340029002e>
  2 +If you wanted to, you could cook some sweet 🥔 π. // <feff0049006600200079006f0075002000770061006e00740065006400200074006f002c00200079006f007500200063006f0075006c006400200063006f006f006b00200073006f006d00650020007300770065006500740020d83edd54002003c0002e>
  3 +If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff0049006600200079006f00750020007400680069006e006b002000770077007700770077007700200069007300200067006f006f0064002c00200079006f0075002000730068006f0075006c00640020007400720079002002ac02ac02ac02ac02ac02ac002e>
  4 +బంగాళాదుంప సలాడ్ // <feff0c2c0c020c170c3e0c330c3e0c260c410c020c2a00200c380c320c3e0c210c4d>
  5 +𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42>
... ...
qpdf/test_pdf_unicode.cc 0 → 100644
  1 +#include <qpdf/QUtil.hh>
  2 +#include <qpdf/QPDFObjectHandle.hh>
  3 +#include <iostream>
  4 +#include <stdlib.h>
  5 +#include <string.h>
  6 +
  7 +static char const* whoami = 0;
  8 +
  9 +void usage()
  10 +{
  11 + std::cerr << "Usage: " << whoami << " infile" << std::endl;
  12 + exit(2);
  13 +}
  14 +
  15 +int main(int argc, char* argv[])
  16 +{
  17 + if ((whoami = strrchr(argv[0], '/')) == NULL)
  18 + {
  19 + whoami = argv[0];
  20 + }
  21 + else
  22 + {
  23 + ++whoami;
  24 + }
  25 + // For libtool's sake....
  26 + if (strncmp(whoami, "lt-", 3) == 0)
  27 + {
  28 + whoami += 3;
  29 + }
  30 +
  31 + if (argc != 2)
  32 + {
  33 + usage();
  34 + }
  35 + char const* infilename = argv[1];
  36 + std::list<std::string> lines =
  37 + QUtil::read_lines_from_file(infilename);
  38 + for (std::list<std::string>::iterator iter = lines.begin();
  39 + iter != lines.end(); ++iter)
  40 + {
  41 + QPDFObjectHandle str = QPDFObjectHandle::newUnicodeString(*iter);
  42 + std::cout << str.getUTF8Value() << " // "
  43 + << str.unparseBinary() << std::endl;
  44 + }
  45 + return 0;
  46 +}
... ...