Commit 3246923cf2189554f7c348ebf51c9774c09deec8

Authored by Jay Berkenbilt
1 parent 16f4f94c

Implement JSON v2 for String

Also refine the herustic for deciding whether to use hexadecimal
notation for a string.
libqpdf/QPDF_String.cc
@@ -45,8 +45,32 @@ QPDF_String::unparse() @@ -45,8 +45,32 @@ QPDF_String::unparse()
45 JSON 45 JSON
46 QPDF_String::getJSON(int json_version) 46 QPDF_String::getJSON(int json_version)
47 { 47 {
48 - // QXXXQ  
49 - return JSON::makeString(getUTF8Val()); 48 + if (json_version == 1) {
  49 + return JSON::makeString(getUTF8Val());
  50 + }
  51 + // See if we can unambiguously represent as Unicode.
  52 + bool is_unicode = false;
  53 + std::string result;
  54 + std::string candidate = getUTF8Val();
  55 + if (QUtil::is_utf16(this->val) || QUtil::is_explicit_utf8(this->val)) {
  56 + is_unicode = true;
  57 + result = candidate;
  58 + } else if (!useHexString()) {
  59 + std::string test;
  60 + if (QUtil::utf8_to_pdf_doc(candidate, test, '?') &&
  61 + (test == this->val)) {
  62 + // This is a PDF-doc string that can be losslessly encoded
  63 + // as Unicode.
  64 + is_unicode = true;
  65 + result = candidate;
  66 + }
  67 + }
  68 + if (is_unicode) {
  69 + result = "u:" + result;
  70 + } else {
  71 + result = "b:" + QUtil::hex_encode(this->val);
  72 + }
  73 + return JSON::makeString(result);
50 } 74 }
51 75
52 QPDFObject::object_type_e 76 QPDFObject::object_type_e
@@ -61,41 +85,32 @@ QPDF_String::getTypeName() const @@ -61,41 +85,32 @@ QPDF_String::getTypeName() const
61 return "string"; 85 return "string";
62 } 86 }
63 87
64 -std::string  
65 -QPDF_String::unparse(bool force_binary) 88 +bool
  89 +QPDF_String::useHexString() const
66 { 90 {
67 - bool use_hexstring = force_binary;  
68 - if (!use_hexstring) {  
69 - unsigned int nonprintable = 0;  
70 - int consecutive_printable = 0;  
71 - for (unsigned int i = 0; i < this->val.length(); ++i) {  
72 - char ch = this->val.at(i);  
73 - // Note: do not use locale to determine printability. The  
74 - // PDF specification accepts arbitrary binary data. Some  
75 - // locales imply multibyte characters. We'll consider  
76 - // something printable if it is printable in 7-bit ASCII.  
77 - // We'll code this manually rather than being rude and  
78 - // setting locale.  
79 - if ((ch == 0) ||  
80 - (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {  
81 - ++nonprintable;  
82 - consecutive_printable = 0;  
83 - } else {  
84 - if (++consecutive_printable > 5) {  
85 - // If there are more than 5 consecutive printable  
86 - // characters, I want to see them as such.  
87 - nonprintable = 0;  
88 - break;  
89 - } 91 + // Heuristic: use the hexadecimal representation of a string if
  92 + // there are any non-printable (in PDF Doc encoding) characters or
  93 + // if too large of a proportion of the string consists of
  94 + // non-ASCII characters.
  95 + bool nonprintable = false;
  96 + unsigned int non_ascii = 0;
  97 + for (unsigned int i = 0; i < this->val.length(); ++i) {
  98 + char ch = this->val.at(i);
  99 + if ((ch == 0) ||
  100 + (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {
  101 + if ((ch >= 0) && (ch < 24)) {
  102 + nonprintable = true;
90 } 103 }
91 - }  
92 -  
93 - // Use hex notation if more than 20% of the characters are not  
94 - // printable in plain ASCII.  
95 - if (5 * nonprintable > val.length()) {  
96 - use_hexstring = true; 104 + ++non_ascii;
97 } 105 }
98 } 106 }
  107 + return (nonprintable || (5 * non_ascii > val.length()));
  108 +}
  109 +
  110 +std::string
  111 +QPDF_String::unparse(bool force_binary)
  112 +{
  113 + bool use_hexstring = force_binary || useHexString();
99 std::string result; 114 std::string result;
100 if (use_hexstring) { 115 if (use_hexstring) {
101 result += "<" + QUtil::hex_encode(this->val) + ">"; 116 result += "<" + QUtil::hex_encode(this->val) + ">";
libqpdf/qpdf/QPDF_String.hh
@@ -20,6 +20,7 @@ class QPDF_String: public QPDFObject @@ -20,6 +20,7 @@ class QPDF_String: public QPDFObject
20 std::string getUTF8Val() const; 20 std::string getUTF8Val() const;
21 21
22 private: 22 private:
  23 + bool useHexString() const;
23 std::string val; 24 std::string val;
24 }; 25 };
25 26
qpdf/qtest/qpdf/V4-clearmeta.pdf
No preview for this file type
qpdf/qtest/qpdf/direct-pages-json-objects.out
@@ -65,8 +65,8 @@ @@ -65,8 +65,8 @@
65 ], 65 ],
66 "trailer": { 66 "trailer": {
67 "/ID": [ 67 "/ID": [
68 - "\u0013#¥fi|WzfsU…©6ŸÎ<",  
69 - "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj" 68 + "b:1323a5937c577a66735583a93698ce3c",
  69 + "b:372cbf44f6db88ab60d9263c0f0bd26a"
70 ], 70 ],
71 "/Root": "1 0 R", 71 "/Root": "1 0 R",
72 "/Size": 7 72 "/Size": 7
qpdf/qtest/qpdf/direct-pages-json-pages.out
@@ -89,8 +89,8 @@ @@ -89,8 +89,8 @@
89 }, 89 },
90 "trailer": { 90 "trailer": {
91 "/ID": [ 91 "/ID": [
92 - "\u0013#¥fi|WzfsU…©6ŸÎ<",  
93 - "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj" 92 + "b:1323a5937c577a66735583a93698ce3c",
  93 + "b:372cbf44f6db88ab60d9263c0f0bd26a"
94 ], 94 ],
95 "/Root": "1 0 R", 95 "/Root": "1 0 R",
96 "/Size": 7 96 "/Size": 7
qpdf/qtest/qpdf/good14.out
@@ -9,7 +9,7 @@ three lines @@ -9,7 +9,7 @@ three lines
9 (string with \nCRLF and\nCR and\nLF) 9 (string with \nCRLF and\nCR and\nLF)
10 and another 10 and another
11 indentation 11 indentation
12 -(\001B%DEF)<01> 12 +<014225444546><01>
13 <8a8b> 13 <8a8b>
14 (ab) 14 (ab)
15 <8c><dd> ) > 15 <8c><dd> ) >
qpdf/qtest/qpdf/merge-dict.out
1 { 1 {
2 - "/k1": "scalar1", 2 + "/k1": "u:scalar1",
3 "/k2": 16059, 3 "/k2": 16059,
4 "/k3": { 4 "/k3": {
5 - "/a": "a",  
6 - "/b": "conflict: seen", 5 + "/a": "u:a",
  6 + "/b": "u:conflict: seen",
7 "/c": [ 7 "/c": [
8 2, 8 2,
9 3 9 3
@@ -12,7 +12,7 @@ @@ -12,7 +12,7 @@
12 "/y": 25, 12 "/y": 25,
13 "/z": 26 13 "/z": 26
14 }, 14 },
15 - "/e": "e" 15 + "/e": "u:e"
16 }, 16 },
17 "/k4": { 17 "/k4": {
18 "/A": 65, 18 "/A": 65,
@@ -24,11 +24,11 @@ @@ -24,11 +24,11 @@
24 "/k5": [ 24 "/k5": [
25 "/one", 25 "/one",
26 2, 26 2,
27 - "three", 27 + "u:three",
28 [ 28 [
29 "/four" 29 "/four"
30 ], 30 ],
31 - "two" 31 + "u:two"
32 ] 32 ]
33 } 33 }
34 /A 34 /A
qpdf/qtest/qpdf/page_api_2-json-objects.out
@@ -9,8 +9,8 @@ @@ -9,8 +9,8 @@
9 "/Type": "/Catalog" 9 "/Type": "/Catalog"
10 }, 10 },
11 "2 0 R": { 11 "2 0 R": {
12 - "/CreationDate": "D:20120621124041",  
13 - "/Producer": "Apex PDFWriter" 12 + "/CreationDate": "u:D:20120621124041",
  13 + "/Producer": "u:Apex PDFWriter"
14 }, 14 },
15 "3 0 R": { 15 "3 0 R": {
16 "/Count": 3, 16 "/Count": 3,
@@ -77,8 +77,8 @@ @@ -77,8 +77,8 @@
77 "10 0 R": 47, 77 "10 0 R": 47,
78 "trailer": { 78 "trailer": {
79 "/ID": [ 79 "/ID": [
80 - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",  
81 - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" 80 + "b:fb18b786ff7b358705da8a532aba8f6f",
  81 + "b:f7179eb35159bfd4c00f128abcfd1f02"
82 ], 82 ],
83 "/Info": "2 0 R", 83 "/Info": "2 0 R",
84 "/Root": "1 0 R", 84 "/Root": "1 0 R",
qpdf/qtest/qpdf/page_api_2-json-pages.out
@@ -41,8 +41,8 @@ @@ -41,8 +41,8 @@
41 "/Type": "/Catalog" 41 "/Type": "/Catalog"
42 }, 42 },
43 "2 0 R": { 43 "2 0 R": {
44 - "/CreationDate": "D:20120621124041",  
45 - "/Producer": "Apex PDFWriter" 44 + "/CreationDate": "u:D:20120621124041",
  45 + "/Producer": "u:Apex PDFWriter"
46 }, 46 },
47 "3 0 R": { 47 "3 0 R": {
48 "/Count": 3, 48 "/Count": 3,
@@ -129,8 +129,8 @@ @@ -129,8 +129,8 @@
129 }, 129 },
130 "trailer": { 130 "trailer": {
131 "/ID": [ 131 "/ID": [
132 - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",  
133 - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" 132 + "b:fb18b786ff7b358705da8a532aba8f6f",
  133 + "b:f7179eb35159bfd4c00f128abcfd1f02"
134 ], 134 ],
135 "/Info": "2 0 R", 135 "/Info": "2 0 R",
136 "/Root": "1 0 R", 136 "/Root": "1 0 R",