Commit 3246923cf2189554f7c348ebf51c9774c09deec8

Authored by Jay Berkenbilt
1 parent 16f4f94c

Implement JSON v2 for String

Also refine the herustic for deciding whether to use hexadecimal
notation for a string.
libqpdf/QPDF_String.cc
... ... @@ -45,8 +45,32 @@ QPDF_String::unparse()
45 45 JSON
46 46 QPDF_String::getJSON(int json_version)
47 47 {
48   - // QXXXQ
49   - return JSON::makeString(getUTF8Val());
  48 + if (json_version == 1) {
  49 + return JSON::makeString(getUTF8Val());
  50 + }
  51 + // See if we can unambiguously represent as Unicode.
  52 + bool is_unicode = false;
  53 + std::string result;
  54 + std::string candidate = getUTF8Val();
  55 + if (QUtil::is_utf16(this->val) || QUtil::is_explicit_utf8(this->val)) {
  56 + is_unicode = true;
  57 + result = candidate;
  58 + } else if (!useHexString()) {
  59 + std::string test;
  60 + if (QUtil::utf8_to_pdf_doc(candidate, test, '?') &&
  61 + (test == this->val)) {
  62 + // This is a PDF-doc string that can be losslessly encoded
  63 + // as Unicode.
  64 + is_unicode = true;
  65 + result = candidate;
  66 + }
  67 + }
  68 + if (is_unicode) {
  69 + result = "u:" + result;
  70 + } else {
  71 + result = "b:" + QUtil::hex_encode(this->val);
  72 + }
  73 + return JSON::makeString(result);
50 74 }
51 75  
52 76 QPDFObject::object_type_e
... ... @@ -61,41 +85,32 @@ QPDF_String::getTypeName() const
61 85 return "string";
62 86 }
63 87  
64   -std::string
65   -QPDF_String::unparse(bool force_binary)
  88 +bool
  89 +QPDF_String::useHexString() const
66 90 {
67   - bool use_hexstring = force_binary;
68   - if (!use_hexstring) {
69   - unsigned int nonprintable = 0;
70   - int consecutive_printable = 0;
71   - for (unsigned int i = 0; i < this->val.length(); ++i) {
72   - char ch = this->val.at(i);
73   - // Note: do not use locale to determine printability. The
74   - // PDF specification accepts arbitrary binary data. Some
75   - // locales imply multibyte characters. We'll consider
76   - // something printable if it is printable in 7-bit ASCII.
77   - // We'll code this manually rather than being rude and
78   - // setting locale.
79   - if ((ch == 0) ||
80   - (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {
81   - ++nonprintable;
82   - consecutive_printable = 0;
83   - } else {
84   - if (++consecutive_printable > 5) {
85   - // If there are more than 5 consecutive printable
86   - // characters, I want to see them as such.
87   - nonprintable = 0;
88   - break;
89   - }
  91 + // Heuristic: use the hexadecimal representation of a string if
  92 + // there are any non-printable (in PDF Doc encoding) characters or
  93 + // if too large of a proportion of the string consists of
  94 + // non-ASCII characters.
  95 + bool nonprintable = false;
  96 + unsigned int non_ascii = 0;
  97 + for (unsigned int i = 0; i < this->val.length(); ++i) {
  98 + char ch = this->val.at(i);
  99 + if ((ch == 0) ||
  100 + (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {
  101 + if ((ch >= 0) && (ch < 24)) {
  102 + nonprintable = true;
90 103 }
91   - }
92   -
93   - // Use hex notation if more than 20% of the characters are not
94   - // printable in plain ASCII.
95   - if (5 * nonprintable > val.length()) {
96   - use_hexstring = true;
  104 + ++non_ascii;
97 105 }
98 106 }
  107 + return (nonprintable || (5 * non_ascii > val.length()));
  108 +}
  109 +
  110 +std::string
  111 +QPDF_String::unparse(bool force_binary)
  112 +{
  113 + bool use_hexstring = force_binary || useHexString();
99 114 std::string result;
100 115 if (use_hexstring) {
101 116 result += "<" + QUtil::hex_encode(this->val) + ">";
... ...
libqpdf/qpdf/QPDF_String.hh
... ... @@ -20,6 +20,7 @@ class QPDF_String: public QPDFObject
20 20 std::string getUTF8Val() const;
21 21  
22 22 private:
  23 + bool useHexString() const;
23 24 std::string val;
24 25 };
25 26  
... ...
qpdf/qtest/qpdf/V4-clearmeta.pdf
No preview for this file type
qpdf/qtest/qpdf/direct-pages-json-objects.out
... ... @@ -65,8 +65,8 @@
65 65 ],
66 66 "trailer": {
67 67 "/ID": [
68   - "\u0013#¥fi|WzfsU…©6ŸÎ<",
69   - "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj"
  68 + "b:1323a5937c577a66735583a93698ce3c",
  69 + "b:372cbf44f6db88ab60d9263c0f0bd26a"
70 70 ],
71 71 "/Root": "1 0 R",
72 72 "/Size": 7
... ...
qpdf/qtest/qpdf/direct-pages-json-pages.out
... ... @@ -89,8 +89,8 @@
89 89 },
90 90 "trailer": {
91 91 "/ID": [
92   - "\u0013#¥fi|WzfsU…©6ŸÎ<",
93   - "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj"
  92 + "b:1323a5937c577a66735583a93698ce3c",
  93 + "b:372cbf44f6db88ab60d9263c0f0bd26a"
94 94 ],
95 95 "/Root": "1 0 R",
96 96 "/Size": 7
... ...
qpdf/qtest/qpdf/good14.out
... ... @@ -9,7 +9,7 @@ three lines
9 9 (string with \nCRLF and\nCR and\nLF)
10 10 and another
11 11 indentation
12   -(\001B%DEF)<01>
  12 +<014225444546><01>
13 13 <8a8b>
14 14 (ab)
15 15 <8c><dd> ) >
... ...
qpdf/qtest/qpdf/merge-dict.out
1 1 {
2   - "/k1": "scalar1",
  2 + "/k1": "u:scalar1",
3 3 "/k2": 16059,
4 4 "/k3": {
5   - "/a": "a",
6   - "/b": "conflict: seen",
  5 + "/a": "u:a",
  6 + "/b": "u:conflict: seen",
7 7 "/c": [
8 8 2,
9 9 3
... ... @@ -12,7 +12,7 @@
12 12 "/y": 25,
13 13 "/z": 26
14 14 },
15   - "/e": "e"
  15 + "/e": "u:e"
16 16 },
17 17 "/k4": {
18 18 "/A": 65,
... ... @@ -24,11 +24,11 @@
24 24 "/k5": [
25 25 "/one",
26 26 2,
27   - "three",
  27 + "u:three",
28 28 [
29 29 "/four"
30 30 ],
31   - "two"
  31 + "u:two"
32 32 ]
33 33 }
34 34 /A
... ...
qpdf/qtest/qpdf/page_api_2-json-objects.out
... ... @@ -9,8 +9,8 @@
9 9 "/Type": "/Catalog"
10 10 },
11 11 "2 0 R": {
12   - "/CreationDate": "D:20120621124041",
13   - "/Producer": "Apex PDFWriter"
  12 + "/CreationDate": "u:D:20120621124041",
  13 + "/Producer": "u:Apex PDFWriter"
14 14 },
15 15 "3 0 R": {
16 16 "/Count": 3,
... ... @@ -77,8 +77,8 @@
77 77 "10 0 R": 47,
78 78 "trailer": {
79 79 "/ID": [
80   - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",
81   - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002"
  80 + "b:fb18b786ff7b358705da8a532aba8f6f",
  81 + "b:f7179eb35159bfd4c00f128abcfd1f02"
82 82 ],
83 83 "/Info": "2 0 R",
84 84 "/Root": "1 0 R",
... ...
qpdf/qtest/qpdf/page_api_2-json-pages.out
... ... @@ -41,8 +41,8 @@
41 41 "/Type": "/Catalog"
42 42 },
43 43 "2 0 R": {
44   - "/CreationDate": "D:20120621124041",
45   - "/Producer": "Apex PDFWriter"
  44 + "/CreationDate": "u:D:20120621124041",
  45 + "/Producer": "u:Apex PDFWriter"
46 46 },
47 47 "3 0 R": {
48 48 "/Count": 3,
... ... @@ -129,8 +129,8 @@
129 129 },
130 130 "trailer": {
131 131 "/ID": [
132   - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",
133   - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002"
  132 + "b:fb18b786ff7b358705da8a532aba8f6f",
  133 + "b:f7179eb35159bfd4c00f128abcfd1f02"
134 134 ],
135 135 "/Info": "2 0 R",
136 136 "/Root": "1 0 R",
... ...