Commit 3246923cf2189554f7c348ebf51c9774c09deec8
1 parent
16f4f94c
Implement JSON v2 for String
Also refine the herustic for deciding whether to use hexadecimal notation for a string.
Showing
9 changed files
with
68 additions
and
52 deletions
libqpdf/QPDF_String.cc
| ... | ... | @@ -45,8 +45,32 @@ QPDF_String::unparse() |
| 45 | 45 | JSON |
| 46 | 46 | QPDF_String::getJSON(int json_version) |
| 47 | 47 | { |
| 48 | - // QXXXQ | |
| 49 | - return JSON::makeString(getUTF8Val()); | |
| 48 | + if (json_version == 1) { | |
| 49 | + return JSON::makeString(getUTF8Val()); | |
| 50 | + } | |
| 51 | + // See if we can unambiguously represent as Unicode. | |
| 52 | + bool is_unicode = false; | |
| 53 | + std::string result; | |
| 54 | + std::string candidate = getUTF8Val(); | |
| 55 | + if (QUtil::is_utf16(this->val) || QUtil::is_explicit_utf8(this->val)) { | |
| 56 | + is_unicode = true; | |
| 57 | + result = candidate; | |
| 58 | + } else if (!useHexString()) { | |
| 59 | + std::string test; | |
| 60 | + if (QUtil::utf8_to_pdf_doc(candidate, test, '?') && | |
| 61 | + (test == this->val)) { | |
| 62 | + // This is a PDF-doc string that can be losslessly encoded | |
| 63 | + // as Unicode. | |
| 64 | + is_unicode = true; | |
| 65 | + result = candidate; | |
| 66 | + } | |
| 67 | + } | |
| 68 | + if (is_unicode) { | |
| 69 | + result = "u:" + result; | |
| 70 | + } else { | |
| 71 | + result = "b:" + QUtil::hex_encode(this->val); | |
| 72 | + } | |
| 73 | + return JSON::makeString(result); | |
| 50 | 74 | } |
| 51 | 75 | |
| 52 | 76 | QPDFObject::object_type_e |
| ... | ... | @@ -61,41 +85,32 @@ QPDF_String::getTypeName() const |
| 61 | 85 | return "string"; |
| 62 | 86 | } |
| 63 | 87 | |
| 64 | -std::string | |
| 65 | -QPDF_String::unparse(bool force_binary) | |
| 88 | +bool | |
| 89 | +QPDF_String::useHexString() const | |
| 66 | 90 | { |
| 67 | - bool use_hexstring = force_binary; | |
| 68 | - if (!use_hexstring) { | |
| 69 | - unsigned int nonprintable = 0; | |
| 70 | - int consecutive_printable = 0; | |
| 71 | - for (unsigned int i = 0; i < this->val.length(); ++i) { | |
| 72 | - char ch = this->val.at(i); | |
| 73 | - // Note: do not use locale to determine printability. The | |
| 74 | - // PDF specification accepts arbitrary binary data. Some | |
| 75 | - // locales imply multibyte characters. We'll consider | |
| 76 | - // something printable if it is printable in 7-bit ASCII. | |
| 77 | - // We'll code this manually rather than being rude and | |
| 78 | - // setting locale. | |
| 79 | - if ((ch == 0) || | |
| 80 | - (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) { | |
| 81 | - ++nonprintable; | |
| 82 | - consecutive_printable = 0; | |
| 83 | - } else { | |
| 84 | - if (++consecutive_printable > 5) { | |
| 85 | - // If there are more than 5 consecutive printable | |
| 86 | - // characters, I want to see them as such. | |
| 87 | - nonprintable = 0; | |
| 88 | - break; | |
| 89 | - } | |
| 91 | + // Heuristic: use the hexadecimal representation of a string if | |
| 92 | + // there are any non-printable (in PDF Doc encoding) characters or | |
| 93 | + // if too large of a proportion of the string consists of | |
| 94 | + // non-ASCII characters. | |
| 95 | + bool nonprintable = false; | |
| 96 | + unsigned int non_ascii = 0; | |
| 97 | + for (unsigned int i = 0; i < this->val.length(); ++i) { | |
| 98 | + char ch = this->val.at(i); | |
| 99 | + if ((ch == 0) || | |
| 100 | + (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) { | |
| 101 | + if ((ch >= 0) && (ch < 24)) { | |
| 102 | + nonprintable = true; | |
| 90 | 103 | } |
| 91 | - } | |
| 92 | - | |
| 93 | - // Use hex notation if more than 20% of the characters are not | |
| 94 | - // printable in plain ASCII. | |
| 95 | - if (5 * nonprintable > val.length()) { | |
| 96 | - use_hexstring = true; | |
| 104 | + ++non_ascii; | |
| 97 | 105 | } |
| 98 | 106 | } |
| 107 | + return (nonprintable || (5 * non_ascii > val.length())); | |
| 108 | +} | |
| 109 | + | |
| 110 | +std::string | |
| 111 | +QPDF_String::unparse(bool force_binary) | |
| 112 | +{ | |
| 113 | + bool use_hexstring = force_binary || useHexString(); | |
| 99 | 114 | std::string result; |
| 100 | 115 | if (use_hexstring) { |
| 101 | 116 | result += "<" + QUtil::hex_encode(this->val) + ">"; | ... | ... |
libqpdf/qpdf/QPDF_String.hh
qpdf/qtest/qpdf/V4-clearmeta.pdf
No preview for this file type
qpdf/qtest/qpdf/direct-pages-json-objects.out
qpdf/qtest/qpdf/direct-pages-json-pages.out
qpdf/qtest/qpdf/good14.out
qpdf/qtest/qpdf/merge-dict.out
| 1 | 1 | { |
| 2 | - "/k1": "scalar1", | |
| 2 | + "/k1": "u:scalar1", | |
| 3 | 3 | "/k2": 16059, |
| 4 | 4 | "/k3": { |
| 5 | - "/a": "a", | |
| 6 | - "/b": "conflict: seen", | |
| 5 | + "/a": "u:a", | |
| 6 | + "/b": "u:conflict: seen", | |
| 7 | 7 | "/c": [ |
| 8 | 8 | 2, |
| 9 | 9 | 3 |
| ... | ... | @@ -12,7 +12,7 @@ |
| 12 | 12 | "/y": 25, |
| 13 | 13 | "/z": 26 |
| 14 | 14 | }, |
| 15 | - "/e": "e" | |
| 15 | + "/e": "u:e" | |
| 16 | 16 | }, |
| 17 | 17 | "/k4": { |
| 18 | 18 | "/A": 65, |
| ... | ... | @@ -24,11 +24,11 @@ |
| 24 | 24 | "/k5": [ |
| 25 | 25 | "/one", |
| 26 | 26 | 2, |
| 27 | - "three", | |
| 27 | + "u:three", | |
| 28 | 28 | [ |
| 29 | 29 | "/four" |
| 30 | 30 | ], |
| 31 | - "two" | |
| 31 | + "u:two" | |
| 32 | 32 | ] |
| 33 | 33 | } |
| 34 | 34 | /A | ... | ... |
qpdf/qtest/qpdf/page_api_2-json-objects.out
| ... | ... | @@ -9,8 +9,8 @@ |
| 9 | 9 | "/Type": "/Catalog" |
| 10 | 10 | }, |
| 11 | 11 | "2 0 R": { |
| 12 | - "/CreationDate": "D:20120621124041", | |
| 13 | - "/Producer": "Apex PDFWriter" | |
| 12 | + "/CreationDate": "u:D:20120621124041", | |
| 13 | + "/Producer": "u:Apex PDFWriter" | |
| 14 | 14 | }, |
| 15 | 15 | "3 0 R": { |
| 16 | 16 | "/Count": 3, |
| ... | ... | @@ -77,8 +77,8 @@ |
| 77 | 77 | "10 0 R": 47, |
| 78 | 78 | "trailer": { |
| 79 | 79 | "/ID": [ |
| 80 | - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", | |
| 81 | - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" | |
| 80 | + "b:fb18b786ff7b358705da8a532aba8f6f", | |
| 81 | + "b:f7179eb35159bfd4c00f128abcfd1f02" | |
| 82 | 82 | ], |
| 83 | 83 | "/Info": "2 0 R", |
| 84 | 84 | "/Root": "1 0 R", | ... | ... |
qpdf/qtest/qpdf/page_api_2-json-pages.out
| ... | ... | @@ -41,8 +41,8 @@ |
| 41 | 41 | "/Type": "/Catalog" |
| 42 | 42 | }, |
| 43 | 43 | "2 0 R": { |
| 44 | - "/CreationDate": "D:20120621124041", | |
| 45 | - "/Producer": "Apex PDFWriter" | |
| 44 | + "/CreationDate": "u:D:20120621124041", | |
| 45 | + "/Producer": "u:Apex PDFWriter" | |
| 46 | 46 | }, |
| 47 | 47 | "3 0 R": { |
| 48 | 48 | "/Count": 3, |
| ... | ... | @@ -129,8 +129,8 @@ |
| 129 | 129 | }, |
| 130 | 130 | "trailer": { |
| 131 | 131 | "/ID": [ |
| 132 | - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", | |
| 133 | - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" | |
| 132 | + "b:fb18b786ff7b358705da8a532aba8f6f", | |
| 133 | + "b:f7179eb35159bfd4c00f128abcfd1f02" | |
| 134 | 134 | ], |
| 135 | 135 | "/Info": "2 0 R", |
| 136 | 136 | "/Root": "1 0 R", | ... | ... |