Commit 3246923cf2189554f7c348ebf51c9774c09deec8
1 parent
16f4f94c
Implement JSON v2 for String
Also refine the herustic for deciding whether to use hexadecimal notation for a string.
Showing
9 changed files
with
68 additions
and
52 deletions
libqpdf/QPDF_String.cc
| @@ -45,8 +45,32 @@ QPDF_String::unparse() | @@ -45,8 +45,32 @@ QPDF_String::unparse() | ||
| 45 | JSON | 45 | JSON |
| 46 | QPDF_String::getJSON(int json_version) | 46 | QPDF_String::getJSON(int json_version) |
| 47 | { | 47 | { |
| 48 | - // QXXXQ | ||
| 49 | - return JSON::makeString(getUTF8Val()); | 48 | + if (json_version == 1) { |
| 49 | + return JSON::makeString(getUTF8Val()); | ||
| 50 | + } | ||
| 51 | + // See if we can unambiguously represent as Unicode. | ||
| 52 | + bool is_unicode = false; | ||
| 53 | + std::string result; | ||
| 54 | + std::string candidate = getUTF8Val(); | ||
| 55 | + if (QUtil::is_utf16(this->val) || QUtil::is_explicit_utf8(this->val)) { | ||
| 56 | + is_unicode = true; | ||
| 57 | + result = candidate; | ||
| 58 | + } else if (!useHexString()) { | ||
| 59 | + std::string test; | ||
| 60 | + if (QUtil::utf8_to_pdf_doc(candidate, test, '?') && | ||
| 61 | + (test == this->val)) { | ||
| 62 | + // This is a PDF-doc string that can be losslessly encoded | ||
| 63 | + // as Unicode. | ||
| 64 | + is_unicode = true; | ||
| 65 | + result = candidate; | ||
| 66 | + } | ||
| 67 | + } | ||
| 68 | + if (is_unicode) { | ||
| 69 | + result = "u:" + result; | ||
| 70 | + } else { | ||
| 71 | + result = "b:" + QUtil::hex_encode(this->val); | ||
| 72 | + } | ||
| 73 | + return JSON::makeString(result); | ||
| 50 | } | 74 | } |
| 51 | 75 | ||
| 52 | QPDFObject::object_type_e | 76 | QPDFObject::object_type_e |
| @@ -61,41 +85,32 @@ QPDF_String::getTypeName() const | @@ -61,41 +85,32 @@ QPDF_String::getTypeName() const | ||
| 61 | return "string"; | 85 | return "string"; |
| 62 | } | 86 | } |
| 63 | 87 | ||
| 64 | -std::string | ||
| 65 | -QPDF_String::unparse(bool force_binary) | 88 | +bool |
| 89 | +QPDF_String::useHexString() const | ||
| 66 | { | 90 | { |
| 67 | - bool use_hexstring = force_binary; | ||
| 68 | - if (!use_hexstring) { | ||
| 69 | - unsigned int nonprintable = 0; | ||
| 70 | - int consecutive_printable = 0; | ||
| 71 | - for (unsigned int i = 0; i < this->val.length(); ++i) { | ||
| 72 | - char ch = this->val.at(i); | ||
| 73 | - // Note: do not use locale to determine printability. The | ||
| 74 | - // PDF specification accepts arbitrary binary data. Some | ||
| 75 | - // locales imply multibyte characters. We'll consider | ||
| 76 | - // something printable if it is printable in 7-bit ASCII. | ||
| 77 | - // We'll code this manually rather than being rude and | ||
| 78 | - // setting locale. | ||
| 79 | - if ((ch == 0) || | ||
| 80 | - (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) { | ||
| 81 | - ++nonprintable; | ||
| 82 | - consecutive_printable = 0; | ||
| 83 | - } else { | ||
| 84 | - if (++consecutive_printable > 5) { | ||
| 85 | - // If there are more than 5 consecutive printable | ||
| 86 | - // characters, I want to see them as such. | ||
| 87 | - nonprintable = 0; | ||
| 88 | - break; | ||
| 89 | - } | 91 | + // Heuristic: use the hexadecimal representation of a string if |
| 92 | + // there are any non-printable (in PDF Doc encoding) characters or | ||
| 93 | + // if too large of a proportion of the string consists of | ||
| 94 | + // non-ASCII characters. | ||
| 95 | + bool nonprintable = false; | ||
| 96 | + unsigned int non_ascii = 0; | ||
| 97 | + for (unsigned int i = 0; i < this->val.length(); ++i) { | ||
| 98 | + char ch = this->val.at(i); | ||
| 99 | + if ((ch == 0) || | ||
| 100 | + (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) { | ||
| 101 | + if ((ch >= 0) && (ch < 24)) { | ||
| 102 | + nonprintable = true; | ||
| 90 | } | 103 | } |
| 91 | - } | ||
| 92 | - | ||
| 93 | - // Use hex notation if more than 20% of the characters are not | ||
| 94 | - // printable in plain ASCII. | ||
| 95 | - if (5 * nonprintable > val.length()) { | ||
| 96 | - use_hexstring = true; | 104 | + ++non_ascii; |
| 97 | } | 105 | } |
| 98 | } | 106 | } |
| 107 | + return (nonprintable || (5 * non_ascii > val.length())); | ||
| 108 | +} | ||
| 109 | + | ||
| 110 | +std::string | ||
| 111 | +QPDF_String::unparse(bool force_binary) | ||
| 112 | +{ | ||
| 113 | + bool use_hexstring = force_binary || useHexString(); | ||
| 99 | std::string result; | 114 | std::string result; |
| 100 | if (use_hexstring) { | 115 | if (use_hexstring) { |
| 101 | result += "<" + QUtil::hex_encode(this->val) + ">"; | 116 | result += "<" + QUtil::hex_encode(this->val) + ">"; |
libqpdf/qpdf/QPDF_String.hh
qpdf/qtest/qpdf/V4-clearmeta.pdf
No preview for this file type
qpdf/qtest/qpdf/direct-pages-json-objects.out
| @@ -65,8 +65,8 @@ | @@ -65,8 +65,8 @@ | ||
| 65 | ], | 65 | ], |
| 66 | "trailer": { | 66 | "trailer": { |
| 67 | "/ID": [ | 67 | "/ID": [ |
| 68 | - "\u0013#¥fi|WzfsU…©6ŸÎ<", | ||
| 69 | - "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj" | 68 | + "b:1323a5937c577a66735583a93698ce3c", |
| 69 | + "b:372cbf44f6db88ab60d9263c0f0bd26a" | ||
| 70 | ], | 70 | ], |
| 71 | "/Root": "1 0 R", | 71 | "/Root": "1 0 R", |
| 72 | "/Size": 7 | 72 | "/Size": 7 |
qpdf/qtest/qpdf/direct-pages-json-pages.out
| @@ -89,8 +89,8 @@ | @@ -89,8 +89,8 @@ | ||
| 89 | }, | 89 | }, |
| 90 | "trailer": { | 90 | "trailer": { |
| 91 | "/ID": [ | 91 | "/ID": [ |
| 92 | - "\u0013#¥fi|WzfsU…©6ŸÎ<", | ||
| 93 | - "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj" | 92 | + "b:1323a5937c577a66735583a93698ce3c", |
| 93 | + "b:372cbf44f6db88ab60d9263c0f0bd26a" | ||
| 94 | ], | 94 | ], |
| 95 | "/Root": "1 0 R", | 95 | "/Root": "1 0 R", |
| 96 | "/Size": 7 | 96 | "/Size": 7 |
qpdf/qtest/qpdf/good14.out
| @@ -9,7 +9,7 @@ three lines | @@ -9,7 +9,7 @@ three lines | ||
| 9 | (string with \nCRLF and\nCR and\nLF) | 9 | (string with \nCRLF and\nCR and\nLF) |
| 10 | and another | 10 | and another |
| 11 | indentation | 11 | indentation |
| 12 | -(\001B%DEF)<01> | 12 | +<014225444546><01> |
| 13 | <8a8b> | 13 | <8a8b> |
| 14 | (ab) | 14 | (ab) |
| 15 | <8c><dd> ) > | 15 | <8c><dd> ) > |
qpdf/qtest/qpdf/merge-dict.out
| 1 | { | 1 | { |
| 2 | - "/k1": "scalar1", | 2 | + "/k1": "u:scalar1", |
| 3 | "/k2": 16059, | 3 | "/k2": 16059, |
| 4 | "/k3": { | 4 | "/k3": { |
| 5 | - "/a": "a", | ||
| 6 | - "/b": "conflict: seen", | 5 | + "/a": "u:a", |
| 6 | + "/b": "u:conflict: seen", | ||
| 7 | "/c": [ | 7 | "/c": [ |
| 8 | 2, | 8 | 2, |
| 9 | 3 | 9 | 3 |
| @@ -12,7 +12,7 @@ | @@ -12,7 +12,7 @@ | ||
| 12 | "/y": 25, | 12 | "/y": 25, |
| 13 | "/z": 26 | 13 | "/z": 26 |
| 14 | }, | 14 | }, |
| 15 | - "/e": "e" | 15 | + "/e": "u:e" |
| 16 | }, | 16 | }, |
| 17 | "/k4": { | 17 | "/k4": { |
| 18 | "/A": 65, | 18 | "/A": 65, |
| @@ -24,11 +24,11 @@ | @@ -24,11 +24,11 @@ | ||
| 24 | "/k5": [ | 24 | "/k5": [ |
| 25 | "/one", | 25 | "/one", |
| 26 | 2, | 26 | 2, |
| 27 | - "three", | 27 | + "u:three", |
| 28 | [ | 28 | [ |
| 29 | "/four" | 29 | "/four" |
| 30 | ], | 30 | ], |
| 31 | - "two" | 31 | + "u:two" |
| 32 | ] | 32 | ] |
| 33 | } | 33 | } |
| 34 | /A | 34 | /A |
qpdf/qtest/qpdf/page_api_2-json-objects.out
| @@ -9,8 +9,8 @@ | @@ -9,8 +9,8 @@ | ||
| 9 | "/Type": "/Catalog" | 9 | "/Type": "/Catalog" |
| 10 | }, | 10 | }, |
| 11 | "2 0 R": { | 11 | "2 0 R": { |
| 12 | - "/CreationDate": "D:20120621124041", | ||
| 13 | - "/Producer": "Apex PDFWriter" | 12 | + "/CreationDate": "u:D:20120621124041", |
| 13 | + "/Producer": "u:Apex PDFWriter" | ||
| 14 | }, | 14 | }, |
| 15 | "3 0 R": { | 15 | "3 0 R": { |
| 16 | "/Count": 3, | 16 | "/Count": 3, |
| @@ -77,8 +77,8 @@ | @@ -77,8 +77,8 @@ | ||
| 77 | "10 0 R": 47, | 77 | "10 0 R": 47, |
| 78 | "trailer": { | 78 | "trailer": { |
| 79 | "/ID": [ | 79 | "/ID": [ |
| 80 | - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", | ||
| 81 | - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" | 80 | + "b:fb18b786ff7b358705da8a532aba8f6f", |
| 81 | + "b:f7179eb35159bfd4c00f128abcfd1f02" | ||
| 82 | ], | 82 | ], |
| 83 | "/Info": "2 0 R", | 83 | "/Info": "2 0 R", |
| 84 | "/Root": "1 0 R", | 84 | "/Root": "1 0 R", |
qpdf/qtest/qpdf/page_api_2-json-pages.out
| @@ -41,8 +41,8 @@ | @@ -41,8 +41,8 @@ | ||
| 41 | "/Type": "/Catalog" | 41 | "/Type": "/Catalog" |
| 42 | }, | 42 | }, |
| 43 | "2 0 R": { | 43 | "2 0 R": { |
| 44 | - "/CreationDate": "D:20120621124041", | ||
| 45 | - "/Producer": "Apex PDFWriter" | 44 | + "/CreationDate": "u:D:20120621124041", |
| 45 | + "/Producer": "u:Apex PDFWriter" | ||
| 46 | }, | 46 | }, |
| 47 | "3 0 R": { | 47 | "3 0 R": { |
| 48 | "/Count": 3, | 48 | "/Count": 3, |
| @@ -129,8 +129,8 @@ | @@ -129,8 +129,8 @@ | ||
| 129 | }, | 129 | }, |
| 130 | "trailer": { | 130 | "trailer": { |
| 131 | "/ID": [ | 131 | "/ID": [ |
| 132 | - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", | ||
| 133 | - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" | 132 | + "b:fb18b786ff7b358705da8a532aba8f6f", |
| 133 | + "b:f7179eb35159bfd4c00f128abcfd1f02" | ||
| 134 | ], | 134 | ], |
| 135 | "/Info": "2 0 R", | 135 | "/Info": "2 0 R", |
| 136 | "/Root": "1 0 R", | 136 | "/Root": "1 0 R", |