Commit e87d149918ed6ed211f733f932df3b62ab445c12
1 parent
997f4ab6
Add QUtil::possible_repaired_encodings
Showing
5 changed files
with
158 additions
and
0 deletions
ChangeLog
| ... | ... | @@ -14,6 +14,14 @@ |
| 14 | 14 | the first bug in qpdf's history that could result in silent loss |
| 15 | 15 | of data when processing a correct input file. Fixes #276. |
| 16 | 16 | |
| 17 | +2019-01-15 Jay Berkenbilt <ejb@ql.org> | |
| 18 | + | |
| 19 | + * Add QUtil::possible_repaired_encodings which, given a string, | |
| 20 | + generates other strings that represent re-interpretation of the | |
| 21 | + bytes in a different coding system. This is used to help recover | |
| 22 | + passwords if the password string was improperly encoded on a | |
| 23 | + different system due to user error or a software bug. | |
| 24 | + | |
| 17 | 25 | 2019-01-14 Jay Berkenbilt <ejb@ql.org> |
| 18 | 26 | |
| 19 | 27 | * Add new CLI flags to 128-bit and 256-bit encryption: --assemble, | ... | ... |
include/qpdf/QUtil.hh
| ... | ... | @@ -223,6 +223,28 @@ namespace QUtil |
| 223 | 223 | bool& is_valid_utf8, |
| 224 | 224 | bool& is_utf16); |
| 225 | 225 | |
| 226 | + // Try to compensate for previously incorrectly encoded strings. | |
| 227 | + // We want to compensate for the following errors: | |
| 228 | + // | |
| 229 | + // * The string was supposed to be UTF-8 but was one of the | |
| 230 | + // single-byte encodings | |
| 231 | + // * The string was supposed to be PDF Doc but was either UTF-8 or | |
| 232 | + // one of the other single-byte encodings | |
| 233 | + // | |
| 234 | + // The returned vector always contains the original string first, | |
| 235 | + // and then it contains what the correct string would be in the | |
| 236 | + // event that the original string was the result of any of the | |
| 237 | + // above errors. | |
| 238 | + // | |
| 239 | + // This method is useful for attempting to recover a password that | |
| 240 | + // may have been previously incorrectly encoded. For example, the | |
| 241 | + // password was supposed to be UTF-8 but the previous application | |
| 242 | + // used a password encoded in WinAnsi, or if the previous password | |
| 243 | + // was supposed to be PDFDoc but was actually given as UTF-8 or | |
| 244 | + // WinAnsi, this method would find the correct password. | |
| 245 | + QPDF_DLL | |
| 246 | + std::vector<std::string> possible_repaired_encodings(std::string); | |
| 247 | + | |
| 226 | 248 | // If secure random number generation is supported on your |
| 227 | 249 | // platform and qpdf was not compiled with insecure random number |
| 228 | 250 | // generation, this returns a cryptographically secure random | ... | ... |
libqpdf/QUtil.cc
| ... | ... | @@ -15,6 +15,7 @@ |
| 15 | 15 | #include <sstream> |
| 16 | 16 | #include <fstream> |
| 17 | 17 | #include <stdexcept> |
| 18 | +#include <set> | |
| 18 | 19 | #include <stdio.h> |
| 19 | 20 | #include <errno.h> |
| 20 | 21 | #include <ctype.h> |
| ... | ... | @@ -1992,3 +1993,95 @@ QUtil::analyze_encoding(std::string const& val, |
| 1992 | 1993 | is_valid_utf8 = true; |
| 1993 | 1994 | } |
| 1994 | 1995 | } |
| 1996 | + | |
| 1997 | +std::vector<std::string> | |
| 1998 | +QUtil::possible_repaired_encodings(std::string supplied) | |
| 1999 | +{ | |
| 2000 | + std::vector<std::string> result; | |
| 2001 | + // Always include the original string | |
| 2002 | + result.push_back(supplied); | |
| 2003 | + bool has_8bit_chars = false; | |
| 2004 | + bool is_valid_utf8 = false; | |
| 2005 | + bool is_utf16 = false; | |
| 2006 | + analyze_encoding(supplied, has_8bit_chars, is_valid_utf8, is_utf16); | |
| 2007 | + if (! has_8bit_chars) | |
| 2008 | + { | |
| 2009 | + return result; | |
| 2010 | + } | |
| 2011 | + if (is_utf16) | |
| 2012 | + { | |
| 2013 | + // Convert to UTF-8 and pretend we got a UTF-8 string. | |
| 2014 | + is_utf16 = false; | |
| 2015 | + is_valid_utf8 = true; | |
| 2016 | + supplied = utf16_to_utf8(supplied); | |
| 2017 | + } | |
| 2018 | + std::string output; | |
| 2019 | + if (is_valid_utf8) | |
| 2020 | + { | |
| 2021 | + // Maybe we were given UTF-8 but wanted one of the single-byte | |
| 2022 | + // encodings. | |
| 2023 | + if (utf8_to_pdf_doc(supplied, output)) | |
| 2024 | + { | |
| 2025 | + result.push_back(output); | |
| 2026 | + } | |
| 2027 | + if (utf8_to_win_ansi(supplied, output)) | |
| 2028 | + { | |
| 2029 | + result.push_back(output); | |
| 2030 | + } | |
| 2031 | + if (utf8_to_mac_roman(supplied, output)) | |
| 2032 | + { | |
| 2033 | + result.push_back(output); | |
| 2034 | + } | |
| 2035 | + } | |
| 2036 | + else | |
| 2037 | + { | |
| 2038 | + // Maybe we were given one of the single-byte encodings but | |
| 2039 | + // wanted UTF-8. | |
| 2040 | + std::string from_pdf_doc(pdf_doc_to_utf8(supplied)); | |
| 2041 | + result.push_back(from_pdf_doc); | |
| 2042 | + std::string from_win_ansi(win_ansi_to_utf8(supplied)); | |
| 2043 | + result.push_back(from_win_ansi); | |
| 2044 | + std::string from_mac_roman(mac_roman_to_utf8(supplied)); | |
| 2045 | + result.push_back(from_mac_roman); | |
| 2046 | + | |
| 2047 | + // Maybe we were given one of the other single-byte encodings | |
| 2048 | + // but wanted one of the other ones. | |
| 2049 | + if (utf8_to_win_ansi(from_pdf_doc, output)) | |
| 2050 | + { | |
| 2051 | + result.push_back(output); | |
| 2052 | + } | |
| 2053 | + if (utf8_to_mac_roman(from_pdf_doc, output)) | |
| 2054 | + { | |
| 2055 | + result.push_back(output); | |
| 2056 | + } | |
| 2057 | + if (utf8_to_pdf_doc(from_win_ansi, output)) | |
| 2058 | + { | |
| 2059 | + result.push_back(output); | |
| 2060 | + } | |
| 2061 | + if (utf8_to_mac_roman(from_win_ansi, output)) | |
| 2062 | + { | |
| 2063 | + result.push_back(output); | |
| 2064 | + } | |
| 2065 | + if (utf8_to_pdf_doc(from_mac_roman, output)) | |
| 2066 | + { | |
| 2067 | + result.push_back(output); | |
| 2068 | + } | |
| 2069 | + if (utf8_to_win_ansi(from_mac_roman, output)) | |
| 2070 | + { | |
| 2071 | + result.push_back(output); | |
| 2072 | + } | |
| 2073 | + } | |
| 2074 | + // De-duplicate | |
| 2075 | + std::vector<std::string> t; | |
| 2076 | + std::set<std::string> seen; | |
| 2077 | + for (std::vector<std::string>::iterator iter = result.begin(); | |
| 2078 | + iter != result.end(); ++iter) | |
| 2079 | + { | |
| 2080 | + if (! seen.count(*iter)) | |
| 2081 | + { | |
| 2082 | + seen.insert(*iter); | |
| 2083 | + t.push_back(*iter); | |
| 2084 | + } | |
| 2085 | + } | |
| 2086 | + return t; | |
| 2087 | +} | ... | ... |
libtests/qtest/qutil/qutil.out
| ... | ... | @@ -58,6 +58,19 @@ bidirectional pdf doc done |
| 58 | 58 | bidirectional win ansi done |
| 59 | 59 | bidirectional mac roman done |
| 60 | 60 | analysis done |
| 61 | +alternatives | |
| 62 | +0: 86a9e99e | |
| 63 | +1: c692c2a9c3a9c5be | |
| 64 | +2: e280a0c2a9c3a9c5be | |
| 65 | +3: c39cc2a9c388c3bb | |
| 66 | +4: 83a9e99e | |
| 67 | +5: 81a9e99e | |
| 68 | +6: dca9c8fb | |
| 69 | +0: c692c2a9c3a9c5be | |
| 70 | +1: 86a9e99e | |
| 71 | +2: 83a9e99e | |
| 72 | +0: 717561636b | |
| 73 | +done alternatives | |
| 61 | 74 | ---- whoami |
| 62 | 75 | quack1 |
| 63 | 76 | quack2 | ... | ... |
libtests/qutil.cc
| ... | ... | @@ -276,6 +276,16 @@ void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) |
| 276 | 276 | } |
| 277 | 277 | } |
| 278 | 278 | |
| 279 | +void print_alternatives(std::string const& str) | |
| 280 | +{ | |
| 281 | + std::vector<std::string> result = QUtil::possible_repaired_encodings(str); | |
| 282 | + size_t n = result.size(); | |
| 283 | + for (size_t i = 0; i < n; ++i) | |
| 284 | + { | |
| 285 | + std::cout << i << ": " << QUtil::hex_encode(result.at(i)) << std::endl; | |
| 286 | + } | |
| 287 | +} | |
| 288 | + | |
| 279 | 289 | void transcoding_test() |
| 280 | 290 | { |
| 281 | 291 | transcoding_test(&QUtil::pdf_doc_to_utf8, |
| ... | ... | @@ -308,6 +318,18 @@ void transcoding_test() |
| 308 | 318 | assert(QUtil::utf8_to_pdf_doc(input1, output)); |
| 309 | 319 | assert(! QUtil::utf8_to_pdf_doc(input2, output)); |
| 310 | 320 | assert(QUtil::utf8_to_pdf_doc(input3, output)); |
| 321 | + std::cout << "alternatives" << std::endl; | |
| 322 | + // char name mac win pdf-doc | |
| 323 | + // U+0192 florin 304 203 206 | |
| 324 | + // U+00A9 copyright 251 251 251 | |
| 325 | + // U+00E9 eacute 216 351 351 | |
| 326 | + // U+017E zcaron - 236 236 | |
| 327 | + std::string pdfdoc = "\206\251\351\236"; | |
| 328 | + std::string utf8 = QUtil::pdf_doc_to_utf8(pdfdoc); | |
| 329 | + print_alternatives(pdfdoc); | |
| 330 | + print_alternatives(utf8); | |
| 331 | + print_alternatives("quack"); | |
| 332 | + std::cout << "done alternatives" << std::endl; | |
| 311 | 333 | } |
| 312 | 334 | |
| 313 | 335 | void print_whoami(char const* str) | ... | ... |