Commit e87d149918ed6ed211f733f932df3b62ab445c12

Authored by Jay Berkenbilt
1 parent 997f4ab6

Add QUtil::possible_repaired_encodings

ChangeLog
@@ -14,6 +14,14 @@ @@ -14,6 +14,14 @@
14 the first bug in qpdf's history that could result in silent loss 14 the first bug in qpdf's history that could result in silent loss
15 of data when processing a correct input file. Fixes #276. 15 of data when processing a correct input file. Fixes #276.
16 16
  17 +2019-01-15 Jay Berkenbilt <ejb@ql.org>
  18 +
  19 + * Add QUtil::possible_repaired_encodings which, given a string,
  20 + generates other strings that represent re-interpretation of the
  21 + bytes in a different coding system. This is used to help recover
  22 + passwords if the password string was improperly encoded on a
  23 + different system due to user error or a software bug.
  24 +
17 2019-01-14 Jay Berkenbilt <ejb@ql.org> 25 2019-01-14 Jay Berkenbilt <ejb@ql.org>
18 26
19 * Add new CLI flags to 128-bit and 256-bit encryption: --assemble, 27 * Add new CLI flags to 128-bit and 256-bit encryption: --assemble,
include/qpdf/QUtil.hh
@@ -223,6 +223,28 @@ namespace QUtil @@ -223,6 +223,28 @@ namespace QUtil
223 bool& is_valid_utf8, 223 bool& is_valid_utf8,
224 bool& is_utf16); 224 bool& is_utf16);
225 225
  226 + // Try to compensate for previously incorrectly encoded strings.
  227 + // We want to compensate for the following errors:
  228 + //
  229 + // * The string was supposed to be UTF-8 but was one of the
  230 + // single-byte encodings
  231 + // * The string was supposed to be PDF Doc but was either UTF-8 or
  232 + // one of the other single-byte encodings
  233 + //
  234 + // The returned vector always contains the original string first,
  235 + // and then it contains what the correct string would be in the
  236 + // event that the original string was the result of any of the
  237 + // above errors.
  238 + //
  239 + // This method is useful for attempting to recover a password that
  240 + // may have been previously incorrectly encoded. For example, the
  241 + // password was supposed to be UTF-8 but the previous application
  242 + // used a password encoded in WinAnsi, or if the previous password
  243 + // was supposed to be PDFDoc but was actually given as UTF-8 or
  244 + // WinAnsi, this method would find the correct password.
  245 + QPDF_DLL
  246 + std::vector<std::string> possible_repaired_encodings(std::string);
  247 +
226 // If secure random number generation is supported on your 248 // If secure random number generation is supported on your
227 // platform and qpdf was not compiled with insecure random number 249 // platform and qpdf was not compiled with insecure random number
228 // generation, this returns a cryptographically secure random 250 // generation, this returns a cryptographically secure random
libqpdf/QUtil.cc
@@ -15,6 +15,7 @@ @@ -15,6 +15,7 @@
15 #include <sstream> 15 #include <sstream>
16 #include <fstream> 16 #include <fstream>
17 #include <stdexcept> 17 #include <stdexcept>
  18 +#include <set>
18 #include <stdio.h> 19 #include <stdio.h>
19 #include <errno.h> 20 #include <errno.h>
20 #include <ctype.h> 21 #include <ctype.h>
@@ -1992,3 +1993,95 @@ QUtil::analyze_encoding(std::string const&amp; val, @@ -1992,3 +1993,95 @@ QUtil::analyze_encoding(std::string const&amp; val,
1992 is_valid_utf8 = true; 1993 is_valid_utf8 = true;
1993 } 1994 }
1994 } 1995 }
  1996 +
  1997 +std::vector<std::string>
  1998 +QUtil::possible_repaired_encodings(std::string supplied)
  1999 +{
  2000 + std::vector<std::string> result;
  2001 + // Always include the original string
  2002 + result.push_back(supplied);
  2003 + bool has_8bit_chars = false;
  2004 + bool is_valid_utf8 = false;
  2005 + bool is_utf16 = false;
  2006 + analyze_encoding(supplied, has_8bit_chars, is_valid_utf8, is_utf16);
  2007 + if (! has_8bit_chars)
  2008 + {
  2009 + return result;
  2010 + }
  2011 + if (is_utf16)
  2012 + {
  2013 + // Convert to UTF-8 and pretend we got a UTF-8 string.
  2014 + is_utf16 = false;
  2015 + is_valid_utf8 = true;
  2016 + supplied = utf16_to_utf8(supplied);
  2017 + }
  2018 + std::string output;
  2019 + if (is_valid_utf8)
  2020 + {
  2021 + // Maybe we were given UTF-8 but wanted one of the single-byte
  2022 + // encodings.
  2023 + if (utf8_to_pdf_doc(supplied, output))
  2024 + {
  2025 + result.push_back(output);
  2026 + }
  2027 + if (utf8_to_win_ansi(supplied, output))
  2028 + {
  2029 + result.push_back(output);
  2030 + }
  2031 + if (utf8_to_mac_roman(supplied, output))
  2032 + {
  2033 + result.push_back(output);
  2034 + }
  2035 + }
  2036 + else
  2037 + {
  2038 + // Maybe we were given one of the single-byte encodings but
  2039 + // wanted UTF-8.
  2040 + std::string from_pdf_doc(pdf_doc_to_utf8(supplied));
  2041 + result.push_back(from_pdf_doc);
  2042 + std::string from_win_ansi(win_ansi_to_utf8(supplied));
  2043 + result.push_back(from_win_ansi);
  2044 + std::string from_mac_roman(mac_roman_to_utf8(supplied));
  2045 + result.push_back(from_mac_roman);
  2046 +
  2047 + // Maybe we were given one of the other single-byte encodings
  2048 + // but wanted one of the other ones.
  2049 + if (utf8_to_win_ansi(from_pdf_doc, output))
  2050 + {
  2051 + result.push_back(output);
  2052 + }
  2053 + if (utf8_to_mac_roman(from_pdf_doc, output))
  2054 + {
  2055 + result.push_back(output);
  2056 + }
  2057 + if (utf8_to_pdf_doc(from_win_ansi, output))
  2058 + {
  2059 + result.push_back(output);
  2060 + }
  2061 + if (utf8_to_mac_roman(from_win_ansi, output))
  2062 + {
  2063 + result.push_back(output);
  2064 + }
  2065 + if (utf8_to_pdf_doc(from_mac_roman, output))
  2066 + {
  2067 + result.push_back(output);
  2068 + }
  2069 + if (utf8_to_win_ansi(from_mac_roman, output))
  2070 + {
  2071 + result.push_back(output);
  2072 + }
  2073 + }
  2074 + // De-duplicate
  2075 + std::vector<std::string> t;
  2076 + std::set<std::string> seen;
  2077 + for (std::vector<std::string>::iterator iter = result.begin();
  2078 + iter != result.end(); ++iter)
  2079 + {
  2080 + if (! seen.count(*iter))
  2081 + {
  2082 + seen.insert(*iter);
  2083 + t.push_back(*iter);
  2084 + }
  2085 + }
  2086 + return t;
  2087 +}
libtests/qtest/qutil/qutil.out
@@ -58,6 +58,19 @@ bidirectional pdf doc done @@ -58,6 +58,19 @@ bidirectional pdf doc done
58 bidirectional win ansi done 58 bidirectional win ansi done
59 bidirectional mac roman done 59 bidirectional mac roman done
60 analysis done 60 analysis done
  61 +alternatives
  62 +0: 86a9e99e
  63 +1: c692c2a9c3a9c5be
  64 +2: e280a0c2a9c3a9c5be
  65 +3: c39cc2a9c388c3bb
  66 +4: 83a9e99e
  67 +5: 81a9e99e
  68 +6: dca9c8fb
  69 +0: c692c2a9c3a9c5be
  70 +1: 86a9e99e
  71 +2: 83a9e99e
  72 +0: 717561636b
  73 +done alternatives
61 ---- whoami 74 ---- whoami
62 quack1 75 quack1
63 quack2 76 quack2
libtests/qutil.cc
@@ -276,6 +276,16 @@ void check_analyze(std::string const&amp; str, bool has8bit, bool utf8, bool utf16) @@ -276,6 +276,16 @@ void check_analyze(std::string const&amp; str, bool has8bit, bool utf8, bool utf16)
276 } 276 }
277 } 277 }
278 278
  279 +void print_alternatives(std::string const& str)
  280 +{
  281 + std::vector<std::string> result = QUtil::possible_repaired_encodings(str);
  282 + size_t n = result.size();
  283 + for (size_t i = 0; i < n; ++i)
  284 + {
  285 + std::cout << i << ": " << QUtil::hex_encode(result.at(i)) << std::endl;
  286 + }
  287 +}
  288 +
279 void transcoding_test() 289 void transcoding_test()
280 { 290 {
281 transcoding_test(&QUtil::pdf_doc_to_utf8, 291 transcoding_test(&QUtil::pdf_doc_to_utf8,
@@ -308,6 +318,18 @@ void transcoding_test() @@ -308,6 +318,18 @@ void transcoding_test()
308 assert(QUtil::utf8_to_pdf_doc(input1, output)); 318 assert(QUtil::utf8_to_pdf_doc(input1, output));
309 assert(! QUtil::utf8_to_pdf_doc(input2, output)); 319 assert(! QUtil::utf8_to_pdf_doc(input2, output));
310 assert(QUtil::utf8_to_pdf_doc(input3, output)); 320 assert(QUtil::utf8_to_pdf_doc(input3, output));
  321 + std::cout << "alternatives" << std::endl;
  322 + // char name mac win pdf-doc
  323 + // U+0192 florin 304 203 206
  324 + // U+00A9 copyright 251 251 251
  325 + // U+00E9 eacute 216 351 351
  326 + // U+017E zcaron - 236 236
  327 + std::string pdfdoc = "\206\251\351\236";
  328 + std::string utf8 = QUtil::pdf_doc_to_utf8(pdfdoc);
  329 + print_alternatives(pdfdoc);
  330 + print_alternatives(utf8);
  331 + print_alternatives("quack");
  332 + std::cout << "done alternatives" << std::endl;
311 } 333 }
312 334
313 void print_whoami(char const* str) 335 void print_whoami(char const* str)