Commit e87d149918ed6ed211f733f932df3b62ab445c12

Authored by Jay Berkenbilt
1 parent 997f4ab6

Add QUtil::possible_repaired_encodings

ChangeLog
... ... @@ -14,6 +14,14 @@
14 14 the first bug in qpdf's history that could result in silent loss
15 15 of data when processing a correct input file. Fixes #276.
16 16  
  17 +2019-01-15 Jay Berkenbilt <ejb@ql.org>
  18 +
  19 + * Add QUtil::possible_repaired_encodings which, given a string,
  20 + generates other strings that represent re-interpretation of the
  21 + bytes in a different coding system. This is used to help recover
  22 + passwords if the password string was improperly encoded on a
  23 + different system due to user error or a software bug.
  24 +
17 25 2019-01-14 Jay Berkenbilt <ejb@ql.org>
18 26  
19 27 * Add new CLI flags to 128-bit and 256-bit encryption: --assemble,
... ...
include/qpdf/QUtil.hh
... ... @@ -223,6 +223,28 @@ namespace QUtil
223 223 bool& is_valid_utf8,
224 224 bool& is_utf16);
225 225  
  226 + // Try to compensate for previously incorrectly encoded strings.
  227 + // We want to compensate for the following errors:
  228 + //
  229 + // * The string was supposed to be UTF-8 but was one of the
  230 + // single-byte encodings
  231 + // * The string was supposed to be PDF Doc but was either UTF-8 or
  232 + // one of the other single-byte encodings
  233 + //
  234 + // The returned vector always contains the original string first,
  235 + // and then it contains what the correct string would be in the
  236 + // event that the original string was the result of any of the
  237 + // above errors.
  238 + //
  239 + // This method is useful for attempting to recover a password that
  240 + // may have been previously incorrectly encoded. For example, the
  241 + // password was supposed to be UTF-8 but the previous application
  242 + // used a password encoded in WinAnsi, or if the previous password
  243 + // was supposed to be PDFDoc but was actually given as UTF-8 or
  244 + // WinAnsi, this method would find the correct password.
  245 + QPDF_DLL
  246 + std::vector<std::string> possible_repaired_encodings(std::string);
  247 +
226 248 // If secure random number generation is supported on your
227 249 // platform and qpdf was not compiled with insecure random number
228 250 // generation, this returns a cryptographically secure random
... ...
libqpdf/QUtil.cc
... ... @@ -15,6 +15,7 @@
15 15 #include <sstream>
16 16 #include <fstream>
17 17 #include <stdexcept>
  18 +#include <set>
18 19 #include <stdio.h>
19 20 #include <errno.h>
20 21 #include <ctype.h>
... ... @@ -1992,3 +1993,95 @@ QUtil::analyze_encoding(std::string const&amp; val,
1992 1993 is_valid_utf8 = true;
1993 1994 }
1994 1995 }
  1996 +
  1997 +std::vector<std::string>
  1998 +QUtil::possible_repaired_encodings(std::string supplied)
  1999 +{
  2000 + std::vector<std::string> result;
  2001 + // Always include the original string
  2002 + result.push_back(supplied);
  2003 + bool has_8bit_chars = false;
  2004 + bool is_valid_utf8 = false;
  2005 + bool is_utf16 = false;
  2006 + analyze_encoding(supplied, has_8bit_chars, is_valid_utf8, is_utf16);
  2007 + if (! has_8bit_chars)
  2008 + {
  2009 + return result;
  2010 + }
  2011 + if (is_utf16)
  2012 + {
  2013 + // Convert to UTF-8 and pretend we got a UTF-8 string.
  2014 + is_utf16 = false;
  2015 + is_valid_utf8 = true;
  2016 + supplied = utf16_to_utf8(supplied);
  2017 + }
  2018 + std::string output;
  2019 + if (is_valid_utf8)
  2020 + {
  2021 + // Maybe we were given UTF-8 but wanted one of the single-byte
  2022 + // encodings.
  2023 + if (utf8_to_pdf_doc(supplied, output))
  2024 + {
  2025 + result.push_back(output);
  2026 + }
  2027 + if (utf8_to_win_ansi(supplied, output))
  2028 + {
  2029 + result.push_back(output);
  2030 + }
  2031 + if (utf8_to_mac_roman(supplied, output))
  2032 + {
  2033 + result.push_back(output);
  2034 + }
  2035 + }
  2036 + else
  2037 + {
  2038 + // Maybe we were given one of the single-byte encodings but
  2039 + // wanted UTF-8.
  2040 + std::string from_pdf_doc(pdf_doc_to_utf8(supplied));
  2041 + result.push_back(from_pdf_doc);
  2042 + std::string from_win_ansi(win_ansi_to_utf8(supplied));
  2043 + result.push_back(from_win_ansi);
  2044 + std::string from_mac_roman(mac_roman_to_utf8(supplied));
  2045 + result.push_back(from_mac_roman);
  2046 +
  2047 + // Maybe we were given one of the other single-byte encodings
  2048 + // but wanted one of the other ones.
  2049 + if (utf8_to_win_ansi(from_pdf_doc, output))
  2050 + {
  2051 + result.push_back(output);
  2052 + }
  2053 + if (utf8_to_mac_roman(from_pdf_doc, output))
  2054 + {
  2055 + result.push_back(output);
  2056 + }
  2057 + if (utf8_to_pdf_doc(from_win_ansi, output))
  2058 + {
  2059 + result.push_back(output);
  2060 + }
  2061 + if (utf8_to_mac_roman(from_win_ansi, output))
  2062 + {
  2063 + result.push_back(output);
  2064 + }
  2065 + if (utf8_to_pdf_doc(from_mac_roman, output))
  2066 + {
  2067 + result.push_back(output);
  2068 + }
  2069 + if (utf8_to_win_ansi(from_mac_roman, output))
  2070 + {
  2071 + result.push_back(output);
  2072 + }
  2073 + }
  2074 + // De-duplicate
  2075 + std::vector<std::string> t;
  2076 + std::set<std::string> seen;
  2077 + for (std::vector<std::string>::iterator iter = result.begin();
  2078 + iter != result.end(); ++iter)
  2079 + {
  2080 + if (! seen.count(*iter))
  2081 + {
  2082 + seen.insert(*iter);
  2083 + t.push_back(*iter);
  2084 + }
  2085 + }
  2086 + return t;
  2087 +}
... ...
libtests/qtest/qutil/qutil.out
... ... @@ -58,6 +58,19 @@ bidirectional pdf doc done
58 58 bidirectional win ansi done
59 59 bidirectional mac roman done
60 60 analysis done
  61 +alternatives
  62 +0: 86a9e99e
  63 +1: c692c2a9c3a9c5be
  64 +2: e280a0c2a9c3a9c5be
  65 +3: c39cc2a9c388c3bb
  66 +4: 83a9e99e
  67 +5: 81a9e99e
  68 +6: dca9c8fb
  69 +0: c692c2a9c3a9c5be
  70 +1: 86a9e99e
  71 +2: 83a9e99e
  72 +0: 717561636b
  73 +done alternatives
61 74 ---- whoami
62 75 quack1
63 76 quack2
... ...
libtests/qutil.cc
... ... @@ -276,6 +276,16 @@ void check_analyze(std::string const&amp; str, bool has8bit, bool utf8, bool utf16)
276 276 }
277 277 }
278 278  
  279 +void print_alternatives(std::string const& str)
  280 +{
  281 + std::vector<std::string> result = QUtil::possible_repaired_encodings(str);
  282 + size_t n = result.size();
  283 + for (size_t i = 0; i < n; ++i)
  284 + {
  285 + std::cout << i << ": " << QUtil::hex_encode(result.at(i)) << std::endl;
  286 + }
  287 +}
  288 +
279 289 void transcoding_test()
280 290 {
281 291 transcoding_test(&QUtil::pdf_doc_to_utf8,
... ... @@ -308,6 +318,18 @@ void transcoding_test()
308 318 assert(QUtil::utf8_to_pdf_doc(input1, output));
309 319 assert(! QUtil::utf8_to_pdf_doc(input2, output));
310 320 assert(QUtil::utf8_to_pdf_doc(input3, output));
  321 + std::cout << "alternatives" << std::endl;
  322 + // char name mac win pdf-doc
  323 + // U+0192 florin 304 203 206
  324 + // U+00A9 copyright 251 251 251
  325 + // U+00E9 eacute 216 351 351
  326 + // U+017E zcaron - 236 236
  327 + std::string pdfdoc = "\206\251\351\236";
  328 + std::string utf8 = QUtil::pdf_doc_to_utf8(pdfdoc);
  329 + print_alternatives(pdfdoc);
  330 + print_alternatives(utf8);
  331 + print_alternatives("quack");
  332 + std::cout << "done alternatives" << std::endl;
311 333 }
312 334  
313 335 void print_whoami(char const* str)
... ...