Commit 6d4115b7c565b6750ba4649d120446a1bd2b5af2

Authored by Jay Berkenbilt
1 parent 986d2485

Detect overlong UTF-8 strings

ChangeLog
  1 +2023-12-25 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Detect overlong UTF-8 in the UTF-8 decoder, and fix detection of
  4 + 8-bit characters in erroneous UTF-8 strings.
  5 +
1 2023-12-24 Jay Berkenbilt <ejb@ql.org> 6 2023-12-24 Jay Berkenbilt <ejb@ql.org>
2 7
3 * 11.7.0: release 8 * 11.7.0: release
libqpdf/QUtil.cc
@@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint) @@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint)
1485 unsigned long 1485 unsigned long
1486 QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) 1486 QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
1487 { 1487 {
  1488 + auto o_pos = pos;
1488 size_t len = utf8_val.length(); 1489 size_t len = utf8_val.length();
1489 unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++)); 1490 unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
1490 error = false; 1491 error = false;
@@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; e @@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; e
1505 return 0xfffd; 1506 return 0xfffd;
1506 } 1507 }
1507 1508
1508 - unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear); 1509 + auto codepoint = static_cast<unsigned long>(ch & ~to_clear);
1509 while (bytes_needed > 0) { 1510 while (bytes_needed > 0) {
1510 --bytes_needed; 1511 --bytes_needed;
1511 ch = static_cast<unsigned char>(utf8_val.at(pos++)); 1512 ch = static_cast<unsigned char>(utf8_val.at(pos++));
@@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; e @@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; e
1517 codepoint <<= 6; 1518 codepoint <<= 6;
1518 codepoint += (ch & 0x3f); 1519 codepoint += (ch & 0x3f);
1519 } 1520 }
  1521 + unsigned long lower_bound = 0;
  1522 + switch (pos - o_pos) {
  1523 + case 2:
  1524 + lower_bound = 1 << 7;
  1525 + break;
  1526 + case 3:
  1527 + lower_bound = 1 << 11;
  1528 + break;
  1529 + case 4:
  1530 + lower_bound = 1 << 16;
  1531 + break;
  1532 + case 5:
  1533 + lower_bound = 1 << 12;
  1534 + break;
  1535 + case 6:
  1536 + lower_bound = 1 << 26;
  1537 + break;
  1538 + default:
  1539 + lower_bound = 0;
  1540 + }
  1541 +
  1542 + if (lower_bound > 0 && codepoint < lower_bound) {
  1543 + // Too many bytes were used, but return whatever character was encoded.
  1544 + error = true;
  1545 + }
1520 return codepoint; 1546 return codepoint;
1521 } 1547 }
1522 1548
@@ -1799,11 +1825,16 @@ QUtil::analyze_encoding( @@ -1799,11 +1825,16 @@ QUtil::analyze_encoding(
1799 bool any_errors = false; 1825 bool any_errors = false;
1800 while (pos < len) { 1826 while (pos < len) {
1801 bool error = false; 1827 bool error = false;
  1828 + auto old_pos = pos;
1802 unsigned long codepoint = get_next_utf8_codepoint(val, pos, error); 1829 unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
1803 if (error) { 1830 if (error) {
1804 any_errors = true; 1831 any_errors = true;
1805 - }  
1806 - if (codepoint >= 128) { 1832 + for (auto p = old_pos; p < pos; p++) {
  1833 + if (static_cast<unsigned char>(val.at(p)) >= 128) {
  1834 + has_8bit_chars = true;
  1835 + }
  1836 + }
  1837 + } else if (codepoint >= 128) {
1807 has_8bit_chars = true; 1838 has_8bit_chars = true;
1808 } 1839 }
1809 } 1840 }
libtests/qutil.cc
@@ -266,6 +266,23 @@ to_utf8_test() @@ -266,6 +266,23 @@ to_utf8_test()
266 } catch (std::runtime_error& e) { 266 } catch (std::runtime_error& e) {
267 std::cout << "0x80000000: " << e.what() << std::endl; 267 std::cout << "0x80000000: " << e.what() << std::endl;
268 } 268 }
  269 +
  270 + // Overlong characters: characters represented by more bytes than necessary.
  271 + size_t pos = 0;
  272 + std::string utf8 = "\xC0\x80" // 1 << 7
  273 + "\xE0\x80\x80" // 1 << 11
  274 + "\xF0\x80\x80\x80" // 1 << 16
  275 + "\xF8\x80\x80\x80\x80" // 1 << 21
  276 + "\xFC\x80\x80\x80\x80\x80"; // 1 << 26
  277 + auto check = [&pos, &utf8](unsigned long wanted_pos) {
  278 + bool error = false;
  279 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos);
  280 + };
  281 + check(2);
  282 + check(5);
  283 + check(9);
  284 + check(14);
  285 + check(20);
269 } 286 }
270 287
271 static void 288 static void