Commit 6d4115b7c565b6750ba4649d120446a1bd2b5af2

Authored by Jay Berkenbilt
1 parent 986d2485

Detect overlong UTF-8 strings

ChangeLog
  1 +2023-12-25 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Detect overlong UTF-8 in the UTF-8 decoder, and fix detection of
  4 + 8-bit characters in erroneous UTF-8 strings.
  5 +
1 6 2023-12-24 Jay Berkenbilt <ejb@ql.org>
2 7  
3 8 * 11.7.0: release
... ...
libqpdf/QUtil.cc
... ... @@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint)
1485 1485 unsigned long
1486 1486 QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
1487 1487 {
  1488 + auto o_pos = pos;
1488 1489 size_t len = utf8_val.length();
1489 1490 unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
1490 1491 error = false;
... ... @@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; e
1505 1506 return 0xfffd;
1506 1507 }
1507 1508  
1508   - unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
  1509 + auto codepoint = static_cast<unsigned long>(ch & ~to_clear);
1509 1510 while (bytes_needed > 0) {
1510 1511 --bytes_needed;
1511 1512 ch = static_cast<unsigned char>(utf8_val.at(pos++));
... ... @@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; e
1517 1518 codepoint <<= 6;
1518 1519 codepoint += (ch & 0x3f);
1519 1520 }
  1521 + unsigned long lower_bound = 0;
  1522 + switch (pos - o_pos) {
  1523 + case 2:
  1524 + lower_bound = 1 << 7;
  1525 + break;
  1526 + case 3:
  1527 + lower_bound = 1 << 11;
  1528 + break;
  1529 + case 4:
  1530 + lower_bound = 1 << 16;
  1531 + break;
  1532 + case 5:
  1533 + lower_bound = 1 << 12;
  1534 + break;
  1535 + case 6:
  1536 + lower_bound = 1 << 26;
  1537 + break;
  1538 + default:
  1539 + lower_bound = 0;
  1540 + }
  1541 +
  1542 + if (lower_bound > 0 && codepoint < lower_bound) {
  1543 + // Too many bytes were used, but return whatever character was encoded.
  1544 + error = true;
  1545 + }
1520 1546 return codepoint;
1521 1547 }
1522 1548  
... ... @@ -1799,11 +1825,16 @@ QUtil::analyze_encoding(
1799 1825 bool any_errors = false;
1800 1826 while (pos < len) {
1801 1827 bool error = false;
  1828 + auto old_pos = pos;
1802 1829 unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
1803 1830 if (error) {
1804 1831 any_errors = true;
1805   - }
1806   - if (codepoint >= 128) {
  1832 + for (auto p = old_pos; p < pos; p++) {
  1833 + if (static_cast<unsigned char>(val.at(p)) >= 128) {
  1834 + has_8bit_chars = true;
  1835 + }
  1836 + }
  1837 + } else if (codepoint >= 128) {
1807 1838 has_8bit_chars = true;
1808 1839 }
1809 1840 }
... ...
libtests/qutil.cc
... ... @@ -266,6 +266,23 @@ to_utf8_test()
266 266 } catch (std::runtime_error& e) {
267 267 std::cout << "0x80000000: " << e.what() << std::endl;
268 268 }
  269 +
  270 + // Overlong characters: characters represented by more bytes than necessary.
  271 + size_t pos = 0;
  272 + std::string utf8 = "\xC0\x80" // 1 << 7
  273 + "\xE0\x80\x80" // 1 << 11
  274 + "\xF0\x80\x80\x80" // 1 << 16
  275 + "\xF8\x80\x80\x80\x80" // 1 << 21
  276 + "\xFC\x80\x80\x80\x80\x80"; // 1 << 26
  277 + auto check = [&pos, &utf8](unsigned long wanted_pos) {
  278 + bool error = false;
  279 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos);
  280 + };
  281 + check(2);
  282 + check(5);
  283 + check(9);
  284 + check(14);
  285 + check(20);
269 286 }
270 287  
271 288 static void
... ...