Commit 6d4115b7c565b6750ba4649d120446a1bd2b5af2
1 parent
986d2485
Detect overlong UTF-8 strings
Showing
3 changed files
with
56 additions
and
3 deletions
ChangeLog
libqpdf/QUtil.cc
| @@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint) | @@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint) | ||
| 1485 | unsigned long | 1485 | unsigned long |
| 1486 | QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) | 1486 | QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) |
| 1487 | { | 1487 | { |
| 1488 | + auto o_pos = pos; | ||
| 1488 | size_t len = utf8_val.length(); | 1489 | size_t len = utf8_val.length(); |
| 1489 | unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++)); | 1490 | unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++)); |
| 1490 | error = false; | 1491 | error = false; |
| @@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e | @@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e | ||
| 1505 | return 0xfffd; | 1506 | return 0xfffd; |
| 1506 | } | 1507 | } |
| 1507 | 1508 | ||
| 1508 | - unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear); | 1509 | + auto codepoint = static_cast<unsigned long>(ch & ~to_clear); |
| 1509 | while (bytes_needed > 0) { | 1510 | while (bytes_needed > 0) { |
| 1510 | --bytes_needed; | 1511 | --bytes_needed; |
| 1511 | ch = static_cast<unsigned char>(utf8_val.at(pos++)); | 1512 | ch = static_cast<unsigned char>(utf8_val.at(pos++)); |
| @@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e | @@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e | ||
| 1517 | codepoint <<= 6; | 1518 | codepoint <<= 6; |
| 1518 | codepoint += (ch & 0x3f); | 1519 | codepoint += (ch & 0x3f); |
| 1519 | } | 1520 | } |
| 1521 | + unsigned long lower_bound = 0; | ||
| 1522 | + switch (pos - o_pos) { | ||
| 1523 | + case 2: | ||
| 1524 | + lower_bound = 1 << 7; | ||
| 1525 | + break; | ||
| 1526 | + case 3: | ||
| 1527 | + lower_bound = 1 << 11; | ||
| 1528 | + break; | ||
| 1529 | + case 4: | ||
| 1530 | + lower_bound = 1 << 16; | ||
| 1531 | + break; | ||
| 1532 | + case 5: | ||
| 1533 | + lower_bound = 1 << 12; | ||
| 1534 | + break; | ||
| 1535 | + case 6: | ||
| 1536 | + lower_bound = 1 << 26; | ||
| 1537 | + break; | ||
| 1538 | + default: | ||
| 1539 | + lower_bound = 0; | ||
| 1540 | + } | ||
| 1541 | + | ||
| 1542 | + if (lower_bound > 0 && codepoint < lower_bound) { | ||
| 1543 | + // Too many bytes were used, but return whatever character was encoded. | ||
| 1544 | + error = true; | ||
| 1545 | + } | ||
| 1520 | return codepoint; | 1546 | return codepoint; |
| 1521 | } | 1547 | } |
| 1522 | 1548 | ||
| @@ -1799,11 +1825,16 @@ QUtil::analyze_encoding( | @@ -1799,11 +1825,16 @@ QUtil::analyze_encoding( | ||
| 1799 | bool any_errors = false; | 1825 | bool any_errors = false; |
| 1800 | while (pos < len) { | 1826 | while (pos < len) { |
| 1801 | bool error = false; | 1827 | bool error = false; |
| 1828 | + auto old_pos = pos; | ||
| 1802 | unsigned long codepoint = get_next_utf8_codepoint(val, pos, error); | 1829 | unsigned long codepoint = get_next_utf8_codepoint(val, pos, error); |
| 1803 | if (error) { | 1830 | if (error) { |
| 1804 | any_errors = true; | 1831 | any_errors = true; |
| 1805 | - } | ||
| 1806 | - if (codepoint >= 128) { | 1832 | + for (auto p = old_pos; p < pos; p++) { |
| 1833 | + if (static_cast<unsigned char>(val.at(p)) >= 128) { | ||
| 1834 | + has_8bit_chars = true; | ||
| 1835 | + } | ||
| 1836 | + } | ||
| 1837 | + } else if (codepoint >= 128) { | ||
| 1807 | has_8bit_chars = true; | 1838 | has_8bit_chars = true; |
| 1808 | } | 1839 | } |
| 1809 | } | 1840 | } |
libtests/qutil.cc
| @@ -266,6 +266,23 @@ to_utf8_test() | @@ -266,6 +266,23 @@ to_utf8_test() | ||
| 266 | } catch (std::runtime_error& e) { | 266 | } catch (std::runtime_error& e) { |
| 267 | std::cout << "0x80000000: " << e.what() << std::endl; | 267 | std::cout << "0x80000000: " << e.what() << std::endl; |
| 268 | } | 268 | } |
| 269 | + | ||
| 270 | + // Overlong characters: characters represented by more bytes than necessary. | ||
| 271 | + size_t pos = 0; | ||
| 272 | + std::string utf8 = "\xC0\x80" // 1 << 7 | ||
| 273 | + "\xE0\x80\x80" // 1 << 11 | ||
| 274 | + "\xF0\x80\x80\x80" // 1 << 16 | ||
| 275 | + "\xF8\x80\x80\x80\x80" // 1 << 21 | ||
| 276 | + "\xFC\x80\x80\x80\x80\x80"; // 1 << 26 | ||
| 277 | + auto check = [&pos, &utf8](unsigned long wanted_pos) { | ||
| 278 | + bool error = false; | ||
| 279 | + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos); | ||
| 280 | + }; | ||
| 281 | + check(2); | ||
| 282 | + check(5); | ||
| 283 | + check(9); | ||
| 284 | + check(14); | ||
| 285 | + check(20); | ||
| 269 | } | 286 | } |
| 270 | 287 | ||
| 271 | static void | 288 | static void |