Commit 6d4115b7c565b6750ba4649d120446a1bd2b5af2
1 parent
986d2485
Detect overlong UTF-8 strings
Showing
3 changed files
with
56 additions
and
3 deletions
ChangeLog
libqpdf/QUtil.cc
| ... | ... | @@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint) |
| 1485 | 1485 | unsigned long |
| 1486 | 1486 | QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) |
| 1487 | 1487 | { |
| 1488 | + auto o_pos = pos; | |
| 1488 | 1489 | size_t len = utf8_val.length(); |
| 1489 | 1490 | unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++)); |
| 1490 | 1491 | error = false; |
| ... | ... | @@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e |
| 1505 | 1506 | return 0xfffd; |
| 1506 | 1507 | } |
| 1507 | 1508 | |
| 1508 | - unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear); | |
| 1509 | + auto codepoint = static_cast<unsigned long>(ch & ~to_clear); | |
| 1509 | 1510 | while (bytes_needed > 0) { |
| 1510 | 1511 | --bytes_needed; |
| 1511 | 1512 | ch = static_cast<unsigned char>(utf8_val.at(pos++)); |
| ... | ... | @@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e |
| 1517 | 1518 | codepoint <<= 6; |
| 1518 | 1519 | codepoint += (ch & 0x3f); |
| 1519 | 1520 | } |
| 1521 | + unsigned long lower_bound = 0; | |
| 1522 | + switch (pos - o_pos) { | |
| 1523 | + case 2: | |
| 1524 | + lower_bound = 1 << 7; | |
| 1525 | + break; | |
| 1526 | + case 3: | |
| 1527 | + lower_bound = 1 << 11; | |
| 1528 | + break; | |
| 1529 | + case 4: | |
| 1530 | + lower_bound = 1 << 16; | |
| 1531 | + break; | |
| 1532 | + case 5: | |
| 1533 | + lower_bound = 1 << 12; | |
| 1534 | + break; | |
| 1535 | + case 6: | |
| 1536 | + lower_bound = 1 << 26; | |
| 1537 | + break; | |
| 1538 | + default: | |
| 1539 | + lower_bound = 0; | |
| 1540 | + } | |
| 1541 | + | |
| 1542 | + if (lower_bound > 0 && codepoint < lower_bound) { | |
| 1543 | + // Too many bytes were used, but return whatever character was encoded. | |
| 1544 | + error = true; | |
| 1545 | + } | |
| 1520 | 1546 | return codepoint; |
| 1521 | 1547 | } |
| 1522 | 1548 | |
| ... | ... | @@ -1799,11 +1825,16 @@ QUtil::analyze_encoding( |
| 1799 | 1825 | bool any_errors = false; |
| 1800 | 1826 | while (pos < len) { |
| 1801 | 1827 | bool error = false; |
| 1828 | + auto old_pos = pos; | |
| 1802 | 1829 | unsigned long codepoint = get_next_utf8_codepoint(val, pos, error); |
| 1803 | 1830 | if (error) { |
| 1804 | 1831 | any_errors = true; |
| 1805 | - } | |
| 1806 | - if (codepoint >= 128) { | |
| 1832 | + for (auto p = old_pos; p < pos; p++) { | |
| 1833 | + if (static_cast<unsigned char>(val.at(p)) >= 128) { | |
| 1834 | + has_8bit_chars = true; | |
| 1835 | + } | |
| 1836 | + } | |
| 1837 | + } else if (codepoint >= 128) { | |
| 1807 | 1838 | has_8bit_chars = true; |
| 1808 | 1839 | } |
| 1809 | 1840 | } | ... | ... |
libtests/qutil.cc
| ... | ... | @@ -266,6 +266,23 @@ to_utf8_test() |
| 266 | 266 | } catch (std::runtime_error& e) { |
| 267 | 267 | std::cout << "0x80000000: " << e.what() << std::endl; |
| 268 | 268 | } |
| 269 | + | |
| 270 | + // Overlong characters: characters represented by more bytes than necessary. | |
| 271 | + size_t pos = 0; | |
| 272 | + std::string utf8 = "\xC0\x80" // 1 << 7 | |
| 273 | + "\xE0\x80\x80" // 1 << 11 | |
| 274 | + "\xF0\x80\x80\x80" // 1 << 16 | |
| 275 | + "\xF8\x80\x80\x80\x80" // 1 << 21 | |
| 276 | + "\xFC\x80\x80\x80\x80\x80"; // 1 << 26 | |
| 277 | + auto check = [&pos, &utf8](unsigned long wanted_pos) { | |
| 278 | + bool error = false; | |
| 279 | + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos); | |
| 280 | + }; | |
| 281 | + check(2); | |
| 282 | + check(5); | |
| 283 | + check(9); | |
| 284 | + check(14); | |
| 285 | + check(20); | |
| 269 | 286 | } |
| 270 | 287 | |
| 271 | 288 | static void | ... | ... |