From 3dbab5a7f7ff34e8cbe5e30acf177088a6af69eb Mon Sep 17 00:00:00 2001 From: Wiebe Cazemier Date: Sun, 2 May 2021 18:33:00 +0200 Subject: [PATCH] Optimize UTF-8 check and add tests --- FlashMQTests/tst_maintests.cpp | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ threaddata.cpp | 2 ++ utils.cpp | 13 +++++++------ 3 files changed, 76 insertions(+), 6 deletions(-) diff --git a/FlashMQTests/tst_maintests.cpp b/FlashMQTests/tst_maintests.cpp index bf96ce4..996b739 100644 --- a/FlashMQTests/tst_maintests.cpp +++ b/FlashMQTests/tst_maintests.cpp @@ -78,6 +78,8 @@ private slots: void test_sse_split(); + void test_validUtf8(); + }; MainTests::MainTests() @@ -608,6 +610,8 @@ void MainTests::test_sse_split() topics.push_back("//1234567890abcdef/1234567890abcdefg/koe/"); topics.push_back("//1234567890abcdef/1234567890abcdefg/koe//"); topics.push_back("//1234567890abcdef/1234567890abcdef/"); + topics.push_back("/"); + topics.push_back(""); for (const std::string &t : topics) { @@ -615,6 +619,69 @@ void MainTests::test_sse_split() } } +void MainTests::test_validUtf8() +{ + char m[16]; + + QVERIFY(isValidUtf8("")); + QVERIFY(isValidUtf8("Hello")); + + std::memset(m, 0, 16); + QVERIFY(!isValidUtf8(std::string(m, 16))); + + QVERIFY(isValidUtf8("Straƀe")); // two byte chars + QVERIFY(isValidUtf8("StraƀeHelloHelloHelloHelloHelloHello")); // two byte chars + QVERIFY(isValidUtf8("HelloHelloHelloHelloHelloHelloHelloHelloStraƀeHelloHelloHelloHelloHelloHello")); // two byte chars + + std::memset(m, 0, 16); + m[0] = 'a'; + m[1] = 13; // is \r + QVERIFY(!isValidUtf8(std::string(m, 16))); + + const std::string unicode_ballet_shoes("🩰"); + QVERIFY(unicode_ballet_shoes.length() == 4); + QVERIFY(isValidUtf8(unicode_ballet_shoes)); + + const std::string unicode_ballot_box("☐"); + QVERIFY(unicode_ballot_box.length() == 3); + QVERIFY(isValidUtf8(unicode_ballot_box)); + + std::memset(m, 0, 16); + m[0] = 0b11000001; // Start 2 byte char + m[1] = 0b00000001; // Next byte doesn't start with 1, which is wrong + std::string a(m, 2); + QVERIFY(!isValidUtf8(a)); + + std::memset(m, 0, 16); + m[0] = 0b11100001; // Start 3 byte char + m[1] = 0b10100001; + m[2] = 0b00000001; // Next byte doesn't start with 1, which is wrong + std::string b(m, 3); + QVERIFY(!isValidUtf8(b)); + + std::memset(m, 0, 16); + m[0] = 0b11110001; // Start 4 byte char + m[1] = 0b10100001; + m[2] = 0b10100001; + m[3] = 0b00000001; // Next byte doesn't start with 1, which is wrong + std::string c(m, 4); + QVERIFY(!isValidUtf8(c)); + + std::memset(m, 0, 16); + m[0] = 0b11110001; // Start 4 byte char + m[1] = 0b10100001; + m[2] = 0b00100001; // Doesn't start with 1: invalid. + m[3] = 0b10000001; + std::string d(m, 4); + QVERIFY(!isValidUtf8(d)); + + // Upper ASCII, invalid + std::memset(m, 0, 16); + m[0] = 127; + std::string e(m, 1); + QVERIFY(!isValidUtf8(e)); +} + QTEST_GUILESS_MAIN(MainTests) #include "tst_maintests.moc" diff --git a/threaddata.cpp b/threaddata.cpp index d231724..9b56c7b 100644 --- a/threaddata.cpp +++ b/threaddata.cpp @@ -18,6 +18,7 @@ License along with FlashMQ. If not, see . #include "threaddata.h" #include #include +#include #define TOPIC_MEMORY_LENGTH 65560 @@ -181,6 +182,7 @@ std::vector *ThreadData::splitTopic(const std::string &topic) __m128i loaded = _mm_loadu_si128((__m128i*)i); int len_left = s - n; + assert(len_left >= 0); int index = _mm_cmpestri(slashes, 1, loaded, len_left, 0); std::memcpy(&subtopicParseMem[carryi], i, index); carryi += std::min(index, len_left); diff --git a/utils.cpp b/utils.cpp index bdb9a3d..92b6832 100644 --- a/utils.cpp +++ b/utils.cpp @@ -83,11 +83,8 @@ bool isValidUtf8(const std::string &s, bool alsoCheckInvalidPublishChars) { int multibyte_remain = 0; int cur_code_point = 0; - for(const char &x : s) + for(const char x : s) { - if (x == 0) - return false; - if (alsoCheckInvalidPublishChars && (x == '#' || x == '+')) return false; @@ -95,7 +92,11 @@ bool isValidUtf8(const std::string &s, bool alsoCheckInvalidPublishChars) { cur_code_point = 0; - if((x & 0b11100000) == 0b11000000) // 2 byte char + if ((x & 0b10000000) == 0) // when the MSB is 0, it's ASCII, most common case + { + cur_code_point += (x & 0b01111111); + } + else if((x & 0b11100000) == 0b11000000) // 2 byte char { multibyte_remain = 1; cur_code_point += ((x & 0b00011111) << 6); @@ -128,7 +129,7 @@ bool isValidUtf8(const std::string &s, bool alsoCheckInvalidPublishChars) // Invalid range for MQTT. [MQTT-1.5.3-1] if (cur_code_point >= 0xD800 && cur_code_point <= 0xDFFF) // Dec 55296-57343 return false; - if (cur_code_point >= 0x0001 && cur_code_point <= 0x001F) + if (cur_code_point <= 0x001F) return false; if (cur_code_point >= 0x007F && cur_code_point <= 0x009F) return false; -- libgit2 0.21.4