diff --git a/FlashMQTests/tst_maintests.cpp b/FlashMQTests/tst_maintests.cpp
index bf96ce4..996b739 100644
--- a/FlashMQTests/tst_maintests.cpp
+++ b/FlashMQTests/tst_maintests.cpp
@@ -78,6 +78,8 @@ private slots:
void test_sse_split();
+ void test_validUtf8();
+
};
MainTests::MainTests()
@@ -608,6 +610,8 @@ void MainTests::test_sse_split()
topics.push_back("//1234567890abcdef/1234567890abcdefg/koe/");
topics.push_back("//1234567890abcdef/1234567890abcdefg/koe//");
topics.push_back("//1234567890abcdef/1234567890abcdef/");
+ topics.push_back("/");
+ topics.push_back("");
for (const std::string &t : topics)
{
@@ -615,6 +619,69 @@ void MainTests::test_sse_split()
}
}
+void MainTests::test_validUtf8()
+{
+ char m[16];
+
+ QVERIFY(isValidUtf8(""));
+ QVERIFY(isValidUtf8("Hello"));
+
+ std::memset(m, 0, 16);
+ QVERIFY(!isValidUtf8(std::string(m, 16)));
+
+ QVERIFY(isValidUtf8("Straƀe")); // two byte chars
+ QVERIFY(isValidUtf8("StraƀeHelloHelloHelloHelloHelloHello")); // two byte chars
+ QVERIFY(isValidUtf8("HelloHelloHelloHelloHelloHelloHelloHelloStraƀeHelloHelloHelloHelloHelloHello")); // two byte chars
+
+ std::memset(m, 0, 16);
+ m[0] = 'a';
+ m[1] = 13; // is \r
+ QVERIFY(!isValidUtf8(std::string(m, 16)));
+
+ const std::string unicode_ballet_shoes("🩰");
+ QVERIFY(unicode_ballet_shoes.length() == 4);
+ QVERIFY(isValidUtf8(unicode_ballet_shoes));
+
+ const std::string unicode_ballot_box("☐");
+ QVERIFY(unicode_ballot_box.length() == 3);
+ QVERIFY(isValidUtf8(unicode_ballot_box));
+
+ std::memset(m, 0, 16);
+ m[0] = 0b11000001; // Start 2 byte char
+ m[1] = 0b00000001; // Next byte doesn't start with 1, which is wrong
+ std::string a(m, 2);
+ QVERIFY(!isValidUtf8(a));
+
+ std::memset(m, 0, 16);
+ m[0] = 0b11100001; // Start 3 byte char
+ m[1] = 0b10100001;
+ m[2] = 0b00000001; // Next byte doesn't start with 1, which is wrong
+ std::string b(m, 3);
+ QVERIFY(!isValidUtf8(b));
+
+ std::memset(m, 0, 16);
+ m[0] = 0b11110001; // Start 4 byte char
+ m[1] = 0b10100001;
+ m[2] = 0b10100001;
+ m[3] = 0b00000001; // Next byte doesn't start with 1, which is wrong
+ std::string c(m, 4);
+ QVERIFY(!isValidUtf8(c));
+
+ std::memset(m, 0, 16);
+ m[0] = 0b11110001; // Start 4 byte char
+ m[1] = 0b10100001;
+ m[2] = 0b00100001; // Doesn't start with 1: invalid.
+ m[3] = 0b10000001;
+ std::string d(m, 4);
+ QVERIFY(!isValidUtf8(d));
+
+ // Upper ASCII, invalid
+ std::memset(m, 0, 16);
+ m[0] = 127;
+ std::string e(m, 1);
+ QVERIFY(!isValidUtf8(e));
+}
+
QTEST_GUILESS_MAIN(MainTests)
#include "tst_maintests.moc"
diff --git a/threaddata.cpp b/threaddata.cpp
index d231724..9b56c7b 100644
--- a/threaddata.cpp
+++ b/threaddata.cpp
@@ -18,6 +18,7 @@ License along with FlashMQ. If not, see .
#include "threaddata.h"
#include
#include
+#include
#define TOPIC_MEMORY_LENGTH 65560
@@ -181,6 +182,7 @@ std::vector *ThreadData::splitTopic(const std::string &topic)
__m128i loaded = _mm_loadu_si128((__m128i*)i);
int len_left = s - n;
+ assert(len_left >= 0);
int index = _mm_cmpestri(slashes, 1, loaded, len_left, 0);
std::memcpy(&subtopicParseMem[carryi], i, index);
carryi += std::min(index, len_left);
diff --git a/utils.cpp b/utils.cpp
index bdb9a3d..92b6832 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -83,11 +83,8 @@ bool isValidUtf8(const std::string &s, bool alsoCheckInvalidPublishChars)
{
int multibyte_remain = 0;
int cur_code_point = 0;
- for(const char &x : s)
+ for(const char x : s)
{
- if (x == 0)
- return false;
-
if (alsoCheckInvalidPublishChars && (x == '#' || x == '+'))
return false;
@@ -95,7 +92,11 @@ bool isValidUtf8(const std::string &s, bool alsoCheckInvalidPublishChars)
{
cur_code_point = 0;
- if((x & 0b11100000) == 0b11000000) // 2 byte char
+ if ((x & 0b10000000) == 0) // when the MSB is 0, it's ASCII, most common case
+ {
+ cur_code_point += (x & 0b01111111);
+ }
+ else if((x & 0b11100000) == 0b11000000) // 2 byte char
{
multibyte_remain = 1;
cur_code_point += ((x & 0b00011111) << 6);
@@ -128,7 +129,7 @@ bool isValidUtf8(const std::string &s, bool alsoCheckInvalidPublishChars)
// Invalid range for MQTT. [MQTT-1.5.3-1]
if (cur_code_point >= 0xD800 && cur_code_point <= 0xDFFF) // Dec 55296-57343
return false;
- if (cur_code_point >= 0x0001 && cur_code_point <= 0x001F)
+ if (cur_code_point <= 0x001F)
return false;
if (cur_code_point >= 0x007F && cur_code_point <= 0x009F)
return false;