Commit 4bfef9db006a58c8fc1a94a96db78f30b05a2d12
1 parent
af190a63
Fix several deadlocks
One fix is client destruction happening on the correct thread (when kicking another one off with existing client ID). This caused deadlocks on the subscriptions lock during a race condition when doKeepAliveCheck() also ran. A related deadlock was that the queued functions were executed while holding the lock taskQueueMutex. Together with the subscriptions lock, that was executed in the $SYS topic function, this also caused deadlocks.
Showing
7 changed files
with
123 additions
and
18 deletions
client.cpp
| ... | ... | @@ -222,7 +222,7 @@ void Client::writeMqttPacketAndBlameThisClient(const MqttPacket &packet, const c |
| 222 | 222 | } |
| 223 | 223 | catch (std::exception &ex) |
| 224 | 224 | { |
| 225 | - threadData->removeClient(fd); | |
| 225 | + threadData->removeClientQueued(fd); | |
| 226 | 226 | } |
| 227 | 227 | } |
| 228 | 228 | |
| ... | ... | @@ -322,6 +322,11 @@ void Client::resetBuffersIfEligible() |
| 322 | 322 | writebuf.resetSizeIfEligable(initialBufferSize); |
| 323 | 323 | } |
| 324 | 324 | |
| 325 | +void Client::setCleanSession(bool val) | |
| 326 | +{ | |
| 327 | + this->cleanSession = val; | |
| 328 | +} | |
| 329 | + | |
| 325 | 330 | #ifndef NDEBUG |
| 326 | 331 | /** |
| 327 | 332 | * @brief IoWrapper::setFakeUpgraded(). | ... | ... |
client.h
mqttpacket.cpp
| ... | ... | @@ -412,7 +412,7 @@ void MqttPacket::handleDisconnect() |
| 412 | 412 | sender->setDisconnectReason("MQTT Disconnect received."); |
| 413 | 413 | sender->markAsDisconnecting(); |
| 414 | 414 | sender->clearWill(); |
| 415 | - sender->getThreadData()->removeClient(sender); | |
| 415 | + sender->getThreadData()->removeClientQueued(sender); | |
| 416 | 416 | } |
| 417 | 417 | |
| 418 | 418 | void MqttPacket::handleSubscribe() | ... | ... |
subscriptionstore.cpp
| ... | ... | @@ -202,6 +202,7 @@ void SubscriptionStore::registerClientAndKickExistingOne(std::shared_ptr<Client> |
| 202 | 202 | if (client->getClientId().empty()) |
| 203 | 203 | throw ProtocolError("Trying to store client without an ID."); |
| 204 | 204 | |
| 205 | + bool originalClientDemandsSessionDestruction = false; | |
| 205 | 206 | std::shared_ptr<Session> session; |
| 206 | 207 | auto session_it = sessionsById.find(client->getClientId()); |
| 207 | 208 | if (session_it != sessionsById.end()) |
| ... | ... | @@ -215,14 +216,22 @@ void SubscriptionStore::registerClientAndKickExistingOne(std::shared_ptr<Client> |
| 215 | 216 | if (cl) |
| 216 | 217 | { |
| 217 | 218 | logger->logf(LOG_NOTICE, "Disconnecting existing client with id '%s'", cl->getClientId().c_str()); |
| 219 | + cl->setDisconnectReason("Another client with this ID connected"); | |
| 220 | + | |
| 221 | + // We have to set session to false, because it's no longer up to the destruction of that client | |
| 222 | + // to destroy the session. We either do it in this function, or not at all. | |
| 223 | + originalClientDemandsSessionDestruction = cl->getCleanSession(); | |
| 224 | + cl->setCleanSession(false); | |
| 225 | + | |
| 218 | 226 | cl->setReadyForDisconnect(); |
| 219 | - cl->getThreadData()->removeClient(cl); | |
| 227 | + cl->getThreadData()->removeClientQueued(cl); | |
| 220 | 228 | cl->markAsDisconnecting(); |
| 221 | 229 | } |
| 230 | + | |
| 222 | 231 | } |
| 223 | 232 | } |
| 224 | 233 | |
| 225 | - if (!session || client->getCleanSession()) | |
| 234 | + if (!session || client->getCleanSession() || originalClientDemandsSessionDestruction) | |
| 226 | 235 | { |
| 227 | 236 | session.reset(new Session()); |
| 228 | 237 | ... | ... |
threaddata.cpp
| ... | ... | @@ -131,6 +131,36 @@ void ThreadData::publishStat(const std::string &topic, uint64_t n) |
| 131 | 131 | subscriptionStore->setRetainedMessage(topic, subtopics, payload, 0); |
| 132 | 132 | } |
| 133 | 133 | |
| 134 | +void ThreadData::removeQueuedClients() | |
| 135 | +{ | |
| 136 | + std::vector<int> fds; | |
| 137 | + fds.reserve(1024); // 1024 is arbitrary... | |
| 138 | + | |
| 139 | + { | |
| 140 | + std::lock_guard<std::mutex> lck2(clientsToRemoveMutex); | |
| 141 | + | |
| 142 | + for (const std::weak_ptr<Client> &c : clientsQueuedForRemoving) | |
| 143 | + { | |
| 144 | + std::shared_ptr<Client> client = c.lock(); | |
| 145 | + if (client) | |
| 146 | + { | |
| 147 | + int fd = client->getFd(); | |
| 148 | + fds.push_back(fd); | |
| 149 | + } | |
| 150 | + } | |
| 151 | + | |
| 152 | + clientsQueuedForRemoving.clear(); | |
| 153 | + } | |
| 154 | + | |
| 155 | + { | |
| 156 | + std::lock_guard<std::mutex> lck(clients_by_fd_mutex); | |
| 157 | + for(int fd : fds) | |
| 158 | + { | |
| 159 | + clients_by_fd.erase(fd); | |
| 160 | + } | |
| 161 | + } | |
| 162 | +} | |
| 163 | + | |
| 134 | 164 | void ThreadData::giveClient(std::shared_ptr<Client> client) |
| 135 | 165 | { |
| 136 | 166 | clients_by_fd_mutex.lock(); |
| ... | ... | @@ -151,25 +181,70 @@ std::shared_ptr<Client> ThreadData::getClient(int fd) |
| 151 | 181 | return this->clients_by_fd[fd]; |
| 152 | 182 | } |
| 153 | 183 | |
| 154 | -void ThreadData::removeClient(std::shared_ptr<Client> client) | |
| 184 | +void ThreadData::removeClientQueued(const std::shared_ptr<Client> &client) | |
| 155 | 185 | { |
| 156 | - client->markAsDisconnecting(); | |
| 186 | + bool wakeUpNeeded = true; | |
| 157 | 187 | |
| 158 | - std::lock_guard<std::mutex> lck(clients_by_fd_mutex); | |
| 159 | - clients_by_fd.erase(client->getFd()); | |
| 188 | + { | |
| 189 | + std::lock_guard<std::mutex> locker(clientsToRemoveMutex); | |
| 190 | + wakeUpNeeded = clientsQueuedForRemoving.empty(); | |
| 191 | + clientsQueuedForRemoving.push_front(client); | |
| 192 | + } | |
| 193 | + | |
| 194 | + if (wakeUpNeeded) | |
| 195 | + { | |
| 196 | + auto f = std::bind(&ThreadData::removeQueuedClients, this); | |
| 197 | + std::lock_guard<std::mutex> lockertaskQueue(taskQueueMutex); | |
| 198 | + taskQueue.push_front(f); | |
| 199 | + | |
| 200 | + wakeUpThread(); | |
| 201 | + } | |
| 160 | 202 | } |
| 161 | 203 | |
| 162 | -void ThreadData::removeClient(int fd) | |
| 204 | +void ThreadData::removeClientQueued(int fd) | |
| 163 | 205 | { |
| 164 | - std::lock_guard<std::mutex> lck(clients_by_fd_mutex); | |
| 165 | - auto client_it = this->clients_by_fd.find(fd); | |
| 166 | - if (client_it != this->clients_by_fd.end()) | |
| 206 | + bool wakeUpNeeded = true; | |
| 207 | + std::shared_ptr<Client> clientFound; | |
| 208 | + | |
| 209 | + { | |
| 210 | + std::lock_guard<std::mutex> lck(clients_by_fd_mutex); | |
| 211 | + auto client_it = this->clients_by_fd.find(fd); | |
| 212 | + if (client_it != this->clients_by_fd.end()) | |
| 213 | + { | |
| 214 | + clientFound = client_it->second; | |
| 215 | + } | |
| 216 | + } | |
| 217 | + | |
| 218 | + if (clientFound) | |
| 167 | 219 | { |
| 168 | - client_it->second->markAsDisconnecting(); | |
| 169 | - this->clients_by_fd.erase(fd); | |
| 220 | + { | |
| 221 | + std::lock_guard<std::mutex> locker(clientsToRemoveMutex); | |
| 222 | + wakeUpNeeded = clientsQueuedForRemoving.empty(); | |
| 223 | + clientsQueuedForRemoving.push_front(clientFound); | |
| 224 | + } | |
| 225 | + | |
| 226 | + if (wakeUpNeeded) | |
| 227 | + { | |
| 228 | + auto f = std::bind(&ThreadData::removeQueuedClients, this); | |
| 229 | + std::lock_guard<std::mutex> lockertaskQueue(taskQueueMutex); | |
| 230 | + taskQueue.push_front(f); | |
| 231 | + | |
| 232 | + wakeUpThread(); | |
| 233 | + } | |
| 170 | 234 | } |
| 171 | 235 | } |
| 172 | 236 | |
| 237 | +void ThreadData::removeClient(std::shared_ptr<Client> client) | |
| 238 | +{ | |
| 239 | + // This function is only for same-thread calling. | |
| 240 | + assert(pthread_self() == thread.native_handle()); | |
| 241 | + | |
| 242 | + client->markAsDisconnecting(); | |
| 243 | + | |
| 244 | + std::lock_guard<std::mutex> lck(clients_by_fd_mutex); | |
| 245 | + clients_by_fd.erase(client->getFd()); | |
| 246 | +} | |
| 247 | + | |
| 173 | 248 | std::shared_ptr<SubscriptionStore> &ThreadData::getSubscriptionStore() |
| 174 | 249 | { |
| 175 | 250 | return subscriptionStore; | ... | ... |
threaddata.h
| ... | ... | @@ -56,6 +56,8 @@ class ThreadData |
| 56 | 56 | uint64_t sentMessageCountPrevious = 0; |
| 57 | 57 | std::chrono::time_point<std::chrono::steady_clock> sentMessagePreviousTime = std::chrono::steady_clock::now(); |
| 58 | 58 | |
| 59 | + std::mutex clientsToRemoveMutex; | |
| 60 | + std::forward_list<std::weak_ptr<Client>> clientsQueuedForRemoving; | |
| 59 | 61 | |
| 60 | 62 | void reload(std::shared_ptr<Settings> settings); |
| 61 | 63 | void wakeUpThread(); |
| ... | ... | @@ -64,6 +66,8 @@ class ThreadData |
| 64 | 66 | void publishStatsOnDollarTopic(std::vector<std::shared_ptr<ThreadData>> &threads); |
| 65 | 67 | void publishStat(const std::string &topic, uint64_t n); |
| 66 | 68 | |
| 69 | + void removeQueuedClients(); | |
| 70 | + | |
| 67 | 71 | public: |
| 68 | 72 | Settings settingsLocalCopy; // Is updated on reload, within the thread loop. |
| 69 | 73 | Authentication authentication; |
| ... | ... | @@ -84,8 +88,9 @@ public: |
| 84 | 88 | |
| 85 | 89 | void giveClient(std::shared_ptr<Client> client); |
| 86 | 90 | std::shared_ptr<Client> getClient(int fd); |
| 91 | + void removeClientQueued(const std::shared_ptr<Client> &client); | |
| 92 | + void removeClientQueued(int fd); | |
| 87 | 93 | void removeClient(std::shared_ptr<Client> client); |
| 88 | - void removeClient(int fd); | |
| 89 | 94 | std::shared_ptr<SubscriptionStore> &getSubscriptionStore(); |
| 90 | 95 | |
| 91 | 96 | void initAuthPlugin(); | ... | ... |
threadloop.cpp
| ... | ... | @@ -64,12 +64,21 @@ void do_thread_work(ThreadData *threadData) |
| 64 | 64 | uint64_t eventfd_value = 0; |
| 65 | 65 | check<std::runtime_error>(read(fd, &eventfd_value, sizeof(uint64_t))); |
| 66 | 66 | |
| 67 | - std::lock_guard<std::mutex> locker(threadData->taskQueueMutex); | |
| 68 | - for(auto &f : threadData->taskQueue) | |
| 67 | + std::forward_list<std::function<void()>> copiedTasks; | |
| 68 | + | |
| 69 | + { | |
| 70 | + std::lock_guard<std::mutex> locker(threadData->taskQueueMutex); | |
| 71 | + for(auto &f : threadData->taskQueue) | |
| 72 | + { | |
| 73 | + copiedTasks.push_front(std::move(f)); | |
| 74 | + } | |
| 75 | + threadData->taskQueue.clear(); | |
| 76 | + } | |
| 77 | + | |
| 78 | + for(auto &f : copiedTasks) | |
| 69 | 79 | { |
| 70 | 80 | f(); |
| 71 | 81 | } |
| 72 | - threadData->taskQueue.clear(); | |
| 73 | 82 | |
| 74 | 83 | continue; |
| 75 | 84 | } | ... | ... |