Skip to content

Commit

Permalink
Fix crash when freeing newly created node when nodeIp2String fail
Browse files Browse the repository at this point in the history
In #1441, we found a assert, and decided remove this assert and instead just
free the newly created node and close the link, since if we cannot get the
IP from the link it probably means the connection was closed.
```
=== VALKEY BUG REPORT START: Cut & paste starting from here ===
17847:M 19 Dec 2024 00:15:58.021 # === ASSERTION FAILED ===
17847:M 19 Dec 2024 00:15:58.021 # ==> cluster_legacy.c:3252 'nodeIp2String(node->ip, link, hdr->myip) == C_OK' is not true

------ STACK TRACE ------

17847 valkey-server *
src/valkey-server 127.0.0.1:27131 [cluster](clusterProcessPacket+0x1304) [0x4e5634]
src/valkey-server 127.0.0.1:27131 [cluster](clusterReadHandler+0x11e) [0x4e59de]
/__w/valkey/valkey/src/valkey-tls.so(+0x2f1e) [0x7f083983ff1e]
src/valkey-server 127.0.0.1:27131 [cluster](aeMain+0x8a) [0x41afea]
src/valkey-server 127.0.0.1:27131 [cluster](main+0x4d7) [0x40f547]
/lib64/libc.so.6(+0x40c8) [0x7f083985a0c8]
/lib64/libc.so.6(__libc_start_main+0x8b) [0x7f083985a18b]
src/valkey-server 127.0.0.1:27131 [cluster](_start+0x25) [0x410ef5]
```

But it also introduces another assert. The reason is that this new node is
not added to the cluster nodes dict, we should just free it.
```
17128:M 08 Jan 2025 10:51:44.061 # === ASSERTION FAILED ===
17128:M 08 Jan 2025 10:51:44.061 # ==> cluster_legacy.c:1693 'dictDelete(server.cluster->nodes, nodename) == DICT_OK' is not true

------ STACK TRACE ------

17128 valkey-server *
src/valkey-server 127.0.0.1:28627 [cluster][0x4ebdc4]
src/valkey-server 127.0.0.1:28627 [cluster][0x4e81d2]
src/valkey-server 127.0.0.1:28627 [cluster](clusterReadHandler+0x268)[0x4e8618]
/__w/valkey/valkey/src/valkey-tls.so(+0xb278)[0x7f109480b278]
src/valkey-server 127.0.0.1:28627 [cluster](aeMain+0x89)[0x592b09]
src/valkey-server 127.0.0.1:28627 [cluster](main+0x4b3)[0x453e23]
/lib64/libc.so.6(__libc_start_main+0xe5)[0x7f10958bf7e5]
src/valkey-server 127.0.0.1:28627 [cluster](_start+0x2e)[0x454a5e]
```

This closes #1527.

Signed-off-by: Binbin <[email protected]>
  • Loading branch information
enjoy-binbin committed Jan 9, 2025
1 parent b207b42 commit 7aba5e4
Showing 1 changed file with 19 additions and 10 deletions.
29 changes: 19 additions & 10 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -1676,8 +1676,12 @@ int clusterCountNonFailingReplicas(clusterNode *n) {
return ok_replicas;
}

/* Low level cleanup of the node structure. Only called by clusterDelNode(). */
void freeClusterNode(clusterNode *n) {
/* Low level cleanup of the node structure.
*
* When delete = 1 is passed, it is called by clusterDelNode and we will delete
* the node from the cluster nodes dict. When delete = 0 is passed, we only free
* the node resources because the node is not added to the cluster nodes dict. */
void freeClusterNode(clusterNode *n, int delete) {
sds nodename;
int j;

Expand All @@ -1689,9 +1693,11 @@ void freeClusterNode(clusterNode *n) {
if (nodeIsReplica(n) && n->replicaof) clusterNodeRemoveReplica(n->replicaof, n);

/* Unlink from the set of nodes. */
nodename = sdsnewlen(n->name, CLUSTER_NAMELEN);
serverAssert(dictDelete(server.cluster->nodes, nodename) == DICT_OK);
sdsfree(nodename);
if (delete) {
nodename = sdsnewlen(n->name, CLUSTER_NAMELEN);
serverAssert(dictDelete(server.cluster->nodes, nodename) == DICT_OK);
sdsfree(nodename);
}
sdsfree(n->hostname);
sdsfree(n->human_nodename);
sdsfree(n->announce_client_ipv4);
Expand Down Expand Up @@ -1754,7 +1760,7 @@ void clusterDelNode(clusterNode *delnode) {
clusterRemoveNodeFromShard(delnode);

/* 4) Free the node, unlinking it from the cluster. */
freeClusterNode(delnode);
freeClusterNode(delnode, 1);
}

/* Node lookup by name */
Expand Down Expand Up @@ -3254,11 +3260,14 @@ int clusterProcessPacket(clusterLink *link) {
* to reduce the amount of time needed to stabilize the shard ID. */
clusterNode *node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE);
if (nodeIp2String(node->ip, link, hdr->myip) != C_OK) {
/* We cannot get the IP info from the link, it probably means the connection is closed. */
serverLog(LL_NOTICE, "Closing link even though we received a MEET packet on it, "
"because the connection has an error");
/* Unable to retrieve the node's IP address from the connection. Without a
* valid IP, the node becomes unusable in the cluster. This failure might be
* due to the connection being closed. To avoid leaving the cluster in an
* inconsistent state, we close the link and free the node. */
serverLog(LL_NOTICE, "Closing link and freeing node due to failure to retrieve IP "
"from the connection, possibly caused by a closed connection.");
freeClusterLink(link);
freeClusterNode(node);
freeClusterNode(node, 0);
return 0;
}
getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port);
Expand Down

0 comments on commit 7aba5e4

Please sign in to comment.