From 0ffddafc3a3970ef7013696e7f36b3d378bc4c16 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 15 Jun 2020 11:25:33 +0800 Subject: dlm: Fix kobject memleak Currently the error return path from kobject_init_and_add() is not followed by a call to kobject_put() - which means we are leaking the kobject. Set do_unreg = 1 before kobject_init_and_add() to ensure that kobject_put() can be called in its error patch. Fixes: 901195ed7f4b ("Kobject: change GFS2 to use kobject_init_and_add") Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index e93670ecfae5..624617c12250 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -622,6 +622,9 @@ static int new_lockspace(const char *name, const char *cluster, wait_event(ls->ls_recover_lock_wait, test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)); + /* let kobject handle freeing of ls if there's an error */ + do_unreg = 1; + ls->ls_kobj.kset = dlm_kset; error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL, "%s", ls->ls_name); @@ -629,9 +632,6 @@ static int new_lockspace(const char *name, const char *cluster, goto out_recoverd; kobject_uevent(&ls->ls_kobj, KOBJ_ADD); - /* let kobject handle freeing of ls if there's an error */ - do_unreg = 1; - /* This uevent triggers dlm_controld in userspace to add us to the group of nodes that are members of this lockspace (managed by the cluster infrastructure.) Once it's done that, it tells us who the -- cgit v1.2.3 From a5b7ab6352bfaab6eec4df6618a135341d72c247 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 26 Jun 2020 13:26:49 -0400 Subject: fs: dlm: set skb mark for listen socket This patch adds support to set the skb mark value for the DLM listen tcp and sctp sockets. The mark value will be offered as cluster configuration. At creation time of the listen socket it will be set as socket option. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/config.c | 6 ++++++ fs/dlm/config.h | 1 + fs/dlm/lowcomms.c | 3 +++ 3 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 3b21082e1b55..d74655cd6cd3 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -73,6 +73,7 @@ struct dlm_cluster { unsigned int cl_log_debug; unsigned int cl_log_info; unsigned int cl_protocol; + unsigned int cl_mark; unsigned int cl_timewarn_cs; unsigned int cl_waitwarn_us; unsigned int cl_new_rsb_count; @@ -96,6 +97,7 @@ enum { CLUSTER_ATTR_LOG_DEBUG, CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, + CLUSTER_ATTR_MARK, CLUSTER_ATTR_TIMEWARN_CS, CLUSTER_ATTR_WAITWARN_US, CLUSTER_ATTR_NEW_RSB_COUNT, @@ -168,6 +170,7 @@ CLUSTER_ATTR(scan_secs, 1); CLUSTER_ATTR(log_debug, 0); CLUSTER_ATTR(log_info, 0); CLUSTER_ATTR(protocol, 0); +CLUSTER_ATTR(mark, 0); CLUSTER_ATTR(timewarn_cs, 1); CLUSTER_ATTR(waitwarn_us, 0); CLUSTER_ATTR(new_rsb_count, 0); @@ -183,6 +186,7 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug, [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, + [CLUSTER_ATTR_MARK] = &cluster_attr_mark, [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs, [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us, [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, @@ -855,6 +859,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_LOG_DEBUG 0 #define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL 0 +#define DEFAULT_MARK 0 #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ #define DEFAULT_WAITWARN_US 0 #define DEFAULT_NEW_RSB_COUNT 128 @@ -871,6 +876,7 @@ struct dlm_config_info dlm_config = { .ci_log_debug = DEFAULT_LOG_DEBUG, .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, + .ci_mark = DEFAULT_MARK, .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, .ci_waitwarn_us = DEFAULT_WAITWARN_US, .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 2b471aae4e61..13508ec3ff5e 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -31,6 +31,7 @@ struct dlm_config_info { int ci_log_debug; int ci_log_info; int ci_protocol; + int ci_mark; int ci_timewarn_cs; int ci_waitwarn_us; int ci_new_rsb_count; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3543a8fec907..eaedad7d069a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1111,6 +1111,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con, goto create_out; } + sock_set_mark(sock->sk, dlm_config.ci_mark); + /* Turn off Nagle's algorithm */ tcp_sock_set_nodelay(sock->sk); @@ -1185,6 +1187,7 @@ static int sctp_listen_for_all(void) } sock_set_rcvbuf(sock->sk, NEEDED_RMEM); + sock_set_mark(sock->sk, dlm_config.ci_mark); sctp_sock_set_nodelay(sock->sk); write_lock_bh(&sock->sk->sk_callback_lock); -- cgit v1.2.3 From 9c9f168f5b145986535f727d259ef75f6ea26990 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 26 Jun 2020 13:26:50 -0400 Subject: fs: dlm: set skb mark per peer socket This patch adds support to set the skb mark value for the DLM tcp and sctp socket per peer. The mark value will be offered as per comm value of configfs. At creation time of the peer socket it will be set as socket option. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/config.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/dlm/config.h | 1 + fs/dlm/lowcomms.c | 16 ++++++++++++++++ 3 files changed, 55 insertions(+) (limited to 'fs') diff --git a/fs/dlm/config.c b/fs/dlm/config.c index d74655cd6cd3..47f0b98b707f 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -200,6 +200,7 @@ enum { COMM_ATTR_LOCAL, COMM_ATTR_ADDR, COMM_ATTR_ADDR_LIST, + COMM_ATTR_MARK, }; enum { @@ -232,6 +233,7 @@ struct dlm_comm { int nodeid; int local; int addr_count; + unsigned int mark; struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; @@ -469,6 +471,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name) cm->nodeid = -1; cm->local = 0; cm->addr_count = 0; + cm->mark = 0; return &cm->item; } @@ -664,8 +667,28 @@ static ssize_t comm_addr_list_show(struct config_item *item, char *buf) return 4096 - allowance; } +static ssize_t comm_mark_show(struct config_item *item, char *buf) +{ + return sprintf(buf, "%u\n", config_item_to_comm(item)->mark); +} + +static ssize_t comm_mark_store(struct config_item *item, const char *buf, + size_t len) +{ + unsigned int mark; + int rc; + + rc = kstrtouint(buf, 0, &mark); + if (rc) + return rc; + + config_item_to_comm(item)->mark = mark; + return len; +} + CONFIGFS_ATTR(comm_, nodeid); CONFIGFS_ATTR(comm_, local); +CONFIGFS_ATTR(comm_, mark); CONFIGFS_ATTR_WO(comm_, addr); CONFIGFS_ATTR_RO(comm_, addr_list); @@ -674,6 +697,7 @@ static struct configfs_attribute *comm_attrs[] = { [COMM_ATTR_LOCAL] = &comm_attr_local, [COMM_ATTR_ADDR] = &comm_attr_addr, [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list, + [COMM_ATTR_MARK] = &comm_attr_mark, NULL, }; @@ -833,6 +857,20 @@ int dlm_comm_seq(int nodeid, uint32_t *seq) return 0; } +int dlm_comm_mark(int nodeid, unsigned int *mark) +{ + struct dlm_comm *cm; + + cm = get_comm(nodeid); + if (!cm) + return -ENOENT; + + *mark = cm->mark; + put_comm(cm); + + return 0; +} + int dlm_our_nodeid(void) { return local_comm ? local_comm->nodeid : 0; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 13508ec3ff5e..f62996cad561 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -46,6 +46,7 @@ void dlm_config_exit(void); int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out); int dlm_comm_seq(int nodeid, uint32_t *seq); +int dlm_comm_mark(int nodeid, unsigned int *mark); int dlm_our_nodeid(void); int dlm_our_addr(struct sockaddr_storage *addr, int num); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index eaedad7d069a..3fa1b93dbbc7 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -914,6 +914,7 @@ static void sctp_connect_to_sock(struct connection *con) int result; int addr_len; struct socket *sock; + unsigned int mark; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -944,6 +945,13 @@ static void sctp_connect_to_sock(struct connection *con) if (result < 0) goto socket_err; + /* set skb mark */ + result = dlm_comm_mark(con->nodeid, &mark); + if (result < 0) + goto bind_err; + + sock_set_mark(sock->sk, mark); + con->rx_action = receive_from_sock; con->connect_action = sctp_connect_to_sock; add_sock(sock, con); @@ -1006,6 +1014,7 @@ static void tcp_connect_to_sock(struct connection *con) struct sockaddr_storage saddr, src_addr; int addr_len; struct socket *sock = NULL; + unsigned int mark; int result; if (con->nodeid == 0) { @@ -1027,6 +1036,13 @@ static void tcp_connect_to_sock(struct connection *con) if (result < 0) goto out_err; + /* set skb mark */ + result = dlm_comm_mark(con->nodeid, &mark); + if (result < 0) + goto out_err; + + sock_set_mark(sock->sk, mark); + memset(&saddr, 0, sizeof(saddr)); result = nodeid_to_addr(con->nodeid, &saddr, NULL, false); if (result < 0) { -- cgit v1.2.3 From 0ea47e4d2109efd61e9949d014b37ea835f20861 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 27 Jul 2020 09:13:36 -0400 Subject: fs: dlm: don't close socket on invalid message This patch doesn't close sockets when there is an invalid dlm message received. The connection will probably reconnect anyway so. To not close the connection will reduce the number of possible failtures. As we don't have a different strategy to react on such scenario just keep going the connection and ignore the message. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3fa1b93dbbc7..9e6acbb47bb9 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -685,14 +685,14 @@ static int receive_from_sock(struct connection *con) page_address(con->rx_page), con->cb.base, con->cb.len, PAGE_SIZE); - if (ret == -EBADMSG) { - log_print("lowcomms: addr=%p, base=%u, len=%u, read=%d", - page_address(con->rx_page), con->cb.base, + if (ret < 0) { + log_print("lowcomms err %d: addr=%p, base=%u, len=%u, read=%d", + ret, page_address(con->rx_page), con->cb.base, con->cb.len, r); + cbuf_eat(&con->cb, r); + } else { + cbuf_eat(&con->cb, ret); } - if (ret < 0) - goto out_close; - cbuf_eat(&con->cb, ret); if (cbuf_empty(&con->cb) && !call_again_soon) { __free_page(con->rx_page); -- cgit v1.2.3 From ba3ab3ca68caafb7700c4abae357b7fb7538df11 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 27 Jul 2020 09:13:37 -0400 Subject: fs: dlm: change handling of reconnects This patch changes the handling of reconnects. At first we only close the connection related to the communication failure. If we get a new connection for an already existing connection we close the existing connection and take the new one. This patch improves significantly the stability of tcp connections while running "tcpkill -9 -i $IFACE port 21064" while generating a lot of dlm messages e.g. on a gfs2 mount with many files. My test setup shows that a deadlock is "more" unlikely. Before this patch I wasn't able to get not a deadlock after 5 seconds. After this patch my observation is that it's more likely to survive after 5 seconds and more, but still a deadlock occurs after certain time. My guess is that there are still "segments" inside the tcp writequeue or retransmit queue which get dropped when receiving a tcp reset [1]. Hard to reproduce because the right message need to be inside these queues, which might even be in the 5 first seconds with this patch. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/net/ipv4/tcp_input.c?h=v5.8-rc6#n4122 Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9e6acbb47bb9..289439fdca99 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -713,7 +713,7 @@ out_resched: out_close: mutex_unlock(&con->sock_mutex); if (ret != -EAGAIN) { - close_connection(con, true, true, false); + close_connection(con, false, true, false); /* Reconnect when there is something to send */ } /* Don't return success if we really got EOF */ @@ -804,21 +804,16 @@ static int accept_from_sock(struct connection *con) INIT_WORK(&othercon->swork, process_send_sockets); INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); + } else { + /* close other sock con if we have something new */ + close_connection(othercon, false, true, false); } + mutex_lock_nested(&othercon->sock_mutex, 2); - if (!othercon->sock) { - newcon->othercon = othercon; - add_sock(newsock, othercon); - addcon = othercon; - mutex_unlock(&othercon->sock_mutex); - } - else { - printk("Extra connection from node %d attempted\n", nodeid); - result = -EAGAIN; - mutex_unlock(&othercon->sock_mutex); - mutex_unlock(&newcon->sock_mutex); - goto accept_err; - } + newcon->othercon = othercon; + add_sock(newsock, othercon); + addcon = othercon; + mutex_unlock(&othercon->sock_mutex); } else { newcon->rx_action = receive_from_sock; @@ -1415,7 +1410,7 @@ out: send_error: mutex_unlock(&con->sock_mutex); - close_connection(con, true, false, true); + close_connection(con, false, false, true); /* Requeue the send work. When the work daemon runs again, it will try a new connection, then call this function again. */ queue_work(send_workqueue, &con->swork); -- cgit v1.2.3 From 055923bf6b48659755b5f0169e34107ee2cb9b68 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 27 Jul 2020 09:13:38 -0400 Subject: fs: dlm: implement tcp graceful shutdown During my code inspection I saw there is no implementation of a graceful shutdown for tcp. This patch will introduce a graceful shutdown for tcp connections. The shutdown is implemented synchronized as dlm_lowcomms_stop() is called to end all dlm communication. After shutdown is done, a lot of flush and closing functionality will be called. However I don't see a problem with that. The waitqueue for synchronize the shutdown has a timeout of 10 seconds, if timeout a force close will be exectued. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 289439fdca99..5050fe05769b 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -63,6 +63,7 @@ /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 +#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000) struct cbuf { unsigned int base; @@ -110,10 +111,12 @@ struct connection { #define CF_CLOSE 6 #define CF_APP_LIMITED 7 #define CF_CLOSING 8 +#define CF_SHUTDOWN 9 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; int (*rx_action) (struct connection *); /* What to do when active */ void (*connect_action) (struct connection *); /* What to do to connect */ + void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ struct page *rx_page; struct cbuf cb; int retries; @@ -122,6 +125,7 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ + wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -218,6 +222,7 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc) spin_lock_init(&con->writequeue_lock); INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); + init_waitqueue_head(&con->shutdown_wait); /* Setup action pointers for child sockets */ if (con->nodeid) { @@ -619,6 +624,54 @@ static void close_connection(struct connection *con, bool and_other, clear_bit(CF_CLOSING, &con->flags); } +static void shutdown_connection(struct connection *con) +{ + int ret; + + if (cancel_work_sync(&con->swork)) { + log_print("canceled swork for node %d", con->nodeid); + clear_bit(CF_WRITE_PENDING, &con->flags); + } + + mutex_lock(&con->sock_mutex); + /* nothing to shutdown */ + if (!con->sock) { + mutex_unlock(&con->sock_mutex); + return; + } + + set_bit(CF_SHUTDOWN, &con->flags); + ret = kernel_sock_shutdown(con->sock, SHUT_WR); + mutex_unlock(&con->sock_mutex); + if (ret) { + log_print("Connection %p failed to shutdown: %d will force close", + con, ret); + goto force_close; + } else { + ret = wait_event_timeout(con->shutdown_wait, + !test_bit(CF_SHUTDOWN, &con->flags), + DLM_SHUTDOWN_WAIT_TIMEOUT); + if (ret == 0) { + log_print("Connection %p shutdown timed out, will force close", + con); + goto force_close; + } + } + + return; + +force_close: + clear_bit(CF_SHUTDOWN, &con->flags); + close_connection(con, false, true, true); +} + +static void dlm_tcp_shutdown(struct connection *con) +{ + if (con->othercon) + shutdown_connection(con->othercon); + shutdown_connection(con); +} + /* Data received from remote end */ static int receive_from_sock(struct connection *con) { @@ -713,13 +766,18 @@ out_resched: out_close: mutex_unlock(&con->sock_mutex); if (ret != -EAGAIN) { - close_connection(con, false, true, false); /* Reconnect when there is something to send */ + close_connection(con, false, true, false); + if (ret == 0) { + log_print("connection %p got EOF from %d", + con, con->nodeid); + /* handling for tcp shutdown */ + clear_bit(CF_SHUTDOWN, &con->flags); + wake_up(&con->shutdown_wait); + /* signal to breaking receive worker */ + ret = -1; + } } - /* Don't return success if we really got EOF */ - if (ret == 0) - ret = -EAGAIN; - return ret; } @@ -803,6 +861,7 @@ static int accept_from_sock(struct connection *con) spin_lock_init(&othercon->writequeue_lock); INIT_WORK(&othercon->swork, process_send_sockets); INIT_WORK(&othercon->rwork, process_recv_sockets); + init_waitqueue_head(&othercon->shutdown_wait); set_bit(CF_IS_OTHERCON, &othercon->flags); } else { /* close other sock con if we have something new */ @@ -1047,6 +1106,7 @@ static void tcp_connect_to_sock(struct connection *con) con->rx_action = receive_from_sock; con->connect_action = tcp_connect_to_sock; + con->shutdown_action = dlm_tcp_shutdown; add_sock(sock, con); /* Bind to our cluster-known address connecting to avoid @@ -1542,6 +1602,12 @@ static void stop_conn(struct connection *con) _stop_conn(con, true); } +static void shutdown_conn(struct connection *con) +{ + if (con->shutdown_action) + con->shutdown_action(con); +} + static void free_conn(struct connection *con) { close_connection(con, true, true, true); @@ -1593,6 +1659,7 @@ void dlm_lowcomms_stop(void) mutex_lock(&connections_lock); dlm_allow_conn = 0; mutex_unlock(&connections_lock); + foreach_conn(shutdown_conn); work_flush(); clean_writequeues(); foreach_conn(free_conn); -- cgit v1.2.3