diff options
author | David S. Miller <davem@davemloft.net> | 2016-11-17 21:35:19 +0300 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2016-11-17 21:35:19 +0300 |
commit | fcd2b0da730fab9b125296e859716c9652614721 (patch) | |
tree | 234b4827c3c99e6d911bd4b7ea787a1185e695df | |
parent | b3e51069627e2b9439757590d7b82be0d22c7779 (diff) | |
parent | 1a0e100fb2c9667cea2a7d755faaa83569942f05 (diff) | |
download | linux-fcd2b0da730fab9b125296e859716c9652614721.tar.xz |
Merge branch 'rds-ha-failover-fixes'
Sowmini Varadhan says:
====================
RDS: TCP: HA/Failover fixes
This series contains a set of fixes for bugs exposed when
we ran the following in a loop between a test machine pair:
while (1); do
# modprobe rds-tcp on test nodes
# run rds-stress in bi-dir mode between test machine pair
# modprobe -r rds-tcp on test nodes
done
rds-stress in bi-dir mode will cause both nodes to initiate
RDS-TCP connections at almost the same instant, exposing the
bugs fixed in this series.
Without the fixes, rds-stress reports sporadic packet drops,
and packets arriving out of sequence. After the fixes,we have
been able to run the test overnight, without any issues.
Each patch has a detailed description of the root-cause fixed
by the patch.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/rds/af_rds.c | 4 | ||||
-rw-r--r-- | net/rds/connection.c | 3 | ||||
-rw-r--r-- | net/rds/message.c | 1 | ||||
-rw-r--r-- | net/rds/rds.h | 8 | ||||
-rw-r--r-- | net/rds/recv.c | 36 | ||||
-rw-r--r-- | net/rds/send.c | 9 | ||||
-rw-r--r-- | net/rds/tcp_connect.c | 14 | ||||
-rw-r--r-- | net/rds/tcp_listen.c | 29 | ||||
-rw-r--r-- | net/rds/tcp_send.c | 3 |
9 files changed, 86 insertions, 21 deletions
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 6beaeb1138f3..2ac1e6194be3 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -605,10 +605,14 @@ static void rds_exit(void) } module_exit(rds_exit); +u32 rds_gen_num; + static int rds_init(void) { int ret; + net_get_random_once(&rds_gen_num, sizeof(rds_gen_num)); + ret = rds_bind_lock_init(); if (ret) goto out; diff --git a/net/rds/connection.c b/net/rds/connection.c index 13f459dad4ef..fe9d31c0b22d 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -269,6 +269,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, kmem_cache_free(rds_conn_slab, conn); conn = found; } else { + conn->c_my_gen_num = rds_gen_num; + conn->c_peer_gen_num = 0; hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; @@ -681,6 +683,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); } +EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); void rds_conn_connect_if_down(struct rds_connection *conn) { diff --git a/net/rds/message.c b/net/rds/message.c index 6cb91061556a..49bfb512d808 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -42,6 +42,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), [RDS_EXTHDR_NPATHS] = sizeof(u16), +[RDS_EXTHDR_GEN_NUM] = sizeof(u32), }; diff --git a/net/rds/rds.h b/net/rds/rds.h index 4121e1862444..ebbf909b87ec 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -151,6 +151,9 @@ struct rds_connection { struct rds_conn_path c_path[RDS_MPATH_WORKERS]; wait_queue_head_t c_hs_waitq; /* handshake waitq */ + + u32 c_my_gen_num; + u32 c_peer_gen_num; }; static inline @@ -243,7 +246,8 @@ struct rds_ext_header_rdma_dest { /* Extension header announcing number of paths. * Implicit length = 2 bytes. */ -#define RDS_EXTHDR_NPATHS 4 +#define RDS_EXTHDR_NPATHS 5 +#define RDS_EXTHDR_GEN_NUM 6 #define __RDS_EXTHDR_MAX 16 /* for now */ @@ -338,6 +342,7 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) #define RDS_MSG_RETRANSMITTED 5 #define RDS_MSG_MAPPED 6 #define RDS_MSG_PAGEVEC 7 +#define RDS_MSG_FLUSH 8 struct rds_message { atomic_t m_refcount; @@ -664,6 +669,7 @@ void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* conn.c */ +extern u32 rds_gen_num; int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(struct net *net, diff --git a/net/rds/recv.c b/net/rds/recv.c index cbfabdf3ff48..9d0666e5fe35 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -120,6 +120,36 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, /* do nothing if no change in cong state */ } +static void rds_conn_peer_gen_update(struct rds_connection *conn, + u32 peer_gen_num) +{ + int i; + struct rds_message *rm, *tmp; + unsigned long flags; + + WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP); + if (peer_gen_num != 0) { + if (conn->c_peer_gen_num != 0 && + peer_gen_num != conn->c_peer_gen_num) { + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + struct rds_conn_path *cp; + + cp = &conn->c_path[i]; + spin_lock_irqsave(&cp->cp_lock, flags); + cp->cp_next_tx_seq = 1; + cp->cp_next_rx_seq = 0; + list_for_each_entry_safe(rm, tmp, + &cp->cp_retrans, + m_conn_item) { + set_bit(RDS_MSG_FLUSH, &rm->m_flags); + } + spin_unlock_irqrestore(&cp->cp_lock, flags); + } + } + conn->c_peer_gen_num = peer_gen_num; + } +} + /* * Process all extension headers that come with this message. */ @@ -163,7 +193,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, union { struct rds_ext_header_version version; u16 rds_npaths; + u32 rds_gen_num; } buffer; + u32 new_peer_gen_num = 0; while (1) { len = sizeof(buffer); @@ -176,6 +208,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, buffer.rds_npaths); break; + case RDS_EXTHDR_GEN_NUM: + new_peer_gen_num = buffer.rds_gen_num; + break; default: pr_warn_ratelimited("ignoring unknown exthdr type " "0x%x\n", type); @@ -183,6 +218,7 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, } /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ conn->c_npaths = max_t(int, conn->c_npaths, 1); + rds_conn_peer_gen_update(conn, new_peer_gen_num); } /* rds_start_mprds() will synchronously start multiple paths when appropriate. diff --git a/net/rds/send.c b/net/rds/send.c index 896626b9a0ef..77c8c6e613ad 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -259,8 +259,9 @@ restart: * connection. * Therefore, we never retransmit messages with RDMA ops. */ - if (rm->rdma.op_active && - test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { + if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) || + (rm->rdma.op_active && + test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) { spin_lock_irqsave(&cp->cp_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); @@ -1209,6 +1210,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_NPATHS, &npaths, sizeof(npaths)); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_GEN_NUM, + &cp->cp_conn->c_my_gen_num, + sizeof(u32)); } spin_unlock_irqrestore(&cp->cp_lock, flags); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 05f61c533ed3..d6839d96d539 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -60,7 +60,19 @@ void rds_tcp_state_change(struct sock *sk) case TCP_SYN_RECV: break; case TCP_ESTABLISHED: - rds_connect_path_complete(cp, RDS_CONN_CONNECTING); + /* Force the peer to reconnect so that we have the + * TCP ports going from <smaller-ip>.<transient> to + * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the + * RDS connection as RDS_CONN_UP until the reconnect, + * to avoid RDS datagram loss. + */ + if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr && + rds_conn_path_transition(cp, RDS_CONN_CONNECTING, + RDS_CONN_ERROR)) { + rds_conn_path_drop(cp); + } else { + rds_connect_path_complete(cp, RDS_CONN_CONNECTING); + } break; case TCP_CLOSE_WAIT: case TCP_CLOSE: diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index c9c496844cd7..f74bab3ecdca 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -83,25 +83,20 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) { int i; bool peer_is_smaller = (conn->c_faddr < conn->c_laddr); - int npaths = conn->c_npaths; - - if (npaths <= 1) { - struct rds_conn_path *cp = &conn->c_path[0]; - int ret; - - ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, - RDS_CONN_CONNECTING); - if (!ret) - rds_conn_path_transition(cp, RDS_CONN_ERROR, - RDS_CONN_CONNECTING); - return cp->cp_transport_data; - } + int npaths = max_t(int, 1, conn->c_npaths); - /* for mprds, paths with cp_index > 0 MUST be initiated by the peer + /* for mprds, all paths MUST be initiated by the peer * with the smaller address. */ - if (!peer_is_smaller) + if (!peer_is_smaller) { + /* Make sure we initiate at least one path if this + * has not already been done; rds_start_mprds() will + * take care of additional paths, if necessary. + */ + if (npaths == 1) + rds_conn_path_connect_if_down(&conn->c_path[0]); return NULL; + } for (i = 0; i < npaths; i++) { struct rds_conn_path *cp = &conn->c_path[i]; @@ -171,8 +166,8 @@ int rds_tcp_accept_one(struct socket *sock) mutex_lock(&rs_tcp->t_conn_path_lock); cp = rs_tcp->t_cpath; conn_state = rds_conn_path_state(cp); - if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP && - conn_state != RDS_CONN_ERROR) + WARN_ON(conn_state == RDS_CONN_UP); + if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) goto rst_nsk; if (rs_tcp->t_sock) { /* Need to resolve a duelling SYN between peers. diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 89d09b481f47..dcf4742083ea 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -100,6 +100,9 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); tc->t_last_expected_una = rm->m_ack_seq + 1; + if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; + rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", rm, rds_tcp_snd_nxt(tc), (unsigned long long)rm->m_ack_seq); |