summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-11-17 21:35:19 +0300
committerDavid S. Miller <davem@davemloft.net>2016-11-17 21:35:19 +0300
commitfcd2b0da730fab9b125296e859716c9652614721 (patch)
tree234b4827c3c99e6d911bd4b7ea787a1185e695df
parentb3e51069627e2b9439757590d7b82be0d22c7779 (diff)
parent1a0e100fb2c9667cea2a7d755faaa83569942f05 (diff)
downloadlinux-fcd2b0da730fab9b125296e859716c9652614721.tar.xz
Merge branch 'rds-ha-failover-fixes'
Sowmini Varadhan says: ==================== RDS: TCP: HA/Failover fixes This series contains a set of fixes for bugs exposed when we ran the following in a loop between a test machine pair: while (1); do # modprobe rds-tcp on test nodes # run rds-stress in bi-dir mode between test machine pair # modprobe -r rds-tcp on test nodes done rds-stress in bi-dir mode will cause both nodes to initiate RDS-TCP connections at almost the same instant, exposing the bugs fixed in this series. Without the fixes, rds-stress reports sporadic packet drops, and packets arriving out of sequence. After the fixes,we have been able to run the test overnight, without any issues. Each patch has a detailed description of the root-cause fixed by the patch. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/rds/af_rds.c4
-rw-r--r--net/rds/connection.c3
-rw-r--r--net/rds/message.c1
-rw-r--r--net/rds/rds.h8
-rw-r--r--net/rds/recv.c36
-rw-r--r--net/rds/send.c9
-rw-r--r--net/rds/tcp_connect.c14
-rw-r--r--net/rds/tcp_listen.c29
-rw-r--r--net/rds/tcp_send.c3
9 files changed, 86 insertions, 21 deletions
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 6beaeb1138f3..2ac1e6194be3 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -605,10 +605,14 @@ static void rds_exit(void)
}
module_exit(rds_exit);
+u32 rds_gen_num;
+
static int rds_init(void)
{
int ret;
+ net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
+
ret = rds_bind_lock_init();
if (ret)
goto out;
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 13f459dad4ef..fe9d31c0b22d 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -269,6 +269,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
+ conn->c_my_gen_num = rds_gen_num;
+ conn->c_peer_gen_num = 0;
hlist_add_head_rcu(&conn->c_hash_node, head);
rds_cong_add_conn(conn);
rds_conn_count++;
@@ -681,6 +683,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
!test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
}
+EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
void rds_conn_connect_if_down(struct rds_connection *conn)
{
diff --git a/net/rds/message.c b/net/rds/message.c
index 6cb91061556a..49bfb512d808 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -42,6 +42,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
[RDS_EXTHDR_NPATHS] = sizeof(u16),
+[RDS_EXTHDR_GEN_NUM] = sizeof(u32),
};
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 4121e1862444..ebbf909b87ec 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -151,6 +151,9 @@ struct rds_connection {
struct rds_conn_path c_path[RDS_MPATH_WORKERS];
wait_queue_head_t c_hs_waitq; /* handshake waitq */
+
+ u32 c_my_gen_num;
+ u32 c_peer_gen_num;
};
static inline
@@ -243,7 +246,8 @@ struct rds_ext_header_rdma_dest {
/* Extension header announcing number of paths.
* Implicit length = 2 bytes.
*/
-#define RDS_EXTHDR_NPATHS 4
+#define RDS_EXTHDR_NPATHS 5
+#define RDS_EXTHDR_GEN_NUM 6
#define __RDS_EXTHDR_MAX 16 /* for now */
@@ -338,6 +342,7 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
#define RDS_MSG_RETRANSMITTED 5
#define RDS_MSG_MAPPED 6
#define RDS_MSG_PAGEVEC 7
+#define RDS_MSG_FLUSH 8
struct rds_message {
atomic_t m_refcount;
@@ -664,6 +669,7 @@ void rds_cong_exit(void);
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
/* conn.c */
+extern u32 rds_gen_num;
int rds_conn_init(void);
void rds_conn_exit(void);
struct rds_connection *rds_conn_create(struct net *net,
diff --git a/net/rds/recv.c b/net/rds/recv.c
index cbfabdf3ff48..9d0666e5fe35 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -120,6 +120,36 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
/* do nothing if no change in cong state */
}
+static void rds_conn_peer_gen_update(struct rds_connection *conn,
+ u32 peer_gen_num)
+{
+ int i;
+ struct rds_message *rm, *tmp;
+ unsigned long flags;
+
+ WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
+ if (peer_gen_num != 0) {
+ if (conn->c_peer_gen_num != 0 &&
+ peer_gen_num != conn->c_peer_gen_num) {
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ struct rds_conn_path *cp;
+
+ cp = &conn->c_path[i];
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ cp->cp_next_tx_seq = 1;
+ cp->cp_next_rx_seq = 0;
+ list_for_each_entry_safe(rm, tmp,
+ &cp->cp_retrans,
+ m_conn_item) {
+ set_bit(RDS_MSG_FLUSH, &rm->m_flags);
+ }
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ }
+ }
+ conn->c_peer_gen_num = peer_gen_num;
+ }
+}
+
/*
* Process all extension headers that come with this message.
*/
@@ -163,7 +193,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
union {
struct rds_ext_header_version version;
u16 rds_npaths;
+ u32 rds_gen_num;
} buffer;
+ u32 new_peer_gen_num = 0;
while (1) {
len = sizeof(buffer);
@@ -176,6 +208,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
buffer.rds_npaths);
break;
+ case RDS_EXTHDR_GEN_NUM:
+ new_peer_gen_num = buffer.rds_gen_num;
+ break;
default:
pr_warn_ratelimited("ignoring unknown exthdr type "
"0x%x\n", type);
@@ -183,6 +218,7 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
}
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
conn->c_npaths = max_t(int, conn->c_npaths, 1);
+ rds_conn_peer_gen_update(conn, new_peer_gen_num);
}
/* rds_start_mprds() will synchronously start multiple paths when appropriate.
diff --git a/net/rds/send.c b/net/rds/send.c
index 896626b9a0ef..77c8c6e613ad 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -259,8 +259,9 @@ restart:
* connection.
* Therefore, we never retransmit messages with RDMA ops.
*/
- if (rm->rdma.op_active &&
- test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+ if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) ||
+ (rm->rdma.op_active &&
+ test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) {
spin_lock_irqsave(&cp->cp_lock, flags);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
list_move(&rm->m_conn_item, &to_be_dropped);
@@ -1209,6 +1210,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_NPATHS, &npaths,
sizeof(npaths));
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_GEN_NUM,
+ &cp->cp_conn->c_my_gen_num,
+ sizeof(u32));
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 05f61c533ed3..d6839d96d539 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -60,7 +60,19 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_SYN_RECV:
break;
case TCP_ESTABLISHED:
- rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+ /* Force the peer to reconnect so that we have the
+ * TCP ports going from <smaller-ip>.<transient> to
+ * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
+ * RDS connection as RDS_CONN_UP until the reconnect,
+ * to avoid RDS datagram loss.
+ */
+ if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr &&
+ rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
+ RDS_CONN_ERROR)) {
+ rds_conn_path_drop(cp);
+ } else {
+ rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+ }
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSE:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index c9c496844cd7..f74bab3ecdca 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -83,25 +83,20 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
{
int i;
bool peer_is_smaller = (conn->c_faddr < conn->c_laddr);
- int npaths = conn->c_npaths;
-
- if (npaths <= 1) {
- struct rds_conn_path *cp = &conn->c_path[0];
- int ret;
-
- ret = rds_conn_path_transition(cp, RDS_CONN_DOWN,
- RDS_CONN_CONNECTING);
- if (!ret)
- rds_conn_path_transition(cp, RDS_CONN_ERROR,
- RDS_CONN_CONNECTING);
- return cp->cp_transport_data;
- }
+ int npaths = max_t(int, 1, conn->c_npaths);
- /* for mprds, paths with cp_index > 0 MUST be initiated by the peer
+ /* for mprds, all paths MUST be initiated by the peer
* with the smaller address.
*/
- if (!peer_is_smaller)
+ if (!peer_is_smaller) {
+ /* Make sure we initiate at least one path if this
+ * has not already been done; rds_start_mprds() will
+ * take care of additional paths, if necessary.
+ */
+ if (npaths == 1)
+ rds_conn_path_connect_if_down(&conn->c_path[0]);
return NULL;
+ }
for (i = 0; i < npaths; i++) {
struct rds_conn_path *cp = &conn->c_path[i];
@@ -171,8 +166,8 @@ int rds_tcp_accept_one(struct socket *sock)
mutex_lock(&rs_tcp->t_conn_path_lock);
cp = rs_tcp->t_cpath;
conn_state = rds_conn_path_state(cp);
- if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP &&
- conn_state != RDS_CONN_ERROR)
+ WARN_ON(conn_state == RDS_CONN_UP);
+ if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
goto rst_nsk;
if (rs_tcp->t_sock) {
/* Need to resolve a duelling SYN between peers.
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 89d09b481f47..dcf4742083ea 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -100,6 +100,9 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
tc->t_last_expected_una = rm->m_ack_seq + 1;
+ if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
rm, rds_tcp_snd_nxt(tc),
(unsigned long long)rm->m_ack_seq);