summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--net/mptcp/protocol.c123
-rw-r--r--net/mptcp/protocol.h7
-rw-r--r--net/mptcp/subflow.c5
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_connect.sh52
4 files changed, 166 insertions, 21 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 28ec26d97f96..fa137a9c42d1 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -179,13 +179,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
return false;
}
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf);
-
- if (rcvbuf > sk->sk_rcvbuf)
- sk->sk_rcvbuf = rcvbuf;
- }
-
tp = tcp_sk(ssk);
do {
u32 map_remaining, offset;
@@ -916,6 +909,100 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
return copied;
}
+/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
+ *
+ * Only difference: Use highest rtt estimate of the subflows in use.
+ */
+static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ u32 time, advmss = 1;
+ u64 rtt_us, mstamp;
+
+ sock_owned_by_me(sk);
+
+ if (copied <= 0)
+ return;
+
+ msk->rcvq_space.copied += copied;
+
+ mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
+ time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
+
+ rtt_us = msk->rcvq_space.rtt_us;
+ if (rtt_us && time < (rtt_us >> 3))
+ return;
+
+ rtt_us = 0;
+ mptcp_for_each_subflow(msk, subflow) {
+ const struct tcp_sock *tp;
+ u64 sf_rtt_us;
+ u32 sf_advmss;
+
+ tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
+
+ sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
+ sf_advmss = READ_ONCE(tp->advmss);
+
+ rtt_us = max(sf_rtt_us, rtt_us);
+ advmss = max(sf_advmss, advmss);
+ }
+
+ msk->rcvq_space.rtt_us = rtt_us;
+ if (time < (rtt_us >> 3) || rtt_us == 0)
+ return;
+
+ if (msk->rcvq_space.copied <= msk->rcvq_space.space)
+ goto new_measure;
+
+ if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int rcvmem, rcvbuf;
+ u64 rcvwin, grow;
+
+ rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
+
+ grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
+
+ do_div(grow, msk->rcvq_space.space);
+ rcvwin += (grow << 1);
+
+ rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
+ while (tcp_win_from_space(sk, rcvmem) < advmss)
+ rcvmem += 128;
+
+ do_div(rcvwin, advmss);
+ rcvbuf = min_t(u64, rcvwin * rcvmem,
+ sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+
+ if (rcvbuf > sk->sk_rcvbuf) {
+ u32 window_clamp;
+
+ window_clamp = tcp_win_from_space(sk, rcvbuf);
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+
+ /* Make subflows follow along. If we do not do this, we
+ * get drops at subflow level if skbs can't be moved to
+ * the mptcp rx queue fast enough (announced rcv_win can
+ * exceed ssk->sk_rcvbuf).
+ */
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk;
+
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
+ tcp_sk(ssk)->window_clamp = window_clamp;
+ }
+ }
+ }
+
+ msk->rcvq_space.space = msk->rcvq_space.copied;
+new_measure:
+ msk->rcvq_space.copied = 0;
+ msk->rcvq_space.time = mstamp;
+}
+
static bool __mptcp_move_skbs(struct mptcp_sock *msk)
{
unsigned int moved = 0;
@@ -1028,6 +1115,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
set_bit(MPTCP_DATA_READY, &msk->flags);
}
out_err:
+ mptcp_rcv_space_adjust(msk, copied);
+
release_sock(sk);
return copied;
}
@@ -1241,6 +1330,7 @@ static int mptcp_init_sock(struct sock *sk)
return ret;
sk_sockets_allocated_inc(sk);
+ sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
return 0;
@@ -1423,6 +1513,22 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
return nsk;
}
+void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
+{
+ const struct tcp_sock *tp = tcp_sk(ssk);
+
+ msk->rcvq_space.copied = 0;
+ msk->rcvq_space.rtt_us = 0;
+
+ msk->rcvq_space.time = tp->tcp_mstamp;
+
+ /* initial rcv_space offering made to peer */
+ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
+ TCP_INIT_CWND * tp->advmss);
+ if (msk->rcvq_space.space == 0)
+ msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+}
+
static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
bool kern)
{
@@ -1471,6 +1577,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
list_add(&subflow->node, &msk->conn_list);
inet_sk_state_store(newsk, TCP_ESTABLISHED);
+ mptcp_rcv_space_init(msk, ssk);
bh_unlock_sock(new_mptcp_sock);
__MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
@@ -1631,6 +1738,8 @@ void mptcp_finish_connect(struct sock *ssk)
atomic64_set(&msk->snd_una, msk->write_seq);
mptcp_pm_new_connection(msk, 0);
+
+ mptcp_rcv_space_init(msk, ssk);
}
static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 1d05d9841b5c..a6412ff0fddb 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -209,6 +209,12 @@ struct mptcp_sock {
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first;
struct mptcp_pm_data pm;
+ struct {
+ u32 space; /* bytes copied in last measurement window */
+ u32 copied; /* bytes copied in this measurement window */
+ u64 time; /* start time of measurement window */
+ u64 rtt_us; /* last maximum rtt of subflows */
+ } rcvq_space;
};
#define mptcp_for_each_subflow(__msk, __subflow) \
@@ -369,6 +375,7 @@ void mptcp_get_options(const struct sk_buff *skb,
struct mptcp_options_received *mp_opt);
void mptcp_finish_connect(struct sock *sk);
+void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
void mptcp_data_acked(struct sock *sk);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 664aa9158363..e1e19c76e267 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -225,8 +225,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
pr_fallback(mptcp_sk(subflow->conn));
}
- if (mptcp_check_fallback(sk))
+ if (mptcp_check_fallback(sk)) {
+ mptcp_rcv_space_init(mptcp_sk(parent), sk);
return;
+ }
if (subflow->mp_capable) {
pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
@@ -1118,6 +1120,7 @@ static void subflow_state_change(struct sock *sk)
if (subflow_simultaneous_connect(sk)) {
mptcp_do_fallback(sk);
+ mptcp_rcv_space_init(mptcp_sk(parent), sk);
pr_fallback(mptcp_sk(parent));
subflow->conn_finished = 1;
if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index acf02e156d20..8f7145c413b9 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -3,7 +3,7 @@
time_start=$(date +%s)
-optstring="S:R:d:e:l:r:h4cm:"
+optstring="S:R:d:e:l:r:h4cm:f:t"
ret=0
sin=""
sout=""
@@ -21,6 +21,8 @@ testmode=""
sndbuf=0
rcvbuf=0
options_log=true
+do_tcp=0
+filesize=0
if [ $tc_loss -eq 100 ];then
tc_loss=1%
@@ -40,9 +42,11 @@ usage() {
echo -e "\t-e: ethtool features to disable, e.g.: \"-e tso -e gso\" (default: randomly disable any of tso/gso/gro)"
echo -e "\t-4: IPv4 only: disable IPv6 tests (default: test both IPv4 and IPv6)"
echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)"
+ echo -e "\t-f: size of file to transfer in bytes (default random)"
echo -e "\t-S: set sndbuf value (default: use kernel default)"
echo -e "\t-R: set rcvbuf value (default: use kernel default)"
echo -e "\t-m: test mode (poll, sendfile; default: poll)"
+ echo -e "\t-t: also run tests with TCP (use twice to non-fallback tcp)"
}
while getopts "$optstring" option;do
@@ -94,6 +98,12 @@ while getopts "$optstring" option;do
"m")
testmode="$OPTARG"
;;
+ "f")
+ filesize="$OPTARG"
+ ;;
+ "t")
+ do_tcp=$((do_tcp+1))
+ ;;
"?")
usage $0
exit 1
@@ -449,20 +459,25 @@ make_file()
{
local name=$1
local who=$2
+ local SIZE=$filesize
+ local ksize
+ local rem
- local SIZE TSIZE
- SIZE=$((RANDOM % (1024 * 8)))
- TSIZE=$((SIZE * 1024))
+ if [ $SIZE -eq 0 ]; then
+ local MAXSIZE=$((1024 * 1024 * 8))
+ local MINSIZE=$((1024 * 256))
- dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
+ SIZE=$(((RANDOM * RANDOM + MINSIZE) % MAXSIZE))
+ fi
- SIZE=$((RANDOM % 1024))
- SIZE=$((SIZE + 128))
- TSIZE=$((TSIZE + SIZE))
- dd if=/dev/urandom conv=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
+ ksize=$((SIZE / 1024))
+ rem=$((SIZE - (ksize * 1024)))
+
+ dd if=/dev/urandom of="$name" bs=1024 count=$ksize 2> /dev/null
+ dd if=/dev/urandom conv=notrunc of="$name" bs=1 count=$rem 2> /dev/null
echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name"
- echo "Created $name (size $TSIZE) containing data sent by $who"
+ echo "Created $name (size $(du -b "$name")) containing data sent by $who"
}
run_tests_lo()
@@ -497,9 +512,11 @@ run_tests_lo()
return 1
fi
- # don't bother testing fallback tcp except for loopback case.
- if [ ${listener_ns} != ${connector_ns} ]; then
- return 0
+ if [ $do_tcp -eq 0 ]; then
+ # don't bother testing fallback tcp except for loopback case.
+ if [ ${listener_ns} != ${connector_ns} ]; then
+ return 0
+ fi
fi
do_transfer ${listener_ns} ${connector_ns} MPTCP TCP ${connect_addr} ${local_addr}
@@ -516,6 +533,15 @@ run_tests_lo()
return 1
fi
+ if [ $do_tcp -gt 1 ] ;then
+ do_transfer ${listener_ns} ${connector_ns} TCP TCP ${connect_addr} ${local_addr}
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return 1
+ fi
+ fi
+
return 0
}