summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/devinet.c16
-rw-r--r--net/ipv4/fib_frontend.c4
-rw-r--r--net/ipv4/ip_gre.c2
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ip_sockglue.c6
-rw-r--r--net/ipv4/ip_tunnel.c7
-rw-r--r--net/ipv4/tcp.c55
-rw-r--r--net/ipv4/tcp_input.c46
-rw-r--r--net/ipv4/tcp_ipv4.c6
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv4/udp_tunnel_core.c1
12 files changed, 84 insertions, 65 deletions
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 92b778e423df..e8b9a9202fec 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2682,23 +2682,27 @@ static __net_init int devinet_init_net(struct net *net)
#endif
if (!net_eq(net, &init_net)) {
- if (IS_ENABLED(CONFIG_SYSCTL) &&
- sysctl_devconf_inherit_init_net == 3) {
+ switch (net_inherit_devconf()) {
+ case 3:
/* copy from the current netns */
memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all,
sizeof(ipv4_devconf));
memcpy(dflt,
current->nsproxy->net_ns->ipv4.devconf_dflt,
sizeof(ipv4_devconf_dflt));
- } else if (!IS_ENABLED(CONFIG_SYSCTL) ||
- sysctl_devconf_inherit_init_net != 2) {
- /* inherit == 0 or 1: copy from init_net */
+ break;
+ case 0:
+ case 1:
+ /* copy from init_net */
memcpy(all, init_net.ipv4.devconf_all,
sizeof(ipv4_devconf));
memcpy(dflt, init_net.ipv4.devconf_dflt,
sizeof(ipv4_devconf_dflt));
+ break;
+ case 2:
+ /* use compiled values */
+ break;
}
- /* else inherit == 2: use compiled values */
}
#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index f361d3d56be2..943edf4ad4db 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -389,7 +389,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
dev_match = dev_match || (res.type == RTN_LOCAL &&
dev == net->loopback_dev);
if (dev_match) {
- ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
+ ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK;
return ret;
}
if (no_addr)
@@ -401,7 +401,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
ret = 0;
if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
if (res.type == RTN_UNICAST)
- ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
+ ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK;
}
return ret;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5c58e21f724e..f866d6282b2b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -609,7 +609,7 @@ static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
tunnel_id_to_key32(key->tun_id),
key->tos & ~INET_ECN_MASK, dev_net(dev), 0,
- skb->mark, skb_get_hash(skb));
+ skb->mark, skb_get_hash(skb), key->flow_flags);
rt = ip_route_output_key(dev_net(dev), &fl4);
if (IS_ERR(rt))
return PTR_ERR(rt);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d7bd1daf022b..04e2034f2f8e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1730,7 +1730,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
- sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
ipc.sockc.mark = fl4.flowi4_mark;
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
len, 0, &ipc, &rt, MSG_DONTWAIT);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index a8a323ecbb54..e49a61a053a6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -772,7 +772,7 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen)
if (optlen < GROUP_FILTER_SIZE(0))
return -EINVAL;
- if (optlen > sysctl_optmem_max)
+ if (optlen > READ_ONCE(sysctl_optmem_max))
return -ENOBUFS;
gsf = memdup_sockptr(optval, optlen);
@@ -808,7 +808,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
if (optlen < size0)
return -EINVAL;
- if (optlen > sysctl_optmem_max - 4)
+ if (optlen > READ_ONCE(sysctl_optmem_max) - 4)
return -ENOBUFS;
p = kmalloc(optlen + 4, GFP_KERNEL);
@@ -1233,7 +1233,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
if (optlen < IP_MSFILTER_SIZE(0))
goto e_inval;
- if (optlen > sysctl_optmem_max) {
+ if (optlen > READ_ONCE(sysctl_optmem_max)) {
err = -ENOBUFS;
break;
}
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index e65e948cab9f..019f3b0839c5 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -295,7 +295,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
iph->saddr, tunnel->parms.o_key,
RT_TOS(iph->tos), dev_net(dev),
- tunnel->parms.link, tunnel->fwmark, 0);
+ tunnel->parms.link, tunnel->fwmark, 0, 0);
rt = ip_route_output_key(tunnel->net, &fl4);
if (!IS_ERR(rt)) {
@@ -570,7 +570,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
- dev_net(dev), 0, skb->mark, skb_get_hash(skb));
+ dev_net(dev), 0, skb->mark, skb_get_hash(skb),
+ key->flow_flags);
if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
goto tx_error;
@@ -729,7 +730,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
tunnel->parms.o_key, RT_TOS(tos),
dev_net(dev), tunnel->parms.link,
- tunnel->fwmark, skb_get_hash(skb));
+ tunnel->fwmark, skb_get_hash(skb), 0);
if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
goto tx_error;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 970e9a2cca4a..6cdfce6f2867 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1000,7 +1000,7 @@ new_segment:
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, page, offset);
- if (!can_coalesce && i >= sysctl_max_skb_frags) {
+ if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -1015,7 +1015,7 @@ new_segment:
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
get_page(page);
- skb_fill_page_desc(skb, i, page, offset, copy);
+ skb_fill_page_desc_noacc(skb, i, page, offset, copy);
}
if (!(flags & MSG_NO_SHARED_FRAGS))
@@ -1354,7 +1354,7 @@ new_segment:
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
- if (i >= sysctl_max_skb_frags) {
+ if (i >= READ_ONCE(sysctl_max_skb_frags)) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -1567,17 +1567,11 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-void tcp_cleanup_rbuf(struct sock *sk, int copied)
+static void __tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
- struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
-
- WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
- "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
- tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
-
if (inet_csk_ack_scheduled(sk)) {
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1623,6 +1617,17 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
tcp_send_ack(sk);
}
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
+{
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
+ "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
+ tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+ __tcp_cleanup_rbuf(sk, copied);
+}
+
static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
{
__skb_unlink(skb, &sk->sk_receive_queue);
@@ -1756,34 +1761,26 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
if (sk->sk_state == TCP_LISTEN)
return -ENOTCONN;
- while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
- int used;
-
- __skb_unlink(skb, &sk->sk_receive_queue);
- used = recv_actor(sk, skb);
- if (used <= 0) {
- if (!copied)
- copied = used;
- break;
- }
- seq += used;
- copied += used;
+ skb = tcp_recv_skb(sk, seq, &offset);
+ if (!skb)
+ return 0;
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
- consume_skb(skb);
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ WARN_ON(!skb_set_owner_sk_safe(skb, sk));
+ copied = recv_actor(sk, skb);
+ if (copied >= 0) {
+ seq += copied;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
++seq;
- break;
- }
- consume_skb(skb);
- break;
}
+ consume_skb(skb);
WRITE_ONCE(tp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
if (copied > 0)
- tcp_cleanup_rbuf(sk, copied);
+ __tcp_cleanup_rbuf(sk, copied);
return copied;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ab5f0ea166f1..bc2ea12221f9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2513,6 +2513,21 @@ static inline bool tcp_may_undo(const struct tcp_sock *tp)
return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
}
+static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
+ /* Hold old state until something *above* high_seq
+ * is ACKed. For Reno it is MUST to prevent false
+ * fast retransmits (RFC2582). SACK TCP is safe. */
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+ return true;
+ }
+ return false;
+}
+
/* People celebrate: "We love our President!" */
static bool tcp_try_undo_recovery(struct sock *sk)
{
@@ -2535,14 +2550,8 @@ static bool tcp_try_undo_recovery(struct sock *sk)
} else if (tp->rack.reo_wnd_persist) {
tp->rack.reo_wnd_persist--;
}
- if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
- /* Hold old state until something *above* high_seq
- * is ACKed. For Reno it is MUST to prevent false
- * fast retransmits (RFC2582). SACK TCP is safe. */
- if (!tcp_any_retrans_done(sk))
- tp->retrans_stamp = 0;
+ if (tcp_is_non_sack_preventing_reopen(sk))
return true;
- }
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
return false;
@@ -2578,6 +2587,8 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
inet_csk(sk)->icsk_retransmits = 0;
+ if (tcp_is_non_sack_preventing_reopen(sk))
+ return true;
if (frto_undo || tcp_is_sack(tp)) {
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
@@ -3614,12 +3625,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
/* RFC 5961 7 [ACK Throttling] */
static void tcp_send_challenge_ack(struct sock *sk)
{
- /* unprotected vars, we dont care of overwrites */
- static u32 challenge_timestamp;
- static unsigned int challenge_count;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- u32 count, now;
+ u32 count, now, ack_limit;
/* First check our per-socket dupack rate limit. */
if (__tcp_oow_rate_limited(net,
@@ -3627,18 +3635,22 @@ static void tcp_send_challenge_ack(struct sock *sk)
&tp->last_oow_ack_time))
return;
+ ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
+ if (ack_limit == INT_MAX)
+ goto send_ack;
+
/* Then check host-wide RFC 5961 rate limit. */
now = jiffies / HZ;
- if (now != challenge_timestamp) {
- u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
+ if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) {
u32 half = (ack_limit + 1) >> 1;
- challenge_timestamp = now;
- WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
+ WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now);
+ WRITE_ONCE(net->ipv4.tcp_challenge_count, half + prandom_u32_max(ack_limit));
}
- count = READ_ONCE(challenge_count);
+ count = READ_ONCE(net->ipv4.tcp_challenge_count);
if (count > 0) {
- WRITE_ONCE(challenge_count, count - 1);
+ WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1);
+send_ack:
NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
tcp_send_ack(sk);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0c83780dc9bf..5b019ba2b9d2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3139,8 +3139,10 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
/* Default TSQ limit of 16 TSO segments */
net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
- /* rfc5961 challenge ack rate limiting */
- net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
+
+ /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
+ net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
+
net->ipv4.sysctl_tcp_min_tso_segs = 2;
net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 78b654ff421b..290019de766d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -239,7 +239,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
if (wscale_ok) {
/* Set window scaling on max possible window */
space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
- space = max_t(u32, space, sysctl_rmem_max);
+ space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
space = min_t(u32, space, *window_clamp);
*rcv_wscale = clamp_t(int, ilog2(space) - 15,
0, TCP_MAX_WSCALE);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 34eda973bbf1..cd72158e953a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -783,6 +783,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
*/
if (tunnel) {
/* ...not for tunnels though: we don't have a sending socket */
+ if (udp_sk(sk)->encap_err_rcv)
+ udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2);
goto out;
}
if (!inet->recverr) {
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 8efaf8c3fe2a..8242c8947340 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -72,6 +72,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->encap_type = cfg->encap_type;
udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+ udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv;
udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
udp_sk(sk)->encap_destroy = cfg->encap_destroy;
udp_sk(sk)->gro_receive = cfg->gro_receive;