summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/ip_fragment.c6
-rw-r--r--net/ipv4/ipmr.c6
-rw-r--r--net/ipv4/ping.c14
-rw-r--r--net/ipv4/route.c8
-rw-r--r--net/ipv4/tcp.c9
-rw-r--r--net/ipv4/tcp_input.c15
-rw-r--r--net/ipv4/tcp_metrics.c6
8 files changed, 45 insertions, 25 deletions
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c48c572f024d..1be0d91620a3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -318,17 +318,17 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
return true;
/* No rate limit on loopback */
- dev = dst_dev(dst);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
if (dev && (dev->flags & IFF_LOOPBACK))
goto out;
- rcu_read_lock();
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
l3mdev_master_ifindex_rcu(dev));
rc = inet_peer_xrlim_allow(peer,
READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
- rcu_read_unlock();
out:
+ rcu_read_unlock();
if (!rc)
__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
else
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b2584cce90ae..f7012479713b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -476,14 +476,16 @@ out_fail:
/* Process an incoming IP datagram fragment. */
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
- struct net_device *dev = skb->dev ? : skb_dst_dev(skb);
- int vif = l3mdev_master_ifindex_rcu(dev);
+ struct net_device *dev;
struct ipq *qp;
+ int vif;
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
/* Lookup (or create) queue header */
rcu_read_lock();
+ dev = skb->dev ? : skb_dst_dev_rcu(skb);
+ vif = l3mdev_master_ifindex_rcu(dev);
qp = ip_find(net, ip_hdr(skb), user, vif);
if (qp) {
int ret, refs = 0;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e86a8a862c41..8c568fbddb5f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1904,7 +1904,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt,
return -1;
}
- encap += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ encap += LL_RESERVED_SPACE(dst_dev_rcu(&rt->dst)) + rt->dst.header_len;
if (skb_cow(skb, encap)) {
ip_rt_put(rt);
@@ -1957,7 +1957,7 @@ static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt,
* result in receiving multiple packets.
*/
NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
- net, NULL, skb, skb->dev, rt->dst.dev,
+ net, NULL, skb, skb->dev, dst_dev_rcu(&rt->dst),
ipmr_forward_finish);
return;
@@ -2301,7 +2301,7 @@ int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
guard(rcu)();
- dev = rt->dst.dev;
+ dev = dst_dev_rcu(&rt->dst);
if (IPCB(skb)->flags & IPSKB_FORWARDED)
goto mc_output;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 031df4c19fcc..d2c3480df8f7 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -77,6 +77,7 @@ static inline struct hlist_head *ping_hashslot(struct ping_table *table,
int ping_get_port(struct sock *sk, unsigned short ident)
{
+ struct net *net = sock_net(sk);
struct inet_sock *isk, *isk2;
struct hlist_head *hlist;
struct sock *sk2 = NULL;
@@ -90,9 +91,10 @@ int ping_get_port(struct sock *sk, unsigned short ident)
for (i = 0; i < (1L << 16); i++, result++) {
if (!result)
result++; /* avoid zero */
- hlist = ping_hashslot(&ping_table, sock_net(sk),
- result);
+ hlist = ping_hashslot(&ping_table, net, result);
sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
isk2 = inet_sk(sk2);
if (isk2->inet_num == result)
@@ -108,8 +110,10 @@ next_port:
if (i >= (1L << 16))
goto fail;
} else {
- hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+ hlist = ping_hashslot(&ping_table, net, ident);
sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
isk2 = inet_sk(sk2);
/* BUG? Why is this reuse and not reuseaddr? ping.c
@@ -129,7 +133,7 @@ next_port:
pr_debug("was not hashed\n");
sk_add_node_rcu(sk, hlist);
sock_set_flag(sk, SOCK_RCU_FREE);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ sock_prot_inuse_add(net, sk->sk_prot, 1);
}
spin_unlock(&ping_table.lock);
return 0;
@@ -188,6 +192,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
}
sk_for_each_rcu(sk, hslot) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
isk = inet_sk(sk);
pr_debug("iterate\n");
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index baa43e5966b1..5582ccd673ee 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -413,11 +413,11 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
const void *daddr)
{
const struct rtable *rt = container_of(dst, struct rtable, dst);
- struct net_device *dev = dst_dev(dst);
+ struct net_device *dev;
struct neighbour *n;
rcu_read_lock();
-
+ dev = dst_dev_rcu(dst);
if (likely(rt->rt_gw_family == AF_INET)) {
n = ip_neigh_gw4(dev, rt->rt_gw4);
} else if (rt->rt_gw_family == AF_INET6) {
@@ -1026,7 +1026,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
return;
rcu_read_lock();
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
if (mtu < net->ipv4.ip_rt_min_pmtu) {
lock = true;
mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
@@ -1326,7 +1326,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
struct net *net;
rcu_read_lock();
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
net->ipv4.ip_rt_min_advmss);
rcu_read_unlock();
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ad76556800f2..89040007c7b7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3099,8 +3099,8 @@ bool tcp_check_oom(const struct sock *sk, int shift)
void __tcp_close(struct sock *sk, long timeout)
{
+ bool data_was_unread = false;
struct sk_buff *skb;
- int data_was_unread = 0;
int state;
WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
@@ -3119,11 +3119,12 @@ void __tcp_close(struct sock *sk, long timeout)
* reader process may not have drained the data yet!
*/
while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
- len--;
- data_was_unread += len;
+ end_seq--;
+ if (after(end_seq, tcp_sk(sk)->copied_seq))
+ data_was_unread = true;
__kfree_skb(skb);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 71b76e98371a..64f93668a845 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4890,12 +4890,23 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
/* Check if this incoming skb can be added to socket receive queues
* while satisfying sk->sk_rcvbuf limit.
+ *
+ * In theory we should use skb->truesize, but this can cause problems
+ * when applications use too small SO_RCVBUF values.
+ * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio,
+ * allowing RWIN to be close to available space.
+ * Whenever the receive queue gets full, we can receive a small packet
+ * filling RWIN, but with a high skb->truesize, because most NIC use 4K page
+ * plus sk_buff metadata even when receiving less than 1500 bytes of payload.
+ *
+ * Note that we use skb->len to decide to accept or drop this packet,
+ * but sk->sk_rmem_alloc is the sum of all skb->truesize.
*/
static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
{
- unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize;
+ unsigned int rmem = atomic_read(&sk->sk_rmem_alloc);
- return new_mem <= sk->sk_rcvbuf;
+ return rmem + skb->len <= sk->sk_rcvbuf;
}
static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 03c068ea27b6..10e86f1008e9 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
struct net *net;
spin_lock_bh(&tcp_metrics_lock);
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
/* While waiting for the spin-lock the cache might have been populated
* with this entry and so we have to check again.
@@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
return NULL;
}
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
@@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
else
return NULL;
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);