summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c3
-rw-r--r--net/ipv4/cipso_ipv4.c3
-rw-r--r--net/ipv4/fou.c48
-rw-r--r--net/ipv4/inet_connection_sock.c3
-rw-r--r--net/ipv4/inet_diag.c10
-rw-r--r--net/ipv4/inet_hashtables.c80
-rw-r--r--net/ipv4/ip_sockglue.c13
-rw-r--r--net/ipv4/ip_tunnel_core.c16
-rw-r--r--net/ipv4/ping.c7
-rw-r--r--net/ipv4/raw.c13
-rw-r--r--net/ipv4/tcp.c22
-rw-r--r--net/ipv4/tcp_input.c54
-rw-r--r--net/ipv4/tcp_ipv4.c82
-rw-r--r--net/ipv4/tcp_minisocks.c5
-rw-r--r--net/ipv4/udp.c354
-rw-r--r--net/ipv4/udp_diag.c18
-rw-r--r--net/ipv4/udp_offload.c113
-rw-r--r--net/ipv4/udp_tunnel.c2
18 files changed, 328 insertions, 518 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9e481992dbae..8217cd22f921 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -948,6 +948,7 @@ const struct proto_ops inet_dgram_ops = {
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
+ .set_peek_off = sk_set_peek_off,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
@@ -1106,7 +1107,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
struct ip_options_rcu *inet_opt;
inet_opt = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index bdb2a07ec363..40d6b87713a1 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1933,7 +1933,8 @@ int cipso_v4_sock_setattr(struct sock *sk,
sk_inet = inet_sk(sk);
- old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
+ old = rcu_dereference_protected(sk_inet->inet_opt,
+ lockdep_sock_is_held(sk));
if (sk_inet->is_icsk) {
sk_conn = inet_csk(sk);
if (old)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index a39068b4a4d9..d039f8fff57f 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -22,7 +22,6 @@ struct fou {
u8 flags;
__be16 port;
u16 type;
- struct udp_offload udp_offloads;
struct list_head list;
struct rcu_head rcu;
};
@@ -186,13 +185,13 @@ drop:
return 0;
}
-static struct sk_buff **fou_gro_receive(struct sk_buff **head,
- struct sk_buff *skb,
- struct udp_offload *uoff)
+static struct sk_buff **fou_gro_receive(struct sock *sk,
+ struct sk_buff **head,
+ struct sk_buff *skb)
{
const struct net_offload *ops;
struct sk_buff **pp = NULL;
- u8 proto = NAPI_GRO_CB(skb)->proto;
+ u8 proto = fou_from_sock(sk)->protocol;
const struct net_offload **offloads;
/* We can clear the encap_mark for FOU as we are essentially doing
@@ -220,11 +219,11 @@ out_unlock:
return pp;
}
-static int fou_gro_complete(struct sk_buff *skb, int nhoff,
- struct udp_offload *uoff)
+static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
+ int nhoff)
{
const struct net_offload *ops;
- u8 proto = NAPI_GRO_CB(skb)->proto;
+ u8 proto = fou_from_sock(sk)->protocol;
int err = -ENOSYS;
const struct net_offload **offloads;
@@ -267,9 +266,9 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
return guehdr;
}
-static struct sk_buff **gue_gro_receive(struct sk_buff **head,
- struct sk_buff *skb,
- struct udp_offload *uoff)
+static struct sk_buff **gue_gro_receive(struct sock *sk,
+ struct sk_buff **head,
+ struct sk_buff *skb)
{
const struct net_offload **offloads;
const struct net_offload *ops;
@@ -280,7 +279,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
void *data;
u16 doffset = 0;
int flush = 1;
- struct fou *fou = container_of(uoff, struct fou, udp_offloads);
+ struct fou *fou = fou_from_sock(sk);
struct gro_remcsum grc;
skb_gro_remcsum_init(&grc);
@@ -392,8 +391,7 @@ out:
return pp;
}
-static int gue_gro_complete(struct sk_buff *skb, int nhoff,
- struct udp_offload *uoff)
+static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
{
const struct net_offload **offloads;
struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
@@ -441,10 +439,7 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou)
static void fou_release(struct fou *fou)
{
struct socket *sock = fou->sock;
- struct sock *sk = sock->sk;
- if (sk->sk_family == AF_INET)
- udp_del_offload(&fou->udp_offloads);
list_del(&fou->list);
udp_tunnel_sock_release(sock);
@@ -454,11 +449,9 @@ static void fou_release(struct fou *fou)
static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
{
udp_sk(sk)->encap_rcv = fou_udp_recv;
- fou->protocol = cfg->protocol;
- fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
- fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
- fou->udp_offloads.port = cfg->udp_config.local_udp_port;
- fou->udp_offloads.ipproto = cfg->protocol;
+ udp_sk(sk)->gro_receive = fou_gro_receive;
+ udp_sk(sk)->gro_complete = fou_gro_complete;
+ fou_from_sock(sk)->protocol = cfg->protocol;
return 0;
}
@@ -466,9 +459,8 @@ static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
{
udp_sk(sk)->encap_rcv = gue_udp_recv;
- fou->udp_offloads.callbacks.gro_receive = gue_gro_receive;
- fou->udp_offloads.callbacks.gro_complete = gue_gro_complete;
- fou->udp_offloads.port = cfg->udp_config.local_udp_port;
+ udp_sk(sk)->gro_receive = gue_gro_receive;
+ udp_sk(sk)->gro_complete = gue_gro_complete;
return 0;
}
@@ -527,12 +519,6 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
sk->sk_allocation = GFP_ATOMIC;
- if (cfg->udp_config.family == AF_INET) {
- err = udp_add_offload(net, &fou->udp_offloads);
- if (err)
- goto error;
- }
-
err = fou_add_to_port_list(net, fou);
if (err)
goto error;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index bc5196ea1bdf..ab69da2d2a77 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -661,6 +661,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
newsk->sk_write_space = sk_stream_write_space;
+ /* listeners have SOCK_RCU_FREE, not the children */
+ sock_reset_flag(newsk, SOCK_RCU_FREE);
+
newsk->sk_mark = inet_rsk(req)->ir_mark;
atomic64_set(&newsk->sk_cookie,
atomic64_read(&inet_rsk(req)->ir_cookie));
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 5fdb02f5598e..bd591eb81ec9 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -356,6 +356,7 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
{
struct sock *sk;
+ rcu_read_lock();
if (req->sdiag_family == AF_INET)
sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
@@ -376,9 +377,11 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
req->id.idiag_if);
}
#endif
- else
+ else {
+ rcu_read_unlock();
return ERR_PTR(-EINVAL);
-
+ }
+ rcu_read_unlock();
if (!sk)
return ERR_PTR(-ENOENT);
@@ -772,13 +775,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb;
- struct hlist_nulls_node *node;
struct sock *sk;
num = 0;
ilb = &hashinfo->listening_hash[i];
spin_lock_bh(&ilb->lock);
- sk_nulls_for_each(sk, node, &ilb->head) {
+ sk_for_each(sk, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index bc68eced0105..fcadb670f50b 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -198,13 +198,13 @@ static inline int compute_score(struct sock *sk, struct net *net,
}
/*
- * Don't inline this cruft. Here are some nice properties to exploit here. The
- * BSD API does not allow a listening sock to specify the remote port nor the
+ * Here are some nice properties to exploit here. The BSD API
+ * does not allow a listening sock to specify the remote port nor the
* remote address for the connection. So always assume those are both
* wildcarded during the search since they can never be otherwise.
*/
-
+/* called with rcu_read_lock() : No refcount taken on the socket */
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
@@ -212,38 +212,27 @@ struct sock *__inet_lookup_listener(struct net *net,
const __be32 daddr, const unsigned short hnum,
const int dif)
{
- struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore, matches = 0, reuseport = 0;
- bool select_ok = true;
+ int score, hiscore = 0, matches = 0, reuseport = 0;
+ struct sock *sk, *result = NULL;
u32 phash = 0;
- rcu_read_lock();
-begin:
- result = NULL;
- hiscore = 0;
- sk_nulls_for_each_rcu(sk, node, &ilb->head) {
+ sk_for_each_rcu(sk, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
- result = sk;
- hiscore = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
- sk2 = reuseport_select_sock(sk, phash,
- skb, doff);
- if (sk2) {
- result = sk2;
- goto found;
- }
- }
+ result = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (result)
+ return result;
matches = 1;
}
+ result = sk;
+ hiscore = score;
} else if (score == hiscore && reuseport) {
matches++;
if (reciprocal_scale(phash, matches) == 0)
@@ -251,25 +240,6 @@ begin:
phash = next_pseudo_random32(phash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
- goto begin;
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
- result = NULL;
- else if (unlikely(compute_score(result, net, hnum, daddr,
- dif) < hiscore)) {
- sock_put(result);
- select_ok = false;
- goto begin;
- }
- }
- rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
@@ -312,7 +282,6 @@ struct sock *__inet_lookup_established(struct net *net,
unsigned int slot = hash & hashinfo->ehash_mask;
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
- rcu_read_lock();
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (sk->sk_hash != hash)
@@ -339,7 +308,6 @@ begin:
out:
sk = NULL;
found:
- rcu_read_unlock();
return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_established);
@@ -471,10 +439,9 @@ static int inet_reuseport_add_sock(struct sock *sk,
bool match_wildcard))
{
struct sock *sk2;
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
- sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
+ sk_for_each_rcu(sk2, &ilb->head) {
if (sk2 != sk &&
sk2->sk_family == sk->sk_family &&
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
@@ -512,7 +479,8 @@ int __inet_hash(struct sock *sk, struct sock *osk,
if (err)
goto unlock;
}
- __sk_nulls_add_node_rcu(sk, &ilb->head);
+ hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+ sock_set_flag(sk, SOCK_RCU_FREE);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
spin_unlock(&ilb->lock);
@@ -539,20 +507,25 @@ void inet_unhash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
spinlock_t *lock;
+ bool listener = false;
int done;
if (sk_unhashed(sk))
return;
- if (sk->sk_state == TCP_LISTEN)
+ if (sk->sk_state == TCP_LISTEN) {
lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
- else
+ listener = true;
+ } else {
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
-
+ }
spin_lock_bh(lock);
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
- done = __sk_nulls_del_node_init_rcu(sk);
+ if (listener)
+ done = __sk_del_node_init(sk);
+ else
+ done = __sk_nulls_del_node_init_rcu(sk);
if (done)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock_bh(lock);
@@ -688,9 +661,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
- INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
- i + LISTENING_NULLS_BASE);
- }
+ INIT_HLIST_HEAD(&h->listening_hash[i].head);
+ }
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 035ad645a8d9..89b5f3bd6694 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -219,11 +219,12 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
}
EXPORT_SYMBOL(ip_cmsg_recv_offset);
-int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
+int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
bool allow_ipv6)
{
int err, val;
struct cmsghdr *cmsg;
+ struct net *net = sock_net(sk);
for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
@@ -244,6 +245,12 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
continue;
}
#endif
+ if (cmsg->cmsg_level == SOL_SOCKET) {
+ if (__sock_cmsg_send(sk, msg, cmsg, &ipc->sockc))
+ return -EINVAL;
+ continue;
+ }
+
if (cmsg->cmsg_level != SOL_IP)
continue;
switch (cmsg->cmsg_type) {
@@ -635,7 +642,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
if (err)
break;
old = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (inet->is_icsk) {
struct inet_connection_sock *icsk = inet_csk(sk);
#if IS_ENABLED(CONFIG_IPV6)
@@ -1295,7 +1302,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
struct ip_options_rcu *inet_opt;
inet_opt = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
opt->optlen = 0;
if (inet_opt)
memcpy(optbuf, &inet_opt->opt,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6165f30c4d72..43445df61efd 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -86,15 +86,15 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(iptunnel_xmit);
-int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
- bool xnet)
+int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
+ __be16 inner_proto, bool raw_proto, bool xnet)
{
if (unlikely(!pskb_may_pull(skb, hdr_len)))
return -ENOMEM;
skb_pull_rcsum(skb, hdr_len);
- if (inner_proto == htons(ETH_P_TEB)) {
+ if (!raw_proto && inner_proto == htons(ETH_P_TEB)) {
struct ethhdr *eh;
if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
@@ -117,7 +117,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
return iptunnel_pull_offloads(skb);
}
-EXPORT_SYMBOL_GPL(iptunnel_pull_header);
+EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
gfp_t flags)
@@ -247,10 +247,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
if (tb[LWTUNNEL_IP_DST])
- tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]);
+ tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]);
if (tb[LWTUNNEL_IP_SRC])
- tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]);
+ tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]);
if (tb[LWTUNNEL_IP_TTL])
tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
@@ -275,8 +275,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
- nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
- nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
+ nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
+ nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index cf9700b1a106..66ddcb60519a 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -737,6 +737,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* no remote port */
}
+ ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
ipc.oif = sk->sk_bound_dev_if;
@@ -744,10 +745,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.ttl = 0;
ipc.tos = -1;
- sock_tx_timestamp(sk, &ipc.tx_flags);
-
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
+ err = ip_cmsg_send(sk, msg, &ipc, false);
if (unlikely(err)) {
kfree(ipc.opt);
return err;
@@ -768,6 +767,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_unlock();
}
+ sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
+
saddr = ipc.addr;
ipc.addr = faddr = daddr;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8d22de74080c..438f50c1a676 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -339,8 +339,8 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
struct msghdr *msg, size_t length,
- struct rtable **rtp,
- unsigned int flags)
+ struct rtable **rtp, unsigned int flags,
+ const struct sockcm_cookie *sockc)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
@@ -379,7 +379,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb->ip_summed = CHECKSUM_NONE;
- sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+ sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
skb->transport_header = skb->network_header;
err = -EFAULT;
@@ -540,6 +540,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
daddr = inet->inet_daddr;
}
+ ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
ipc.tx_flags = 0;
@@ -548,7 +549,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
- err = ip_cmsg_send(net, msg, &ipc, false);
+ err = ip_cmsg_send(sk, msg, &ipc, false);
if (unlikely(err)) {
kfree(ipc.opt);
goto out;
@@ -638,10 +639,10 @@ back_from_confirm:
if (inet->hdrincl)
err = raw_send_hdrinc(sk, &fl4, msg, len,
- &rt, msg->msg_flags);
+ &rt, msg->msg_flags, &ipc.sockc);
else {
- sock_tx_timestamp(sk, &ipc.tx_flags);
+ sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
if (!ipc.addr)
ipc.addr = fl4.daddr;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 08b8b960a8ed..4d73858991af 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -428,14 +428,16 @@ void tcp_init_sock(struct sock *sk)
}
EXPORT_SYMBOL(tcp_init_sock);
-static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
+static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
{
- if (sk->sk_tsflags) {
+ if (sk->sk_tsflags || tsflags) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- sock_tx_timestamp(sk, &shinfo->tx_flags);
+ sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
if (shinfo->tx_flags & SKBTX_ANY_TSTAMP)
shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+ tcb->txstamp_ack = !!(shinfo->tx_flags & SKBTX_ACK_TSTAMP);
}
}
@@ -957,7 +959,7 @@ new_segment:
offset += copy;
size -= copy;
if (!size) {
- tcp_tx_timestamp(sk, skb);
+ tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
goto out;
}
@@ -1077,6 +1079,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
+ struct sockcm_cookie sockc;
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
bool sg;
@@ -1119,6 +1122,15 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
/* 'common' sending to sendq */
}
+ sockc.tsflags = sk->sk_tsflags;
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (unlikely(err)) {
+ err = -EINVAL;
+ goto out_err;
+ }
+ }
+
/* This should be in poll */
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -1237,7 +1249,7 @@ new_segment:
copied += copy;
if (!msg_data_left(msg)) {
- tcp_tx_timestamp(sk, skb);
+ tcp_tx_timestamp(sk, sockc.tsflags, skb);
goto out;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e6e65f79ade8..983f04c11177 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2252,16 +2252,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
}
}
-/* CWND moderation, preventing bursts due to too big ACKs
- * in dubious situations.
- */
-static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
-{
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp) + tcp_max_burst(tp));
- tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
{
return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2410,7 +2400,6 @@ static bool tcp_try_undo_recovery(struct sock *sk)
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
- tcp_moderate_cwnd(tp);
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
return true;
@@ -3093,7 +3082,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
const struct skb_shared_info *shinfo;
/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
- if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
+ if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
return;
shinfo = skb_shinfo(skb);
@@ -4318,6 +4307,12 @@ static bool tcp_try_coalesce(struct sock *sk,
return true;
}
+static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+{
+ sk_drops_add(sk, skb);
+ __kfree_skb(skb);
+}
+
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
@@ -4342,7 +4337,7 @@ static void tcp_ofo_queue(struct sock *sk)
__skb_unlink(skb, &tp->out_of_order_queue);
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "ofo packet was already received\n");
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
continue;
}
SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
@@ -4394,7 +4389,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return;
}
@@ -4458,7 +4453,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
@@ -4497,7 +4492,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb1);
+ tcp_drop(sk, skb1);
}
add_sack:
@@ -4580,12 +4575,13 @@ err:
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- int eaten = -1;
bool fragstolen = false;
+ int eaten = -1;
- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
- goto drop;
-
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+ __kfree_skb(skb);
+ return;
+ }
skb_dst_drop(skb);
__skb_pull(skb, tcp_hdr(skb)->doff * 4);
@@ -4667,7 +4663,7 @@ out_of_window:
tcp_enter_quickack_mode(sk);
inet_csk_schedule_ack(sk);
drop:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return;
}
@@ -5244,7 +5240,7 @@ syn_challenge:
return true;
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return false;
}
@@ -5462,7 +5458,7 @@ csum_error:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
}
EXPORT_SYMBOL(tcp_rcv_established);
@@ -5693,7 +5689,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return 0;
} else {
tcp_send_ack(sk);
@@ -6054,7 +6050,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!queued) {
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
}
return 0;
}
@@ -6343,8 +6339,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
af_ops->send_synack(sk, dst, &fl, req,
&foc, !want_cookie);
- if (want_cookie)
- goto drop_and_free;
+ if (want_cookie) {
+ reqsk_free(req);
+ return 0;
+ }
}
reqsk_put(req);
return 0;
@@ -6354,7 +6352,7 @@ drop_and_release:
drop_and_free:
reqsk_free(req);
drop:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ad450509029b..f4f2a0a3849d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -157,7 +157,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
nexthop = daddr = usin->sin_addr.s_addr;
inet_opt = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (inet_opt && inet_opt->opt.srr) {
if (!daddr)
return -EINVAL;
@@ -329,7 +329,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)
* errors returned from accept().
*/
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
- NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(req->rsk_listener);
}
reqsk_put(req);
}
@@ -628,6 +628,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
#ifdef CONFIG_TCP_MD5SIG
+ rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
if (sk && sk_fullsock(sk)) {
key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
@@ -646,16 +647,18 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
ntohs(th->source), inet_iif(skb));
/* don't send rst if it can't find key */
if (!sk1)
- return;
- rcu_read_lock();
+ goto out;
+
key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
&ip_hdr(skb)->saddr, AF_INET);
if (!key)
- goto release_sk1;
+ goto out;
+
genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0)
- goto release_sk1;
+ goto out;
+
}
if (key) {
@@ -698,11 +701,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
#ifdef CONFIG_TCP_MD5SIG
-release_sk1:
- if (sk1) {
- rcu_read_unlock();
- sock_put(sk1);
- }
+out:
+ rcu_read_unlock();
#endif
}
@@ -882,8 +882,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
/* caller either holds rcu_read_lock() or socket lock */
md5sig = rcu_dereference_check(tp->md5sig_info,
- sock_owned_by_user(sk) ||
- lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
+ lockdep_sock_is_held(sk));
if (!md5sig)
return NULL;
#if IS_ENABLED(CONFIG_IPV6)
@@ -928,8 +927,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
}
md5sig = rcu_dereference_protected(tp->md5sig_info,
- sock_owned_by_user(sk) ||
- lockdep_is_held(&sk->sk_lock.slock));
+ lockdep_sock_is_held(sk));
if (!md5sig) {
md5sig = kmalloc(sizeof(*md5sig), gfp);
if (!md5sig)
@@ -1246,7 +1244,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
&tcp_request_sock_ipv4_ops, sk, skb);
drop:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_v4_conn_request);
@@ -1348,7 +1346,7 @@ exit_overflow:
exit_nonewsk:
dst_release(dst);
exit:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return NULL;
put_and_exit:
inet_csk_prepare_forced_close(newsk);
@@ -1538,11 +1536,12 @@ EXPORT_SYMBOL(tcp_prequeue);
int tcp_v4_rcv(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
const struct iphdr *iph;
const struct tcphdr *th;
+ bool refcounted;
struct sock *sk;
int ret;
- struct net *net = dev_net(skb->dev);
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
@@ -1588,7 +1587,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
- th->dest);
+ th->dest, &refcounted);
if (!sk)
goto no_tcp_socket;
@@ -1609,7 +1608,11 @@ process:
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
+ /* We own a reference on the listener, increase it again
+ * as we might lose it too soon.
+ */
sock_hold(sk);
+ refcounted = true;
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
@@ -1665,7 +1668,8 @@ process:
bh_unlock_sock(sk);
put_and_return:
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
@@ -1688,7 +1692,9 @@ discard_it:
return 0;
discard_and_relse:
- sock_put(sk);
+ sk_drops_add(sk, skb);
+ if (refcounted)
+ sock_put(sk);
goto discard_it;
do_time_wait:
@@ -1712,6 +1718,7 @@ do_time_wait:
if (sk2) {
inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
+ refcounted = false;
goto process;
}
/* Fall through to ACK */
@@ -1845,17 +1852,17 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
*/
static void *listening_get_next(struct seq_file *seq, void *cur)
{
- struct inet_connection_sock *icsk;
- struct hlist_nulls_node *node;
- struct sock *sk = cur;
- struct inet_listen_hashbucket *ilb;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
+ struct inet_listen_hashbucket *ilb;
+ struct inet_connection_sock *icsk;
+ struct sock *sk = cur;
if (!sk) {
+get_head:
ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock_bh(&ilb->lock);
- sk = sk_nulls_head(&ilb->head);
+ sk = sk_head(&ilb->head);
st->offset = 0;
goto get_sk;
}
@@ -1863,28 +1870,20 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
++st->num;
++st->offset;
- sk = sk_nulls_next(sk);
+ sk = sk_next(sk);
get_sk:
- sk_nulls_for_each_from(sk, node) {
+ sk_for_each_from(sk) {
if (!net_eq(sock_net(sk), net))
continue;
- if (sk->sk_family == st->family) {
- cur = sk;
- goto out;
- }
+ if (sk->sk_family == st->family)
+ return sk;
icsk = inet_csk(sk);
}
spin_unlock_bh(&ilb->lock);
st->offset = 0;
- if (++st->bucket < INET_LHTABLE_SIZE) {
- ilb = &tcp_hashinfo.listening_hash[st->bucket];
- spin_lock_bh(&ilb->lock);
- sk = sk_nulls_head(&ilb->head);
- goto get_sk;
- }
- cur = NULL;
-out:
- return cur;
+ if (++st->bucket < INET_LHTABLE_SIZE)
+ goto get_head;
+ return NULL;
}
static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
@@ -2383,6 +2382,7 @@ static int __net_init tcp_sk_init(struct net *net)
IPPROTO_TCP, net);
if (res)
goto fail;
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index acb366dd61e6..4c53e7c86586 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -704,7 +704,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
/* Out of window: send ACK and drop. */
- if (!(flg & TCP_FLAG_RST))
+ if (!(flg & TCP_FLAG_RST) &&
+ !tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+ &tcp_rsk(req)->last_oow_ack_time))
req->rsk_ops->send_ack(sk, skb, req);
if (paws_reject)
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 08eed5e16df0..3563788d064f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -143,10 +143,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
unsigned int log)
{
struct sock *sk2;
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
- sk_nulls_for_each(sk2, node, &hslot->head) {
+ sk_for_each(sk2, &hslot->head) {
if (net_eq(sock_net(sk2), net) &&
sk2 != sk &&
(bitmap || udp_sk(sk2)->udp_port_hash == num) &&
@@ -177,12 +176,11 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
bool match_wildcard))
{
struct sock *sk2;
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
int res = 0;
spin_lock(&hslot2->lock);
- udp_portaddr_for_each_entry(sk2, node, &hslot2->head) {
+ udp_portaddr_for_each_entry(sk2, &hslot2->head) {
if (net_eq(sock_net(sk2), net) &&
sk2 != sk &&
(udp_sk(sk2)->udp_port_hash == num) &&
@@ -207,11 +205,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
bool match_wildcard))
{
struct net *net = sock_net(sk);
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
struct sock *sk2;
- sk_nulls_for_each(sk2, node, &hslot->head) {
+ sk_for_each(sk2, &hslot->head) {
if (net_eq(sock_net(sk2), net) &&
sk2 != sk &&
sk2->sk_family == sk->sk_family &&
@@ -333,17 +330,18 @@ found:
goto fail_unlock;
}
- sk_nulls_add_node_rcu(sk, &hslot->head);
+ sk_add_node_rcu(sk, &hslot->head);
hslot->count++;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
spin_lock(&hslot2->lock);
- hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
&hslot2->head);
hslot2->count++;
spin_unlock(&hslot2->lock);
}
+ sock_set_flag(sk, SOCK_RCU_FREE);
error = 0;
fail_unlock:
spin_unlock_bh(&hslot->lock);
@@ -497,37 +495,27 @@ static struct sock *udp4_lib_lookup2(struct net *net,
struct sk_buff *skb)
{
struct sock *sk, *result;
- struct hlist_nulls_node *node;
int score, badness, matches = 0, reuseport = 0;
- bool select_ok = true;
u32 hash = 0;
-begin:
result = NULL;
badness = 0;
- udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+ udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score2(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
- result = sk;
- badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
-
- sk2 = reuseport_select_sock(sk, hash, skb,
+ result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (sk2) {
- result = sk2;
- select_ok = false;
- goto found;
- }
- }
+ if (result)
+ return result;
matches = 1;
}
+ badness = score;
+ result = sk;
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
@@ -535,23 +523,6 @@ begin:
hash = next_pseudo_random32(hash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot2)
- goto begin;
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(compute_score2(result, net, saddr, sport,
- daddr, hnum, dif) < badness)) {
- sock_put(result);
- goto begin;
- }
- }
return result;
}
@@ -563,15 +534,12 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
int dif, struct udp_table *udptable, struct sk_buff *skb)
{
struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
int score, badness, matches = 0, reuseport = 0;
- bool select_ok = true;
u32 hash = 0;
- rcu_read_lock();
if (hslot->count > 10) {
hash2 = udp4_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
@@ -593,35 +561,27 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
htonl(INADDR_ANY), hnum, dif,
hslot2, slot2, skb);
}
- rcu_read_unlock();
return result;
}
begin:
result = NULL;
badness = 0;
- sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+ sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, saddr, hnum, sport,
daddr, dport, dif);
if (score > badness) {
- result = sk;
- badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
-
- sk2 = reuseport_select_sock(sk, hash, skb,
+ result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (sk2) {
- result = sk2;
- select_ok = false;
- goto found;
- }
- }
+ if (result)
+ return result;
matches = 1;
}
+ result = sk;
+ badness = score;
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
@@ -629,25 +589,6 @@ begin:
hash = next_pseudo_random32(hash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begin;
-
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(compute_score(result, net, saddr, hnum, sport,
- daddr, dport, dif) < badness)) {
- sock_put(result);
- goto begin;
- }
- }
- rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
@@ -663,13 +604,37 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
udptable, skb);
}
+struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
+ __be16 sport, __be16 dport)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct net_device *dev =
+ skb_dst(skb) ? skb_dst(skb)->dev : skb->dev;
+
+ return __udp4_lib_lookup(dev_net(dev), iph->saddr, sport,
+ iph->daddr, dport, inet_iif(skb),
+ &udp_table, skb);
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
+
+/* Must be called under rcu_read_lock().
+ * Does increment socket refcount.
+ */
+#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
+ IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
- return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif,
- &udp_table, NULL);
+ struct sock *sk;
+
+ sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ dif, &udp_table, NULL);
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ return sk;
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+#endif
static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
__be16 loc_port, __be32 loc_addr,
@@ -771,7 +736,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
sk->sk_err = err;
sk->sk_error_report(sk);
out:
- sock_put(sk);
+ return;
}
void udp_err(struct sk_buff *skb, u32 info)
@@ -1027,15 +992,13 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
*/
connected = 1;
}
- ipc.addr = inet->inet_saddr;
+ ipc.sockc.tsflags = sk->sk_tsflags;
+ ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
- sock_tx_timestamp(sk, &ipc.tx_flags);
-
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc,
- sk->sk_family == AF_INET6);
+ err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
if (unlikely(err)) {
kfree(ipc.opt);
return err;
@@ -1060,6 +1023,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
saddr = ipc.addr;
ipc.addr = faddr = daddr;
+ sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
+
if (ipc.opt && ipc.opt->opt.srr) {
if (!daddr)
return -EINVAL;
@@ -1342,7 +1307,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct sk_buff *skb;
unsigned int ulen, copied;
- int peeked, off = 0;
+ int peeked, peeking, off;
int err;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
@@ -1352,15 +1317,16 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
return ip_recv_error(sk, msg, len, addr_len);
try_again:
+ peeking = off = sk_peek_offset(sk, flags);
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
&peeked, &off, &err);
if (!skb)
- goto out;
+ return err;
- ulen = skb->len - sizeof(struct udphdr);
+ ulen = skb->len;
copied = len;
- if (copied > ulen)
- copied = ulen;
+ if (copied > ulen - off)
+ copied = ulen - off;
else if (copied < ulen)
msg->msg_flags |= MSG_TRUNC;
@@ -1370,18 +1336,16 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
}
if (checksum_valid || skb_csum_unnecessary(skb))
- err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
- msg, copied);
+ err = skb_copy_datagram_msg(skb, off, msg, copied);
else {
- err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr),
- msg);
+ err = skb_copy_and_csum_datagram_msg(skb, off, msg);
if (err == -EINVAL)
goto csum_copy_err;
@@ -1394,7 +1358,8 @@ try_again:
UDP_INC_STATS_USER(sock_net(sk),
UDP_MIB_INERRORS, is_udplite);
}
- goto out_free;
+ skb_free_datagram_locked(sk, skb);
+ return err;
}
if (!peeked)
@@ -1418,9 +1383,7 @@ try_again:
if (flags & MSG_TRUNC)
err = ulen;
-out_free:
- skb_free_datagram_locked(sk, skb);
-out:
+ __skb_free_datagram_locked(sk, skb, peeking ? -err : err);
return err;
csum_copy_err:
@@ -1474,13 +1437,13 @@ void udp_lib_unhash(struct sock *sk)
spin_lock_bh(&hslot->lock);
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
- if (sk_nulls_del_node_init_rcu(sk)) {
+ if (sk_del_node_init_rcu(sk)) {
hslot->count--;
inet_sk(sk)->inet_num = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_lock(&hslot2->lock);
- hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
hslot2->count--;
spin_unlock(&hslot2->lock);
}
@@ -1513,12 +1476,12 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
if (hslot2 != nhslot2) {
spin_lock(&hslot2->lock);
- hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
hslot2->count--;
spin_unlock(&hslot2->lock);
spin_lock(&nhslot2->lock);
- hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
&nhslot2->head);
nhslot2->count++;
spin_unlock(&nhslot2->lock);
@@ -1548,7 +1511,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sk_incoming_cpu_update(sk);
}
- rc = sock_queue_rcv_skb(sk, skb);
+ rc = __sock_queue_rcv_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
@@ -1664,10 +1627,14 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
}
- if (rcu_access_pointer(sk->sk_filter) &&
- udp_lib_checksum_complete(skb))
- goto csum_error;
+ if (rcu_access_pointer(sk->sk_filter)) {
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+ if (sk_filter(sk, skb))
+ goto drop;
+ }
+ udp_csum_pull_header(skb);
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
@@ -1697,35 +1664,6 @@ drop:
return -1;
}
-static void flush_stack(struct sock **stack, unsigned int count,
- struct sk_buff *skb, unsigned int final)
-{
- unsigned int i;
- struct sk_buff *skb1 = NULL;
- struct sock *sk;
-
- for (i = 0; i < count; i++) {
- sk = stack[i];
- if (likely(!skb1))
- skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
-
- if (!skb1) {
- atomic_inc(&sk->sk_drops);
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- IS_UDPLITE(sk));
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
- IS_UDPLITE(sk));
- }
-
- if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
- skb1 = NULL;
-
- sock_put(sk);
- }
- if (unlikely(skb1))
- kfree_skb(skb1);
-}
-
/* For TCP sockets, sk_rx_dst is protected by socket lock
* For UDP, we use xchg() to guard against concurrent changes.
*/
@@ -1749,14 +1687,14 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct udp_table *udptable,
int proto)
{
- struct sock *sk, *stack[256 / sizeof(struct sock *)];
- struct hlist_nulls_node *node;
+ struct sock *sk, *first = NULL;
unsigned short hnum = ntohs(uh->dest);
struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
- int dif = skb->dev->ifindex;
- unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
- bool inner_flushed = false;
+ unsigned int offset = offsetof(typeof(*sk), sk_node);
+ int dif = skb->dev->ifindex;
+ struct hlist_node *node;
+ struct sk_buff *nskb;
if (use_hash2) {
hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
@@ -1767,23 +1705,28 @@ start_lookup:
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
- spin_lock(&hslot->lock);
- sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
- if (__udp_is_mcast_sock(net, sk,
- uh->dest, daddr,
- uh->source, saddr,
- dif, hnum)) {
- if (unlikely(count == ARRAY_SIZE(stack))) {
- flush_stack(stack, count, skb, ~0);
- inner_flushed = true;
- count = 0;
- }
- stack[count++] = sk;
- sock_hold(sk);
+ sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
+ if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
+ uh->source, saddr, dif, hnum))
+ continue;
+
+ if (!first) {
+ first = sk;
+ continue;
}
- }
+ nskb = skb_clone(skb, GFP_ATOMIC);
- spin_unlock(&hslot->lock);
+ if (unlikely(!nskb)) {
+ atomic_inc(&sk->sk_drops);
+ UDP_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS,
+ IS_UDPLITE(sk));
+ UDP_INC_STATS_BH(net, UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ continue;
+ }
+ if (udp_queue_rcv_skb(sk, nskb) > 0)
+ consume_skb(nskb);
+ }
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
if (use_hash2 && hash2 != hash2_any) {
@@ -1791,16 +1734,13 @@ start_lookup:
goto start_lookup;
}
- /*
- * do the slow work with no lock held
- */
- if (count) {
- flush_stack(stack, count, skb, count - 1);
+ if (first) {
+ if (udp_queue_rcv_skb(first, skb) > 0)
+ consume_skb(skb);
} else {
- if (!inner_flushed)
- UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
- proto == IPPROTO_UDPLITE);
- consume_skb(skb);
+ kfree_skb(skb);
+ UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
+ proto == IPPROTO_UDPLITE);
}
return 0;
}
@@ -1897,7 +1837,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
inet_compute_pseudo);
ret = udp_queue_rcv_skb(sk, skb);
- sock_put(sk);
/* a return value > 0 means to resubmit the input, but
* it wants the return to be -protocol, or 0
@@ -1958,49 +1897,24 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
int dif)
{
struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned short hnum = ntohs(loc_port);
- unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask);
+ unsigned int slot = udp_hashfn(net, hnum, udp_table.mask);
struct udp_hslot *hslot = &udp_table.hash[slot];
/* Do not bother scanning a too big list */
if (hslot->count > 10)
return NULL;
- rcu_read_lock();
-begin:
- count = 0;
result = NULL;
- sk_nulls_for_each_rcu(sk, node, &hslot->head) {
- if (__udp_is_mcast_sock(net, sk,
- loc_port, loc_addr,
- rmt_port, rmt_addr,
- dif, hnum)) {
+ sk_for_each_rcu(sk, &hslot->head) {
+ if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
+ rmt_port, rmt_addr, dif, hnum)) {
+ if (result)
+ return NULL;
result = sk;
- ++count;
- }
- }
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begin;
-
- if (result) {
- if (count != 1 ||
- unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(!__udp_is_mcast_sock(net, result,
- loc_port, loc_addr,
- rmt_port, rmt_addr,
- dif, hnum))) {
- sock_put(result);
- result = NULL;
}
}
- rcu_read_unlock();
+
return result;
}
@@ -2013,37 +1927,22 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 rmt_port, __be32 rmt_addr,
int dif)
{
- struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned short hnum = ntohs(loc_port);
unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
unsigned int slot2 = hash2 & udp_table.mask;
struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+ struct sock *sk;
- rcu_read_lock();
- result = NULL;
- udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
- if (INET_MATCH(sk, net, acookie,
- rmt_addr, loc_addr, ports, dif))
- result = sk;
+ udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+ if (INET_MATCH(sk, net, acookie, rmt_addr,
+ loc_addr, ports, dif))
+ return sk;
/* Only check first socket in chain */
break;
}
-
- if (result) {
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(!INET_MATCH(sk, net, acookie,
- rmt_addr, loc_addr,
- ports, dif))) {
- sock_put(result);
- result = NULL;
- }
- }
- rcu_read_unlock();
- return result;
+ return NULL;
}
void udp_v4_early_demux(struct sk_buff *skb)
@@ -2051,7 +1950,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
const struct iphdr *iph;
const struct udphdr *uh;
- struct sock *sk;
+ struct sock *sk = NULL;
struct dst_entry *dst;
int dif = skb->dev->ifindex;
int ours;
@@ -2083,11 +1982,9 @@ void udp_v4_early_demux(struct sk_buff *skb)
} else if (skb->pkt_type == PACKET_HOST) {
sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
uh->source, iph->saddr, dif);
- } else {
- return;
}
- if (!sk)
+ if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2))
return;
skb->sk = sk;
@@ -2387,14 +2284,13 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
for (state->bucket = start; state->bucket <= state->udp_table->mask;
++state->bucket) {
- struct hlist_nulls_node *node;
struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
- if (hlist_nulls_empty(&hslot->head))
+ if (hlist_empty(&hslot->head))
continue;
spin_lock_bh(&hslot->lock);
- sk_nulls_for_each(sk, node, &hslot->head) {
+ sk_for_each(sk, &hslot->head) {
if (!net_eq(sock_net(sk), net))
continue;
if (sk->sk_family == state->family)
@@ -2413,7 +2309,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
struct net *net = seq_file_net(seq);
do {
- sk = sk_nulls_next(sk);
+ sk = sk_next(sk);
} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
if (!sk) {
@@ -2622,12 +2518,12 @@ void __init udp_table_init(struct udp_table *table, const char *name)
table->hash2 = table->hash + (table->mask + 1);
for (i = 0; i <= table->mask; i++) {
- INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
+ INIT_HLIST_HEAD(&table->hash[i].head);
table->hash[i].count = 0;
spin_lock_init(&table->hash[i].lock);
}
for (i = 0; i <= table->mask; i++) {
- INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
+ INIT_HLIST_HEAD(&table->hash2[i].head);
table->hash2[i].count = 0;
spin_lock_init(&table->hash2[i].lock);
}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index df1966f3b6ec..3d5ccf4b1412 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -36,10 +36,11 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
const struct inet_diag_req_v2 *req)
{
int err = -EINVAL;
- struct sock *sk;
+ struct sock *sk = NULL;
struct sk_buff *rep;
struct net *net = sock_net(in_skb->sk);
+ rcu_read_lock();
if (req->sdiag_family == AF_INET)
sk = __udp4_lib_lookup(net,
req->id.idiag_src[0], req->id.idiag_sport,
@@ -54,9 +55,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
req->id.idiag_dport,
req->id.idiag_if, tbl, NULL);
#endif
- else
- goto out_nosk;
-
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ rcu_read_unlock();
err = -ENOENT;
if (!sk)
goto out_nosk;
@@ -96,24 +97,23 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
- int num, s_num, slot, s_slot;
struct net *net = sock_net(skb->sk);
+ int num, s_num, slot, s_slot;
s_slot = cb->args[0];
num = s_num = cb->args[1];
for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) {
- struct sock *sk;
- struct hlist_nulls_node *node;
struct udp_hslot *hslot = &table->hash[slot];
+ struct sock *sk;
num = 0;
- if (hlist_nulls_empty(&hslot->head))
+ if (hlist_empty(&hslot->head))
continue;
spin_lock_bh(&hslot->lock);
- sk_nulls_for_each(sk, node, &hslot->head) {
+ sk_for_each(sk, &hslot->head) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0ed2dafb7cc4..6230cf4b0d2d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -14,18 +14,6 @@
#include <net/udp.h>
#include <net/protocol.h>
-static DEFINE_SPINLOCK(udp_offload_lock);
-static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
-
-#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock))
-
-struct udp_offload_priv {
- struct udp_offload *offload;
- possible_net_t net;
- struct rcu_head rcu;
- struct udp_offload_priv __rcu *next;
-};
-
static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
@@ -179,6 +167,7 @@ out_unlock:
return segs;
}
+EXPORT_SYMBOL(skb_udp_tunnel_segment);
static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
@@ -253,64 +242,14 @@ out:
return segs;
}
-int udp_add_offload(struct net *net, struct udp_offload *uo)
-{
- struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
-
- if (!new_offload)
- return -ENOMEM;
-
- write_pnet(&new_offload->net, net);
- new_offload->offload = uo;
-
- spin_lock(&udp_offload_lock);
- new_offload->next = udp_offload_base;
- rcu_assign_pointer(udp_offload_base, new_offload);
- spin_unlock(&udp_offload_lock);
-
- return 0;
-}
-EXPORT_SYMBOL(udp_add_offload);
-
-static void udp_offload_free_routine(struct rcu_head *head)
-{
- struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu);
- kfree(ou_priv);
-}
-
-void udp_del_offload(struct udp_offload *uo)
-{
- struct udp_offload_priv __rcu **head = &udp_offload_base;
- struct udp_offload_priv *uo_priv;
-
- spin_lock(&udp_offload_lock);
-
- uo_priv = udp_deref_protected(*head);
- for (; uo_priv != NULL;
- uo_priv = udp_deref_protected(*head)) {
- if (uo_priv->offload == uo) {
- rcu_assign_pointer(*head,
- udp_deref_protected(uo_priv->next));
- goto unlock;
- }
- head = &uo_priv->next;
- }
- pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port));
-unlock:
- spin_unlock(&udp_offload_lock);
- if (uo_priv)
- call_rcu(&uo_priv->rcu, udp_offload_free_routine);
-}
-EXPORT_SYMBOL(udp_del_offload);
-
struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
- struct udphdr *uh)
+ struct udphdr *uh, udp_lookup_t lookup)
{
- struct udp_offload_priv *uo_priv;
struct sk_buff *p, **pp = NULL;
struct udphdr *uh2;
unsigned int off = skb_gro_offset(skb);
int flush = 1;
+ struct sock *sk;
if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
@@ -322,13 +261,10 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
NAPI_GRO_CB(skb)->encap_mark = 1;
rcu_read_lock();
- uo_priv = rcu_dereference(udp_offload_base);
- for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
- if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
- uo_priv->offload->port == uh->dest &&
- uo_priv->offload->callbacks.gro_receive)
- goto unflush;
- }
+ sk = (*lookup)(skb, uh->source, uh->dest);
+
+ if (sk && udp_sk(sk)->gro_receive)
+ goto unflush;
goto out_unlock;
unflush:
@@ -352,9 +288,7 @@ unflush:
skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
- NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
- pp = uo_priv->offload->callbacks.gro_receive(head, skb,
- uo_priv->offload);
+ pp = udp_sk(sk)->gro_receive(sk, head, skb);
out_unlock:
rcu_read_unlock();
@@ -362,6 +296,7 @@ out:
NAPI_GRO_CB(skb)->flush |= flush;
return pp;
}
+EXPORT_SYMBOL(udp_gro_receive);
static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
@@ -383,39 +318,28 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
inet_gro_compute_pseudo);
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 0;
- return udp_gro_receive(head, skb, uh);
+ return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb);
flush:
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
-int udp_gro_complete(struct sk_buff *skb, int nhoff)
+int udp_gro_complete(struct sk_buff *skb, int nhoff,
+ udp_lookup_t lookup)
{
- struct udp_offload_priv *uo_priv;
__be16 newlen = htons(skb->len - nhoff);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
int err = -ENOSYS;
+ struct sock *sk;
uh->len = newlen;
rcu_read_lock();
-
- uo_priv = rcu_dereference(udp_offload_base);
- for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
- if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
- uo_priv->offload->port == uh->dest &&
- uo_priv->offload->callbacks.gro_complete)
- break;
- }
-
- if (uo_priv) {
- NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
- err = uo_priv->offload->callbacks.gro_complete(skb,
- nhoff + sizeof(struct udphdr),
- uo_priv->offload);
- }
-
+ sk = (*lookup)(skb, uh->source, uh->dest);
+ if (sk && udp_sk(sk)->gro_complete)
+ err = udp_sk(sk)->gro_complete(sk, skb,
+ nhoff + sizeof(struct udphdr));
rcu_read_unlock();
if (skb->remcsum_offload)
@@ -426,6 +350,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
return err;
}
+EXPORT_SYMBOL(udp_gro_complete);
static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
{
@@ -440,7 +365,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
}
- return udp_gro_complete(skb, nhoff);
+ return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
}
static const struct net_offload udpv4_offload = {
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 96599d1a1318..47f12c73d959 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -69,6 +69,8 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->encap_type = cfg->encap_type;
udp_sk(sk)->encap_rcv = cfg->encap_rcv;
udp_sk(sk)->encap_destroy = cfg->encap_destroy;
+ udp_sk(sk)->gro_receive = cfg->gro_receive;
+ udp_sk(sk)->gro_complete = cfg->gro_complete;
udp_tunnel_encap_enable(sock);
}