diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 3 | ||||
-rw-r--r-- | net/ipv4/cipso_ipv4.c | 3 | ||||
-rw-r--r-- | net/ipv4/fou.c | 48 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 3 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 10 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 80 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 13 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 16 | ||||
-rw-r--r-- | net/ipv4/ping.c | 7 | ||||
-rw-r--r-- | net/ipv4/raw.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 22 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 54 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 82 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 5 | ||||
-rw-r--r-- | net/ipv4/udp.c | 354 | ||||
-rw-r--r-- | net/ipv4/udp_diag.c | 18 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 113 | ||||
-rw-r--r-- | net/ipv4/udp_tunnel.c | 2 |
18 files changed, 328 insertions, 518 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9e481992dbae..8217cd22f921 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -948,6 +948,7 @@ const struct proto_ops inet_dgram_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, + .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, @@ -1106,7 +1107,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) struct ip_options_rcu *inet_opt; inet_opt = rcu_dereference_protected(inet->inet_opt, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index bdb2a07ec363..40d6b87713a1 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1933,7 +1933,8 @@ int cipso_v4_sock_setattr(struct sock *sk, sk_inet = inet_sk(sk); - old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk)); + old = rcu_dereference_protected(sk_inet->inet_opt, + lockdep_sock_is_held(sk)); if (sk_inet->is_icsk) { sk_conn = inet_csk(sk); if (old) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index a39068b4a4d9..d039f8fff57f 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -22,7 +22,6 @@ struct fou { u8 flags; __be16 port; u16 type; - struct udp_offload udp_offloads; struct list_head list; struct rcu_head rcu; }; @@ -186,13 +185,13 @@ drop: return 0; } -static struct sk_buff **fou_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) +static struct sk_buff **fou_gro_receive(struct sock *sk, + struct sk_buff **head, + struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff **pp = NULL; - u8 proto = NAPI_GRO_CB(skb)->proto; + u8 proto = fou_from_sock(sk)->protocol; const struct net_offload **offloads; /* We can clear the encap_mark for FOU as we are essentially doing @@ -220,11 +219,11 @@ out_unlock: return pp; } -static int fou_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) +static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, + int nhoff) { const struct net_offload *ops; - u8 proto = NAPI_GRO_CB(skb)->proto; + u8 proto = fou_from_sock(sk)->protocol; int err = -ENOSYS; const struct net_offload **offloads; @@ -267,9 +266,9 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, return guehdr; } -static struct sk_buff **gue_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) +static struct sk_buff **gue_gro_receive(struct sock *sk, + struct sk_buff **head, + struct sk_buff *skb) { const struct net_offload **offloads; const struct net_offload *ops; @@ -280,7 +279,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, void *data; u16 doffset = 0; int flush = 1; - struct fou *fou = container_of(uoff, struct fou, udp_offloads); + struct fou *fou = fou_from_sock(sk); struct gro_remcsum grc; skb_gro_remcsum_init(&grc); @@ -392,8 +391,7 @@ out: return pp; } -static int gue_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) +static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { const struct net_offload **offloads; struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); @@ -441,10 +439,7 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou) static void fou_release(struct fou *fou) { struct socket *sock = fou->sock; - struct sock *sk = sock->sk; - if (sk->sk_family == AF_INET) - udp_del_offload(&fou->udp_offloads); list_del(&fou->list); udp_tunnel_sock_release(sock); @@ -454,11 +449,9 @@ static void fou_release(struct fou *fou) static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) { udp_sk(sk)->encap_rcv = fou_udp_recv; - fou->protocol = cfg->protocol; - fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; - fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; - fou->udp_offloads.port = cfg->udp_config.local_udp_port; - fou->udp_offloads.ipproto = cfg->protocol; + udp_sk(sk)->gro_receive = fou_gro_receive; + udp_sk(sk)->gro_complete = fou_gro_complete; + fou_from_sock(sk)->protocol = cfg->protocol; return 0; } @@ -466,9 +459,8 @@ static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) { udp_sk(sk)->encap_rcv = gue_udp_recv; - fou->udp_offloads.callbacks.gro_receive = gue_gro_receive; - fou->udp_offloads.callbacks.gro_complete = gue_gro_complete; - fou->udp_offloads.port = cfg->udp_config.local_udp_port; + udp_sk(sk)->gro_receive = gue_gro_receive; + udp_sk(sk)->gro_complete = gue_gro_complete; return 0; } @@ -527,12 +519,6 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; - if (cfg->udp_config.family == AF_INET) { - err = udp_add_offload(net, &fou->udp_offloads); - if (err) - goto error; - } - err = fou_add_to_port_list(net, fou); if (err) goto error; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index bc5196ea1bdf..ab69da2d2a77 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -661,6 +661,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); newsk->sk_write_space = sk_stream_write_space; + /* listeners have SOCK_RCU_FREE, not the children */ + sock_reset_flag(newsk, SOCK_RCU_FREE); + newsk->sk_mark = inet_rsk(req)->ir_mark; atomic64_set(&newsk->sk_cookie, atomic64_read(&inet_rsk(req)->ir_cookie)); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5fdb02f5598e..bd591eb81ec9 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -356,6 +356,7 @@ struct sock *inet_diag_find_one_icsk(struct net *net, { struct sock *sk; + rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], @@ -376,9 +377,11 @@ struct sock *inet_diag_find_one_icsk(struct net *net, req->id.idiag_if); } #endif - else + else { + rcu_read_unlock(); return ERR_PTR(-EINVAL); - + } + rcu_read_unlock(); if (!sk) return ERR_PTR(-ENOENT); @@ -772,13 +775,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; struct sock *sk; num = 0; ilb = &hashinfo->listening_hash[i]; spin_lock_bh(&ilb->lock); - sk_nulls_for_each(sk, node, &ilb->head) { + sk_for_each(sk, &ilb->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index bc68eced0105..fcadb670f50b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -198,13 +198,13 @@ static inline int compute_score(struct sock *sk, struct net *net, } /* - * Don't inline this cruft. Here are some nice properties to exploit here. The - * BSD API does not allow a listening sock to specify the remote port nor the + * Here are some nice properties to exploit here. The BSD API + * does not allow a listening sock to specify the remote port nor the * remote address for the connection. So always assume those are both * wildcarded during the search since they can never be otherwise. */ - +/* called with rcu_read_lock() : No refcount taken on the socket */ struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -212,38 +212,27 @@ struct sock *__inet_lookup_listener(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif) { - struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore, matches = 0, reuseport = 0; - bool select_ok = true; + int score, hiscore = 0, matches = 0, reuseport = 0; + struct sock *sk, *result = NULL; u32 phash = 0; - rcu_read_lock(); -begin: - result = NULL; - hiscore = 0; - sk_nulls_for_each_rcu(sk, node, &ilb->head) { + sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif); if (score > hiscore) { - result = sk; - hiscore = score; reuseport = sk->sk_reuseport; if (reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - sk2 = reuseport_select_sock(sk, phash, - skb, doff); - if (sk2) { - result = sk2; - goto found; - } - } + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; matches = 1; } + result = sk; + hiscore = score; } else if (score == hiscore && reuseport) { matches++; if (reciprocal_scale(phash, matches) == 0) @@ -251,25 +240,6 @@ begin: phash = next_pseudo_random32(phash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) - goto begin; - if (result) { -found: - if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) - result = NULL; - else if (unlikely(compute_score(result, net, hnum, daddr, - dif) < hiscore)) { - sock_put(result); - select_ok = false; - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); @@ -312,7 +282,6 @@ struct sock *__inet_lookup_established(struct net *net, unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - rcu_read_lock(); begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) @@ -339,7 +308,6 @@ begin: out: sk = NULL; found: - rcu_read_unlock(); return sk; } EXPORT_SYMBOL_GPL(__inet_lookup_established); @@ -471,10 +439,9 @@ static int inet_reuseport_add_sock(struct sock *sk, bool match_wildcard)) { struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each_rcu(sk2, node, &ilb->head) { + sk_for_each_rcu(sk2, &ilb->head) { if (sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && @@ -512,7 +479,8 @@ int __inet_hash(struct sock *sk, struct sock *osk, if (err) goto unlock; } - __sk_nulls_add_node_rcu(sk, &ilb->head); + hlist_add_head_rcu(&sk->sk_node, &ilb->head); + sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb->lock); @@ -539,20 +507,25 @@ void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; spinlock_t *lock; + bool listener = false; int done; if (sk_unhashed(sk)) return; - if (sk->sk_state == TCP_LISTEN) + if (sk->sk_state == TCP_LISTEN) { lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; - else + listener = true; + } else { lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - + } spin_lock_bh(lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - done = __sk_nulls_del_node_init_rcu(sk); + if (listener) + done = __sk_del_node_init(sk); + else + done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); @@ -688,9 +661,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, - i + LISTENING_NULLS_BASE); - } + INIT_HLIST_HEAD(&h->listening_hash[i].head); + } } EXPORT_SYMBOL_GPL(inet_hashinfo_init); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 035ad645a8d9..89b5f3bd6694 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -219,11 +219,12 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, } EXPORT_SYMBOL(ip_cmsg_recv_offset); -int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, +int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6) { int err, val; struct cmsghdr *cmsg; + struct net *net = sock_net(sk); for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) @@ -244,6 +245,12 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, continue; } #endif + if (cmsg->cmsg_level == SOL_SOCKET) { + if (__sock_cmsg_send(sk, msg, cmsg, &ipc->sockc)) + return -EINVAL; + continue; + } + if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { @@ -635,7 +642,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (err) break; old = rcu_dereference_protected(inet->inet_opt, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (inet->is_icsk) { struct inet_connection_sock *icsk = inet_csk(sk); #if IS_ENABLED(CONFIG_IPV6) @@ -1295,7 +1302,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, struct ip_options_rcu *inet_opt; inet_opt = rcu_dereference_protected(inet->inet_opt, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); opt->optlen = 0; if (inet_opt) memcpy(optbuf, &inet_opt->opt, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 6165f30c4d72..43445df61efd 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -86,15 +86,15 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_xmit); -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, - bool xnet) +int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, + __be16 inner_proto, bool raw_proto, bool xnet) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; skb_pull_rcsum(skb, hdr_len); - if (inner_proto == htons(ETH_P_TEB)) { + if (!raw_proto && inner_proto == htons(ETH_P_TEB)) { struct ethhdr *eh; if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) @@ -117,7 +117,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, return iptunnel_pull_offloads(skb); } -EXPORT_SYMBOL_GPL(iptunnel_pull_header); +EXPORT_SYMBOL_GPL(__iptunnel_pull_header); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags) @@ -247,10 +247,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]); if (tb[LWTUNNEL_IP_DST]) - tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]); + tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]); if (tb[LWTUNNEL_IP_SRC]) - tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]); + tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]); if (tb[LWTUNNEL_IP_TTL]) tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); @@ -275,8 +275,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) || - nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || - nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || + nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || + nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index cf9700b1a106..66ddcb60519a 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -737,6 +737,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* no remote port */ } + ipc.sockc.tsflags = sk->sk_tsflags; ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.oif = sk->sk_bound_dev_if; @@ -744,10 +745,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.ttl = 0; ipc.tos = -1; - sock_tx_timestamp(sk, &ipc.tx_flags); - if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); + err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); return err; @@ -768,6 +767,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } + sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); + saddr = ipc.addr; ipc.addr = faddr = daddr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8d22de74080c..438f50c1a676 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -339,8 +339,8 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, struct msghdr *msg, size_t length, - struct rtable **rtp, - unsigned int flags) + struct rtable **rtp, unsigned int flags, + const struct sockcm_cookie *sockc) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -379,7 +379,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->ip_summed = CHECKSUM_NONE; - sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); skb->transport_header = skb->network_header; err = -EFAULT; @@ -540,6 +540,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = inet->inet_daddr; } + ipc.sockc.tsflags = sk->sk_tsflags; ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.tx_flags = 0; @@ -548,7 +549,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(net, msg, &ipc, false); + err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); goto out; @@ -638,10 +639,10 @@ back_from_confirm: if (inet->hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, - &rt, msg->msg_flags); + &rt, msg->msg_flags, &ipc.sockc); else { - sock_tx_timestamp(sk, &ipc.tx_flags); + sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); if (!ipc.addr) ipc.addr = fl4.daddr; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 08b8b960a8ed..4d73858991af 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -428,14 +428,16 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb) +static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) { - if (sk->sk_tsflags) { + if (sk->sk_tsflags || tsflags) { struct skb_shared_info *shinfo = skb_shinfo(skb); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - sock_tx_timestamp(sk, &shinfo->tx_flags); + sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); if (shinfo->tx_flags & SKBTX_ANY_TSTAMP) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + tcb->txstamp_ack = !!(shinfo->tx_flags & SKBTX_ACK_TSTAMP); } } @@ -957,7 +959,7 @@ new_segment: offset += copy; size -= copy; if (!size) { - tcp_tx_timestamp(sk, skb); + tcp_tx_timestamp(sk, sk->sk_tsflags, skb); goto out; } @@ -1077,6 +1079,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; bool sg; @@ -1119,6 +1122,15 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } + sockc.tsflags = sk->sk_tsflags; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) { + err = -EINVAL; + goto out_err; + } + } + /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -1237,7 +1249,7 @@ new_segment: copied += copy; if (!msg_data_left(msg)) { - tcp_tx_timestamp(sk, skb); + tcp_tx_timestamp(sk, sockc.tsflags, skb); goto out; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e6e65f79ade8..983f04c11177 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2252,16 +2252,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) } } -/* CWND moderation, preventing bursts due to too big ACKs - * in dubious situations. - */ -static inline void tcp_moderate_cwnd(struct tcp_sock *tp) -{ - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + tcp_max_burst(tp)); - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) { return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -2410,7 +2400,6 @@ static bool tcp_try_undo_recovery(struct sock *sk) /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ - tcp_moderate_cwnd(tp); if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; return true; @@ -3093,7 +3082,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, const struct skb_shared_info *shinfo; /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */ - if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))) + if (likely(!TCP_SKB_CB(skb)->txstamp_ack)) return; shinfo = skb_shinfo(skb); @@ -4318,6 +4307,12 @@ static bool tcp_try_coalesce(struct sock *sk, return true; } +static void tcp_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + __kfree_skb(skb); +} + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -4342,7 +4337,7 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received\n"); - __kfree_skb(skb); + tcp_drop(sk, skb); continue; } SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", @@ -4394,7 +4389,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); - __kfree_skb(skb); + tcp_drop(sk, skb); return; } @@ -4458,7 +4453,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb); + tcp_drop(sk, skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; @@ -4497,7 +4492,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb1); + tcp_drop(sk, skb1); } add_sack: @@ -4580,12 +4575,13 @@ err: static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - int eaten = -1; bool fragstolen = false; + int eaten = -1; - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) - goto drop; - + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { + __kfree_skb(skb); + return; + } skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); @@ -4667,7 +4663,7 @@ out_of_window: tcp_enter_quickack_mode(sk); inet_csk_schedule_ack(sk); drop: - __kfree_skb(skb); + tcp_drop(sk, skb); return; } @@ -5244,7 +5240,7 @@ syn_challenge: return true; discard: - __kfree_skb(skb); + tcp_drop(sk, skb); return false; } @@ -5462,7 +5458,7 @@ csum_error: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); discard: - __kfree_skb(skb); + tcp_drop(sk, skb); } EXPORT_SYMBOL(tcp_rcv_established); @@ -5693,7 +5689,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, TCP_DELACK_MAX, TCP_RTO_MAX); discard: - __kfree_skb(skb); + tcp_drop(sk, skb); return 0; } else { tcp_send_ack(sk); @@ -6054,7 +6050,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!queued) { discard: - __kfree_skb(skb); + tcp_drop(sk, skb); } return 0; } @@ -6343,8 +6339,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie); - if (want_cookie) - goto drop_and_free; + if (want_cookie) { + reqsk_free(req); + return 0; + } } reqsk_put(req); return 0; @@ -6354,7 +6352,7 @@ drop_and_release: drop_and_free: reqsk_free(req); drop: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return 0; } EXPORT_SYMBOL(tcp_conn_request); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ad450509029b..f4f2a0a3849d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -157,7 +157,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) nexthop = daddr = usin->sin_addr.s_addr; inet_opt = rcu_dereference_protected(inet->inet_opt, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (inet_opt && inet_opt->opt.srr) { if (!daddr) return -EINVAL; @@ -329,7 +329,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) * errors returned from accept(). */ inet_csk_reqsk_queue_drop(req->rsk_listener, req); - NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); + tcp_listendrop(req->rsk_listener); } reqsk_put(req); } @@ -628,6 +628,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG + rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) @@ -646,16 +647,18 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ if (!sk1) - return; - rcu_read_lock(); + goto out; + key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) &ip_hdr(skb)->saddr, AF_INET); if (!key) - goto release_sk1; + goto out; + genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) - goto release_sk1; + goto out; + } if (key) { @@ -698,11 +701,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); #ifdef CONFIG_TCP_MD5SIG -release_sk1: - if (sk1) { - rcu_read_unlock(); - sock_put(sk1); - } +out: + rcu_read_unlock(); #endif } @@ -882,8 +882,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, - sock_owned_by_user(sk) || - lockdep_is_held((spinlock_t *)&sk->sk_lock.slock)); + lockdep_sock_is_held(sk)); if (!md5sig) return NULL; #if IS_ENABLED(CONFIG_IPV6) @@ -928,8 +927,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, } md5sig = rcu_dereference_protected(tp->md5sig_info, - sock_owned_by_user(sk) || - lockdep_is_held(&sk->sk_lock.slock)); + lockdep_sock_is_held(sk)); if (!md5sig) { md5sig = kmalloc(sizeof(*md5sig), gfp); if (!md5sig) @@ -1246,7 +1244,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) &tcp_request_sock_ipv4_ops, sk, skb); drop: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return 0; } EXPORT_SYMBOL(tcp_v4_conn_request); @@ -1348,7 +1346,7 @@ exit_overflow: exit_nonewsk: dst_release(dst); exit: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return NULL; put_and_exit: inet_csk_prepare_forced_close(newsk); @@ -1538,11 +1536,12 @@ EXPORT_SYMBOL(tcp_prequeue); int tcp_v4_rcv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct tcphdr *th; + bool refcounted; struct sock *sk; int ret; - struct net *net = dev_net(skb->dev); if (skb->pkt_type != PACKET_HOST) goto discard_it; @@ -1588,7 +1587,7 @@ int tcp_v4_rcv(struct sk_buff *skb) lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, - th->dest); + th->dest, &refcounted); if (!sk) goto no_tcp_socket; @@ -1609,7 +1608,11 @@ process: inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + /* We own a reference on the listener, increase it again + * as we might lose it too soon. + */ sock_hold(sk); + refcounted = true; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1665,7 +1668,8 @@ process: bh_unlock_sock(sk); put_and_return: - sock_put(sk); + if (refcounted) + sock_put(sk); return ret; @@ -1688,7 +1692,9 @@ discard_it: return 0; discard_and_relse: - sock_put(sk); + sk_drops_add(sk, skb); + if (refcounted) + sock_put(sk); goto discard_it; do_time_wait: @@ -1712,6 +1718,7 @@ do_time_wait: if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; + refcounted = false; goto process; } /* Fall through to ACK */ @@ -1845,17 +1852,17 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); */ static void *listening_get_next(struct seq_file *seq, void *cur) { - struct inet_connection_sock *icsk; - struct hlist_nulls_node *node; - struct sock *sk = cur; - struct inet_listen_hashbucket *ilb; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); + struct inet_listen_hashbucket *ilb; + struct inet_connection_sock *icsk; + struct sock *sk = cur; if (!sk) { +get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock); - sk = sk_nulls_head(&ilb->head); + sk = sk_head(&ilb->head); st->offset = 0; goto get_sk; } @@ -1863,28 +1870,20 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - sk = sk_nulls_next(sk); + sk = sk_next(sk); get_sk: - sk_nulls_for_each_from(sk, node) { + sk_for_each_from(sk) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_family == st->family) { - cur = sk; - goto out; - } + if (sk->sk_family == st->family) + return sk; icsk = inet_csk(sk); } spin_unlock_bh(&ilb->lock); st->offset = 0; - if (++st->bucket < INET_LHTABLE_SIZE) { - ilb = &tcp_hashinfo.listening_hash[st->bucket]; - spin_lock_bh(&ilb->lock); - sk = sk_nulls_head(&ilb->head); - goto get_sk; - } - cur = NULL; -out: - return cur; + if (++st->bucket < INET_LHTABLE_SIZE) + goto get_head; + return NULL; } static void *listening_get_idx(struct seq_file *seq, loff_t *pos) @@ -2383,6 +2382,7 @@ static int __net_init tcp_sk_init(struct net *net) IPPROTO_TCP, net); if (res) goto fail; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index acb366dd61e6..4c53e7c86586 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -704,7 +704,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ - if (!(flg & TCP_FLAG_RST)) + if (!(flg & TCP_FLAG_RST) && + !tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDSYNRECV, + &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 08eed5e16df0..3563788d064f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -143,10 +143,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, unsigned int log) { struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each(sk2, node, &hslot->head) { + sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && @@ -177,12 +176,11 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, bool match_wildcard)) { struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); int res = 0; spin_lock(&hslot2->lock); - udp_portaddr_for_each_entry(sk2, node, &hslot2->head) { + udp_portaddr_for_each_entry(sk2, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && @@ -207,11 +205,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, bool match_wildcard)) { struct net *net = sock_net(sk); - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); struct sock *sk2; - sk_nulls_for_each(sk2, node, &hslot->head) { + sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && sk2->sk_family == sk->sk_family && @@ -333,17 +330,18 @@ found: goto fail_unlock; } - sk_nulls_add_node_rcu(sk, &hslot->head); + sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } + sock_set_flag(sk, SOCK_RCU_FREE); error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); @@ -497,37 +495,27 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; - bool select_ok = true; u32 hash = 0; -begin: result = NULL; badness = 0; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score2(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - select_ok = false; - goto found; - } - } + if (result) + return result; matches = 1; } + badness = score; + result = sk; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -535,23 +523,6 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot2) - goto begin; - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score2(result, net, saddr, sport, - daddr, hnum, dif) < badness)) { - sock_put(result); - goto begin; - } - } return result; } @@ -563,15 +534,12 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int dif, struct udp_table *udptable, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; - bool select_ok = true; u32 hash = 0; - rcu_read_lock(); if (hslot->count > 10) { hash2 = udp4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -593,35 +561,27 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, htonl(INADDR_ANY), hnum, dif, hslot2, slot2, skb); } - rcu_read_unlock(); return result; } begin: result = NULL; badness = 0; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { + sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - select_ok = false; - goto found; - } - } + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -629,25 +589,6 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score(result, net, saddr, hnum, sport, - daddr, dport, dif) < badness)) { - sock_put(result); - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__udp4_lib_lookup); @@ -663,13 +604,37 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, udptable, skb); } +struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport) +{ + const struct iphdr *iph = ip_hdr(skb); + const struct net_device *dev = + skb_dst(skb) ? skb_dst(skb)->dev : skb->dev; + + return __udp4_lib_lookup(dev_net(dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + &udp_table, skb); +} +EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb); + +/* Must be called under rcu_read_lock(). + * Does increment socket refcount. + */ +#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ + IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, - &udp_table, NULL); + struct sock *sk; + + sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, + dif, &udp_table, NULL); + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + return sk; } EXPORT_SYMBOL_GPL(udp4_lib_lookup); +#endif static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, __be32 loc_addr, @@ -771,7 +736,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk->sk_err = err; sk->sk_error_report(sk); out: - sock_put(sk); + return; } void udp_err(struct sk_buff *skb, u32 info) @@ -1027,15 +992,13 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) */ connected = 1; } - ipc.addr = inet->inet_saddr; + ipc.sockc.tsflags = sk->sk_tsflags; + ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; - sock_tx_timestamp(sk, &ipc.tx_flags); - if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc, - sk->sk_family == AF_INET6); + err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); if (unlikely(err)) { kfree(ipc.opt); return err; @@ -1060,6 +1023,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) saddr = ipc.addr; ipc.addr = faddr = daddr; + sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); + if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) return -EINVAL; @@ -1342,7 +1307,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; - int peeked, off = 0; + int peeked, peeking, off; int err; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; @@ -1352,15 +1317,16 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, return ip_recv_error(sk, msg, len, addr_len); try_again: + peeking = off = sk_peek_offset(sk, flags); skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), &peeked, &off, &err); if (!skb) - goto out; + return err; - ulen = skb->len - sizeof(struct udphdr); + ulen = skb->len; copied = len; - if (copied > ulen) - copied = ulen; + if (copied > ulen - off) + copied = ulen - off; else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; @@ -1370,18 +1336,16 @@ try_again: * coverage checksum (UDP-Lite), do it before the copy. */ - if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) { checksum_valid = !udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; } if (checksum_valid || skb_csum_unnecessary(skb)) - err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), - msg, copied); + err = skb_copy_datagram_msg(skb, off, msg, copied); else { - err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), - msg); + err = skb_copy_and_csum_datagram_msg(skb, off, msg); if (err == -EINVAL) goto csum_copy_err; @@ -1394,7 +1358,8 @@ try_again: UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } - goto out_free; + skb_free_datagram_locked(sk, skb); + return err; } if (!peeked) @@ -1418,9 +1383,7 @@ try_again: if (flags & MSG_TRUNC) err = ulen; -out_free: - skb_free_datagram_locked(sk, skb); -out: + __skb_free_datagram_locked(sk, skb, peeking ? -err : err); return err; csum_copy_err: @@ -1474,13 +1437,13 @@ void udp_lib_unhash(struct sock *sk) spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - if (sk_nulls_del_node_init_rcu(sk)) { + if (sk_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); } @@ -1513,12 +1476,12 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) if (hslot2 != nhslot2) { spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); spin_lock(&nhslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &nhslot2->head); nhslot2->count++; spin_unlock(&nhslot2->lock); @@ -1548,7 +1511,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_incoming_cpu_update(sk); } - rc = sock_queue_rcv_skb(sk, skb); + rc = __sock_queue_rcv_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); @@ -1664,10 +1627,14 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } } - if (rcu_access_pointer(sk->sk_filter) && - udp_lib_checksum_complete(skb)) - goto csum_error; + if (rcu_access_pointer(sk->sk_filter)) { + if (udp_lib_checksum_complete(skb)) + goto csum_error; + if (sk_filter(sk, skb)) + goto drop; + } + udp_csum_pull_header(skb); if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); @@ -1697,35 +1664,6 @@ drop: return -1; } -static void flush_stack(struct sock **stack, unsigned int count, - struct sk_buff *skb, unsigned int final) -{ - unsigned int i; - struct sk_buff *skb1 = NULL; - struct sock *sk; - - for (i = 0; i < count; i++) { - sk = stack[i]; - if (likely(!skb1)) - skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); - - if (!skb1) { - atomic_inc(&sk->sk_drops); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - } - - if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) - skb1 = NULL; - - sock_put(sk); - } - if (unlikely(skb1)) - kfree_skb(skb1); -} - /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ @@ -1749,14 +1687,14 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udp_table *udptable, int proto) { - struct sock *sk, *stack[256 / sizeof(struct sock *)]; - struct hlist_nulls_node *node; + struct sock *sk, *first = NULL; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); - int dif = skb->dev->ifindex; - unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); - bool inner_flushed = false; + unsigned int offset = offsetof(typeof(*sk), sk_node); + int dif = skb->dev->ifindex; + struct hlist_node *node; + struct sk_buff *nskb; if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & @@ -1767,23 +1705,28 @@ start_lookup: offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } - spin_lock(&hslot->lock); - sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { - if (__udp_is_mcast_sock(net, sk, - uh->dest, daddr, - uh->source, saddr, - dif, hnum)) { - if (unlikely(count == ARRAY_SIZE(stack))) { - flush_stack(stack, count, skb, ~0); - inner_flushed = true; - count = 0; - } - stack[count++] = sk; - sock_hold(sk); + sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { + if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr, + uh->source, saddr, dif, hnum)) + continue; + + if (!first) { + first = sk; + continue; } - } + nskb = skb_clone(skb, GFP_ATOMIC); - spin_unlock(&hslot->lock); + if (unlikely(!nskb)) { + atomic_inc(&sk->sk_drops); + UDP_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + continue; + } + if (udp_queue_rcv_skb(sk, nskb) > 0) + consume_skb(nskb); + } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { @@ -1791,16 +1734,13 @@ start_lookup: goto start_lookup; } - /* - * do the slow work with no lock held - */ - if (count) { - flush_stack(stack, count, skb, count - 1); + if (first) { + if (udp_queue_rcv_skb(first, skb) > 0) + consume_skb(skb); } else { - if (!inner_flushed) - UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); - consume_skb(skb); + kfree_skb(skb); + UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); } return 0; } @@ -1897,7 +1837,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, inet_compute_pseudo); ret = udp_queue_rcv_skb(sk, skb); - sock_put(sk); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 @@ -1958,49 +1897,24 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, int dif) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(loc_port); - unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); + unsigned int slot = udp_hashfn(net, hnum, udp_table.mask); struct udp_hslot *hslot = &udp_table.hash[slot]; /* Do not bother scanning a too big list */ if (hslot->count > 10) return NULL; - rcu_read_lock(); -begin: - count = 0; result = NULL; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { - if (__udp_is_mcast_sock(net, sk, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum)) { + sk_for_each_rcu(sk, &hslot->head) { + if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr, + rmt_port, rmt_addr, dif, hnum)) { + if (result) + return NULL; result = sk; - ++count; - } - } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { - if (count != 1 || - unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!__udp_is_mcast_sock(net, result, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum))) { - sock_put(result); - result = NULL; } } - rcu_read_unlock(); + return result; } @@ -2013,37 +1927,22 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 rmt_port, __be32 rmt_addr, int dif) { - struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(loc_port); unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + struct sock *sk; - rcu_read_lock(); - result = NULL; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { - if (INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, ports, dif)) - result = sk; + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { + if (INET_MATCH(sk, net, acookie, rmt_addr, + loc_addr, ports, dif)) + return sk; /* Only check first socket in chain */ break; } - - if (result) { - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, - ports, dif))) { - sock_put(result); - result = NULL; - } - } - rcu_read_unlock(); - return result; + return NULL; } void udp_v4_early_demux(struct sk_buff *skb) @@ -2051,7 +1950,7 @@ void udp_v4_early_demux(struct sk_buff *skb) struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct udphdr *uh; - struct sock *sk; + struct sock *sk = NULL; struct dst_entry *dst; int dif = skb->dev->ifindex; int ours; @@ -2083,11 +1982,9 @@ void udp_v4_early_demux(struct sk_buff *skb) } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif); - } else { - return; } - if (!sk) + if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2)) return; skb->sk = sk; @@ -2387,14 +2284,13 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) for (state->bucket = start; state->bucket <= state->udp_table->mask; ++state->bucket) { - struct hlist_nulls_node *node; struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; - if (hlist_nulls_empty(&hslot->head)) + if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_for_each(sk, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == state->family) @@ -2413,7 +2309,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) struct net *net = seq_file_net(seq); do { - sk = sk_nulls_next(sk); + sk = sk_next(sk); } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk) { @@ -2622,12 +2518,12 @@ void __init udp_table_init(struct udp_table *table, const char *name) table->hash2 = table->hash + (table->mask + 1); for (i = 0; i <= table->mask; i++) { - INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); + INIT_HLIST_HEAD(&table->hash[i].head); table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } for (i = 0; i <= table->mask; i++) { - INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i); + INIT_HLIST_HEAD(&table->hash2[i].head); table->hash2[i].count = 0; spin_lock_init(&table->hash2[i].lock); } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index df1966f3b6ec..3d5ccf4b1412 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -36,10 +36,11 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { int err = -EINVAL; - struct sock *sk; + struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); + rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, @@ -54,9 +55,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, req->id.idiag_dport, req->id.idiag_if, tbl, NULL); #endif - else - goto out_nosk; - + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; @@ -96,24 +97,23 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { - int num, s_num, slot, s_slot; struct net *net = sock_net(skb->sk); + int num, s_num, slot, s_slot; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { - struct sock *sk; - struct hlist_nulls_node *node; struct udp_hslot *hslot = &table->hash[slot]; + struct sock *sk; num = 0; - if (hlist_nulls_empty(&hslot->head)) + if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 0ed2dafb7cc4..6230cf4b0d2d 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -14,18 +14,6 @@ #include <net/udp.h> #include <net/protocol.h> -static DEFINE_SPINLOCK(udp_offload_lock); -static struct udp_offload_priv __rcu *udp_offload_base __read_mostly; - -#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock)) - -struct udp_offload_priv { - struct udp_offload *offload; - possible_net_t net; - struct rcu_head rcu; - struct udp_offload_priv __rcu *next; -}; - static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, @@ -179,6 +167,7 @@ out_unlock: return segs; } +EXPORT_SYMBOL(skb_udp_tunnel_segment); static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, netdev_features_t features) @@ -253,64 +242,14 @@ out: return segs; } -int udp_add_offload(struct net *net, struct udp_offload *uo) -{ - struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); - - if (!new_offload) - return -ENOMEM; - - write_pnet(&new_offload->net, net); - new_offload->offload = uo; - - spin_lock(&udp_offload_lock); - new_offload->next = udp_offload_base; - rcu_assign_pointer(udp_offload_base, new_offload); - spin_unlock(&udp_offload_lock); - - return 0; -} -EXPORT_SYMBOL(udp_add_offload); - -static void udp_offload_free_routine(struct rcu_head *head) -{ - struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu); - kfree(ou_priv); -} - -void udp_del_offload(struct udp_offload *uo) -{ - struct udp_offload_priv __rcu **head = &udp_offload_base; - struct udp_offload_priv *uo_priv; - - spin_lock(&udp_offload_lock); - - uo_priv = udp_deref_protected(*head); - for (; uo_priv != NULL; - uo_priv = udp_deref_protected(*head)) { - if (uo_priv->offload == uo) { - rcu_assign_pointer(*head, - udp_deref_protected(uo_priv->next)); - goto unlock; - } - head = &uo_priv->next; - } - pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port)); -unlock: - spin_unlock(&udp_offload_lock); - if (uo_priv) - call_rcu(&uo_priv->rcu, udp_offload_free_routine); -} -EXPORT_SYMBOL(udp_del_offload); - struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh) + struct udphdr *uh, udp_lookup_t lookup) { - struct udp_offload_priv *uo_priv; struct sk_buff *p, **pp = NULL; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; + struct sock *sk; if (NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && @@ -322,13 +261,10 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, NAPI_GRO_CB(skb)->encap_mark = 1; rcu_read_lock(); - uo_priv = rcu_dereference(udp_offload_base); - for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { - if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) && - uo_priv->offload->port == uh->dest && - uo_priv->offload->callbacks.gro_receive) - goto unflush; - } + sk = (*lookup)(skb, uh->source, uh->dest); + + if (sk && udp_sk(sk)->gro_receive) + goto unflush; goto out_unlock; unflush: @@ -352,9 +288,7 @@ unflush: skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; - pp = uo_priv->offload->callbacks.gro_receive(head, skb, - uo_priv->offload); + pp = udp_sk(sk)->gro_receive(sk, head, skb); out_unlock: rcu_read_unlock(); @@ -362,6 +296,7 @@ out: NAPI_GRO_CB(skb)->flush |= flush; return pp; } +EXPORT_SYMBOL(udp_gro_receive); static struct sk_buff **udp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) @@ -383,39 +318,28 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head, inet_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; - return udp_gro_receive(head, skb, uh); + return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb); flush: NAPI_GRO_CB(skb)->flush = 1; return NULL; } -int udp_gro_complete(struct sk_buff *skb, int nhoff) +int udp_gro_complete(struct sk_buff *skb, int nhoff, + udp_lookup_t lookup) { - struct udp_offload_priv *uo_priv; __be16 newlen = htons(skb->len - nhoff); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); int err = -ENOSYS; + struct sock *sk; uh->len = newlen; rcu_read_lock(); - - uo_priv = rcu_dereference(udp_offload_base); - for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { - if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) && - uo_priv->offload->port == uh->dest && - uo_priv->offload->callbacks.gro_complete) - break; - } - - if (uo_priv) { - NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; - err = uo_priv->offload->callbacks.gro_complete(skb, - nhoff + sizeof(struct udphdr), - uo_priv->offload); - } - + sk = (*lookup)(skb, uh->source, uh->dest); + if (sk && udp_sk(sk)->gro_complete) + err = udp_sk(sk)->gro_complete(sk, skb, + nhoff + sizeof(struct udphdr)); rcu_read_unlock(); if (skb->remcsum_offload) @@ -426,6 +350,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) return err; } +EXPORT_SYMBOL(udp_gro_complete); static int udp4_gro_complete(struct sk_buff *skb, int nhoff) { @@ -440,7 +365,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; } - return udp_gro_complete(skb, nhoff); + return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb); } static const struct net_offload udpv4_offload = { diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 96599d1a1318..47f12c73d959 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -69,6 +69,8 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; udp_sk(sk)->encap_destroy = cfg->encap_destroy; + udp_sk(sk)->gro_receive = cfg->gro_receive; + udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sock); } |