summaryrefslogtreecommitdiff
path: root/net/ipv4/udp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/udp.c')
-rw-r--r--net/ipv4/udp.c217
1 files changed, 176 insertions, 41 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6a320a614e54..9592fe3e444a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -129,7 +129,12 @@ DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
#define MAX_UDP_PORTS 65536
-#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)
+
+static struct udp_table *udp_get_table_prot(struct sock *sk)
+{
+ return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table;
+}
static int udp_lib_lport_inuse(struct net *net, __u16 num,
const struct udp_hslot *hslot,
@@ -232,16 +237,16 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
int udp_lib_get_port(struct sock *sk, unsigned short snum,
unsigned int hash2_nulladdr)
{
+ struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2;
- struct udp_table *udptable = sk->sk_prot->h.udp_table;
- int error = 1;
struct net *net = sock_net(sk);
+ int error = -EADDRINUSE;
if (!snum) {
+ DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
+ unsigned short first, last;
int low, high, remaining;
unsigned int rand;
- unsigned short first, last;
- DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
@@ -467,7 +472,7 @@ static struct sock *udp4_lookup_run_bpf(struct net *net,
struct sock *sk, *reuse_sk;
bool no_reuseport;
- if (udptable != &udp_table)
+ if (udptable != net->ipv4.udp_table)
return NULL; /* only UDP is supported */
no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport,
@@ -548,10 +553,11 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport)
{
const struct iphdr *iph = ip_hdr(skb);
+ struct net *net = dev_net(skb->dev);
- return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
+ return __udp4_lib_lookup(net, iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
- inet_sdif(skb), &udp_table, NULL);
+ inet_sdif(skb), net->ipv4.udp_table, NULL);
}
/* Must be called under rcu_read_lock().
@@ -564,7 +570,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
struct sock *sk;
sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
- dif, 0, &udp_table, NULL);
+ dif, 0, net->ipv4.udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
@@ -784,7 +790,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
if (tunnel) {
/* ...not for tunnels though: we don't have a sending socket */
if (udp_sk(sk)->encap_err_rcv)
- udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2);
+ udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info,
+ (u8 *)(uh+1));
goto out;
}
if (!inet->recverr) {
@@ -801,7 +808,7 @@ out:
int udp_err(struct sk_buff *skb, u32 info)
{
- return __udp4_lib_err(skb, info, &udp_table);
+ return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table);
}
/*
@@ -1448,7 +1455,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
if (likely(partial)) {
up->forward_deficit += size;
size = up->forward_deficit;
- if (size < (sk->sk_rcvbuf >> 2) &&
+ if (size < READ_ONCE(up->forward_threshold) &&
!skb_queue_empty(&up->reader_queue))
return;
} else {
@@ -1622,7 +1629,7 @@ static void udp_destruct_sock(struct sock *sk)
int udp_init_sock(struct sock *sk)
{
- skb_queue_head_init(&udp_sk(sk)->reader_queue);
+ udp_lib_init_sock(sk);
sk->sk_destruct = udp_destruct_sock;
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
return 0;
@@ -1998,7 +2005,7 @@ EXPORT_SYMBOL(udp_disconnect);
void udp_lib_unhash(struct sock *sk)
{
if (sk_hashed(sk)) {
- struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2;
hslot = udp_hashslot(udptable, sock_net(sk),
@@ -2029,7 +2036,7 @@ EXPORT_SYMBOL(udp_lib_unhash);
void udp_lib_rehash(struct sock *sk, u16 newhash)
{
if (sk_hashed(sk)) {
- struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2, *nhslot2;
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
@@ -2518,10 +2525,14 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
__be16 rmt_port, __be32 rmt_addr,
int dif, int sdif)
{
- struct sock *sk, *result;
+ struct udp_table *udptable = net->ipv4.udp_table;
unsigned short hnum = ntohs(loc_port);
- unsigned int slot = udp_hashfn(net, hnum, udp_table.mask);
- struct udp_hslot *hslot = &udp_table.hash[slot];
+ struct sock *sk, *result;
+ struct udp_hslot *hslot;
+ unsigned int slot;
+
+ slot = udp_hashfn(net, hnum, udptable->mask);
+ hslot = &udptable->hash[slot];
/* Do not bother scanning a too big list */
if (hslot->count > 10)
@@ -2549,14 +2560,19 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 rmt_port, __be32 rmt_addr,
int dif, int sdif)
{
- unsigned short hnum = ntohs(loc_port);
- unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
- unsigned int slot2 = hash2 & udp_table.mask;
- struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
+ struct udp_table *udptable = net->ipv4.udp_table;
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
- const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+ unsigned short hnum = ntohs(loc_port);
+ unsigned int hash2, slot2;
+ struct udp_hslot *hslot2;
+ __portpair ports;
struct sock *sk;
+ hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (inet_match(net, sk, acookie, ports, dif, sdif))
return sk;
@@ -2636,7 +2652,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
int udp_rcv(struct sk_buff *skb)
{
- return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
+ return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
}
void udp_destroy_sock(struct sock *sk)
@@ -2672,6 +2688,18 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
int err = 0;
int is_udplite = IS_UDPLITE(sk);
+ if (level == SOL_SOCKET) {
+ err = sk_setsockopt(sk, level, optname, optval, optlen);
+
+ if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) {
+ sockopt_lock_sock(sk);
+ /* paired with READ_ONCE in udp_rmem_release() */
+ WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2);
+ sockopt_release_sock(sk);
+ }
+ return err;
+ }
+
if (optlen < sizeof(int))
return -EINVAL;
@@ -2785,7 +2813,7 @@ EXPORT_SYMBOL(udp_lib_setsockopt);
int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen)
{
- if (level == SOL_UDP || level == SOL_UDPLITE)
+ if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET)
return udp_lib_setsockopt(sk, level, optname,
optval, optlen,
udp_push_pending_frames);
@@ -2947,7 +2975,7 @@ struct proto udp_prot = {
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp_sock),
- .h.udp_table = &udp_table,
+ .h.udp_table = NULL,
.diag_destroy = udp_abort,
};
EXPORT_SYMBOL(udp_prot);
@@ -2955,21 +2983,30 @@ EXPORT_SYMBOL(udp_prot);
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
+static struct udp_table *udp_get_table_afinfo(struct udp_seq_afinfo *afinfo,
+ struct net *net)
+{
+ return afinfo->udp_table ? : net->ipv4.udp_table;
+}
+
static struct sock *udp_get_first(struct seq_file *seq, int start)
{
- struct sock *sk;
- struct udp_seq_afinfo *afinfo;
struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq);
+ struct udp_seq_afinfo *afinfo;
+ struct udp_table *udptable;
+ struct sock *sk;
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
else
afinfo = pde_data(file_inode(seq->file));
- for (state->bucket = start; state->bucket <= afinfo->udp_table->mask;
+ udptable = udp_get_table_afinfo(afinfo, net);
+
+ for (state->bucket = start; state->bucket <= udptable->mask;
++state->bucket) {
- struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket];
+ struct udp_hslot *hslot = &udptable->hash[state->bucket];
if (hlist_empty(&hslot->head))
continue;
@@ -2991,9 +3028,10 @@ found:
static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
{
- struct udp_seq_afinfo *afinfo;
struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq);
+ struct udp_seq_afinfo *afinfo;
+ struct udp_table *udptable;
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
@@ -3007,8 +3045,11 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
sk->sk_family != afinfo->family)));
if (!sk) {
- if (state->bucket <= afinfo->udp_table->mask)
- spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
+ udptable = udp_get_table_afinfo(afinfo, net);
+
+ if (state->bucket <= udptable->mask)
+ spin_unlock_bh(&udptable->hash[state->bucket].lock);
+
return udp_get_first(seq, state->bucket + 1);
}
return sk;
@@ -3049,16 +3090,19 @@ EXPORT_SYMBOL(udp_seq_next);
void udp_seq_stop(struct seq_file *seq, void *v)
{
- struct udp_seq_afinfo *afinfo;
struct udp_iter_state *state = seq->private;
+ struct udp_seq_afinfo *afinfo;
+ struct udp_table *udptable;
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
else
afinfo = pde_data(file_inode(seq->file));
- if (state->bucket <= afinfo->udp_table->mask)
- spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
+ udptable = udp_get_table_afinfo(afinfo, seq_file_net(seq));
+
+ if (state->bucket <= udptable->mask)
+ spin_unlock_bh(&udptable->hash[state->bucket].lock);
}
EXPORT_SYMBOL(udp_seq_stop);
@@ -3171,7 +3215,7 @@ EXPORT_SYMBOL(udp_seq_ops);
static struct udp_seq_afinfo udp4_seq_afinfo = {
.family = AF_INET,
- .udp_table = &udp_table,
+ .udp_table = NULL,
};
static int __net_init udp4_proc_init_net(struct net *net)
@@ -3233,7 +3277,7 @@ void __init udp_table_init(struct udp_table *table, const char *name)
&table->log,
&table->mask,
UDP_HTABLE_SIZE_MIN,
- 64 * 1024);
+ UDP_HTABLE_SIZE_MAX);
table->hash2 = table->hash + (table->mask + 1);
for (i = 0; i <= table->mask; i++) {
@@ -3258,7 +3302,7 @@ u32 udp_flow_hashrnd(void)
}
EXPORT_SYMBOL(udp_flow_hashrnd);
-static int __net_init udp_sysctl_init(struct net *net)
+static void __net_init udp_sysctl_init(struct net *net)
{
net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
@@ -3266,12 +3310,103 @@ static int __net_init udp_sysctl_init(struct net *net)
#ifdef CONFIG_NET_L3_MASTER_DEV
net->ipv4.sysctl_udp_l3mdev_accept = 0;
#endif
+}
+
+static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
+{
+ struct udp_table *udptable;
+ int i;
+
+ udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
+ if (!udptable)
+ goto out;
+
+ udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot),
+ GFP_KERNEL_ACCOUNT);
+ if (!udptable->hash)
+ goto free_table;
+
+ udptable->hash2 = udptable->hash + hash_entries;
+ udptable->mask = hash_entries - 1;
+ udptable->log = ilog2(hash_entries);
+
+ for (i = 0; i < hash_entries; i++) {
+ INIT_HLIST_HEAD(&udptable->hash[i].head);
+ udptable->hash[i].count = 0;
+ spin_lock_init(&udptable->hash[i].lock);
+
+ INIT_HLIST_HEAD(&udptable->hash2[i].head);
+ udptable->hash2[i].count = 0;
+ spin_lock_init(&udptable->hash2[i].lock);
+ }
+
+ return udptable;
+
+free_table:
+ kfree(udptable);
+out:
+ return NULL;
+}
+
+static void __net_exit udp_pernet_table_free(struct net *net)
+{
+ struct udp_table *udptable = net->ipv4.udp_table;
+
+ if (udptable == &udp_table)
+ return;
+
+ kvfree(udptable->hash);
+ kfree(udptable);
+}
+
+static void __net_init udp_set_table(struct net *net)
+{
+ struct udp_table *udptable;
+ unsigned int hash_entries;
+ struct net *old_net;
+
+ if (net_eq(net, &init_net))
+ goto fallback;
+
+ old_net = current->nsproxy->net_ns;
+ hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
+ if (!hash_entries)
+ goto fallback;
+
+ /* Set min to keep the bitmap on stack in udp_lib_get_port() */
+ if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
+ hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
+ else
+ hash_entries = roundup_pow_of_two(hash_entries);
+
+ udptable = udp_pernet_table_alloc(hash_entries);
+ if (udptable) {
+ net->ipv4.udp_table = udptable;
+ } else {
+ pr_warn("Failed to allocate UDP hash table (entries: %u) "
+ "for a netns, fallback to the global one\n",
+ hash_entries);
+fallback:
+ net->ipv4.udp_table = &udp_table;
+ }
+}
+
+static int __net_init udp_pernet_init(struct net *net)
+{
+ udp_sysctl_init(net);
+ udp_set_table(net);
return 0;
}
+static void __net_exit udp_pernet_exit(struct net *net)
+{
+ udp_pernet_table_free(net);
+}
+
static struct pernet_operations __net_initdata udp_sysctl_ops = {
- .init = udp_sysctl_init,
+ .init = udp_pernet_init,
+ .exit = udp_pernet_exit,
};
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
@@ -3289,7 +3424,7 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
return -ENOMEM;
afinfo->family = AF_UNSPEC;
- afinfo->udp_table = &udp_table;
+ afinfo->udp_table = NULL;
st->bpf_seq_afinfo = afinfo;
ret = bpf_iter_init_seq_net(priv_data, aux);
if (ret)