summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-02-10 02:34:18 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2018-02-10 02:34:18 +0300
commitc839682c719f0e3dc851951c9e2eeb8a41cd9609 (patch)
treeec10374627a2f2446e14506cb545b713ab2909fe /kernel
parent82f0a41e1980318ea4cdae20cdce7b33cb9c8946 (diff)
parent2fa56a494484f19e06bf4f3464b2155a92beafac (diff)
downloadlinux-c839682c719f0e3dc851951c9e2eeb8a41cd9609.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Pull networking fixes from David Miller: 1) Make allocations less aggressive in x_tables, from Minchal Hocko. 2) Fix netfilter flowtable Kconfig deps, from Pablo Neira Ayuso. 3) Fix connection loss problems in rtlwifi, from Larry Finger. 4) Correct DRAM dump length for some chips in ath10k driver, from Yu Wang. 5) Fix ABORT handling in rxrpc, from David Howells. 6) Add SPDX tags to Sun networking drivers, from Shannon Nelson. 7) Some ipv6 onlink handling fixes, from David Ahern. 8) Netem packet scheduler interval calcualtion fix from Md. Islam. 9) Don't put crypto buffers on-stack in rxrpc, from David Howells. 10) Fix handling of error non-delivery status in netlink multicast delivery over multiple namespaces, from Nicolas Dichtel. 11) Missing xdp flush in tuntap driver, from Jason Wang. 12) Synchonize RDS protocol netns/module teardown with rds object management, from Sowini Varadhan. 13) Add nospec annotations to mpls, from Dan Williams. 14) Fix SKB truesize handling in TIPC, from Hoang Le. 15) Interrupt masking fixes in stammc from Niklas Cassel. 16) Don't allow ptr_ring objects to be sized outside of kmalloc's limits, from Jason Wang. 17) Don't allow SCTP chunks to be built which will have a length exceeding the chunk header's 16-bit length field, from Alexey Kodanev. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (82 commits) ibmvnic: Remove skb->protocol checks in ibmvnic_xmit bpf: fix rlimit in reuseport net selftest sctp: verify size of a new chunk in _sctp_make_chunk() s390/qeth: fix SETIP command handling s390/qeth: fix underestimated count of buffer elements ptr_ring: try vmalloc() when kmalloc() fails ptr_ring: fail early if queue occupies more than KMALLOC_MAX_SIZE net: stmmac: remove redundant enable of PMT irq net: stmmac: rename GMAC_INT_DEFAULT_MASK for dwmac4 net: stmmac: discard disabled flags in interrupt status register ibmvnic: Reset long term map ID counter tools/libbpf: handle issues with bpf ELF objects containing .eh_frames selftests/bpf: add selftest that use test_libbpf_open selftests/bpf: add test program for loading BPF ELF files tools/libbpf: improve the pr_debug statements to contain section numbers bpf: Sync kernel ABI header with tooling header for bpf_common.h net: phy: fix phy_start to consider PHY_IGNORE_INTERRUPT net: thunder: change q_len's type to handle max ring size tipc: fix skb truesize/datasize ratio control net/sched: cls_u32: fix cls_u32 on filter replace ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/sockmap.c187
1 files changed, 115 insertions, 72 deletions
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 0314d1783d77..48c33417d13c 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -86,9 +86,10 @@ struct smap_psock {
struct work_struct tx_work;
struct work_struct gc_work;
+ struct proto *sk_proto;
+ void (*save_close)(struct sock *sk, long timeout);
void (*save_data_ready)(struct sock *sk);
void (*save_write_space)(struct sock *sk);
- void (*save_state_change)(struct sock *sk);
};
static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -96,12 +97,102 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
return rcu_dereference_sk_user_data(sk);
}
+static struct proto tcp_bpf_proto;
+static int bpf_tcp_init(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ if (unlikely(psock->sk_proto)) {
+ rcu_read_unlock();
+ return -EBUSY;
+ }
+
+ psock->save_close = sk->sk_prot->close;
+ psock->sk_proto = sk->sk_prot;
+ sk->sk_prot = &tcp_bpf_proto;
+ rcu_read_unlock();
+ return 0;
+}
+
+static void bpf_tcp_release(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+
+ if (likely(psock)) {
+ sk->sk_prot = psock->sk_proto;
+ psock->sk_proto = NULL;
+ }
+ rcu_read_unlock();
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+
+static void bpf_tcp_close(struct sock *sk, long timeout)
+{
+ void (*close_fun)(struct sock *sk, long timeout);
+ struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock *psock;
+ struct sock *osk;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ return sk->sk_prot->close(sk, timeout);
+ }
+
+ /* The psock may be destroyed anytime after exiting the RCU critial
+ * section so by the time we use close_fun the psock may no longer
+ * be valid. However, bpf_tcp_close is called with the sock lock
+ * held so the close hook and sk are still valid.
+ */
+ close_fun = psock->save_close;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ osk = cmpxchg(e->entry, sk, NULL);
+ if (osk == sk) {
+ list_del(&e->list);
+ smap_release_sock(psock, sk);
+ }
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ rcu_read_unlock();
+ close_fun(sk, timeout);
+}
+
enum __sk_action {
__SK_DROP = 0,
__SK_PASS,
__SK_REDIRECT,
};
+static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
+ .name = "bpf_tcp",
+ .uid = TCP_ULP_BPF,
+ .user_visible = false,
+ .owner = NULL,
+ .init = bpf_tcp_init,
+ .release = bpf_tcp_release,
+};
+
+static int bpf_tcp_ulp_register(void)
+{
+ tcp_bpf_proto = tcp_prot;
+ tcp_bpf_proto.close = bpf_tcp_close;
+ return tcp_register_ulp(&bpf_tcp_ulp_ops);
+}
+
static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
{
struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -166,68 +257,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err)
sk->sk_error_report(sk);
}
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-
-/* Called with lock_sock(sk) held */
-static void smap_state_change(struct sock *sk)
-{
- struct smap_psock_map_entry *e, *tmp;
- struct smap_psock *psock;
- struct socket_wq *wq;
- struct sock *osk;
-
- rcu_read_lock();
-
- /* Allowing transitions into an established syn_recv states allows
- * for early binding sockets to a smap object before the connection
- * is established.
- */
- switch (sk->sk_state) {
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- case TCP_ESTABLISHED:
- break;
- case TCP_CLOSE_WAIT:
- case TCP_CLOSING:
- case TCP_LAST_ACK:
- case TCP_FIN_WAIT1:
- case TCP_FIN_WAIT2:
- case TCP_LISTEN:
- break;
- case TCP_CLOSE:
- /* Only release if the map entry is in fact the sock in
- * question. There is a case where the operator deletes
- * the sock from the map, but the TCP sock is closed before
- * the psock is detached. Use cmpxchg to verify correct
- * sock is removed.
- */
- psock = smap_psock_sk(sk);
- if (unlikely(!psock))
- break;
- write_lock_bh(&sk->sk_callback_lock);
- list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- osk = cmpxchg(e->entry, sk, NULL);
- if (osk == sk) {
- list_del(&e->list);
- smap_release_sock(psock, sk);
- }
- }
- write_unlock_bh(&sk->sk_callback_lock);
- break;
- default:
- psock = smap_psock_sk(sk);
- if (unlikely(!psock))
- break;
- smap_report_sk_error(psock, EPIPE);
- break;
- }
-
- wq = rcu_dereference(sk->sk_wq);
- if (skwq_has_sleeper(wq))
- wake_up_interruptible_all(&wq->wait);
- rcu_read_unlock();
-}
-
static void smap_read_sock_strparser(struct strparser *strp,
struct sk_buff *skb)
{
@@ -322,10 +351,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
return;
sk->sk_data_ready = psock->save_data_ready;
sk->sk_write_space = psock->save_write_space;
- sk->sk_state_change = psock->save_state_change;
psock->save_data_ready = NULL;
psock->save_write_space = NULL;
- psock->save_state_change = NULL;
strp_stop(&psock->strp);
psock->strp_enabled = false;
}
@@ -350,6 +377,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
if (psock->refcnt)
return;
+ tcp_cleanup_ulp(sock);
smap_stop_sock(psock, sock);
clear_bit(SMAP_TX_RUNNING, &psock->state);
rcu_assign_sk_user_data(sock, NULL);
@@ -427,10 +455,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
return;
psock->save_data_ready = sk->sk_data_ready;
psock->save_write_space = sk->sk_write_space;
- psock->save_state_change = sk->sk_state_change;
sk->sk_data_ready = smap_data_ready;
sk->sk_write_space = smap_write_space;
- sk->sk_state_change = smap_state_change;
psock->strp_enabled = true;
}
@@ -509,6 +535,10 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
if (attr->value_size > KMALLOC_MAX_SIZE)
return ERR_PTR(-E2BIG);
+ err = bpf_tcp_ulp_register();
+ if (err && err != -EEXIST)
+ return ERR_PTR(err);
+
stab = kzalloc(sizeof(*stab), GFP_USER);
if (!stab)
return ERR_PTR(-ENOMEM);
@@ -590,11 +620,6 @@ static void sock_map_free(struct bpf_map *map)
}
rcu_read_unlock();
- if (stab->bpf_verdict)
- bpf_prog_put(stab->bpf_verdict);
- if (stab->bpf_parse)
- bpf_prog_put(stab->bpf_parse);
-
sock_map_remove_complete(stab);
}
@@ -754,6 +779,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
goto out_progs;
}
+ err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
+ if (err)
+ goto out_progs;
+
set_bit(SMAP_TX_RUNNING, &psock->state);
}
@@ -866,6 +895,19 @@ static int sock_map_update_elem(struct bpf_map *map,
return err;
}
+static void sock_map_release(struct bpf_map *map, struct file *map_file)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_prog *orig;
+
+ orig = xchg(&stab->bpf_parse, NULL);
+ if (orig)
+ bpf_prog_put(orig);
+ orig = xchg(&stab->bpf_verdict, NULL);
+ if (orig)
+ bpf_prog_put(orig);
+}
+
const struct bpf_map_ops sock_map_ops = {
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
@@ -873,6 +915,7 @@ const struct bpf_map_ops sock_map_ops = {
.map_get_next_key = sock_map_get_next_key,
.map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_map_delete_elem,
+ .map_release = sock_map_release,
};
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,