From 02ec6cafd78c2052283516afc74c309745d20271 Mon Sep 17 00:00:00 2001 From: Hoang Le Date: Tue, 19 Mar 2019 18:49:48 +0700 Subject: tipc: support broadcast/replicast configurable for bc-link Currently, a multicast stream uses either broadcast or replicast as transmission method, based on the ratio between number of actual destinations nodes and cluster size. However, when an L2 interface (e.g., VXLAN) provides pseudo broadcast support, this becomes very inefficient, as it blindly replicates multicast packets to all cluster/subnet nodes, irrespective of whether they host actual target sockets or not. The TIPC multicast algorithm is able to distinguish real destination nodes from other nodes, and hence provides a smarter and more efficient method for transferring multicast messages than pseudo broadcast can do. Because of this, we now make it possible for users to force the broadcast link to permanently switch to using replicast, irrespective of which capabilities the bearer provides, or pretend to provide. Conversely, we also make it possible to force the broadcast link to always use true broadcast. While maybe less useful in deployed systems, this may at least be useful for testing the broadcast algorithm in small clusters. We retain the current AUTOSELECT ability, i.e., to let the broadcast link automatically select which algorithm to use, and to switch back and forth between broadcast and replicast as the ratio between destination node number and cluster size changes. This remains the default method. Furthermore, we make it possible to configure the threshold ratio for such switches. The default ratio is now set to 10%, down from 25% in the earlier implementation. Acked-by: Jon Maloy Signed-off-by: Hoang Le Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 0ebe02ef1a86..efb958fd167d 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -281,6 +281,8 @@ enum { TIPC_NLA_PROP_TOL, /* u32 */ TIPC_NLA_PROP_WIN, /* u32 */ TIPC_NLA_PROP_MTU, /* u32 */ + TIPC_NLA_PROP_BROADCAST, /* u32 */ + TIPC_NLA_PROP_BROADCAST_RATIO, /* u32 */ __TIPC_NLA_PROP_MAX, TIPC_NLA_PROP_MAX = __TIPC_NLA_PROP_MAX - 1 -- cgit v1.2.3 From f295b3ae9f5927e084bd5decdff82390e3471801 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Wed, 20 Mar 2019 02:03:36 +0000 Subject: net/tls: Add support of AES128-CCM based ciphers Added support for AES128-CCM based record encryption. AES128-CCM is similar to AES128-GCM. Both of them have same salt/iv/mac size. The notable difference between the two is that while invoking AES128-CCM operation, the salt||nonce (which is passed as IV) has to be prefixed with a hardcoded value '2'. Further, CCM implementation in kernel requires IV passed in crypto_aead_request() to be full '16' bytes. Therefore, the record structure 'struct tls_rec' has been modified to reserve '16' bytes for IV. This works for both GCM and CCM based cipher. Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- include/net/tls.h | 15 +++++++++-- include/uapi/linux/tls.h | 15 +++++++++++ net/tls/tls_main.c | 31 ++++++++++++---------- net/tls/tls_sw.c | 67 ++++++++++++++++++++++++++++++++++++------------ 4 files changed, 97 insertions(+), 31 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tls.h b/include/net/tls.h index a5a938583295..3ce71d78414c 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -60,6 +60,17 @@ #define TLS_AAD_SPACE_SIZE 13 #define TLS_DEVICE_NAME_MAX 32 +#define MAX_IV_SIZE 16 + +/* For AES-CCM, the full 16-bytes of IV is made of '4' fields of given sizes. + * + * IV[16] = b0[1] || implicit nonce[4] || explicit nonce[8] || length[3] + * + * The field 'length' is encoded in field 'b0' as '(length width - 1)'. + * Hence b0 contains (3 - 1) = 2. + */ +#define TLS_AES_CCM_IV_B0_BYTE 2 + /* * This structure defines the routines for Inline TLS driver. * The following routines are optional and filled with a @@ -123,8 +134,7 @@ struct tls_rec { struct scatterlist sg_content_type; char aad_space[TLS_AAD_SPACE_SIZE]; - u8 iv_data[TLS_CIPHER_AES_GCM_128_IV_SIZE + - TLS_CIPHER_AES_GCM_128_SALT_SIZE]; + u8 iv_data[MAX_IV_SIZE]; struct aead_request aead_req; u8 aead_req_ctx[]; }; @@ -219,6 +229,7 @@ struct tls_prot_info { u16 tag_size; u16 overhead_size; u16 iv_size; + u16 salt_size; u16 rec_seq_size; u16 aad_size; u16 tail_size; diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index 401d6f01de6a..5b9c26753e46 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -70,6 +70,13 @@ #define TLS_CIPHER_AES_GCM_256_TAG_SIZE 16 #define TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE 8 +#define TLS_CIPHER_AES_CCM_128 53 +#define TLS_CIPHER_AES_CCM_128_IV_SIZE 8 +#define TLS_CIPHER_AES_CCM_128_KEY_SIZE 16 +#define TLS_CIPHER_AES_CCM_128_SALT_SIZE 4 +#define TLS_CIPHER_AES_CCM_128_TAG_SIZE 16 +#define TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE 8 + #define TLS_SET_RECORD_TYPE 1 #define TLS_GET_RECORD_TYPE 2 @@ -94,4 +101,12 @@ struct tls12_crypto_info_aes_gcm_256 { unsigned char rec_seq[TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE]; }; +struct tls12_crypto_info_aes_ccm_128 { + struct tls_crypto_info info; + unsigned char iv[TLS_CIPHER_AES_CCM_128_IV_SIZE]; + unsigned char key[TLS_CIPHER_AES_CCM_128_KEY_SIZE]; + unsigned char salt[TLS_CIPHER_AES_CCM_128_SALT_SIZE]; + unsigned char rec_seq[TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE]; +}; + #endif /* _UAPI_LINUX_TLS_H */ diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index df921a2904b9..0e24edab2535 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -469,27 +469,32 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: + optsize = sizeof(struct tls12_crypto_info_aes_gcm_128); + break; case TLS_CIPHER_AES_GCM_256: { - optsize = crypto_info->cipher_type == TLS_CIPHER_AES_GCM_128 ? - sizeof(struct tls12_crypto_info_aes_gcm_128) : - sizeof(struct tls12_crypto_info_aes_gcm_256); - if (optlen != optsize) { - rc = -EINVAL; - goto err_crypto_info; - } - rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), - optlen - sizeof(*crypto_info)); - if (rc) { - rc = -EFAULT; - goto err_crypto_info; - } + optsize = sizeof(struct tls12_crypto_info_aes_gcm_256); break; } + case TLS_CIPHER_AES_CCM_128: + optsize = sizeof(struct tls12_crypto_info_aes_ccm_128); + break; default: rc = -EINVAL; goto err_crypto_info; } + if (optlen != optsize) { + rc = -EINVAL; + goto err_crypto_info; + } + + rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), + optlen - sizeof(*crypto_info)); + if (rc) { + rc = -EFAULT; + goto err_crypto_info; + } + if (tx) { #ifdef CONFIG_TLS_DEVICE rc = tls_set_device_offload(sk, ctx); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 425351ac2a9b..f635c103581e 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -42,8 +42,6 @@ #include #include -#define MAX_IV_SIZE TLS_CIPHER_AES_GCM_128_IV_SIZE - static int __skb_nsg(struct sk_buff *skb, int offset, int len, unsigned int recursion_level) { @@ -479,11 +477,18 @@ static int tls_do_encryption(struct sock *sk, struct tls_rec *rec = ctx->open_rec; struct sk_msg *msg_en = &rec->msg_encrypted; struct scatterlist *sge = sk_msg_elem(msg_en, start); - int rc; + int rc, iv_offset = 0; + + /* For CCM based ciphers, first byte of IV is a constant */ + if (prot->cipher_type == TLS_CIPHER_AES_CCM_128) { + rec->iv_data[0] = TLS_AES_CCM_IV_B0_BYTE; + iv_offset = 1; + } + + memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv, + prot->iv_size + prot->salt_size); - memcpy(rec->iv_data, tls_ctx->tx.iv, sizeof(rec->iv_data)); - xor_iv_with_seq(prot->version, rec->iv_data, - tls_ctx->tx.rec_seq); + xor_iv_with_seq(prot->version, rec->iv_data, tls_ctx->tx.rec_seq); sge->offset += prot->prepend_size; sge->length -= prot->prepend_size; @@ -1344,6 +1349,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout = NULL; const int data_len = rxm->full_len - prot->overhead_size + prot->tail_size; + int iv_offset = 0; if (*zc && (out_iov || out_sg)) { if (out_iov) @@ -1386,18 +1392,25 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, aad = (u8 *)(sgout + n_sgout); iv = aad + prot->aad_size; + /* For CCM based ciphers, first byte of nonce+iv is always '2' */ + if (prot->cipher_type == TLS_CIPHER_AES_CCM_128) { + iv[0] = 2; + iv_offset = 1; + } + /* Prepare IV */ err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, - iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + iv + iv_offset + prot->salt_size, prot->iv_size); if (err < 0) { kfree(mem); return err; } if (prot->version == TLS_1_3_VERSION) - memcpy(iv, tls_ctx->rx.iv, crypto_aead_ivsize(ctx->aead_recv)); + memcpy(iv + iv_offset, tls_ctx->rx.iv, + crypto_aead_ivsize(ctx->aead_recv)); else - memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + memcpy(iv + iv_offset, tls_ctx->rx.iv, prot->salt_size); xor_iv_with_seq(prot->version, iv, tls_ctx->rx.rec_seq); @@ -2152,14 +2165,15 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls12_crypto_info_aes_gcm_256 *gcm_256_info; + struct tls12_crypto_info_aes_ccm_128 *ccm_128_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; struct tls_sw_context_rx *sw_ctx_rx = NULL; struct cipher_context *cctx; struct crypto_aead **aead; struct strp_callbacks cb; - u16 nonce_size, tag_size, iv_size, rec_seq_size; + u16 nonce_size, tag_size, iv_size, rec_seq_size, salt_size; struct crypto_tfm *tfm; - char *iv, *rec_seq, *key, *salt; + char *iv, *rec_seq, *key, *salt, *cipher_name; size_t keysize; int rc = 0; @@ -2224,6 +2238,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) keysize = TLS_CIPHER_AES_GCM_128_KEY_SIZE; key = gcm_128_info->key; salt = gcm_128_info->salt; + salt_size = TLS_CIPHER_AES_GCM_128_SALT_SIZE; + cipher_name = "gcm(aes)"; break; } case TLS_CIPHER_AES_GCM_256: { @@ -2239,6 +2255,25 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) keysize = TLS_CIPHER_AES_GCM_256_KEY_SIZE; key = gcm_256_info->key; salt = gcm_256_info->salt; + salt_size = TLS_CIPHER_AES_GCM_256_SALT_SIZE; + cipher_name = "gcm(aes)"; + break; + } + case TLS_CIPHER_AES_CCM_128: { + nonce_size = TLS_CIPHER_AES_CCM_128_IV_SIZE; + tag_size = TLS_CIPHER_AES_CCM_128_TAG_SIZE; + iv_size = TLS_CIPHER_AES_CCM_128_IV_SIZE; + iv = ((struct tls12_crypto_info_aes_ccm_128 *)crypto_info)->iv; + rec_seq_size = TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE; + rec_seq = + ((struct tls12_crypto_info_aes_ccm_128 *)crypto_info)->rec_seq; + ccm_128_info = + (struct tls12_crypto_info_aes_ccm_128 *)crypto_info; + keysize = TLS_CIPHER_AES_CCM_128_KEY_SIZE; + key = ccm_128_info->key; + salt = ccm_128_info->salt; + salt_size = TLS_CIPHER_AES_CCM_128_SALT_SIZE; + cipher_name = "ccm(aes)"; break; } default: @@ -2268,16 +2303,16 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) prot->overhead_size = prot->prepend_size + prot->tag_size + prot->tail_size; prot->iv_size = iv_size; - cctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - GFP_KERNEL); + prot->salt_size = salt_size; + cctx->iv = kmalloc(iv_size + salt_size, GFP_KERNEL); if (!cctx->iv) { rc = -ENOMEM; goto free_priv; } /* Note: 128 & 256 bit salt are the same size */ - memcpy(cctx->iv, salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); - memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); prot->rec_seq_size = rec_seq_size; + memcpy(cctx->iv, salt, salt_size); + memcpy(cctx->iv + salt_size, iv, iv_size); cctx->rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL); if (!cctx->rec_seq) { rc = -ENOMEM; @@ -2285,7 +2320,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } if (!*aead) { - *aead = crypto_alloc_aead("gcm(aes)", 0, 0); + *aead = crypto_alloc_aead(cipher_name, 0, 0); if (IS_ERR(*aead)) { rc = PTR_ERR(*aead); *aead = NULL; -- cgit v1.2.3 From 0c3e0e3bb623c3735b8c9ab8aa8332f944f83a9f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Mar 2019 12:16:42 +0300 Subject: tun: Add ioctl() TUNGETDEVNETNS cmd to allow obtaining real net ns of tun device In commit f2780d6d7475 "tun: Add ioctl() SIOCGSKNS cmd to allow obtaining net ns of tun device" it was missed that tun may change its net ns, while net ns of socket remains the same as it was created initially. SIOCGSKNS returns net ns of socket, so it is not suitable for obtaining net ns of device. We may have two tun devices with the same names in two net ns, and in this case it's not possible to determ, which of them fd refers to (TUNGETIFF will return the same name). This patch adds new ioctl() cmd for obtaining net ns of a device. Reported-by: Harald Albrecht Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- drivers/net/tun.c | 8 ++++++++ include/uapi/linux/if_tun.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index ef3b65514976..60d0a4e60ec5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3102,6 +3102,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %u\n", cmd); + net = dev_net(tun->dev); ret = 0; switch (cmd) { case TUNGETIFF: @@ -3327,6 +3328,13 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = tun_net_change_carrier(tun->dev, (bool)carrier); break; + case TUNGETDEVNETNS: + ret = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto unlock; + ret = open_related_ns(&net->ns, get_net_ns); + break; + default: ret = -EINVAL; break; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 23a6753b37df..454ae31b93c7 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -60,6 +60,7 @@ #define TUNSETSTEERINGEBPF _IOR('T', 224, int) #define TUNSETFILTEREBPF _IOR('T', 225, int) #define TUNSETCARRIER _IOW('T', 226, int) +#define TUNGETDEVNETNS _IO('T', 227) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 -- cgit v1.2.3 From edbf8c01de5a104a71ed6df2bf6421ceb2836a8e Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:01 +0800 Subject: bpf: add skc_lookup_tcp helper Allow looking up a sock_common. This gives eBPF programs access to timewait and request sockets. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 20 ++++++- kernel/bpf/verifier.c | 3 +- net/core/filter.c | 144 +++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 143 insertions(+), 24 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 929c8e537a14..fab05317f5e7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2431,6 +2431,23 @@ union bpf_attr { * Return * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to bpf_sk_lookup_tcp, except that it + * also returns timewait or request sockets. Use bpf_sk_fullsock + * or bpf_tcp_socket to access the full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from **reuse->socks**\ [] using the hash of the tuple. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2531,7 +2548,8 @@ union bpf_attr { FN(sk_fullsock), \ FN(tcp_sock), \ FN(skb_ecn_set_ce), \ - FN(get_listener_sock), + FN(get_listener_sock), \ + FN(skc_lookup_tcp), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a476e13201d6..dffeec3706ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -369,7 +369,8 @@ static bool is_release_function(enum bpf_func_id func_id) static bool is_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || - func_id == BPF_FUNC_sk_lookup_udp; + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp; } static bool is_ptr_cast_function(enum bpf_func_id func_id) diff --git a/net/core/filter.c b/net/core/filter.c index 647c63a7b25b..b6d83ba97621 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5156,15 +5156,15 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, return sk; } -/* bpf_sk_lookup performs the core lookup for different types of sockets, +/* bpf_skc_lookup performs the core lookup for different types of sockets, * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. * Returns the socket as an 'unsigned long' to simplify the casting in the * callers to satisfy BPF_CALL declarations. */ -static unsigned long -__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, - u64 flags) +static struct sock * +__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { struct sock *sk = NULL; u8 family = AF_UNSPEC; @@ -5192,15 +5192,27 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, put_net(net); } +out: + return sk; +} + +static struct sock * +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) +{ + struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, + ifindex, proto, netns_id, flags); + if (sk) sk = sk_to_full_sk(sk); -out: - return (unsigned long) sk; + + return sk; } -static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +static struct sock * +bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) { struct net *caller_net; int ifindex; @@ -5213,14 +5225,47 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, ifindex = 0; } - return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, - proto, netns_id, flags); + return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, + netns_id, flags); } +static struct sock * +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, + flags); + + if (sk) + sk = sk_to_full_sk(sk); + + return sk; +} + +BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { + .func = bpf_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { @@ -5238,7 +5283,8 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, + netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { @@ -5273,8 +5319,9 @@ BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct net *caller_net = dev_net(ctx->rxq->dev); int ifindex = ctx->rxq->dev->ifindex; - return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, - IPPROTO_UDP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_UDP, netns_id, + flags); } static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { @@ -5289,14 +5336,38 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags); +} + +static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { + .func = bpf_xdp_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net *caller_net = dev_net(ctx->rxq->dev); int ifindex = ctx->rxq->dev->ifindex; - return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, - IPPROTO_TCP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags); } static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { @@ -5311,11 +5382,31 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { + .func = bpf_sock_addr_skc_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_TCP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_TCP, + netns_id, flags); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { @@ -5332,8 +5423,9 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_UDP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_UDP, + netns_id, flags); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { @@ -5586,6 +5678,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_addr_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); @@ -5719,6 +5813,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5754,6 +5850,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_sk_lookup_tcp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_xdp_skc_lookup_tcp_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5846,6 +5944,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_base_func_proto(func_id); -- cgit v1.2.3 From 399040847084a69f345e0a52fd62f04654e0fce3 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:02 +0800 Subject: bpf: add helper to check for a valid SYN cookie Using bpf_skc_lookup_tcp it's possible to ascertain whether a packet belongs to a known connection. However, there is one corner case: no sockets are created if SYN cookies are active. This means that the final ACK in the 3WHS is misclassified. Using the helper, we can look up the listening socket via bpf_skc_lookup_tcp and then check whether a packet is a valid SYN cookie ACK. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 18 +++++++++++- net/core/filter.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fab05317f5e7..3c04410137d9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2448,6 +2448,21 @@ union bpf_attr { * Pointer to **struct bpf_sock**, or **NULL** in case of failure. * For sockets with reuseport option, the **struct bpf_sock** * result is from **reuse->socks**\ [] using the hash of the tuple. + * + * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether iph and th contain a valid SYN cookie ACK for + * the listening socket in sk. + * + * iph points to the start of the IPv4 or IPv6 header, while + * iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr). + * + * th points to the start of the TCP header, while th_len contains + * sizeof(struct tcphdr). + * + * Return + * 0 if iph and th are a valid SYN cookie ACK, or a negative error + * otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2549,7 +2564,8 @@ union bpf_attr { FN(tcp_sock), \ FN(skb_ecn_set_ce), \ FN(get_listener_sock), \ - FN(skc_lookup_tcp), + FN(skc_lookup_tcp), \ + FN(tcp_check_syncookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index b6d83ba97621..d2511fe46db3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5553,6 +5553,74 @@ static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; + +BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, + struct tcphdr *, th, u32, th_len) +{ +#ifdef CONFIG_SYN_COOKIES + u32 cookie; + int ret; + + if (unlikely(th_len < sizeof(*th))) + return -EINVAL; + + /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ + if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + return -EINVAL; + + if (!th->ack || th->rst || th->syn) + return -ENOENT; + + if (tcp_synq_no_recent_overflow(sk)) + return -ENOENT; + + cookie = ntohl(th->ack_seq) - 1; + + switch (sk->sk_family) { + case AF_INET: + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + + ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); + break; + +#if IS_BUILTIN(CONFIG_IPV6) + case AF_INET6: + if (unlikely(iph_len < sizeof(struct ipv6hdr))) + return -EINVAL; + + ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); + break; +#endif /* CONFIG_IPV6 */ + + default: + return -EPROTONOSUPPORT; + } + + if (ret > 0) + return 0; + + return -ENOENT; +#else + return -ENOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { + .func = bpf_tcp_check_syncookie, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5815,6 +5883,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_listener_sock_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5852,6 +5922,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_xdp_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); -- cgit v1.2.3 From 14aa31929b724b70fb63a9b0e7877da325b25cfe Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:54 -0400 Subject: bpf: add bpf_skb_adjust_room mode BPF_ADJ_ROOM_MAC bpf_skb_adjust_room net allows inserting room in an skb. Existing mode BPF_ADJ_ROOM_NET inserts room after the network header by pulling the skb, moving the network header forward and zeroing the new space. Add new mode BPF_ADJUST_ROOM_MAC that inserts room after the mac header. This allows inserting tunnel headers in front of the network header without having to recreate the network header in the original space, avoiding two copies. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 6 +++++- net/core/filter.c | 38 ++++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3c04410137d9..7c8fd0647070 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1478,7 +1478,10 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * - * There is a single supported mode at this time: + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed below the layer 2 header). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). @@ -2627,6 +2630,7 @@ enum bpf_func_id { /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, }; /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ diff --git a/net/core/filter.c b/net/core/filter.c index d21e1acdde29..e7b7720b18e9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2963,9 +2963,8 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) +static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) { - u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) @@ -2992,9 +2991,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) return 0; } -static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) { - u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) @@ -3027,7 +3025,8 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) SKB_MAX_ALLOC; } -static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) +BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) { bool trans_same = skb->transport_header == skb->network_header; u32 len_cur, len_diff_abs = abs(len_diff); @@ -3035,14 +3034,28 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) u32 len_max = __bpf_skb_max_len(skb); __be16 proto = skb->protocol; bool shrink = len_diff < 0; + u32 off; int ret; + if (unlikely(flags)) + return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (unlikely(proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))) return -ENOTSUPP; + off = skb_mac_header_len(skb); + switch (mode) { + case BPF_ADJ_ROOM_NET: + off += bpf_skb_net_base_len(skb); + break; + case BPF_ADJ_ROOM_MAC: + break; + default: + return -ENOTSUPP; + } + len_cur = skb->len - skb_network_offset(skb); if (skb_transport_header_was_set(skb) && !trans_same) len_cur = skb_network_header_len(skb); @@ -3052,24 +3065,13 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) !skb_is_gso(skb)))) return -ENOTSUPP; - ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : - bpf_skb_net_grow(skb, len_diff_abs); + ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs) : + bpf_skb_net_grow(skb, off, len_diff_abs); bpf_compute_data_pointers(skb); return ret; } -BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, - u32, mode, u64, flags) -{ - if (unlikely(flags)) - return -EINVAL; - if (likely(mode == BPF_ADJ_ROOM_NET)) - return bpf_skb_adjust_net(skb, len_diff); - - return -ENOTSUPP; -} - static const struct bpf_func_proto bpf_skb_adjust_room_proto = { .func = bpf_skb_adjust_room, .gpl_only = false, -- cgit v1.2.3 From 2278f6cc151a8bef6ba0b3fe3009d14dc3c51c4a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:55 -0400 Subject: bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_FIXED_GSO bpf_skb_adjust_room adjusts gso_size of gso packets to account for the pushed or popped header room. This is not allowed with UDP, where gso_size delineates datagrams. Add an option to avoid these updates and allow this call for datagrams. It can also be used with TCP, when MSS is known to allow headroom, e.g., through MSS clamping or route MTU. Changes v1->v2: - document flag BPF_F_ADJ_ROOM_FIXED_GSO - do not expose BPF_F_ADJ_ROOM_MASK through uapi, as it may change. Link: https://patchwork.ozlabs.org/patch/1052497/ Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 +++++++-- net/core/filter.c | 38 +++++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7c8fd0647070..4f157d0ec571 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1486,8 +1486,10 @@ union bpf_attr { * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * There is one supported flag at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2627,6 +2629,9 @@ enum bpf_func_id { /* Current network namespace */ #define BPF_F_CURRENT_NETNS (-1L) +/* BPF_FUNC_skb_adjust_room flags. */ +#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/net/core/filter.c b/net/core/filter.c index e7b7720b18e9..d3240a0a0eeb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2963,12 +2963,19 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO) + +static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) { int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } ret = skb_cow_head(skb, len_diff); if (unlikely(ret < 0)) @@ -2982,7 +2989,9 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header grow, MSS needs to be downgraded. */ - skb_decrease_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_decrease_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -2991,12 +3000,17 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) return 0; } -static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) { int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) @@ -3010,7 +3024,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header shrink, MSS can be upgraded. */ - skb_increase_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_increase_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3037,7 +3053,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags)) + if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3065,8 +3081,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, !skb_is_gso(skb)))) return -ENOTSUPP; - ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs) : - bpf_skb_net_grow(skb, off, len_diff_abs); + ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : + bpf_skb_net_grow(skb, off, len_diff_abs, flags); bpf_compute_data_pointers(skb); return ret; -- cgit v1.2.3 From 868d523535c2d00b696753ece606e641a816e91e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:56 -0400 Subject: bpf: add bpf_skb_adjust_room encap flags When pushing tunnel headers, annotate skbs in the same way as tunnel devices. For GSO packets, the network stack requires certain fields set to segment packets with tunnel headers. gro_gse_segment depends on transport and inner mac header, for instance. Add an option to pass this information. Remove the restriction on len_diff to network header length, which is too short, e.g., for GRE protocols. Changes v1->v2: - document new flags - BPF_F_ADJ_ROOM_MASK moved v2->v3: - BPF_F_ADJ_ROOM_ENCAP_L3_MASK moved Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 16 +++++++++++- net/core/filter.c | 66 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 76 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4f157d0ec571..837024512baf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1486,11 +1486,20 @@ union bpf_attr { * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * There is one supported flag at this time: + * The following flags are supported at this time: * * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. * Adjusting mss in this way is not allowed for datagrams. * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **: + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **: + * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -2632,6 +2641,11 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) +#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) +#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/net/core/filter.c b/net/core/filter.c index d3240a0a0eeb..c1d19b074d6c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2963,11 +2963,20 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO) +#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ + BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ + BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ + BPF_F_ADJ_ROOM_ENCAP_L4_UDP) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { + bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; + unsigned int gso_type = SKB_GSO_DODGY; + u16 mac_len, inner_net, inner_trans; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -2981,10 +2990,60 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, if (unlikely(ret < 0)) return ret; + if (encap) { + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -ENOTSUPP; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + return -EINVAL; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + return -EINVAL; + + if (skb->encapsulation) + return -EALREADY; + + mac_len = skb->network_header - skb->mac_header; + inner_net = skb->network_header; + inner_trans = skb->transport_header; + } + ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; + if (encap) { + /* inner mac == inner_net on l3 encap */ + skb->inner_mac_header = inner_net; + skb->inner_network_header = inner_net; + skb->inner_transport_header = inner_trans; + skb_set_inner_protocol(skb, skb->protocol); + + skb->encapsulation = 1; + skb_set_network_header(skb, mac_len); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + gso_type |= SKB_GSO_UDP_TUNNEL; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) + gso_type |= SKB_GSO_GRE; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + gso_type |= SKB_GSO_IPXIP6; + else + gso_type |= SKB_GSO_IPXIP4; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { + int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? + sizeof(struct ipv6hdr) : + sizeof(struct iphdr); + + skb_set_transport_header(skb, mac_len + nh_len); + } + } + if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -2993,7 +3052,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; } @@ -3044,7 +3103,6 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { - bool trans_same = skb->transport_header == skb->network_header; u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = __bpf_skb_max_len(skb); @@ -3073,8 +3131,6 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, } len_cur = skb->len - skb_network_offset(skb); - if (skb_transport_header_was_set(skb) && !trans_same) - len_cur = skb_network_header_len(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || (!shrink && (skb->len + len_diff_abs > len_max && -- cgit v1.2.3 From 576fd2f7cac3daa36025f0039f9e7cb75b4b4ae0 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Fri, 22 Mar 2019 10:59:47 -0400 Subject: tcp: add documentation for tcp_ca_state Add documentation to the tcp_ca_state enum, since this enum is exposed in uapi. Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Cc: Sowmini Varadhan Acked-by: Sowmini Varadhan Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8bb6cc5f3235..b521464ea962 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -160,15 +160,42 @@ enum { #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ +/* + * Sender's congestion state indicating normal or abnormal situations + * in the last round of packets sent. The state is driven by the ACK + * information and timer events. + */ enum tcp_ca_state { + /* + * Nothing bad has been observed recently. + * No apparent reordering, packet loss, or ECN marks. + */ TCP_CA_Open = 0, #define TCPF_CA_Open (1< Date: Sun, 3 Mar 2019 15:52:07 +0100 Subject: batman-adv: Drop license boilerplate All files got a SPDX-License-Identifier with commit 7db7d9f369a4 ("batman-adv: Add SPDX license identifier above copyright header"). All the required information about the license conditions can be found in LICENSES/. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 12 ------------ include/uapi/linux/batman_adv.h | 18 ------------------ net/batman-adv/Kconfig | 12 ------------ net/batman-adv/Makefile | 13 ------------- net/batman-adv/bat_algo.c | 12 ------------ net/batman-adv/bat_algo.h | 12 ------------ net/batman-adv/bat_iv_ogm.c | 12 ------------ net/batman-adv/bat_iv_ogm.h | 12 ------------ net/batman-adv/bat_v.c | 12 ------------ net/batman-adv/bat_v.h | 12 ------------ net/batman-adv/bat_v_elp.c | 12 ------------ net/batman-adv/bat_v_elp.h | 12 ------------ net/batman-adv/bat_v_ogm.c | 12 ------------ net/batman-adv/bat_v_ogm.h | 12 ------------ net/batman-adv/bitarray.c | 12 ------------ net/batman-adv/bitarray.h | 12 ------------ net/batman-adv/bridge_loop_avoidance.c | 12 ------------ net/batman-adv/bridge_loop_avoidance.h | 12 ------------ net/batman-adv/debugfs.c | 12 ------------ net/batman-adv/debugfs.h | 12 ------------ net/batman-adv/distributed-arp-table.c | 12 ------------ net/batman-adv/distributed-arp-table.h | 12 ------------ net/batman-adv/fragmentation.c | 12 ------------ net/batman-adv/fragmentation.h | 12 ------------ net/batman-adv/gateway_client.c | 12 ------------ net/batman-adv/gateway_client.h | 12 ------------ net/batman-adv/gateway_common.c | 12 ------------ net/batman-adv/gateway_common.h | 12 ------------ net/batman-adv/hard-interface.c | 12 ------------ net/batman-adv/hard-interface.h | 12 ------------ net/batman-adv/hash.c | 12 ------------ net/batman-adv/hash.h | 12 ------------ net/batman-adv/icmp_socket.c | 12 ------------ net/batman-adv/icmp_socket.h | 12 ------------ net/batman-adv/log.c | 12 ------------ net/batman-adv/log.h | 12 ------------ net/batman-adv/main.c | 12 ------------ net/batman-adv/main.h | 12 ------------ net/batman-adv/multicast.c | 12 ------------ net/batman-adv/multicast.h | 12 ------------ net/batman-adv/netlink.c | 12 ------------ net/batman-adv/netlink.h | 12 ------------ net/batman-adv/network-coding.c | 12 ------------ net/batman-adv/network-coding.h | 12 ------------ net/batman-adv/originator.c | 12 ------------ net/batman-adv/originator.h | 12 ------------ net/batman-adv/routing.c | 12 ------------ net/batman-adv/routing.h | 12 ------------ net/batman-adv/send.c | 12 ------------ net/batman-adv/send.h | 12 ------------ net/batman-adv/soft-interface.c | 12 ------------ net/batman-adv/soft-interface.h | 12 ------------ net/batman-adv/sysfs.c | 12 ------------ net/batman-adv/sysfs.h | 12 ------------ net/batman-adv/tp_meter.c | 12 ------------ net/batman-adv/tp_meter.h | 12 ------------ net/batman-adv/trace.c | 12 ------------ net/batman-adv/trace.h | 12 ------------ net/batman-adv/translation-table.c | 12 ------------ net/batman-adv/translation-table.h | 12 ------------ net/batman-adv/tvlv.c | 12 ------------ net/batman-adv/tvlv.h | 12 ------------ net/batman-adv/types.h | 12 ------------ 63 files changed, 763 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index c99336f4eefe..4ebc2135e950 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _UAPI_LINUX_BATADV_PACKET_H_ diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 305bf316dd03..e53f2b5e7ee7 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -2,24 +2,6 @@ /* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: * * Matthias Schiffer - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. */ #ifndef _UAPI_LINUX_BATMAN_ADV_H_ diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index a31db5e9ac8e..17595ec0961a 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -2,18 +2,6 @@ # Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of version 2 of the GNU General Public -# License as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . # # B.A.T.M.A.N meshing protocol diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index a887ecc3efa1..1bf7acfea17a 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -2,19 +2,6 @@ # Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of version 2 of the GNU General Public -# License as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . -# obj-$(CONFIG_BATMAN_ADV) += batman-adv.o batman-adv-y += bat_algo.o diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 7b7e15641fef..fa39eaaab9d7 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "main.h" diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 25e7bb51928c..cb7d57d16c9d 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_BAT_ALGO_H_ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index de61091af666..bd4138ddf7e0 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "bat_iv_ogm.h" diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index 785f6666273c..c7a9ba305bfc 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_BAT_IV_OGM_H_ diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 445594ed58af..231b4aab4d8d 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "bat_v.h" diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index 465a4fc23354..37833db098e6 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_BAT_V_H_ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index a9b7919c9de5..13b9ab860a25 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "bat_v_elp.h" diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 75f189ee4a1c..bb3d40f73bfe 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index c9698ad41854..fad95ef64e01 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "bat_v_ogm.h" diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index f67cf7ee06b2..616bf2ea8755 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_BAT_V_OGM_H_ diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 63e134e763e3..7f04a6acf14e 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -2,18 +2,6 @@ /* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "bitarray.h" diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index f3a05ad9afad..84ad2d2b6ac9 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -2,18 +2,6 @@ /* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_BITARRAY_H_ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ef39aabdb694..ee92bbc25058 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "bridge_loop_avoidance.h" diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 31771c751efb..012d72c8d064 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_BLA_H_ diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 3b9d1ad2f467..d38d70ccdd5a 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "debugfs.h" diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index c0b8694041ec..7fac680cf740 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_DEBUGFS_H_ diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 310a4f353008..c14faaa32ca4 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "distributed-arp-table.h" diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 68c0ff321acd..110c27447d70 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index b506d15b8230..385fccdcf69d 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "fragmentation.h" diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index abdac26579bf..d6074ba2ada7 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_FRAGMENTATION_H_ diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index f5811f61aa92..be63d6706659 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -2,18 +2,6 @@ /* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "gateway_client.h" diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index b5732c8be81a..0e14026feebd 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -2,18 +2,6 @@ /* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index e064de45e22c..dac097f9be03 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -2,18 +2,6 @@ /* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "gateway_common.h" diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 128467a0fb89..5cf50736c635 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -2,18 +2,6 @@ /* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 96ef7c70b4d9..79d1731b8306 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "hard-interface.h" diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 48de28c83401..c8ef6aa0e865 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_ diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 56a08ce193d5..a9d4e176f4de 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -2,18 +2,6 @@ /* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "hash.h" diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 37507b6d4006..ceef171f7f98 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -2,18 +2,6 @@ /* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_HASH_H_ diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 9859ababb82e..de81b5ecad91 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "icmp_socket.h" diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 5f8926522ff0..35eecbfd2e65 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_ diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 3e610df8debf..60ce11e16a90 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "log.h" diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 660e9bcc85a2..5504637e63d8 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_LOG_H_ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 75750870cf04..33b9b38b82da 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "main.h" diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 3ed669d7dc6b..c5de987778d1 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_MAIN_H_ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index f91b1b6265cf..4d6e89e04aa0 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -2,18 +2,6 @@ /* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "multicast.h" diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 466013fe88af..34fb922a4566 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -2,18 +2,6 @@ /* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_MULTICAST_H_ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 67a58da2e6a0..8e82e656b870 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -2,18 +2,6 @@ /* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: * * Matthias Schiffer - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "netlink.h" diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 7273368544fc..d1e0681b8743 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -2,18 +2,6 @@ /* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: * * Matthias Schiffer - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_NETLINK_H_ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 278762bd94c6..c5e7906045f3 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -2,18 +2,6 @@ /* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "network-coding.h" diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 96ef0a511fc7..74f56113a5d0 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -2,18 +2,6 @@ /* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index e5cdf89ef63c..45db798a7297 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -2,18 +2,6 @@ /* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "originator.h" diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index dca1e4a34ec6..3829e26f9c5d 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_ORIGINATOR_H_ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index cae0e5dd0768..f0f864820dea 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "routing.h" diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 0102d69d345c..b96c6d06d188 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_ROUTING_H_ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 66a8b3e44501..3ce5f7bad369 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "send.h" diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 1f6132922e60..5921ee4e107c 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_SEND_H_ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 2e367230376b..f8fcdd6de656 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "soft-interface.h" diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 538bb661878c..275442a7acb6 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 0b4b3fb778a6..4fc9f7305174 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "sysfs.h" diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 705ffbe763f4..2d13f59efb1a 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_SYSFS_H_ diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 500109bbd551..820392146249 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -2,18 +2,6 @@ /* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "tp_meter.h" diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index 6b4d0f733896..604b3799c972 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -2,18 +2,6 @@ /* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_TP_METER_H_ diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c index f77c917ed20d..3cedd2c36528 100644 --- a/net/batman-adv/trace.c +++ b/net/batman-adv/trace.c @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Sven Eckelmann - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #define CREATE_TRACE_POINTS diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index 5e5579051400..d8f764521c0b 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Sven Eckelmann - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #if !defined(_NET_BATMAN_ADV_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index f73d79139ae7..842e7634d20f 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "translation-table.h" diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 61bca75e5911..a328979836b2 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 7e947b01919d..aae63f0d21eb 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include "main.h" diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index c0f033b1acb8..114ac01e06af 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_TVLV_H_ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index a21b34ed6548..e19fdb5c1281 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _NET_BATMAN_ADV_TYPES_H_ -- cgit v1.2.3 From 32e727449c792b689c2a06a8b4cc9fef6270c5a7 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Sat, 23 Mar 2019 05:47:41 +0100 Subject: batman-adv: Add multicast-to-unicast support for multiple targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this patch multicast packets with a limited number of destinations (current default: 16) will be split and transmitted by the originator as individual unicast transmissions. Wifi broadcasts with their low bitrate are still a costly undertaking. In a mesh network this cost multiplies with the overall size of the mesh network. Therefore using multiple unicast transmissions instead of broadcast flooding is almost always less burdensome for the mesh network. The maximum amount of unicast packets can be configured via the newly introduced multicast_fanout parameter. If this limit is exceeded distribution will fall back to classic broadcast flooding. The multicast-to-unicast conversion is performed on the initial multicast sender node and counts on a final destination node, mesh-wide basis (and not next hop, neighbor node basis). Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 7 ++ net/batman-adv/multicast.c | 199 ++++++++++++++++++++++++++++++++++++- net/batman-adv/multicast.h | 18 ++++ net/batman-adv/netlink.c | 11 ++ net/batman-adv/soft-interface.c | 8 +- net/batman-adv/translation-table.c | 5 +- net/batman-adv/translation-table.h | 4 + net/batman-adv/types.h | 6 ++ 8 files changed, 252 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index e53f2b5e7ee7..67f4636758af 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -473,6 +473,13 @@ enum batadv_nl_attrs { */ BATADV_ATTR_THROUGHPUT_OVERRIDE, + /** + * @BATADV_ATTR_MULTICAST_FANOUT: defines the maximum number of packet + * copies that may be generated for a multicast-to-unicast conversion. + * Once this limit is exceeded distribution will fall back to broadcast. + */ + BATADV_ATTR_MULTICAST_FANOUT, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 4d6e89e04aa0..3feb9435b715 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -54,6 +54,7 @@ #include "hash.h" #include "log.h" #include "netlink.h" +#include "send.h" #include "soft-interface.h" #include "translation-table.h" #include "tvlv.h" @@ -979,6 +980,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, { int ret, tt_count, ip_count, unsnoop_count, total_count; bool is_unsnoopable = false; + unsigned int mcast_fanout; struct ethhdr *ethhdr; ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable); @@ -1013,8 +1015,203 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, case 0: return BATADV_FORW_NONE; default: - return BATADV_FORW_ALL; + mcast_fanout = atomic_read(&bat_priv->multicast_fanout); + + if (!unsnoop_count && total_count <= mcast_fanout) + return BATADV_FORW_SOME; + } + + return BATADV_FORW_ALL; +} + +/** + * batadv_mcast_forw_tt() - forwards a packet to multicast listeners + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any multicast + * listener registered in the translation table. A transmission is performed + * via a batman-adv unicast packet for each such destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS + * otherwise. + */ +static int +batadv_mcast_forw_tt(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid) +{ + int ret = NET_XMIT_SUCCESS; + struct sk_buff *newskb; + + struct batadv_tt_orig_list_entry *orig_entry; + + struct batadv_tt_global_entry *tt_global; + const u8 *addr = eth_hdr(skb)->h_dest; + + tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); + if (!tt_global) + goto out; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) { + ret = NET_XMIT_DROP; + break; + } + + batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0, + orig_entry->orig_node, vid); + } + rcu_read_unlock(); + + batadv_tt_global_entry_put(tt_global); + +out: + return ret; +} + +/** + * batadv_mcast_forw_want_all_ipv4() - forward to nodes with want-all-ipv4 + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_ALL_IPV4 flag set. A transmission is performed via a + * batman-adv unicast packet for each such destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS + * otherwise. + */ +static int +batadv_mcast_forw_want_all_ipv4(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + struct batadv_orig_node *orig_node; + int ret = NET_XMIT_SUCCESS; + struct sk_buff *newskb; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, + &bat_priv->mcast.want_all_ipv4_list, + mcast_want_all_ipv4_node) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) { + ret = NET_XMIT_DROP; + break; + } + + batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0, + orig_node, vid); + } + rcu_read_unlock(); + return ret; +} + +/** + * batadv_mcast_forw_want_all_ipv6() - forward to nodes with want-all-ipv6 + * @bat_priv: the bat priv with all the soft interface information + * @skb: The multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_ALL_IPV6 flag set. A transmission is performed via a + * batman-adv unicast packet for each such destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS + * otherwise. + */ +static int +batadv_mcast_forw_want_all_ipv6(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + struct batadv_orig_node *orig_node; + int ret = NET_XMIT_SUCCESS; + struct sk_buff *newskb; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, + &bat_priv->mcast.want_all_ipv6_list, + mcast_want_all_ipv6_node) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) { + ret = NET_XMIT_DROP; + break; + } + + batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0, + orig_node, vid); } + rcu_read_unlock(); + return ret; +} + +/** + * batadv_mcast_forw_want_all() - forward packet to nodes in a want-all list + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_ALL_IPV4 or BATADV_MCAST_WANT_ALL_IPV6 flag set. A + * transmission is performed via a batman-adv unicast packet for each such + * destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family + * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. + */ +static int +batadv_mcast_forw_want_all(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + switch (ntohs(eth_hdr(skb)->h_proto)) { + case ETH_P_IP: + return batadv_mcast_forw_want_all_ipv4(bat_priv, skb, vid); + case ETH_P_IPV6: + return batadv_mcast_forw_want_all_ipv6(bat_priv, skb, vid); + default: + /* we shouldn't be here... */ + return NET_XMIT_DROP; + } +} + +/** + * batadv_mcast_forw_send() - send packet to any detected multicast recpient + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node that signaled + * interest in it, that is either via the translation table or the according + * want-all flags. A transmission is performed via a batman-adv unicast packet + * for each such destination node. + * + * The given skb is consumed/freed. + * + * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family + * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. + */ +int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid) +{ + int ret; + + ret = batadv_mcast_forw_tt(bat_priv, skb, vid); + if (ret != NET_XMIT_SUCCESS) { + kfree_skb(skb); + return ret; + } + + ret = batadv_mcast_forw_want_all(bat_priv, skb, vid); + if (ret != NET_XMIT_SUCCESS) { + kfree_skb(skb); + return ret; + } + + consume_skb(skb); + return ret; } /** diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 34fb922a4566..653b9b76fabe 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -23,6 +23,13 @@ enum batadv_forw_mode { */ BATADV_FORW_ALL, + /** + * @BATADV_FORW_SOME: forward the packet to some nodes (currently via + * a multicast-to-unicast conversion and the BATMAN unicast routing + * protocol) + */ + BATADV_FORW_SOME, + /** * @BATADV_FORW_SINGLE: forward the packet to a single node (currently * via the BATMAN unicast routing protocol) @@ -39,6 +46,9 @@ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node **mcast_single_orig); +int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid); + void batadv_mcast_init(struct batadv_priv *bat_priv); int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset); @@ -61,6 +71,14 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, return BATADV_FORW_ALL; } +static inline int +batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} + static inline int batadv_mcast_init(struct batadv_priv *bat_priv) { return 0; diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 8e82e656b870..daf56933223d 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -145,6 +145,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_HOP_PENALTY] = { .type = NLA_U8 }, [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, + [BATADV_ATTR_MULTICAST_FANOUT] = { .type = NLA_U32 }, [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 }, @@ -341,6 +342,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, if (nla_put_u8(msg, BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED, !atomic_read(&bat_priv->multicast_mode))) goto nla_put_failure; + + if (nla_put_u32(msg, BATADV_ATTR_MULTICAST_FANOUT, + atomic_read(&bat_priv->multicast_fanout))) + goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_MCAST */ #ifdef CONFIG_BATMAN_ADV_NC @@ -580,6 +585,12 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) atomic_set(&bat_priv->multicast_mode, !nla_get_u8(attr)); } + + if (info->attrs[BATADV_ATTR_MULTICAST_FANOUT]) { + attr = info->attrs[BATADV_ATTR_MULTICAST_FANOUT]; + + atomic_set(&bat_priv->multicast_fanout, nla_get_u32(attr)); + } #endif /* CONFIG_BATMAN_ADV_MCAST */ #ifdef CONFIG_BATMAN_ADV_NC diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index f8fcdd6de656..a7677e1d000f 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -197,7 +197,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, unsigned short vid; u32 seqno; int gw_mode; - enum batadv_forw_mode forw_mode; + enum batadv_forw_mode forw_mode = BATADV_FORW_SINGLE; struct batadv_orig_node *mcast_single_orig = NULL; int network_offset = ETH_HLEN; __be16 proto; @@ -305,7 +305,8 @@ send: if (forw_mode == BATADV_FORW_NONE) goto dropped; - if (forw_mode == BATADV_FORW_SINGLE) + if (forw_mode == BATADV_FORW_SINGLE || + forw_mode == BATADV_FORW_SOME) do_bcast = false; } } @@ -365,6 +366,8 @@ send: ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, mcast_single_orig, vid); + } else if (forw_mode == BATADV_FORW_SOME) { + ret = batadv_mcast_forw_send(bat_priv, skb, vid); } else { if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) @@ -806,6 +809,7 @@ static int batadv_softif_init_late(struct net_device *dev) bat_priv->mcast.querier_ipv6.shadowing = false; bat_priv->mcast.flags = BATADV_NO_FLAGS; atomic_set(&bat_priv->multicast_mode, 1); + atomic_set(&bat_priv->multicast_fanout, 16); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 842e7634d20f..5d8bf8048e4e 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -193,7 +193,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, * Return: a pointer to the corresponding tt_global_entry struct if the client * is found, NULL otherwise. */ -static struct batadv_tt_global_entry * +struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { @@ -288,8 +288,7 @@ static void batadv_tt_global_entry_release(struct kref *ref) * possibly release it * @tt_global_entry: tt_global_entry to be free'd */ -static void -batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) +void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) { kref_put(&tt_global_entry->common.refcount, batadv_tt_global_entry_release); diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index a328979836b2..c8c48d62a430 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -29,6 +29,10 @@ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message); +struct batadv_tt_global_entry * +batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, + unsigned short vid); +void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry); int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index e19fdb5c1281..357ca119329a 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1553,6 +1553,12 @@ struct batadv_priv { * node's sender/originating side */ atomic_t multicast_mode; + + /** + * @multicast_fanout: Maximum number of packet copies to generate for a + * multicast-to-unicast conversion + */ + atomic_t multicast_fanout; #endif /** @orig_interval: OGM broadcast interval in milliseconds */ -- cgit v1.2.3 From 1713cb37bf671e5d98919536941a8b56337874fd Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Wed, 27 Mar 2019 11:16:03 +0100 Subject: fou: Support binding FoU socket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An FoU socket is currently bound to the wildcard-address. While this works fine, there are several use-cases where the use of the wildcard-address is not desirable. For example, I use FoU on some multi-homed servers and would like to use FoU on only one of the interfaces. This commit adds support for binding FoU sockets to a given source address/interface, as well as connecting the socket to a given destination address/port. udp_tunnel already provides the required infrastructure, so most of the code added is for exposing and setting the different attributes (local address, peer address, etc.). The lookups performed when we add, delete or get an FoU-socket has also been updated to compare all the attributes a user can set. Since the comparison now involves several elements, I have added a separate comparison-function instead of open-coding. In order to test the code and ensure that the new comparison code works correctly, I started by creating a wildcard socket bound to port 1234 on my machine. I then tried to create a non-wildcarded socket bound to the same port, as well as fetching and deleting the socket (including source address, peer address or interface index in the netlink request). Both the create, fetch and delete request failed. Deleting/fetching the socket was only successful when my netlink request attributes matched those used to create the socket. I then repeated the tests, but with a socket bound to a local ip address, a socket bound to a local address + interface, and a bound socket that was also «connected» to a peer. Add only worked when no socket with the matching source address/interface (or wildcard) existed, while fetch/delete was only successful when all attributes matched. In addition to testing that the new code work, I also checked that the current behavior is kept. If none of the new attributes are provided, then an FoU-socket is configured as before (i.e., wildcarded). If any of the new attributes are provided, the FoU-socket is configured as expected. v1->v2: * Fixed building with IPv6 disabled (kbuild). * Fixed a return type warning and make the ugly comparison function more readable (kbuild). * Describe more in detail what has been tested (thanks David Miller). * Make peer port required if peer address is specified. Signed-off-by: Kristian Evensen Signed-off-by: David S. Miller --- include/uapi/linux/fou.h | 6 +++ net/ipv4/fou.c | 138 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 128 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h index f2ea833a2812..87c2c9f08803 100644 --- a/include/uapi/linux/fou.h +++ b/include/uapi/linux/fou.h @@ -16,6 +16,12 @@ enum { FOU_ATTR_IPPROTO, /* u8 */ FOU_ATTR_TYPE, /* u8 */ FOU_ATTR_REMCSUM_NOPARTIAL, /* flag */ + FOU_ATTR_LOCAL_V4, /* u32 */ + FOU_ATTR_LOCAL_V6, /* in6_addr */ + FOU_ATTR_PEER_V4, /* u32 */ + FOU_ATTR_PEER_V6, /* in6_addr */ + FOU_ATTR_PEER_PORT, /* u16 */ + FOU_ATTR_IFINDEX, /* s32 */ __FOU_ATTR_MAX, }; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index a23fbb52d265..100e63f57ea6 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -499,15 +499,45 @@ out_unlock: return err; } -static int fou_add_to_port_list(struct net *net, struct fou *fou) +static bool fou_cfg_cmp(struct fou *fou, struct fou_cfg *cfg) +{ + struct sock *sk = fou->sock->sk; + struct udp_port_cfg *udp_cfg = &cfg->udp_config; + + if (fou->family != udp_cfg->family || + fou->port != udp_cfg->local_udp_port || + sk->sk_dport != udp_cfg->peer_udp_port || + sk->sk_bound_dev_if != udp_cfg->bind_ifindex) + return false; + + if (fou->family == AF_INET) { + if (sk->sk_rcv_saddr != udp_cfg->local_ip.s_addr || + sk->sk_daddr != udp_cfg->peer_ip.s_addr) + return false; + else + return true; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, &udp_cfg->local_ip6) || + ipv6_addr_cmp(&sk->sk_v6_daddr, &udp_cfg->peer_ip6)) + return false; + else + return true; +#endif + } + + return false; +} + +static int fou_add_to_port_list(struct net *net, struct fou *fou, + struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); struct fou *fout; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { - if (fou->port == fout->port && - fou->family == fout->family) { + if (fou_cfg_cmp(fout, cfg)) { mutex_unlock(&fn->fou_lock); return -EALREADY; } @@ -585,7 +615,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; - err = fou_add_to_port_list(net, fou); + err = fou_add_to_port_list(net, fou, cfg); if (err) goto error; @@ -605,14 +635,12 @@ error: static int fou_destroy(struct net *net, struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); - __be16 port = cfg->udp_config.local_udp_port; - u8 family = cfg->udp_config.family; int err = -EINVAL; struct fou *fou; mutex_lock(&fn->fou_lock); list_for_each_entry(fou, &fn->fou_list, list) { - if (fou->port == port && fou->family == family) { + if (fou_cfg_cmp(fou, cfg)) { fou_release(fou); err = 0; break; @@ -626,16 +654,27 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) static struct genl_family fou_nl_family; static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { - [FOU_ATTR_PORT] = { .type = NLA_U16, }, - [FOU_ATTR_AF] = { .type = NLA_U8, }, - [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, - [FOU_ATTR_TYPE] = { .type = NLA_U8, }, - [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, + [FOU_ATTR_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_AF] = { .type = NLA_U8, }, + [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, + [FOU_ATTR_TYPE] = { .type = NLA_U8, }, + [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, + [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, + [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, + [FOU_ATTR_LOCAL_V6] = { .type = sizeof(struct in6_addr), }, + [FOU_ATTR_PEER_V6] = { .type = sizeof(struct in6_addr), }, + [FOU_ATTR_PEER_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, }; static int parse_nl_config(struct genl_info *info, struct fou_cfg *cfg) { + bool has_local = false, has_peer = false; + struct nlattr *attr; + int ifindex; + __be16 port; + memset(cfg, 0, sizeof(*cfg)); cfg->udp_config.family = AF_INET; @@ -657,8 +696,7 @@ static int parse_nl_config(struct genl_info *info, } if (info->attrs[FOU_ATTR_PORT]) { - __be16 port = nla_get_be16(info->attrs[FOU_ATTR_PORT]); - + port = nla_get_be16(info->attrs[FOU_ATTR_PORT]); cfg->udp_config.local_udp_port = port; } @@ -671,6 +709,52 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL]) cfg->flags |= FOU_F_REMCSUM_NOPARTIAL; + if (cfg->udp_config.family == AF_INET) { + if (info->attrs[FOU_ATTR_LOCAL_V4]) { + attr = info->attrs[FOU_ATTR_LOCAL_V4]; + cfg->udp_config.local_ip.s_addr = nla_get_in_addr(attr); + has_local = true; + } + + if (info->attrs[FOU_ATTR_PEER_V4]) { + attr = info->attrs[FOU_ATTR_PEER_V4]; + cfg->udp_config.peer_ip.s_addr = nla_get_in_addr(attr); + has_peer = true; + } +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (info->attrs[FOU_ATTR_LOCAL_V6]) { + attr = info->attrs[FOU_ATTR_LOCAL_V6]; + cfg->udp_config.local_ip6 = nla_get_in6_addr(attr); + has_local = true; + } + + if (info->attrs[FOU_ATTR_PEER_V6]) { + attr = info->attrs[FOU_ATTR_PEER_V6]; + cfg->udp_config.peer_ip6 = nla_get_in6_addr(attr); + has_peer = true; + } +#endif + } + + if (has_peer) { + if (info->attrs[FOU_ATTR_PEER_PORT]) { + port = nla_get_be16(info->attrs[FOU_ATTR_PEER_PORT]); + cfg->udp_config.peer_udp_port = port; + } else { + return -EINVAL; + } + } + + if (info->attrs[FOU_ATTR_IFINDEX]) { + if (!has_local) + return -EINVAL; + + ifindex = nla_get_s32(info->attrs[FOU_ATTR_IFINDEX]); + + cfg->udp_config.bind_ifindex = ifindex; + } + return 0; } @@ -702,15 +786,37 @@ static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) static int fou_fill_info(struct fou *fou, struct sk_buff *msg) { + struct sock *sk = fou->sock->sk; + if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) || nla_put_be16(msg, FOU_ATTR_PORT, fou->port) || + nla_put_be16(msg, FOU_ATTR_PEER_PORT, sk->sk_dport) || nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) || - nla_put_u8(msg, FOU_ATTR_TYPE, fou->type)) + nla_put_u8(msg, FOU_ATTR_TYPE, fou->type) || + nla_put_s32(msg, FOU_ATTR_IFINDEX, sk->sk_bound_dev_if)) return -1; if (fou->flags & FOU_F_REMCSUM_NOPARTIAL) if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL)) return -1; + + if (fou->sock->sk->sk_family == AF_INET) { + if (nla_put_in_addr(msg, FOU_ATTR_LOCAL_V4, sk->sk_rcv_saddr)) + return -1; + + if (nla_put_in_addr(msg, FOU_ATTR_PEER_V4, sk->sk_daddr)) + return -1; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (nla_put_in6_addr(msg, FOU_ATTR_LOCAL_V6, + &sk->sk_v6_rcv_saddr)) + return -1; + + if (nla_put_in6_addr(msg, FOU_ATTR_PEER_V6, &sk->sk_v6_daddr)) + return -1; +#endif + } + return 0; } @@ -763,7 +869,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) ret = -ESRCH; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { - if (port == fout->port && family == fout->family) { + if (fou_cfg_cmp(fout, &cfg)) { ret = fou_dump_info(fout, info->snd_portid, info->snd_seq, 0, msg, info->genlhdr->cmd); -- cgit v1.2.3 From 3aeb0803f7ea11ff2fc478f7d58f2b8e713af380 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 25 Mar 2019 19:34:58 +0100 Subject: ethtool: add PHY Fast Link Down support This adds support for Fast Link Down as new PHY tunable. Fast Link Down reduces the time until a link down event is reported for 1000BaseT. According to the standard it's 750ms what is too long for several use cases. v2: - add comment describing the constants Signed-off-by: Heiner Kallweit Reviewed-by: Florian Fainelli Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 8 ++++++++ net/core/ethtool.c | 2 ++ 2 files changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 3652b239dad1..50c76f4fa402 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -252,9 +252,17 @@ struct ethtool_tunable { #define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff #define DOWNSHIFT_DEV_DISABLE 0 +/* Time in msecs after which link is reported as down + * 0 = lowest time supported by the PHY + * 0xff = off, link down detection according to standard + */ +#define ETHTOOL_PHY_FAST_LINK_DOWN_ON 0 +#define ETHTOOL_PHY_FAST_LINK_DOWN_OFF 0xff + enum phy_tunable_id { ETHTOOL_PHY_ID_UNSPEC, ETHTOOL_PHY_DOWNSHIFT, + ETHTOOL_PHY_FAST_LINK_DOWN, /* * Add your fresh new phy tunable attribute above and remember to update * phy_tunable_strings[] in net/core/ethtool.c diff --git a/net/core/ethtool.c b/net/core/ethtool.c index b1eb32419732..387d67eb75ab 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -136,6 +136,7 @@ static const char phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { [ETHTOOL_ID_UNSPEC] = "Unspec", [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", + [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down", }; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) @@ -2432,6 +2433,7 @@ static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_PHY_DOWNSHIFT: + case ETHTOOL_PHY_FAST_LINK_DOWN: if (tuna->len != sizeof(u8) || tuna->type_id != ETHTOOL_TUNABLE_U8) return -EINVAL; -- cgit v1.2.3 From 4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 Mon Sep 17 00:00:00 2001 From: Numan Siddique Date: Tue, 26 Mar 2019 06:13:46 +0530 Subject: net: openvswitch: Add a new action check_pkt_len This patch adds a new action - 'check_pkt_len' which checks the packet length and executes a set of actions if the packet length is greater than the specified length or executes another set of actions if the packet length is lesser or equal to. This action takes below nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. The main use case for adding this action is to solve the packet drops because of MTU mismatch in OVN virtual networking solution. When a VM (which belongs to a logical switch of OVN) sends a packet destined to go via the gateway router and if the nic which provides external connectivity, has a lesser MTU, OVS drops the packet if the packet length is greater than this MTU. With the help of this action, OVN will check the packet length and if it is greater than the MTU size, it will generate an ICMP packet (type 3, code 4) and includes the next hop mtu in it so that the sender can fragment the packets. Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff Signed-off-by: Numan Siddique CC: Gregory Rose CC: Pravin B Shelar Acked-by: Pravin B Shelar Tested-by: Greg Rose Reviewed-by: Greg Rose Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 42 ++++++++++ net/openvswitch/actions.c | 48 +++++++++++ net/openvswitch/flow_netlink.c | 171 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 261 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index dbe0cbe4f1b7..dfabacee6903 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -798,6 +798,44 @@ struct ovs_action_push_eth { struct ovs_key_ethernet addresses; }; +/* + * enum ovs_check_pkt_len_attr - Attributes for %OVS_ACTION_ATTR_CHECK_PKT_LEN. + * + * @OVS_CHECK_PKT_LEN_ATTR_PKT_LEN: u16 Packet length to check for. + * @OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER: Nested OVS_ACTION_ATTR_* + * actions to apply if the packer length is greater than the specified + * length in the attr - OVS_CHECK_PKT_LEN_ATTR_PKT_LEN. + * @OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL - Nested OVS_ACTION_ATTR_* + * actions to apply if the packer length is lesser or equal to the specified + * length in the attr - OVS_CHECK_PKT_LEN_ATTR_PKT_LEN. + */ +enum ovs_check_pkt_len_attr { + OVS_CHECK_PKT_LEN_ATTR_UNSPEC, + OVS_CHECK_PKT_LEN_ATTR_PKT_LEN, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL, + __OVS_CHECK_PKT_LEN_ATTR_MAX, + +#ifdef __KERNEL__ + OVS_CHECK_PKT_LEN_ATTR_ARG /* struct check_pkt_len_arg */ +#endif +}; + +#define OVS_CHECK_PKT_LEN_ATTR_MAX (__OVS_CHECK_PKT_LEN_ATTR_MAX - 1) + +#ifdef __KERNEL__ +struct check_pkt_len_arg { + u16 pkt_len; /* Same value as OVS_CHECK_PKT_LEN_ATTR_PKT_LEN'. */ + bool exec_for_greater; /* When true, actions in IF_GREATER will + * not change flow keys. False otherwise. + */ + bool exec_for_lesser_equal; /* When true, actions in IF_LESS_EQUAL + * will not change flow keys. False + * otherwise. + */ +}; +#endif + /** * enum ovs_action_attr - Action types. * @@ -842,6 +880,9 @@ struct ovs_action_push_eth { * packet, or modify the packet (e.g., change the DSCP field). * @OVS_ACTION_ATTR_CLONE: make a copy of the packet and execute a list of * actions without affecting the original packet and key. + * @OVS_ACTION_ATTR_CHECK_PKT_LEN: Check the packet length and execute a set + * of actions if greater than the specified packet length, else execute + * another set of actions. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -876,6 +917,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_POP_NSH, /* No argument. */ OVS_ACTION_ATTR_METER, /* u32 meter ID. */ OVS_ACTION_ATTR_CLONE, /* Nested OVS_CLONE_ATTR_*. */ + OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index e47ebbbe71b8..2c151bb322c1 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -169,6 +169,10 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, const struct nlattr *actions, int len, bool last, bool clone_flow_key); +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr, int len); + static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, __be16 ethertype) { @@ -1213,6 +1217,40 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb, return clone_execute(dp, skb, key, recirc_id, NULL, 0, last, true); } +static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr, bool last) +{ + const struct nlattr *actions, *cpl_arg; + const struct check_pkt_len_arg *arg; + int rem = nla_len(attr); + bool clone_flow_key; + + /* The first netlink attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ARG'. + */ + cpl_arg = nla_data(attr); + arg = nla_data(cpl_arg); + + if (skb->len <= arg->pkt_len) { + /* Second netlink attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'. + */ + actions = nla_next(cpl_arg, &rem); + clone_flow_key = !arg->exec_for_lesser_equal; + } else { + /* Third netlink attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER'. + */ + actions = nla_next(cpl_arg, &rem); + actions = nla_next(actions, &rem); + clone_flow_key = !arg->exec_for_greater; + } + + return clone_execute(dp, skb, key, 0, nla_data(actions), + nla_len(actions), last, clone_flow_key); +} + /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, @@ -1374,6 +1412,16 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; } + + case OVS_ACTION_ATTR_CHECK_PKT_LEN: { + bool last = nla_is_last(a, rem); + + err = execute_check_pkt_len(dp, skb, key, a, last); + if (last) + return err; + + break; + } } if (unlikely(err)) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 691da853bef5..b7543700db87 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -91,6 +91,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SET_MASKED: case OVS_ACTION_ATTR_METER: + case OVS_ACTION_ATTR_CHECK_PKT_LEN: default: return true; } @@ -2838,6 +2839,87 @@ static int validate_userspace(const struct nlattr *attr) return 0; } +static const struct nla_policy cpl_policy[OVS_CHECK_PKT_LEN_ATTR_MAX + 1] = { + [OVS_CHECK_PKT_LEN_ATTR_PKT_LEN] = {.type = NLA_U16 }, + [OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER] = {.type = NLA_NESTED }, + [OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL] = {.type = NLA_NESTED }, +}; + +static int validate_and_copy_check_pkt_len(struct net *net, + const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + bool log, bool last) +{ + const struct nlattr *acts_if_greater, *acts_if_lesser_eq; + struct nlattr *a[OVS_CHECK_PKT_LEN_ATTR_MAX + 1]; + struct check_pkt_len_arg arg; + int nested_acts_start; + int start, err; + + err = nla_parse_strict(a, OVS_CHECK_PKT_LEN_ATTR_MAX, nla_data(attr), + nla_len(attr), cpl_policy, NULL); + if (err) + return err; + + if (!a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN] || + !nla_get_u16(a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN])) + return -EINVAL; + + acts_if_lesser_eq = a[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL]; + acts_if_greater = a[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER]; + + /* Both the nested action should be present. */ + if (!acts_if_greater || !acts_if_lesser_eq) + return -EINVAL; + + /* validation done, copy the nested actions. */ + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CHECK_PKT_LEN, + log); + if (start < 0) + return start; + + arg.pkt_len = nla_get_u16(a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN]); + arg.exec_for_lesser_equal = + last || !actions_may_change_flow(acts_if_lesser_eq); + arg.exec_for_greater = + last || !actions_may_change_flow(acts_if_greater); + + err = ovs_nla_add_action(sfa, OVS_CHECK_PKT_LEN_ATTR_ARG, &arg, + sizeof(arg), log); + if (err) + return err; + + nested_acts_start = add_nested_action_start(sfa, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL, log); + if (nested_acts_start < 0) + return nested_acts_start; + + err = __ovs_nla_copy_actions(net, acts_if_lesser_eq, key, sfa, + eth_type, vlan_tci, log); + + if (err) + return err; + + add_nested_action_end(*sfa, nested_acts_start); + + nested_acts_start = add_nested_action_start(sfa, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER, log); + if (nested_acts_start < 0) + return nested_acts_start; + + err = __ovs_nla_copy_actions(net, acts_if_greater, key, sfa, + eth_type, vlan_tci, log); + + if (err) + return err; + + add_nested_action_end(*sfa, nested_acts_start); + add_nested_action_end(*sfa, start); + return 0; +} + static int copy_action(const struct nlattr *from, struct sw_flow_actions **sfa, bool log) { @@ -2884,6 +2966,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_POP_NSH] = 0, [OVS_ACTION_ATTR_METER] = sizeof(u32), [OVS_ACTION_ATTR_CLONE] = (u32)-1, + [OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3085,6 +3168,19 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, break; } + case OVS_ACTION_ATTR_CHECK_PKT_LEN: { + bool last = nla_is_last(a, rem); + + err = validate_and_copy_check_pkt_len(net, a, key, sfa, + eth_type, + vlan_tci, log, + last); + if (err) + return err; + skip_copy = true; + break; + } + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -3183,6 +3279,75 @@ static int clone_action_to_attr(const struct nlattr *attr, return err; } +static int check_pkt_len_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + struct nlattr *start, *ac_start = NULL; + const struct check_pkt_len_arg *arg; + const struct nlattr *a, *cpl_arg; + int err = 0, rem = nla_len(attr); + + start = nla_nest_start(skb, OVS_ACTION_ATTR_CHECK_PKT_LEN); + if (!start) + return -EMSGSIZE; + + /* The first nested attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ARG'. + */ + cpl_arg = nla_data(attr); + arg = nla_data(cpl_arg); + + if (nla_put_u16(skb, OVS_CHECK_PKT_LEN_ATTR_PKT_LEN, arg->pkt_len)) { + err = -EMSGSIZE; + goto out; + } + + /* Second nested attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'. + */ + a = nla_next(cpl_arg, &rem); + ac_start = nla_nest_start(skb, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL); + if (!ac_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) { + nla_nest_cancel(skb, ac_start); + goto out; + } else { + nla_nest_end(skb, ac_start); + } + + /* Third nested attribute in 'attr' is always + * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER. + */ + a = nla_next(a, &rem); + ac_start = nla_nest_start(skb, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER); + if (!ac_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) { + nla_nest_cancel(skb, ac_start); + goto out; + } else { + nla_nest_end(skb, ac_start); + } + + nla_nest_end(skb, start); + return 0; + +out: + nla_nest_cancel(skb, start); + return err; +} + static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); @@ -3277,6 +3442,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) return err; break; + case OVS_ACTION_ATTR_CHECK_PKT_LEN: + err = check_pkt_len_action_to_attr(a, skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; -- cgit v1.2.3 From 06bd2bdf19d2f3d22731625e1a47fa1dff5ac407 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Tue, 26 Mar 2019 11:31:14 -0700 Subject: openvswitch: Add timeout support to ct action Add support for fine-grain timeout support to conntrack action. The new OVS_CT_ATTR_TIMEOUT attribute of the conntrack action specifies a timeout to be associated with this connection. If no timeout is specified, it acts as is, that is the default timeout for the connection will be automatically applied. Example usage: $ nfct timeout add timeout_1 inet tcp syn_sent 100 established 200 $ ovs-ofctl add-flow br0 in_port=1,ip,tcp,action=ct(commit,timeout=timeout_1) CC: Pravin Shelar CC: Pablo Neira Ayuso Signed-off-by: Yi-Hung Wei Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 3 +++ net/openvswitch/conntrack.c | 30 +++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index dfabacee6903..0cac5d802c6a 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -734,6 +734,7 @@ struct ovs_action_hash { * be received on NFNLGRP_CONNTRACK_NEW and NFNLGRP_CONNTRACK_DESTROY groups, * respectively. Remaining bits control the changes for which an event is * delivered on the NFNLGRP_CONNTRACK_UPDATE group. + * @OVS_CT_ATTR_TIMEOUT: Variable length string defining conntrack timeout. */ enum ovs_ct_attr { OVS_CT_ATTR_UNSPEC, @@ -746,6 +747,8 @@ enum ovs_ct_attr { OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */ OVS_CT_ATTR_FORCE_COMMIT, /* No argument */ OVS_CT_ATTR_EVENTMASK, /* u32 mask of IPCT_* events. */ + OVS_CT_ATTR_TIMEOUT, /* Associate timeout with this connection for + * fine-grain timeout tuning. */ __OVS_CT_ATTR_MAX }; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 845b83598e0d..121b01d4a3c0 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,7 @@ struct ovs_conntrack_info { u32 eventmask; /* Mask of 1 << IPCT_*. */ struct md_mark mark; struct md_labels labels; + char timeout[CTNL_TIMEOUT_NAME_MAX]; #ifdef CONFIG_NF_NAT_NEEDED struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ #endif @@ -1471,6 +1473,8 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { #endif [OVS_CT_ATTR_EVENTMASK] = { .minlen = sizeof(u32), .maxlen = sizeof(u32) }, + [OVS_CT_ATTR_TIMEOUT] = { .minlen = 1, + .maxlen = CTNL_TIMEOUT_NAME_MAX }, }; static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, @@ -1556,6 +1560,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, info->have_eventmask = true; info->eventmask = nla_get_u32(a); break; +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + case OVS_CT_ATTR_TIMEOUT: + memcpy(info->timeout, nla_data(a), nla_len(a)); + if (!memchr(info->timeout, '\0', nla_len(a))) { + OVS_NLERR(log, "Invalid conntrack helper"); + return -EINVAL; + } + break; +#endif default: OVS_NLERR(log, "Unknown conntrack attr (%d)", @@ -1637,6 +1650,14 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, OVS_NLERR(log, "Failed to allocate conntrack template"); return -ENOMEM; } + + if (ct_info.timeout[0]) { + if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, + ct_info.timeout)) + pr_info_ratelimited("Failed to associated timeout " + "policy `%s'\n", ct_info.timeout); + } + if (helper) { err = ovs_ct_add_helper(&ct_info, helper, key, log); if (err) @@ -1757,6 +1778,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, if (ct_info->have_eventmask && nla_put_u32(skb, OVS_CT_ATTR_EVENTMASK, ct_info->eventmask)) return -EMSGSIZE; + if (ct_info->timeout[0]) { + if (nla_put_string(skb, OVS_CT_ATTR_TIMEOUT, ct_info->timeout)) + return -EMSGSIZE; + } #ifdef CONFIG_NF_NAT_NEEDED if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) @@ -1778,8 +1803,11 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) { if (ct_info->helper) nf_conntrack_helper_put(ct_info->helper); - if (ct_info->ct) + if (ct_info->ct) { nf_ct_tmpl_free(ct_info->ct); + if (ct_info->timeout[0]) + nf_ct_destroy_timeout(ct_info->ct); + } } #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) -- cgit v1.2.3 From 18b6f717483a835fb98de9f0df6c724df9324e78 Mon Sep 17 00:00:00 2001 From: wenxu Date: Thu, 28 Mar 2019 12:43:23 +0800 Subject: openvswitch: Make metadata_dst tunnel work in IP_TUNNEL_INFO_BRIDGE mode There is currently no support for the multicast/broadcast aspects of VXLAN in ovs. In the datapath flow the tun_dst must specific. But in the IP_TUNNEL_INFO_BRIDGE mode the tun_dst can not be specific. And the packet can forward through the fdb table of vxlan devcice. In this mode the broadcast/multicast packet can be sent through the following ways in ovs. ovs-vsctl add-port br0 vxlan -- set in vxlan type=vxlan \ options:key=1000 options:remote_ip=flow ovs-ofctl add-flow br0 in_port=LOCAL,dl_dst=ff:ff:ff:ff:ff:ff, \ action=output:vxlan bridge fdb append ff:ff:ff:ff:ff:ff dev vxlan_sys_4789 dst 172.168.0.1 \ src_vni 1000 vni 1000 self bridge fdb append ff:ff:ff:ff:ff:ff dev vxlan_sys_4789 dst 172.168.0.2 \ src_vni 1000 vni 1000 self Signed-off-by: wenxu Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/flow_netlink.c | 46 +++++++++++++++++++++++++++++++--------- 2 files changed, 37 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 0cac5d802c6a..f271f1ec50ae 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -364,6 +364,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ OVS_TUNNEL_KEY_ATTR_PAD, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* struct erspan_metadata */ + OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE, /* No argument. IPV4_INFO_BRIDGE mode.*/ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index b7543700db87..bd019058fc6f 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -404,6 +404,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE] = { .len = 0 }, }; static const struct ovs_len_tbl @@ -667,6 +668,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, bool log) { bool ttl = false, ipv4 = false, ipv6 = false; + bool info_bridge_mode = false; __be16 tun_flags = 0; int opts_type = 0; struct nlattr *a; @@ -783,6 +785,10 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_ERSPAN_OPT; opts_type = type; break; + case OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE: + info_bridge_mode = true; + ipv4 = true; + break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); @@ -813,16 +819,29 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, OVS_NLERR(log, "IP tunnel dst address not specified"); return -EINVAL; } - if (ipv4 && !match->key->tun_key.u.ipv4.dst) { - OVS_NLERR(log, "IPv4 tunnel dst address is zero"); - return -EINVAL; + if (ipv4) { + if (info_bridge_mode) { + if (match->key->tun_key.u.ipv4.src || + match->key->tun_key.u.ipv4.dst || + match->key->tun_key.tp_src || + match->key->tun_key.tp_dst || + match->key->tun_key.ttl || + match->key->tun_key.tos || + tun_flags & ~TUNNEL_KEY) { + OVS_NLERR(log, "IPv4 tun info is not correct"); + return -EINVAL; + } + } else if (!match->key->tun_key.u.ipv4.dst) { + OVS_NLERR(log, "IPv4 tunnel dst address is zero"); + return -EINVAL; + } } if (ipv6 && ipv6_addr_any(&match->key->tun_key.u.ipv6.dst)) { OVS_NLERR(log, "IPv6 tunnel dst address is zero"); return -EINVAL; } - if (!ttl) { + if (!ttl && !info_bridge_mode) { OVS_NLERR(log, "IP tunnel TTL not specified."); return -EINVAL; } @@ -851,12 +870,17 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb, static int __ip_tun_to_nlattr(struct sk_buff *skb, const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len, - unsigned short tun_proto) + unsigned short tun_proto, u8 mode) { if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id, OVS_TUNNEL_KEY_ATTR_PAD)) return -EMSGSIZE; + + if (mode & IP_TUNNEL_INFO_BRIDGE) + return nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE) + ? -EMSGSIZE : 0; + switch (tun_proto) { case AF_INET: if (output->u.ipv4.src && @@ -919,7 +943,7 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, static int ip_tun_to_nlattr(struct sk_buff *skb, const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len, - unsigned short tun_proto) + unsigned short tun_proto, u8 mode) { struct nlattr *nla; int err; @@ -929,7 +953,7 @@ static int ip_tun_to_nlattr(struct sk_buff *skb, return -EMSGSIZE; err = __ip_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len, - tun_proto); + tun_proto, mode); if (err) return err; @@ -943,7 +967,7 @@ int ovs_nla_put_tunnel_info(struct sk_buff *skb, return __ip_tun_to_nlattr(skb, &tun_info->key, ip_tunnel_info_opts(tun_info), tun_info->options_len, - ip_tunnel_info_af(tun_info)); + ip_tunnel_info_af(tun_info), tun_info->mode); } static int encode_vlan_from_nlattrs(struct sw_flow_match *match, @@ -1981,7 +2005,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); if (ip_tun_to_nlattr(skb, &output->tun_key, opts, - swkey->tun_opts_len, swkey->tun_proto)) + swkey->tun_opts_len, swkey->tun_proto, 0)) goto nla_put_failure; } @@ -2606,6 +2630,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, tun_info->mode = IP_TUNNEL_INFO_TX; if (key.tun_proto == AF_INET6) tun_info->mode |= IP_TUNNEL_INFO_IPV6; + else if (key.tun_proto == AF_INET && key.tun_key.u.ipv4.dst == 0) + tun_info->mode |= IP_TUNNEL_INFO_BRIDGE; tun_info->key = key.tun_key; /* We need to store the options in the action itself since @@ -3367,7 +3393,7 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) err = ip_tun_to_nlattr(skb, &tun_info->key, ip_tunnel_info_opts(tun_info), tun_info->options_len, - ip_tunnel_info_af(tun_info)); + ip_tunnel_info_af(tun_info), tun_info->mode); if (err) return err; nla_nest_end(skb, start); -- cgit v1.2.3 From 1e1b11b6a1111cd9e8af1fd6ccda270a9fa3eacf Mon Sep 17 00:00:00 2001 From: vamsi krishna Date: Fri, 1 Feb 2019 18:34:51 +0530 Subject: nl80211/cfg80211: Specify band specific min RSSI thresholds with sched scan This commit adds the support to specify the RSSI thresholds per band for each match set. This enhances the current behavior which specifies a single rssi_threshold across all the bands by introducing the rssi_threshold_per_band. These per band rssi thresholds are referred through NL80211_BAND_* (enum nl80211_band) variables as attribute types. Such attributes/values per each band are nested through NL80211_ATTR_SCHED_SCAN_MIN_RSSI. These band specific rssi thresholds shall take precedence over the current rssi_thold per match set. Drivers indicate this support through %NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These per band rssi attributes/values does not specify "default RSSI filter" as done by NL80211_SCHED_SCAN_MATCH_ATTR_RSSI to stay backward compatible. That said, these per band rssi values have to be specified for the corresponding matchset. Signed-off-by: vamsi krishna Signed-off-by: Srinivas Dasari [rebase on refactoring, add policy] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 +++++++ include/uapi/linux/nl80211.h | 13 +++++++++++ net/wireless/nl80211.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bb307a11ee63..b13234a486e7 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1832,11 +1832,19 @@ static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match * or no match (RSSI only) * @rssi_thold: don't report scan results below this threshold (in s32 dBm) + * @per_band_rssi_thold: Minimum rssi threshold for each band to be applied + * for filtering out scan results received. Drivers advertize this support + * of band specific rssi based filtering through the feature capability + * %NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These band + * specific rssi thresholds take precedence over rssi_thold, if specified. + * If not specified for any band, it will be assigned with rssi_thold of + * corresponding matchset. */ struct cfg80211_match_set { struct cfg80211_ssid ssid; u8 bssid[ETH_ALEN]; s32 rssi_thold; + s32 per_band_rssi_thold[NUM_NL80211_BANDS]; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index dd4f86ee286e..4a9404958fbe 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3638,6 +3638,14 @@ enum nl80211_reg_rule_attr { * value as specified by &struct nl80211_bss_select_rssi_adjust. * @NL80211_SCHED_SCAN_MATCH_ATTR_BSSID: BSSID to be used for matching * (this cannot be used together with SSID). + * @NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI: Nested attribute that carries the + * band specific minimum rssi thresholds for the bands defined in + * enum nl80211_band. The minimum rssi threshold value(s32) specific to a + * band shall be encapsulated in attribute with type value equals to one + * of the NL80211_BAND_* defined in enum nl80211_band. For example, the + * minimum rssi threshold value for 2.4GHZ band shall be encapsulated + * within an attribute of type NL80211_BAND_2GHZ. And one or more of such + * attributes will be nested within this attribute. * @NL80211_SCHED_SCAN_MATCH_ATTR_MAX: highest scheduled scan filter * attribute number currently defined * @__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST: internal use @@ -3650,6 +3658,7 @@ enum nl80211_sched_scan_match_attr { NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI, NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST, NL80211_SCHED_SCAN_MATCH_ATTR_BSSID, + NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI, /* keep last */ __NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST, @@ -5343,6 +5352,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_AP_PMKSA_CACHING: Driver/device supports PMKSA caching * (set/del PMKSA operations) in AP mode. * + * @NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD: Driver supports + * filtering of sched scan results using band specific RSSI thresholds. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5384,6 +5396,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS, NL80211_EXT_FEATURE_AP_PMKSA_CACHING, + NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5c49d11fc477..62f96d6c02f0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -617,12 +617,21 @@ nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { [NL80211_REKEY_DATA_REPLAY_CTR] = { .len = NL80211_REPLAY_CTR_LEN }, }; +static const struct nla_policy +nl80211_match_band_rssi_policy[NUM_NL80211_BANDS] = { + [NL80211_BAND_2GHZ] = { .type = NLA_S32 }, + [NL80211_BAND_5GHZ] = { .type = NLA_S32 }, + [NL80211_BAND_60GHZ] = { .type = NLA_S32 }, +}; + static const struct nla_policy nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = { [NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY, .len = IEEE80211_MAX_SSID_LEN }, [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = { .len = ETH_ALEN }, [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 }, + [NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI] = + NLA_POLICY_NESTED(nl80211_match_band_rssi_policy), }; static const struct nla_policy @@ -7537,6 +7546,41 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans, return 0; } +static int +nl80211_parse_sched_scan_per_band_rssi(struct wiphy *wiphy, + struct cfg80211_match_set *match_sets, + struct nlattr *tb_band_rssi, + s32 rssi_thold) +{ + struct nlattr *attr; + int i, tmp, ret = 0; + + if (!wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD)) { + if (tb_band_rssi) + ret = -EOPNOTSUPP; + else + for (i = 0; i < NUM_NL80211_BANDS; i++) + match_sets->per_band_rssi_thold[i] = + NL80211_SCAN_RSSI_THOLD_OFF; + return ret; + } + + for (i = 0; i < NUM_NL80211_BANDS; i++) + match_sets->per_band_rssi_thold[i] = rssi_thold; + + nla_for_each_nested(attr, tb_band_rssi, tmp) { + enum nl80211_band band = nla_type(attr); + + if (band < 0 || band >= NUM_NL80211_BANDS) + return -EINVAL; + + match_sets->per_band_rssi_thold[band] = nla_get_s32(attr); + } + + return 0; +} + static struct cfg80211_sched_scan_request * nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, struct nlattr **attrs, int max_match_sets) @@ -7816,6 +7860,15 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, if (rssi) request->match_sets[i].rssi_thold = nla_get_s32(rssi); + + /* Parse per band RSSI attribute */ + err = nl80211_parse_sched_scan_per_band_rssi(wiphy, + &request->match_sets[i], + tb[NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI], + request->match_sets[i].rssi_thold); + if (err) + goto out_free; + i++; } -- cgit v1.2.3 From ab60633c7136c300f15a390f3469d7c4be15a055 Mon Sep 17 00:00:00 2001 From: Narayanraddi Masti Date: Thu, 7 Feb 2019 12:16:05 -0800 Subject: mac80211: Add support for NL80211_STA_INFO_AIRTIME_LINK_METRIC Add support for mesh airtime link metric attribute NL80211_STA_INFO_AIRTIME_LINK_METRIC. Signed-off-by: Narayanraddi Masti Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 2 ++ net/mac80211/mesh.h | 2 ++ net/mac80211/mesh_hwmp.c | 4 ++-- net/mac80211/sta_info.c | 6 ++++++ net/wireless/nl80211.c | 1 + 6 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b13234a486e7..5859a5e02454 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1327,6 +1327,7 @@ struct cfg80211_tid_stats { * @fcs_err_count: number of packets (MPDUs) received from this station with * an FCS error. This counter should be incremented only when TA of the * received packet with an FCS error matches the peer MAC address. + * @airtime_link_metric: mesh airtime link metric. */ struct station_info { u64 filled; @@ -1381,6 +1382,8 @@ struct station_info { u32 rx_mpdu_count; u32 fcs_err_count; + + u32 airtime_link_metric; }; #if IS_ENABLED(CONFIG_CFG80211) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4a9404958fbe..07457f4aea00 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3139,6 +3139,7 @@ enum nl80211_sta_bss_param { * @NL80211_STA_INFO_TX_DURATION: aggregate PPDU duration for all frames * sent to the station (u64, usec) * @NL80211_STA_INFO_AIRTIME_WEIGHT: current airtime weight for station (u16) + * @NL80211_STA_INFO_AIRTIME_LINK_METRIC: airtime link metric for mesh station * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -3184,6 +3185,7 @@ enum nl80211_sta_info { NL80211_STA_INFO_CONNECTED_TO_GATE, NL80211_STA_INFO_TX_DURATION, NL80211_STA_INFO_AIRTIME_WEIGHT, + NL80211_STA_INFO_AIRTIME_LINK_METRIC, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 574c3891c4b2..88535a2e62bc 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -278,6 +278,8 @@ mesh_path_add(struct ieee80211_sub_if_data *sdata, const u8 *dst); int mesh_path_add_gate(struct mesh_path *mpath); int mesh_path_send_to_gates(struct mesh_path *mpath); int mesh_gate_num(struct ieee80211_sub_if_data *sdata); +u32 airtime_link_metric_get(struct ieee80211_local *local, + struct sta_info *sta); /* Mesh plinks */ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index f7517668e77a..c694c0dd907e 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -318,8 +318,8 @@ void ieee80211s_update_metric(struct ieee80211_local *local, cfg80211_calculate_bitrate(&rinfo)); } -static u32 airtime_link_metric_get(struct ieee80211_local *local, - struct sta_info *sta) +u32 airtime_link_metric_get(struct ieee80211_local *local, + struct sta_info *sta) { /* This should be adjusted for each device */ int device_constant = 1 << ARITH_SHIFT; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 11f058987a54..a81e1279a76d 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2373,6 +2373,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } + + if (ieee80211_vif_is_mesh(&sdata->vif)) { + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC); + sinfo->airtime_link_metric = + airtime_link_metric_get(local, sta); + } } u32 sta_get_expected_throughput(struct sta_info *sta) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 62f96d6c02f0..7556c0479b3c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4898,6 +4898,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(TX_RETRIES, tx_retries, u32); PUT_SINFO(TX_FAILED, tx_failed, u32); PUT_SINFO(EXPECTED_THROUGHPUT, expected_throughput, u32); + PUT_SINFO(AIRTIME_LINK_METRIC, airtime_link_metric, u32); PUT_SINFO(BEACON_LOSS, beacon_loss_count, u32); PUT_SINFO(LOCAL_PM, local_pm, u32); PUT_SINFO(PEER_PM, peer_pm, u32); -- cgit v1.2.3 From cb74e9775871f8c82a1297cf76209f10ab5bbe3d Mon Sep 17 00:00:00 2001 From: Sunil Dutt Date: Wed, 20 Feb 2019 16:18:07 +0530 Subject: cfg80211/nl80211: Offload OWE processing to user space in AP mode This interface allows the host driver to offload OWE processing to user space. This intends to support OWE (Opportunistic Wireless Encryption) AKM by the drivers that implement SME but rely on the user space for the cryptographic/OWE processing in AP mode. Such drivers are not capable of processing/deriving the DH IE. A new NL80211 command - NL80211_CMD_UPDATE_OWE_INFO is introduced to send the request/event between the host driver and user space. Driver shall provide the OWE info (MAC address and DH IE) of the peer to user space for cryptographic processing of the DH IE through the event. Accordingly, the user space shall update the OWE info/DH IE to the driver. Following is the sequence in AP mode for OWE authentication. Driver passes the OWE info obtained from the peer in the Association Request to the user space through the event cfg80211_update_owe_info_event. User space shall process the OWE info received and generate new OWE info. This OWE info is passed to the driver through NL80211_CMD_UPDATE_OWE_INFO request. Driver eventually uses this OWE info to send the Association Response to the peer. This OWE info in the command interface carries the IEs that include PMKID of the peer if the PMKSA is still valid or an updated DH IE for generating a new PMKSA with the peer. Signed-off-by: Liangwei Dong Signed-off-by: Sunil Dutt Signed-off-by: Srinivas Dasari [remove policy initialization - no longer exists] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 42 ++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 7 +++++ net/wireless/nl80211.c | 72 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 13 ++++++++ net/wireless/trace.h | 38 +++++++++++++++++++++++ 5 files changed, 172 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5859a5e02454..70432fd638af 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3110,6 +3110,32 @@ struct cfg80211_pmsr_request { struct cfg80211_pmsr_request_peer peers[]; }; +/** + * struct cfg80211_update_owe_info - OWE Information + * + * This structure provides information needed for the drivers to offload OWE + * (Opportunistic Wireless Encryption) processing to the user space. + * + * Commonly used across update_owe_info request and event interfaces. + * + * @peer: MAC address of the peer device for which the OWE processing + * has to be done. + * @status: status code, %WLAN_STATUS_SUCCESS for successful OWE info + * processing, use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space + * cannot give you the real status code for failures. Used only for + * OWE update request command interface (user space to driver). + * @ie: IEs obtained from the peer or constructed by the user space. These are + * the IEs of the remote peer in the event from the host driver and + * the constructed IEs by the user space in the request interface. + * @ie_len: Length of IEs in octets. + */ +struct cfg80211_update_owe_info { + u8 peer[ETH_ALEN] __aligned(2); + u16 status; + const u8 *ie; + size_t ie_len; +}; + /** * struct cfg80211_ops - backend description for wireless configuration * @@ -3447,6 +3473,10 @@ struct cfg80211_pmsr_request { * Statistics should be cumulative, currently no way to reset is provided. * @start_pmsr: start peer measurement (e.g. FTM) * @abort_pmsr: abort peer measurement + * + * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME + * but offloading OWE processing to the user space will get the updated + * DH IE through this interface. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3761,6 +3791,8 @@ struct cfg80211_ops { struct cfg80211_pmsr_request *request); void (*abort_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request); + int (*update_owe_info)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_update_owe_info *owe_info); }; /* @@ -7219,4 +7251,14 @@ void cfg80211_pmsr_complete(struct wireless_dev *wdev, #define wiphy_WARN(wiphy, format, args...) \ WARN(1, "wiphy: %s\n" format, wiphy_name(wiphy), ##args); +/** + * cfg80211_update_owe_info_event - Notify the peer's OWE info to user space + * @netdev: network device + * @owe_info: peer's owe info + * @gfp: allocation flags + */ +void cfg80211_update_owe_info_event(struct net_device *netdev, + struct cfg80211_update_owe_info *owe_info, + gfp_t gfp); + #endif /* __NET_CFG80211_H */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 07457f4aea00..a99d75bef598 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1065,6 +1065,11 @@ * indicated by %NL80211_ATTR_WIPHY_FREQ and other attributes * determining the width and type. * + * @NL80211_CMD_UPDATE_OWE_INFO: This interface allows the host driver to + * offload OWE processing to user space. This intends to support + * OWE AKM by the host drivers that implement SME but rely + * on the user space for the cryptographic/DH IE processing in AP mode. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1285,6 +1290,8 @@ enum nl80211_commands { NL80211_CMD_NOTIFY_RADAR, + NL80211_CMD_UPDATE_OWE_INFO, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7556c0479b3c..0124bab1f8a7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13316,6 +13316,31 @@ nla_put_failure: return -ENOBUFS; } +static int nl80211_update_owe_info(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct cfg80211_update_owe_info owe_info; + struct net_device *dev = info->user_ptr[1]; + + if (!rdev->ops->update_owe_info) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_STATUS_CODE] || + !info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + memset(&owe_info, 0, sizeof(owe_info)); + owe_info.status = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]); + nla_memcpy(owe_info.peer, info->attrs[NL80211_ATTR_MAC], ETH_ALEN); + + if (info->attrs[NL80211_ATTR_IE]) { + owe_info.ie = nla_data(info->attrs[NL80211_ATTR_IE]); + owe_info.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + } + + return rdev_update_owe_info(rdev, dev, &owe_info); +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -14146,6 +14171,13 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_UPDATE_OWE_INFO, + .doit = nl80211_update_owe_info, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_family nl80211_fam __ro_after_init = { @@ -16318,6 +16350,46 @@ int cfg80211_external_auth_request(struct net_device *dev, } EXPORT_SYMBOL(cfg80211_external_auth_request); +void cfg80211_update_owe_info_event(struct net_device *netdev, + struct cfg80211_update_owe_info *owe_info, + gfp_t gfp) +{ + struct wiphy *wiphy = netdev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_update_owe_info_event(wiphy, netdev, owe_info); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_UPDATE_OWE_INFO); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, owe_info->peer)) + goto nla_put_failure; + + if (!owe_info->ie_len || + nla_put(msg, NL80211_ATTR_IE, owe_info->ie_len, owe_info->ie)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_MLME, gfp); + return; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_update_owe_info_event); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 5cb48d135fab..c1e3210b09e6 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1272,4 +1272,17 @@ rdev_abort_pmsr(struct cfg80211_registered_device *rdev, trace_rdev_return_void(&rdev->wiphy); } +static inline int rdev_update_owe_info(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_update_owe_info *oweinfo) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_update_owe_info(&rdev->wiphy, dev, oweinfo); + if (rdev->ops->update_owe_info) + ret = rdev->ops->update_owe_info(&rdev->wiphy, dev, oweinfo); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 44b2ce1bb13a..2dda5291fc01 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3362,6 +3362,44 @@ TRACE_EVENT(cfg80211_pmsr_complete, WIPHY_PR_ARG, WDEV_PR_ARG, (unsigned long long)__entry->cookie) ); + +TRACE_EVENT(rdev_update_owe_info, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_update_owe_info *owe_info), + TP_ARGS(wiphy, netdev, owe_info), + TP_STRUCT__entry(WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u16, status) + __dynamic_array(u8, ie, owe_info->ie_len)), + TP_fast_assign(WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, owe_info->peer); + __entry->status = owe_info->status; + memcpy(__get_dynamic_array(ie), + owe_info->ie, owe_info->ie_len);), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT + " status %d", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), + __entry->status) +); + +TRACE_EVENT(cfg80211_update_owe_info_event, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_update_owe_info *owe_info), + TP_ARGS(wiphy, netdev, owe_info), + TP_STRUCT__entry(WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __dynamic_array(u8, ie, owe_info->ie_len)), + TP_fast_assign(WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, owe_info->peer); + memcpy(__get_dynamic_array(ie), owe_info->ie, + owe_info->ie_len);), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 84c0d5e96f3ae20344fb3a79161eab18905dae56 Mon Sep 17 00:00:00 2001 From: Jacky Hu Date: Tue, 26 Mar 2019 18:31:21 +0800 Subject: ipvs: allow tunneling with gue encapsulation ipip packets are blocked in some public cloud environments, this patch allows gue encapsulation with the tunneling method, which would make tunneling working in those environments. Signed-off-by: Jacky Hu Acked-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 5 +++ include/uapi/linux/ip_vs.h | 11 ++++++ net/netfilter/ipvs/ip_vs_ctl.c | 35 ++++++++++++++++- net/netfilter/ipvs/ip_vs_xmit.c | 84 +++++++++++++++++++++++++++++++++++++++-- 4 files changed, 130 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 047f9a5ccaad..2ac40135b576 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -600,6 +600,9 @@ struct ip_vs_dest_user_kern { /* Address family of addr */ u16 af; + + u16 tun_type; /* tunnel type */ + __be16 tun_port; /* tunnel port */ }; @@ -660,6 +663,8 @@ struct ip_vs_dest { atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ atomic_t last_weight; /* server latest weight */ + __u16 tun_type; /* tunnel type */ + __be16 tun_port; /* tunnel port */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index 1c916b2f89dc..e34f436fc79d 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -124,6 +124,13 @@ #define IP_VS_PEDATA_MAXLEN 255 +/* Tunnel types */ +enum { + IP_VS_CONN_F_TUNNEL_TYPE_IPIP = 0, /* IPIP */ + IP_VS_CONN_F_TUNNEL_TYPE_GUE, /* GUE */ + IP_VS_CONN_F_TUNNEL_TYPE_MAX, +}; + /* * The struct ip_vs_service_user and struct ip_vs_dest_user are * used to set IPVS rules through setsockopt. @@ -392,6 +399,10 @@ enum { IPVS_DEST_ATTR_STATS64, /* nested attribute for dest stats */ + IPVS_DEST_ATTR_TUN_TYPE, /* tunnel type */ + + IPVS_DEST_ATTR_TUN_PORT, /* tunnel port */ + __IPVS_DEST_ATTR_MAX, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4b933669fd83..ab119a7540db 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -831,6 +831,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; + /* set the tunnel info */ + dest->tun_type = udest->tun_type; + dest->tun_port = udest->tun_port; + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; @@ -987,6 +991,13 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } + if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if (udest->tun_port == 0) { + pr_err("%s(): tunnel port is zero\n", __func__); + return -EINVAL; + } + } + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ @@ -1051,6 +1062,13 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } + if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if (udest->tun_port == 0) { + pr_err("%s(): tunnel port is zero\n", __func__); + return -EINVAL; + } + } + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ @@ -2333,6 +2351,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; udest->af = AF_INET; + udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; } static int @@ -2890,6 +2909,8 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, + [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3193,6 +3214,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) IP_VS_CONN_F_FWD_MASK)) || nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)) || + nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, + dest->tun_type) || + nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, + dest->tun_port) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, @@ -3315,12 +3340,14 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, - *nla_l_thresh; + *nla_l_thresh, *nla_tun_type, *nla_tun_port; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; + nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; @@ -3330,6 +3357,12 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, udest->weight = nla_get_u32(nla_weight); udest->u_threshold = nla_get_u32(nla_u_thresh); udest->l_threshold = nla_get_u32(nla_l_thresh); + + if (nla_tun_type) + udest->tun_type = nla_get_u8(nla_tun_type); + + if (nla_tun_port) + udest->tun_port = nla_get_be16(nla_tun_port); } return 0; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 175349fcf91f..8d6f94b67772 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -32,6 +32,7 @@ #include #include /* for tcphdr */ #include +#include #include /* for csum_tcpudp_magic */ #include #include /* for icmp_send */ @@ -382,6 +383,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst); } else { mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (!dest) + goto err_put; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; @@ -533,6 +538,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst); else { mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (!dest) + goto err_put; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); @@ -989,6 +998,41 @@ static inline int __tun_gso_type_mask(int encaps_af, int orig_af) } } +static int +ipvs_gue_encap(struct net *net, struct sk_buff *skb, + struct ip_vs_conn *cp, __u8 *next_protocol) +{ + __be16 dport; + __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); + struct udphdr *udph; /* Our new UDP header */ + struct guehdr *gueh; /* Our new GUE header */ + + skb_push(skb, sizeof(struct guehdr)); + + gueh = (struct guehdr *)skb->data; + + gueh->control = 0; + gueh->version = 0; + gueh->hlen = 0; + gueh->flags = 0; + gueh->proto_ctype = *next_protocol; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + udph = udp_hdr(skb); + + dport = cp->dest->tun_port; + udph->dest = dport; + udph->source = sport; + udph->len = htons(skb->len); + udph->check = 0; + + *next_protocol = IPPROTO_UDP; + + return 0; +} + /* * IP Tunneling transmitter * @@ -1025,6 +1069,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; + int tun_type, gso_type; EnterFunction(10); @@ -1046,6 +1091,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + tun_type = cp->dest->tun_type; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, @@ -1054,11 +1104,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af))) + gso_type = __tun_gso_type_mask(AF_INET, cp->af); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + gso_type |= SKB_GSO_UDP_TUNNEL; + + if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; skb->transport_header = skb->network_header; + skb_set_inner_ipproto(skb, next_protocol); + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + ipvs_gue_encap(net, skb, cp, &next_protocol); + skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1102,6 +1161,8 @@ int ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { + struct netns_ipvs *ipvs = cp->ipvs; + struct net *net = ipvs->net; struct rt6_info *rt; /* Route to the other host */ struct in6_addr saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ @@ -1112,10 +1173,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; + int tun_type, gso_type; EnterFunction(10); - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, &cp->daddr.in6, &saddr, ipvsh, 1, IP_VS_RT_MODE_LOCAL | @@ -1134,17 +1196,31 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + tun_type = cp->dest->tun_type; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, &next_protocol, &payload_len, &dsfield, &ttl, NULL); if (IS_ERR(skb)) goto tx_error; - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af))) + gso_type = __tun_gso_type_mask(AF_INET6, cp->af); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + gso_type |= SKB_GSO_UDP_TUNNEL; + + if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; skb->transport_header = skb->network_header; + skb_set_inner_ipproto(skb, next_protocol); + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + ipvs_gue_encap(net, skb, cp, &next_protocol); + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1167,7 +1243,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip6_local_out(cp->ipvs->net, skb->sk, skb); + ip6_local_out(net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); -- cgit v1.2.3 From 22c7652cdaa8cd33ce78bacceb4e826a3f795873 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 27 Mar 2019 11:36:26 +0100 Subject: netfilter: nft_osf: Add version option support Add version option support to the nftables "osf" expression. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink_osf.h | 11 ++++++++--- include/uapi/linux/netfilter/nf_tables.h | 6 ++++++ net/netfilter/nfnetlink_osf.c | 14 +++++++------- net/netfilter/nft_osf.c | 30 +++++++++++++++++++++++++----- 4 files changed, 46 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h index c6000046c966..788613f36935 100644 --- a/include/linux/netfilter/nfnetlink_osf.h +++ b/include/linux/netfilter/nfnetlink_osf.h @@ -21,13 +21,18 @@ struct nf_osf_finger { struct nf_osf_user_finger finger; }; +struct nf_osf_data { + const char *genre; + const char *version; +}; + bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers); -const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers, - const int ttl_check); +bool nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers, + const int ttl_check, struct nf_osf_data *data); #endif /* _NFOSF_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index a66c8de006cc..061bb3eb20c3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1522,15 +1522,21 @@ enum nft_flowtable_hook_attributes { * * @NFTA_OSF_DREG: destination register (NLA_U32: nft_registers) * @NFTA_OSF_TTL: Value of the TTL osf option (NLA_U8) + * @NFTA_OSF_FLAGS: flags (NLA_U32) */ enum nft_osf_attributes { NFTA_OSF_UNSPEC, NFTA_OSF_DREG, NFTA_OSF_TTL, + NFTA_OSF_FLAGS, __NFTA_OSF_MAX, }; #define NFTA_OSF_MAX (__NFTA_OSF_MAX - 1) +enum nft_osf_flags { + NFT_OSF_F_VERSION = (1 << 0), +}; + /** * enum nft_device_attributes - nf_tables device netlink attributes * diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 1f1d90c1716b..7b827bcb412c 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -255,9 +255,9 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, } EXPORT_SYMBOL_GPL(nf_osf_match); -const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers, - const int ttl_check) +bool nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers, + const int ttl_check, struct nf_osf_data *data) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; @@ -265,24 +265,24 @@ const char *nf_osf_find(const struct sk_buff *skb, const struct nf_osf_finger *kf; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; - const char *genre = NULL; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); if (!tcp) - return NULL; + return false; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; - genre = f->genre; + data->genre = f->genre; + data->version = f->version; break; } - return genre; + return true; } EXPORT_SYMBOL_GPL(nf_osf_find); diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index b13618c764ec..87b60d6617ef 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -7,11 +7,13 @@ struct nft_osf { enum nft_registers dreg:8; u8 ttl; + u32 flags; }; static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = { [NFTA_OSF_DREG] = { .type = NLA_U32 }, [NFTA_OSF_TTL] = { .type = NLA_U8 }, + [NFTA_OSF_FLAGS] = { .type = NLA_U32 }, }; static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, @@ -20,9 +22,10 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_osf *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; struct sk_buff *skb = pkt->skb; + char os_match[NFT_OSF_MAXGENRELEN + 1]; const struct tcphdr *tcp; + struct nf_osf_data data; struct tcphdr _tcph; - const char *os_name; tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); @@ -35,11 +38,17 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - os_name = nf_osf_find(skb, nf_osf_fingers, priv->ttl); - if (!os_name) + if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) { strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); - else - strncpy((char *)dest, os_name, NFT_OSF_MAXGENRELEN); + } else { + if (priv->flags & NFT_OSF_F_VERSION) + snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s", + data.genre, data.version); + else + strlcpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); + + strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN); + } } static int nft_osf_init(const struct nft_ctx *ctx, @@ -47,6 +56,7 @@ static int nft_osf_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_osf *priv = nft_expr_priv(expr); + u32 flags; int err; u8 ttl; @@ -57,6 +67,13 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->ttl = ttl; } + if (tb[NFTA_OSF_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_OSF_FLAGS])); + if (flags != NFT_OSF_F_VERSION) + return -EINVAL; + priv->flags = flags; + } + priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); err = nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); @@ -73,6 +90,9 @@ static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl)) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_OSF_FLAGS, ntohl(priv->flags))) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg)) goto nla_put_failure; -- cgit v1.2.3 From d8eca5bbb2be9bc7546f9e733786fa2f1a594c67 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:03 +0200 Subject: bpf: implement lookup-free direct value access for maps This generic extension to BPF maps allows for directly loading an address residing inside a BPF map value as a single BPF ldimm64 instruction! The idea is similar to what BPF_PSEUDO_MAP_FD does today, which is a special src_reg flag for ldimm64 instruction that indicates that inside the first part of the double insns's imm field is a file descriptor which the verifier then replaces as a full 64bit address of the map into both imm parts. For the newly added BPF_PSEUDO_MAP_VALUE src_reg flag, the idea is the following: the first part of the double insns's imm field is again a file descriptor corresponding to the map, and the second part of the imm field is an offset into the value. The verifier will then replace both imm parts with an address that points into the BPF map value at the given value offset for maps that support this operation. Currently supported is array map with single entry. It is possible to support more than just single map element by reusing both 16bit off fields of the insns as a map index, so full array map lookup could be expressed that way. It hasn't been implemented here due to lack of concrete use case, but could easily be done so in future in a compatible way, since both off fields right now have to be 0 and would correctly denote a map index 0. The BPF_PSEUDO_MAP_VALUE is a distinct flag as otherwise with BPF_PSEUDO_MAP_FD we could not differ offset 0 between load of map pointer versus load of map's value at offset 0, and changing BPF_PSEUDO_MAP_FD's encoding into off by one to differ between regular map pointer and map value pointer would add unnecessary complexity and increases barrier for debugability thus less suitable. Using the second part of the imm field as an offset into the value does /not/ come with limitations since maximum possible value size is in u32 universe anyway. This optimization allows for efficiently retrieving an address to a map value memory area without having to issue a helper call which needs to prepare registers according to calling convention, etc, without needing the extra NULL test, and without having to add the offset in an additional instruction to the value base pointer. The verifier then treats the destination register as PTR_TO_MAP_VALUE with constant reg->off from the user passed offset from the second imm field, and guarantees that this is within bounds of the map value. Any subsequent operations are normally treated as typical map value handling without anything extra needed from verification side. The two map operations for direct value access have been added to array map for now. In future other types could be supported as well depending on the use case. The main use case for this commit is to allow for BPF loader support for global variables that reside in .data/.rodata/.bss sections such that we can directly load the address of them with minimal additional infrastructure required. Loader support has been added in subsequent commits for libbpf library. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++ include/linux/bpf_verifier.h | 4 ++ include/uapi/linux/bpf.h | 13 +++++- kernel/bpf/arraymap.c | 32 +++++++++++++++ kernel/bpf/core.c | 3 +- kernel/bpf/disasm.c | 5 ++- kernel/bpf/syscall.c | 28 +++++++++---- kernel/bpf/verifier.c | 86 ++++++++++++++++++++++++++++++--------- tools/bpf/bpftool/xlated_dumper.c | 3 ++ 9 files changed, 149 insertions(+), 31 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a445194b5fb6..bd93a592dd29 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -57,6 +57,12 @@ struct bpf_map_ops { const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); + + /* Direct value access helpers. */ + int (*map_direct_value_addr)(const struct bpf_map *map, + u64 *imm, u32 off); + int (*map_direct_value_meta)(const struct bpf_map *map, + u64 imm, u32 *off); }; struct bpf_map { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index fc8254d6b569..b3ab61fe1932 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -224,6 +224,10 @@ struct bpf_insn_aux_data { unsigned long map_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ + struct { + u32 map_index; /* index into used_maps[] */ + u32 map_off; /* offset from value base address */ + }; }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 837024512baf..26cfb5b2c964 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -255,8 +255,19 @@ enum bpf_attach_type { */ #define BPF_F_ANY_ALIGNMENT (1U << 1) -/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * two extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd map fd + * insn[1].imm: 0 offset into value + * insn[0].off: 0 0 + * insn[1].off: 0 0 + * ldimm64 rewrite: address of map address of map[0]+offset + * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_FD 1 +#define BPF_PSEUDO_MAP_VALUE 2 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c72e0d8e1e65..1a6e9861d554 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -160,6 +160,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + array->elem_size * (index & array->index_mask); } +static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, + u32 off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + if (map->max_entries != 1) + return -ENOTSUPP; + if (off >= map->value_size) + return -EINVAL; + + *imm = (unsigned long)array->value; + return 0; +} + +static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, + u32 *off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u64 base = (unsigned long)array->value; + u64 range = array->elem_size; + + if (map->max_entries != 1) + return -ENOTSUPP; + if (imm < base || imm >= base + range) + return -ENOENT; + + *off = imm - base; + return 0; +} + /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -419,6 +449,8 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, + .map_direct_value_addr = array_map_direct_value_addr, + .map_direct_value_meta = array_map_direct_value_meta, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2966cb368bf4..ace8c22c8b0e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -292,7 +292,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) dst[i] = fp->insnsi[i]; if (!was_ld_map && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && - dst[i].src_reg == BPF_PSEUDO_MAP_FD) { + (dst[i].src_reg == BPF_PSEUDO_MAP_FD || + dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { was_ld_map = true; dst[i].imm = 0; } else if (was_ld_map && diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index de73f55e42fd..d9ce383c0f9c 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -205,10 +205,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, * part of the ldimm64 insn is accessible. */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || + insn->src_reg == BPF_PSEUDO_MAP_VALUE; char tmp[64]; - if (map_ptr && !allow_ptr_leaks) + if (is_ptr && !allow_ptr_leaks) imm = 0; verbose(cbs->private_data, "(%02x) r%d = %s\n", diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1d65e56594db..828518bb947b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2072,13 +2072,26 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) } static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, - unsigned long addr) + unsigned long addr, u32 *off, + u32 *type) { + const struct bpf_map *map; int i; - for (i = 0; i < prog->aux->used_map_cnt; i++) - if (prog->aux->used_maps[i] == (void *)addr) - return prog->aux->used_maps[i]; + for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (map == (void *)addr) { + *type = BPF_PSEUDO_MAP_FD; + return map; + } + if (!map->ops->map_direct_value_meta) + continue; + if (!map->ops->map_direct_value_meta(map, addr, off)) { + *type = BPF_PSEUDO_MAP_VALUE; + return map; + } + } + return NULL; } @@ -2086,6 +2099,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) { const struct bpf_map *map; struct bpf_insn *insns; + u32 off, type; u64 imm; int i; @@ -2113,11 +2127,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; - map = bpf_map_from_imm(prog, imm); + map = bpf_map_from_imm(prog, imm, &off, &type); if (map) { - insns[i].src_reg = BPF_PSEUDO_MAP_FD; + insns[i].src_reg = type; insns[i].imm = map->id; - insns[i + 1].imm = 0; + insns[i + 1].imm = off; continue; } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48718e1da16d..6ab7a23fc924 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5056,18 +5056,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return 0; } -/* return the map pointer stored inside BPF_LD_IMM64 instruction */ -static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) -{ - u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; - - return (struct bpf_map *) (unsigned long) imm64; -} - /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { + struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map; int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -5091,11 +5085,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ - BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + map = env->used_maps[aux->map_index]; + mark_reg_known_zero(env, regs, insn->dst_reg); + regs[insn->dst_reg].map_ptr = map; + + if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].off = aux->map_off; + if (map_value_has_spin_lock(map)) + regs[insn->dst_reg].id = ++env->id_gen; + } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + } else { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; - regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); return 0; } @@ -6803,8 +6808,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_insn_aux_data *aux; struct bpf_map *map; struct fd f; + u64 addr; if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || @@ -6813,13 +6820,19 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return -EINVAL; } - if (insn->src_reg == 0) + if (insn[0].src_reg == 0) /* valid generic load 64-bit imm */ goto next_insn; - if (insn[0].src_reg != BPF_PSEUDO_MAP_FD || - insn[1].imm != 0) { - verbose(env, "unrecognized bpf_ld_imm64 insn\n"); + /* In final convert_pseudo_ld_imm64() step, this is + * converted into regular 64-bit imm load insn. + */ + if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD && + insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) || + (insn[0].src_reg == BPF_PSEUDO_MAP_FD && + insn[1].imm != 0)) { + verbose(env, + "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } @@ -6837,16 +6850,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return err; } - /* store map pointer inside BPF_LD_IMM64 instruction */ - insn[0].imm = (u32) (unsigned long) map; - insn[1].imm = ((u64) (unsigned long) map) >> 32; + aux = &env->insn_aux_data[i]; + if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + addr = (unsigned long)map; + } else { + u32 off = insn[1].imm; + + if (off >= BPF_MAX_VAR_OFF) { + verbose(env, "direct value offset of %u is not allowed\n", off); + fdput(f); + return -EINVAL; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + fdput(f); + return -EINVAL; + } + + err = map->ops->map_direct_value_addr(map, &addr, off); + if (err) { + verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n", + map->value_size, off); + fdput(f); + return err; + } + + aux->map_off = off; + addr += off; + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; /* check whether we recorded this map already */ - for (j = 0; j < env->used_map_cnt; j++) + for (j = 0; j < env->used_map_cnt; j++) { if (env->used_maps[j] == map) { + aux->map_index = j; fdput(f); goto next_insn; } + } if (env->used_map_cnt >= MAX_USED_MAPS) { fdput(f); @@ -6863,6 +6907,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) fdput(f); return PTR_ERR(map); } + + aux->map_index = env->used_map_cnt; env->used_maps[env->used_map_cnt++] = map; if (bpf_map_is_cgroup_storage(map) && diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c index 7073dbe1ff27..0bb17bf88b18 100644 --- a/tools/bpf/bpftool/xlated_dumper.c +++ b/tools/bpf/bpftool/xlated_dumper.c @@ -195,6 +195,9 @@ static const char *print_imm(void *private_data, if (insn->src_reg == BPF_PSEUDO_MAP_FD) snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), "map[id:%u]", insn->imm); + else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "map[id:%u][0]+%u", insn->imm, (insn + 1)->imm); else snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), "0x%llx", (unsigned long long)full_imm); -- cgit v1.2.3 From 591fe9888d7809d9ee5c828020b6c6ae27c37229 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:05 +0200 Subject: bpf: add program side {rd, wr}only support for maps This work adds two new map creation flags BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG in order to allow for read-only or write-only BPF maps from a BPF program side. Today we have BPF_F_RDONLY and BPF_F_WRONLY, but this only applies to system call side, meaning the BPF program has full read/write access to the map as usual while bpf(2) calls with map fd can either only read or write into the map depending on the flags. BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG allows for the exact opposite such that verifier is going to reject program loads if write into a read-only map or a read into a write-only map is detected. For read-only map case also some helpers are forbidden for programs that would alter the map state such as map deletion, update, etc. As opposed to the two BPF_F_RDONLY / BPF_F_WRONLY flags, BPF_F_RDONLY_PROG as well as BPF_F_WRONLY_PROG really do correspond to the map lifetime. We've enabled this generic map extension to various non-special maps holding normal user data: array, hash, lru, lpm, local storage, queue and stack. Further generic map types could be followed up in future depending on use-case. Main use case here is to forbid writes into .rodata map values from verifier side. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 29 +++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 6 +++++- kernel/bpf/arraymap.c | 6 +++++- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/local_storage.c | 6 +++--- kernel/bpf/lpm_trie.c | 3 ++- kernel/bpf/queue_stack_maps.c | 6 +++--- kernel/bpf/syscall.c | 2 ++ kernel/bpf/verifier.c | 46 +++++++++++++++++++++++++++++++++++++++++-- 9 files changed, 96 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bd93a592dd29..be20804631b5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -430,6 +430,35 @@ struct bpf_array { #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 32 +#define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ + BPF_F_RDONLY_PROG | \ + BPF_F_WRONLY | \ + BPF_F_WRONLY_PROG) + +#define BPF_MAP_CAN_READ BIT(0) +#define BPF_MAP_CAN_WRITE BIT(1) + +static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) +{ + u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); + + /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is + * not possible. + */ + if (access_flags & BPF_F_RDONLY_PROG) + return BPF_MAP_CAN_READ; + else if (access_flags & BPF_F_WRONLY_PROG) + return BPF_MAP_CAN_WRITE; + else + return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE; +} + +static inline bool bpf_map_flags_access_ok(u32 access_flags) +{ + return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) != + (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); +} + struct bpf_event_entry { struct perf_event *event; struct file *perf_file; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 26cfb5b2c964..d275446d807c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -294,7 +294,7 @@ enum bpf_attach_type { #define BPF_OBJ_NAME_LEN 16U -/* Flags for accessing BPF object */ +/* Flags for accessing BPF object from syscall side. */ #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) @@ -304,6 +304,10 @@ enum bpf_attach_type { /* Zero-initialize hash function seed. This should only be used for testing. */ #define BPF_F_ZERO_SEED (1U << 6) +/* Flags for accessing BPF object from program side. */ +#define BPF_F_RDONLY_PROG (1U << 7) +#define BPF_F_WRONLY_PROG (1U << 8) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1a6e9861d554..217b10bd9f48 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -22,7 +22,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -63,6 +63,7 @@ int array_map_alloc_check(union bpf_attr *attr) if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; @@ -472,6 +473,9 @@ static int fd_array_map_alloc_check(union bpf_attr *attr) /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return -EINVAL; + /* Program read-only/write-only not supported for special maps yet. */ + if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) + return -EINVAL; return array_map_alloc_check(attr); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fed15cf94dca..192d32e77db3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -23,7 +23,7 @@ #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED) + BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) struct bucket { struct hlist_nulls_head head; @@ -262,8 +262,8 @@ static int htab_map_alloc_check(union bpf_attr *attr) /* Guard against local DoS, and discourage production use. */ return -EPERM; - if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (!lru && percpu_lru) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 6b572e2de7fb..980e8f1f6cb5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO #ifdef CONFIG_CGROUP_BPF #define LOCAL_STORAGE_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_cgroup_storage_map { struct bpf_map map; @@ -282,8 +282,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); - if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return ERR_PTR(-EINVAL); if (attr->max_entries) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 93a5cbbde421..e61630c2e50b 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -538,7 +538,7 @@ out: #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY) + BPF_F_ACCESS_MASK) static struct bpf_map *trie_alloc(union bpf_attr *attr) { @@ -553,6 +553,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) if (attr->max_entries == 0 || !(attr->map_flags & BPF_F_NO_PREALLOC) || attr->map_flags & ~LPM_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || attr->key_size < LPM_KEY_SIZE_MIN || attr->key_size > LPM_KEY_SIZE_MAX || attr->value_size < LPM_VAL_SIZE_MIN || diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index b384ea9f3254..0b140d236889 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -11,8 +11,7 @@ #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_queue_stack { struct bpf_map map; @@ -52,7 +51,8 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || attr->value_size == 0 || - attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 56b4b0e08b3b..0c9276b54c88 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -501,6 +501,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->spin_lock_off = btf_find_spin_lock(btf, value_type); if (map_value_has_spin_lock(map)) { + if (map->map_flags & BPF_F_RDONLY_PROG) + return -EACCES; if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6ab7a23fc924..b747434df89c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1439,6 +1439,28 @@ static int check_stack_access(struct bpf_verifier_env *env, return 0; } +static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, + int off, int size, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map = regs[regno].map_ptr; + u32 cap = bpf_map_flags_to_cap(map); + + if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { + verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { + verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) @@ -2024,7 +2046,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } - + err = check_map_access_type(env, regno, off, size, t); + if (err) + return err; err = check_map_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); @@ -2327,6 +2351,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_VALUE: + if (check_map_access_type(env, regno, reg->off, access_size, + meta && meta->raw_mode ? BPF_WRITE : + BPF_READ)) + return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ @@ -3059,6 +3087,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map = meta->map_ptr; if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && @@ -3069,11 +3098,24 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_peek_elem) return 0; - if (meta->map_ptr == NULL) { + if (map == NULL) { verbose(env, "kernel subsystem misconfigured verifier\n"); return -EINVAL; } + /* In case of read-only, some additional restrictions + * need to be applied in order to prevent altering the + * state of the map from program side. + */ + if ((map->map_flags & BPF_F_RDONLY_PROG) && + (func_id == BPF_FUNC_map_delete_elem || + func_id == BPF_FUNC_map_update_elem || + func_id == BPF_FUNC_map_push_elem || + func_id == BPF_FUNC_map_pop_elem)) { + verbose(env, "write into map forbidden\n"); + return -EACCES; + } + if (!BPF_MAP_PTR(aux->map_state)) bpf_map_ptr_store(aux, meta->map_ptr, meta->map_ptr->unpriv_array); -- cgit v1.2.3 From 87df15de441bd4add7876ef584da8cabdd9a042a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:06 +0200 Subject: bpf: add syscall side map freeze support This patch adds a new BPF_MAP_FREEZE command which allows to "freeze" the map globally as read-only / immutable from syscall side. Map permission handling has been refactored into map_get_sys_perms() and drops FMODE_CAN_WRITE in case of locked map. Main use case is to allow for setting up .rodata sections from the BPF ELF which are loaded into the kernel, meaning BPF loader first allocates map, sets up map value by copying .rodata section into it and once complete, it calls BPF_MAP_FREEZE on the map fd to prevent further modifications. Right now BPF_MAP_FREEZE only takes map fd as argument while remaining bpf_attr members are required to be zero. I didn't add write-only locking here as counterpart since I don't have a concrete use-case for it on my side, and I think it makes probably more sense to wait once there is actually one. In that case bpf_attr can be extended as usual with a flag field and/or others where flag 0 means that we lock the map read-only hence this doesn't prevent to add further extensions to BPF_MAP_FREEZE upon need. A map creation flag like BPF_F_WRONCE was not considered for couple of reasons: i) in case of a generic implementation, a map can consist of more than just one element, thus there could be multiple map updates needed to set the map into a state where it can then be made immutable, ii) WRONCE indicates exact one-time write before it is then set immutable. A generic implementation would set a bit atomically on map update entry (if unset), indicating that every subsequent update from then onwards will need to bail out there. However, map updates can fail, so upon failure that flag would need to be unset again and the update attempt would need to be repeated for it to be eventually made immutable. While this can be made race-free, this approach feels less clean and in combination with reason i), it's not generic enough. A dedicated BPF_MAP_FREEZE command directly sets the flag and caller has the guarantee that map is immutable from syscall side upon successful return for any future syscall invocations that would alter the map state, which is also more intuitive from an API point of view. A command name such as BPF_MAP_LOCK has been avoided as it's too close with BPF map spin locks (which already has BPF_F_LOCK flag). BPF_MAP_FREEZE is so far only enabled for privileged users. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 66 +++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 57 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be20804631b5..65f7094c40b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -87,7 +87,8 @@ struct bpf_map { struct btf *btf; u32 pages; bool unpriv_array; - /* 51 bytes hole */ + bool frozen; /* write-once */ + /* 48 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d275446d807c..af1cbd951f26 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -105,6 +105,7 @@ enum bpf_cmd { BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, BPF_MAP_LOOKUP_AND_DELETE_ELEM, + BPF_MAP_FREEZE, }; enum bpf_map_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c9276b54c88..b3ce516e5a20 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -355,6 +355,18 @@ static int bpf_map_release(struct inode *inode, struct file *filp) return 0; } +static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) +{ + fmode_t mode = f.file->f_mode; + + /* Our file permissions may have been overridden by global + * map permissions facing syscall side. + */ + if (READ_ONCE(map->frozen)) + mode &= ~FMODE_CAN_WRITE; + return mode; +} + #ifdef CONFIG_PROC_FS static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { @@ -376,14 +388,16 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "max_entries:\t%u\n" "map_flags:\t%#x\n" "memlock:\t%llu\n" - "map_id:\t%u\n", + "map_id:\t%u\n" + "frozen:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, map->pages * 1ULL << PAGE_SHIFT, - map->id); + map->id, + READ_ONCE(map->frozen)); if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", @@ -727,8 +741,7 @@ static int map_lookup_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } @@ -857,8 +870,7 @@ static int map_update_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -969,8 +981,7 @@ static int map_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1021,8 +1032,7 @@ static int map_get_next_key(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } @@ -1089,8 +1099,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1132,6 +1141,36 @@ err_put: return err; } +#define BPF_MAP_FREEZE_LAST_FIELD map_fd + +static int map_freeze(const union bpf_attr *attr) +{ + int err = 0, ufd = attr->map_fd; + struct bpf_map *map; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_FREEZE)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + if (READ_ONCE(map->frozen)) { + err = -EBUSY; + goto err_put; + } + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto err_put; + } + + WRITE_ONCE(map->frozen, true); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -2735,6 +2774,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_MAP_FREEZE: + err = map_freeze(&attr); + break; case BPF_PROG_LOAD: err = bpf_prog_load(&attr, uattr); break; -- cgit v1.2.3 From f063c889c9458354a92b235a51cbb60d30321070 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:08 +0200 Subject: bpf: add specification for BTF Var and DataSec kinds This adds the BTF specification and UAPI bits for supporting BTF Var and DataSec kinds. This is following LLVM upstream commit ac4082b77e07 ("[BPF] Add BTF Var and DataSec Support") which has been merged recently. Var itself is for describing a global variable and DataSec to describe ELF sections e.g. data/bss/rodata sections that hold one or multiple global variables. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- Documentation/bpf/btf.rst | 57 +++++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/btf.h | 32 ++++++++++++++++++++++---- 2 files changed, 85 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 9a60a5d60e38..60d87d7363ec 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -82,6 +82,8 @@ sequentially and type id is assigned to each recognized type starting from id #define BTF_KIND_RESTRICT 11 /* Restrict */ #define BTF_KIND_FUNC 12 /* Function */ #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ + #define BTF_KIND_VAR 14 /* Variable */ + #define BTF_KIND_DATASEC 15 /* Section */ Note that the type section encodes debug info, not just pure types. ``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram. @@ -393,6 +395,61 @@ refers to parameter type. If the function has variable arguments, the last parameter is encoded with ``name_off = 0`` and ``type = 0``. +2.2.14 BTF_KIND_VAR +~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: offset to a valid C identifier + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_VAR + * ``info.vlen``: 0 + * ``type``: the type of the variable + +``btf_type`` is followed by a single ``struct btf_variable`` with the +following data:: + + struct btf_var { + __u32 linkage; + }; + +``struct btf_var`` encoding: + * ``linkage``: currently only static variable 0, or globally allocated + variable in ELF sections 1 + +Not all type of global variables are supported by LLVM at this point. +The following is currently available: + + * static variables with or without section attributes + * global variables with section attributes + +The latter is for future extraction of map key/value type id's from a +map definition. + +2.2.15 BTF_KIND_DATASEC +~~~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: offset to a valid name associated with a variable or + one of .data/.bss/.rodata + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_DATASEC + * ``info.vlen``: # of variables + * ``size``: total section size in bytes (0 at compilation time, patched + to actual size by BPF loaders such as libbpf) + +``btf_type`` is followed by ``info.vlen`` number of ``struct btf_var_secinfo``.:: + + struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; + }; + +``struct btf_var_secinfo`` encoding: + * ``type``: the type of the BTF_KIND_VAR variable + * ``offset``: the in-section offset of the variable + * ``size``: the size of the variable in bytes + 3. BTF Kernel API ***************** diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 7b7475ef2f17..9310652ca4f9 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -39,11 +39,11 @@ struct btf_type { * struct, union and fwd */ __u32 info; - /* "size" is used by INT, ENUM, STRUCT and UNION. + /* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC. * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC and FUNC_PROTO. + * FUNC, FUNC_PROTO and VAR. * "type" is a type_id referring to another type. */ union { @@ -70,8 +70,10 @@ struct btf_type { #define BTF_KIND_RESTRICT 11 /* Restrict */ #define BTF_KIND_FUNC 12 /* Function */ #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ -#define BTF_KIND_MAX 13 -#define NR_BTF_KINDS 14 +#define BTF_KIND_VAR 14 /* Variable */ +#define BTF_KIND_DATASEC 15 /* Section */ +#define BTF_KIND_MAX BTF_KIND_DATASEC +#define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. @@ -138,4 +140,26 @@ struct btf_param { __u32 type; }; +enum { + BTF_VAR_STATIC = 0, + BTF_VAR_GLOBAL_ALLOCATED, +}; + +/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe + * additional information related to the variable such as its linkage. + */ +struct btf_var { + __u32 linkage; +}; + +/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" + * to describe all BTF_KIND_VAR types it contains along with it's + * in-section offset as well as size. + */ +struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; +}; + #endif /* _UAPI__LINUX_BTF_H__ */ -- cgit v1.2.3 From b0b9395d865e3060d97658fbc9ba3f77fecc8da1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 9 Apr 2019 11:49:09 -0700 Subject: bpf: support input __sk_buff context in BPF_PROG_TEST_RUN Add new set of arguments to bpf_attr for BPF_PROG_TEST_RUN: * ctx_in/ctx_size_in - input context * ctx_out/ctx_size_out - output context The intended use case is to pass some meta data to the test runs that operate on skb (this has being brought up on recent LPC). For programs that use bpf_prog_test_run_skb, support __sk_buff input and output. Initially, from input __sk_buff, copy _only_ cb and priority into skb, all other non-zero fields are prohibited (with EINVAL). If the user has set ctx_out/ctx_size_out, copy the potentially modified __sk_buff back to the userspace. We require all fields of input __sk_buff except the ones we explicitly support to be set to zero. The expectation is that in the future we might add support for more fields and we want to fail explicitly if the user runs the program on the kernel where we don't yet support them. The API is intentionally vague (i.e. we don't explicitly add __sk_buff to bpf_attr, but ctx_in) to potentially let other test_run types use this interface in the future (this can be xdp_md for xdp types for example). v4: * don't copy more than allowed in bpf_ctx_init [Martin] v3: * handle case where ctx_in is NULL, but ctx_out is not [Martin] * convert size==0 checks to ptr==NULL checks and add some extra ptr checks [Martin] v2: * Addressed comments from Martin Lau Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 7 +++ kernel/bpf/syscall.c | 10 +++- net/bpf/test_run.c | 143 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 151 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index af1cbd951f26..31a27dd337dc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -412,6 +412,13 @@ union bpf_attr { __aligned_u64 data_out; __u32 repeat; __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 438199e2eca4..d995eedfdd16 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2009,7 +2009,7 @@ static int bpf_prog_query(const union bpf_attr *attr, return cgroup_bpf_prog_query(attr, uattr); } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration +#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -2022,6 +2022,14 @@ static int bpf_prog_test_run(const union bpf_attr *attr, if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; + if ((attr->test.ctx_size_in && !attr->test.ctx_in) || + (!attr->test.ctx_size_in && attr->test.ctx_in)) + return -EINVAL; + + if ((attr->test.ctx_size_out && !attr->test.ctx_out) || + (!attr->test.ctx_size_out && attr->test.ctx_out)) + return -EINVAL; + prog = bpf_prog_get(attr->test.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index fab142b796ef..cbd4fb65aa4f 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -123,12 +123,126 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, return data; } +static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) +{ + void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + u32 size = kattr->test.ctx_size_in; + void *data; + int err; + + if (!data_in && !data_out) + return NULL; + + data = kzalloc(max_size, GFP_USER); + if (!data) + return ERR_PTR(-ENOMEM); + + if (data_in) { + err = bpf_check_uarg_tail_zero(data_in, max_size, size); + if (err) { + kfree(data); + return ERR_PTR(err); + } + + size = min_t(u32, max_size, size); + if (copy_from_user(data, data_in, size)) { + kfree(data); + return ERR_PTR(-EFAULT); + } + } + return data; +} + +static int bpf_ctx_finish(const union bpf_attr *kattr, + union bpf_attr __user *uattr, const void *data, + u32 size) +{ + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + int err = -EFAULT; + u32 copy_size = size; + + if (!data || !data_out) + return 0; + + if (copy_size > kattr->test.ctx_size_out) { + copy_size = kattr->test.ctx_size_out; + err = -ENOSPC; + } + + if (copy_to_user(data_out, data, copy_size)) + goto out; + if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size))) + goto out; + if (err != -ENOSPC) + err = 0; +out: + return err; +} + +/** + * range_is_zero - test whether buffer is initialized + * @buf: buffer to check + * @from: check from this position + * @to: check up until (excluding) this position + * + * This function returns true if the there is a non-zero byte + * in the buf in the range [from,to). + */ +static inline bool range_is_zero(void *buf, size_t from, size_t to) +{ + return !memchr_inv((u8 *)buf + from, 0, to - from); +} + +static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return 0; + + /* make sure the fields we don't use are zeroed */ + if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority))) + return -EINVAL; + + /* priority is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) + + FIELD_SIZEOF(struct __sk_buff, priority), + offsetof(struct __sk_buff, cb))) + return -EINVAL; + + /* cb is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + + FIELD_SIZEOF(struct __sk_buff, cb), + sizeof(struct __sk_buff))) + return -EINVAL; + + skb->priority = __skb->priority; + memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); + + return 0; +} + +static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return; + + __skb->priority = skb->priority; + memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); +} + int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool is_l2 = false, is_direct_pkt_access = false; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; + struct __sk_buff *ctx = NULL; u32 retval, duration; int hh_len = ETH_HLEN; struct sk_buff *skb; @@ -141,6 +255,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(data)) return PTR_ERR(data); + ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); + if (IS_ERR(ctx)) { + kfree(data); + return PTR_ERR(ctx); + } + switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: @@ -158,6 +278,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, sk = kzalloc(sizeof(struct sock), GFP_USER); if (!sk) { kfree(data); + kfree(ctx); return -ENOMEM; } sock_net_set(sk, current->nsproxy->net_ns); @@ -166,6 +287,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, skb = build_skb(data, 0); if (!skb) { kfree(data); + kfree(ctx); kfree(sk); return -ENOMEM; } @@ -180,32 +302,37 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); + ret = convert___skb_to_skb(skb, ctx); + if (ret) + goto out; ret = bpf_test_run(prog, skb, repeat, &retval, &duration); - if (ret) { - kfree_skb(skb); - kfree(sk); - return ret; - } + if (ret) + goto out; if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { - kfree_skb(skb); - kfree(sk); - return -ENOMEM; + ret = -ENOMEM; + goto out; } } memset(__skb_push(skb, hh_len), 0, hh_len); } + convert_skb_to___skb(skb, ctx); size = skb->len; /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) size = skb_headlen(skb); ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, ctx, + sizeof(struct __sk_buff)); +out: kfree_skb(skb); kfree(sk); + kfree(ctx); return ret; } -- cgit v1.2.3 From 58dfc900faff6db7eb9bf01555622e0b6c74c262 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Tue, 9 Apr 2019 15:06:41 +0100 Subject: bpf: add layer 2 encap support to bpf_skb_adjust_room commit 868d523535c2 ("bpf: add bpf_skb_adjust_room encap flags") introduced support to bpf_skb_adjust_room for GSO-friendly GRE and UDP encapsulation. For GSO to work for skbs, the inner headers (mac and network) need to be marked. For L3 encapsulation using bpf_skb_adjust_room, the mac and network headers are identical. Here we provide a way of specifying the inner mac header length for cases where L2 encap is desired. Such an approach can support encapsulated ethernet headers, MPLS headers etc. For example to convert from a packet of form [eth][ip][tcp] to [eth][ip][udp][inner mac][ip][tcp], something like the following could be done: headroom = sizeof(iph) + sizeof(struct udphdr) + inner_maclen; ret = bpf_skb_adjust_room(skb, headroom, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_ENCAP_L4_UDP | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | BPF_F_ADJ_ROOM_ENCAP_L2(inner_maclen)); Signed-off-by: Alan Maguire Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 10 ++++++++++ net/core/filter.c | 12 ++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 31a27dd337dc..2e96d0b4bf65 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1523,6 +1523,10 @@ union bpf_attr { * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: * Use with ENCAP_L3 flags to further specify the tunnel type. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2(len) **: + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; **len** is the length of the inner MAC header. + * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -2664,10 +2668,16 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 + #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { diff --git a/net/core/filter.c b/net/core/filter.c index 22eb2edf5573..a1654ef62533 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2969,11 +2969,14 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ - BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2( \ + BPF_ADJ_ROOM_ENCAP_L2_MASK)) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { + u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; unsigned int gso_type = SKB_GSO_DODGY; @@ -3008,6 +3011,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, mac_len = skb->network_header - skb->mac_header; inner_net = skb->network_header; + if (inner_mac_len > len_diff) + return -EINVAL; inner_trans = skb->transport_header; } @@ -3016,8 +3021,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, return ret; if (encap) { - /* inner mac == inner_net on l3 encap */ - skb->inner_mac_header = inner_net; + skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; skb_set_inner_protocol(skb, skb->protocol); @@ -3031,7 +3035,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, gso_type |= SKB_GSO_GRE; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) gso_type |= SKB_GSO_IPXIP6; - else + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) gso_type |= SKB_GSO_IPXIP4; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || -- cgit v1.2.3 From 7b146cebe30cb481b0f70d85779da938da818637 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 27 Feb 2019 12:59:24 -0800 Subject: bpf: Sysctl hook Containerized applications may run as root and it may create problems for whole host. Specifically such applications may change a sysctl and affect applications in other containers. Furthermore in existing infrastructure it may not be possible to just completely disable writing to sysctl, instead such a process should be gradual with ability to log what sysctl are being changed by a container, investigate, limit the set of writable sysctl to currently used ones (so that new ones can not be changed) and eventually reduce this set to zero. The patch introduces new program type BPF_PROG_TYPE_CGROUP_SYSCTL and attach type BPF_CGROUP_SYSCTL to solve these problems on cgroup basis. New program type has access to following minimal context: struct bpf_sysctl { __u32 write; }; Where @write indicates whether sysctl is being read (= 0) or written (= 1). Helpers to access sysctl name and value will be introduced separately. BPF_CGROUP_SYSCTL attach point is added to sysctl code right before passing control to ctl_table->proc_handler so that BPF program can either allow or deny access to sysctl. Suggested-by: Roman Gushchin Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- fs/proc/proc_sysctl.c | 5 +++ include/linux/bpf-cgroup.h | 18 +++++++++ include/linux/bpf_types.h | 1 + include/linux/filter.h | 8 ++++ include/uapi/linux/bpf.h | 9 +++++ kernel/bpf/cgroup.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 7 ++++ kernel/bpf/verifier.c | 1 + 8 files changed, 141 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d65390727541..e01b02150340 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -588,6 +589,10 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write); + if (error) + goto out; + /* careful: calling conventions are nasty here */ res = count; error = table->proc_handler(table, write, buf, &res, ppos); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a4c644c1c091..b1c45da20a26 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -17,6 +17,8 @@ struct bpf_map; struct bpf_prog; struct bpf_sock_ops_kern; struct bpf_cgroup_storage; +struct ctl_table; +struct ctl_table_header; #ifdef CONFIG_CGROUP_BPF @@ -109,6 +111,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, + struct ctl_table *table, int write, + enum bpf_attach_type type); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -253,6 +259,17 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, \ __ret; \ }) + + +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ + BPF_CGROUP_SYSCTL); \ + __ret; \ +}) + int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, @@ -321,6 +338,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 08bf2f1fe553..d26991a16894 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -28,6 +28,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) #endif #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) diff --git a/include/linux/filter.h b/include/linux/filter.h index 6074aa064b54..a17732057880 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -33,6 +33,8 @@ struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; struct sock_reuseport; +struct ctl_table; +struct ctl_table_header; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -1177,4 +1179,10 @@ struct bpf_sock_ops_kern { */ }; +struct bpf_sysctl_kern { + struct ctl_table_header *head; + struct ctl_table *table; + int write; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e96d0b4bf65..cc2a2466d5f3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -167,6 +167,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, }; enum bpf_attach_type { @@ -188,6 +189,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, BPF_FLOW_DISSECTOR, + BPF_CGROUP_SYSCTL, __MAX_BPF_ATTACH_TYPE }; @@ -3308,4 +3310,11 @@ struct bpf_line_info { struct bpf_spin_lock { __u32 val; }; + +struct bpf_sysctl { + __u32 write; /* Sysctl is being read (= 0) or written (= 1). + * Allows 1,2,4-byte read, but no write. + */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f6cd38746df2..610491b5f0aa 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -11,7 +11,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -768,3 +770,93 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { .get_func_proto = cgroup_dev_func_proto, .is_valid_access = cgroup_dev_is_valid_access, }; + +/** + * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl + * + * @head: sysctl table header + * @table: sysctl table + * @write: sysctl is being read (= 0) or written (= 1) + * @type: type of program to be executed + * + * Program is run when sysctl is being accessed, either read or written, and + * can allow or deny such access. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases 0 is returned. + */ +int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, + struct ctl_table *table, int write, + enum bpf_attach_type type) +{ + struct bpf_sysctl_kern ctx = { + .head = head, + .table = table, + .write = write, + }; + struct cgroup *cgrp; + int ret; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + rcu_read_unlock(); + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); + +static const struct bpf_func_proto * +sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return cgroup_base_func_proto(func_id, prog); +} + +static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off + size > sizeof(struct bpf_sysctl) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sysctl, write): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + default: + return false; + } +} + +static u32 sysctl_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sysctl, write): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sysctl_kern, write, + FIELD_SIZEOF(struct bpf_sysctl_kern, + write), + target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops cg_sysctl_verifier_ops = { + .get_func_proto = sysctl_func_proto, + .is_valid_access = sysctl_is_valid_access, + .convert_ctx_access = sysctl_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sysctl_prog_ops = { +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d995eedfdd16..92c9b8a32b50 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1888,6 +1888,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_FLOW_DISSECTOR: ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; break; + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -1966,6 +1969,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: return skb_flow_dissector_bpf_prog_detach(attr); + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -1999,6 +2005,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: + case BPF_CGROUP_SYSCTL: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f25b7c9c20ba..20808e3c95a8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5267,6 +5267,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: break; default: return 0; -- cgit v1.2.3 From 808649fb787d918a48a360a668ee4ee9023f0c11 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 27 Feb 2019 13:28:48 -0800 Subject: bpf: Introduce bpf_sysctl_get_name helper Add bpf_sysctl_get_name() helper to copy sysctl name (/proc/sys/ entry) into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer. By default full name (w/o /proc/sys/) is copied, e.g. "net/ipv4/tcp_mem". If BPF_F_SYSCTL_BASE_NAME flag is set, only base name will be copied, e.g. "tcp_mem". Documentation for the new helper is provided in bpf.h UAPI. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 22 ++++++++++++++- kernel/bpf/cgroup.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 90 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cc2a2466d5f3..9c8a2f3ccb9b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2506,6 +2506,22 @@ union bpf_attr { * Return * 0 if iph and th are a valid SYN cookie ACK, or a negative error * otherwise. + * + * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * Description + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2608,7 +2624,8 @@ union bpf_attr { FN(skb_ecn_set_ce), \ FN(get_listener_sock), \ FN(skc_lookup_tcp), \ - FN(tcp_check_syncookie), + FN(tcp_check_syncookie), \ + FN(sysctl_get_name), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2681,6 +2698,9 @@ enum bpf_func_id { BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) +/* BPF_FUNC_sysctl_get_name flags. */ +#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 610491b5f0aa..a68387043244 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -806,10 +807,77 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, + size_t *lenp) +{ + ssize_t tmp_ret = 0, ret; + + if (dir->header.parent) { + tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); + if (ret < 0) + return ret; + *bufp += ret; + *lenp -= ret; + ret += tmp_ret; + + /* Avoid leading slash. */ + if (!ret) + return ret; + + tmp_ret = strscpy(*bufp, "/", *lenp); + if (tmp_ret < 0) + return tmp_ret; + *bufp += tmp_ret; + *lenp -= tmp_ret; + + return ret + tmp_ret; +} + +BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len, u64, flags) +{ + ssize_t tmp_ret = 0, ret; + + if (!buf) + return -EINVAL; + + if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { + if (!ctx->head) + return -EINVAL; + tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(buf, ctx->table->procname, buf_len); + + return ret < 0 ? ret : tmp_ret + ret; +} + +static const struct bpf_func_proto bpf_sysctl_get_name_proto = { + .func = bpf_sysctl_get_name, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return cgroup_base_func_proto(func_id, prog); + switch (func_id) { + case BPF_FUNC_sysctl_get_name: + return &bpf_sysctl_get_name_proto; + default: + return cgroup_base_func_proto(func_id, prog); + } } static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, -- cgit v1.2.3 From 1d11b3016cec4ed9770b98e82a61708c8f4926e7 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 28 Feb 2019 19:22:15 -0800 Subject: bpf: Introduce bpf_sysctl_get_current_value helper Add bpf_sysctl_get_current_value() helper to copy current sysctl value into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer. It provides same string as user space can see by reading corresponding file in /proc/sys/, including new line, etc. Documentation for the new helper is provided in bpf.h UAPI. Since current value is kept in ctl_table->data in a parsed form, ctl_table->proc_handler() with write=0 is called to read that data and convert it to a string. Such a string can later be parsed by a program using helpers that will be introduced separately. Unfortunately it's not trivial to provide API to access parsed data due to variety of data representations (string, intvec, uintvec, ulongvec, custom structures, even NULL, etc). Instead it's assumed that user know how to handle specific sysctl they're interested in and appropriate helpers can be used. Since ctl_table->proc_handler() expects __user buffer, conversion to __user happens for kernel allocated one where the value is stored. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 ++ include/uapi/linux/bpf.h | 22 +++++++++++++++- kernel/bpf/cgroup.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index a17732057880..f254ff92819f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1182,6 +1182,8 @@ struct bpf_sock_ops_kern { struct bpf_sysctl_kern { struct ctl_table_header *head; struct ctl_table *table; + void *cur_val; + size_t cur_len; int write; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9c8a2f3ccb9b..063543afa359 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2522,6 +2522,25 @@ union bpf_attr { * * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). + * + * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2625,7 +2644,8 @@ union bpf_attr { FN(get_listener_sock), \ FN(skc_lookup_tcp), \ FN(tcp_check_syncookie), \ - FN(sysctl_get_name), + FN(sysctl_get_name), \ + FN(sysctl_get_current_value), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a68387043244..c6b2cf29a54b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -794,15 +794,37 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .head = head, .table = table, .write = write, + .cur_val = NULL, + .cur_len = PAGE_SIZE, }; struct cgroup *cgrp; int ret; + ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); + if (ctx.cur_val) { + mm_segment_t old_fs; + loff_t pos = 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, + &ctx.cur_len, &pos)) { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + set_fs(old_fs); + } else { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); rcu_read_unlock(); + kfree(ctx.cur_val); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); @@ -869,12 +891,55 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .arg4_type = ARG_ANYTHING, }; +static int copy_sysctl_value(char *dst, size_t dst_len, char *src, + size_t src_len) +{ + if (!dst) + return -EINVAL; + + if (!dst_len) + return -E2BIG; + + if (!src || !src_len) { + memset(dst, 0, dst_len); + return -EINVAL; + } + + memcpy(dst, src, min(dst_len, src_len)); + + if (dst_len > src_len) { + memset(dst + src_len, '\0', dst_len - src_len); + return src_len; + } + + dst[dst_len - 1] = '\0'; + + return -E2BIG; +} + +BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, + char *, buf, size_t, buf_len) +{ + return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { + .func = bpf_sysctl_get_current_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; + case BPF_FUNC_sysctl_get_current_value: + return &bpf_sysctl_get_current_value_proto; default: return cgroup_base_func_proto(func_id, prog); } -- cgit v1.2.3 From 4e63acdff864654cee0ac5aaeda3913798ee78f6 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 7 Mar 2019 18:38:43 -0800 Subject: bpf: Introduce bpf_sysctl_{get,set}_new_value helpers Add helpers to work with new value being written to sysctl by user space. bpf_sysctl_get_new_value() copies value being written to sysctl into provided buffer. bpf_sysctl_set_new_value() overrides new value being written by user space with a one from provided buffer. Buffer should contain string representation of the value, similar to what can be seen in /proc/sys/. Both helpers can be used only on sysctl write. File position matters and can be managed by an interface that will be introduced separately. E.g. if user space calls sys_write to a file in /proc/sys/ at file position = X, where X > 0, then the value set by bpf_sysctl_set_new_value() will be written starting from X. If program wants to override whole value with specified buffer, file position has to be set to zero. Documentation for the new helpers is provided in bpf.h UAPI. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- fs/proc/proc_sysctl.c | 22 ++++++++++--- include/linux/bpf-cgroup.h | 8 +++-- include/linux/filter.h | 3 ++ include/uapi/linux/bpf.h | 38 +++++++++++++++++++++- kernel/bpf/cgroup.c | 81 +++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 142 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index e01b02150340..023101c6f0d7 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -570,8 +570,8 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; + void *new_buf = NULL; ssize_t error; - size_t res; if (IS_ERR(head)) return PTR_ERR(head); @@ -589,15 +589,27 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; - error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write); + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count, + &new_buf); if (error) goto out; /* careful: calling conventions are nasty here */ - res = count; - error = table->proc_handler(table, write, buf, &res, ppos); + if (new_buf) { + mm_segment_t old_fs; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + error = table->proc_handler(table, write, (void __user *)new_buf, + &count, ppos); + set_fs(old_fs); + kfree(new_buf); + } else { + error = table->proc_handler(table, write, buf, &count, ppos); + } + if (!error) - error = res; + error = count; out: sysctl_head_finish(head); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index b1c45da20a26..1e97271f9a10 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -113,7 +113,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - enum bpf_attach_type type); + void __user *buf, size_t *pcount, + void **new_buf, enum bpf_attach_type type); static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) @@ -261,11 +262,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) \ +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, nbuf) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ + buf, count, nbuf, \ BPF_CGROUP_SYSCTL); \ __ret; \ }) @@ -338,7 +340,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,nbuf) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/filter.h b/include/linux/filter.h index f254ff92819f..a23653f9460c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1184,6 +1184,9 @@ struct bpf_sysctl_kern { struct ctl_table *table; void *cur_val; size_t cur_len; + void *new_val; + size_t new_len; + int new_updated; int write; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 063543afa359..547b8258d731 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2541,6 +2541,40 @@ union bpf_attr { * * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. + * + * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + * + * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * Description + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * Return + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2645,7 +2679,9 @@ union bpf_attr { FN(skc_lookup_tcp), \ FN(tcp_check_syncookie), \ FN(sysctl_get_name), \ - FN(sysctl_get_current_value), + FN(sysctl_get_current_value), \ + FN(sysctl_get_new_value), \ + FN(sysctl_set_new_value), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c6b2cf29a54b..ba4e21986760 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -778,6 +778,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) + * @buf: pointer to buffer passed by user space + * @pcount: value-result argument: value is size of buffer pointed to by @buf, + * result is size of @new_buf if program set new value, initial value + * otherwise + * @new_buf: pointer to pointer to new buffer that will be allocated if program + * overrides new value provided by user space on sysctl write + * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -788,7 +795,8 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - enum bpf_attach_type type) + void __user *buf, size_t *pcount, + void **new_buf, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { .head = head, @@ -796,6 +804,9 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .write = write, .cur_val = NULL, .cur_len = PAGE_SIZE, + .new_val = NULL, + .new_len = 0, + .new_updated = 0, }; struct cgroup *cgrp; int ret; @@ -818,6 +829,18 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, ctx.cur_len = 0; } + if (write && buf && *pcount) { + /* BPF program should be able to override new value with a + * buffer bigger than provided by user. + */ + ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); + ctx.new_len = min(PAGE_SIZE, *pcount); + if (!ctx.new_val || + copy_from_user(ctx.new_val, buf, ctx.new_len)) + /* Let BPF program decide how to proceed. */ + ctx.new_len = 0; + } + rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); @@ -825,6 +848,13 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); + if (ret == 1 && ctx.new_updated) { + *new_buf = ctx.new_val; + *pcount = ctx.new_len; + } else { + kfree(ctx.new_val); + } + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); @@ -932,6 +962,51 @@ static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { .arg3_type = ARG_CONST_SIZE, }; +BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len) +{ + if (!ctx->write) { + if (buf && buf_len) + memset(buf, '\0', buf_len); + return -EINVAL; + } + return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { + .func = bpf_sysctl_get_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, + const char *, buf, size_t, buf_len) +{ + if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) + return -EINVAL; + + if (buf_len > PAGE_SIZE - 1) + return -E2BIG; + + memcpy(ctx->new_val, buf, buf_len); + ctx->new_len = buf_len; + ctx->new_updated = 1; + + return 0; +} + +static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { + .func = bpf_sysctl_set_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -940,6 +1015,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: return &bpf_sysctl_get_current_value_proto; + case BPF_FUNC_sysctl_get_new_value: + return &bpf_sysctl_get_new_value_proto; + case BPF_FUNC_sysctl_set_new_value: + return &bpf_sysctl_set_new_value_proto; default: return cgroup_base_func_proto(func_id, prog); } -- cgit v1.2.3 From e1550bfe0de47e30484ba91de1e50a91ec1c31f5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 7 Mar 2019 18:50:52 -0800 Subject: bpf: Add file_pos field to bpf_sysctl ctx Add file_pos field to bpf_sysctl context to read and write sysctl file position at which sysctl is being accessed (read or written). The field can be used to e.g. override whole sysctl value on write to sysctl even when sys_write is called by user space with file_pos > 0. Or BPF program may reject such accesses. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- fs/proc/proc_sysctl.c | 2 +- include/linux/bpf-cgroup.h | 9 ++++---- include/linux/filter.h | 3 +++ include/uapi/linux/bpf.h | 3 +++ kernel/bpf/cgroup.c | 54 +++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 63 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 023101c6f0d7..2d61e5e8c863 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -590,7 +590,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, goto out; error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count, - &new_buf); + ppos, &new_buf); if (error) goto out; diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 1e97271f9a10..cb3c6b3b89c8 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -114,7 +114,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, void __user *buf, size_t *pcount, - void **new_buf, enum bpf_attach_type type); + loff_t *ppos, void **new_buf, + enum bpf_attach_type type); static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) @@ -262,12 +263,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, nbuf) \ +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ - buf, count, nbuf, \ + buf, count, pos, nbuf, \ BPF_CGROUP_SYSCTL); \ __ret; \ }) @@ -340,7 +341,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,nbuf) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/filter.h b/include/linux/filter.h index a23653f9460c..fb0edad75971 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1188,6 +1188,9 @@ struct bpf_sysctl_kern { size_t new_len; int new_updated; int write; + loff_t *ppos; + /* Temporary "register" for indirect stores to ppos. */ + u64 tmp_reg; }; #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 547b8258d731..89976de909af 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3391,6 +3391,9 @@ struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. */ + __u32 file_pos; /* Sysctl file position to read from, write to. + * Allows 1,2,4-byte read an 4-byte write. + */ }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ba4e21986760..b2adf22139b3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -782,6 +782,9 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise + * @ppos: value-result argument: value is position at which read from or write + * to sysctl is happening, result is new position if program overrode it, + * initial value otherwise * @new_buf: pointer to pointer to new buffer that will be allocated if program * overrides new value provided by user space on sysctl write * NOTE: it's caller responsibility to free *new_buf if it was set @@ -796,12 +799,14 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, void __user *buf, size_t *pcount, - void **new_buf, enum bpf_attach_type type) + loff_t *ppos, void **new_buf, + enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { .head = head, .table = table, .write = write, + .ppos = ppos, .cur_val = NULL, .cur_len = PAGE_SIZE, .new_val = NULL, @@ -1030,14 +1035,22 @@ static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, { const int size_default = sizeof(__u32); - if (off < 0 || off + size > sizeof(struct bpf_sysctl) || - off % size || type != BPF_READ) + if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) return false; switch (off) { case offsetof(struct bpf_sysctl, write): + if (type != BPF_READ) + return false; bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case offsetof(struct bpf_sysctl, file_pos): + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } else { + return size == size_default; + } default: return false; } @@ -1059,6 +1072,41 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, write), target_size)); break; + case offsetof(struct bpf_sysctl, file_pos): + /* ppos is a pointer so it should be accessed via indirect + * loads and stores. Also for stores additional temporary + * register is used since neither src_reg nor dst_reg can be + * overridden. + */ + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM( + BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_STX_MEM( + BPF_SIZEOF(u32), treg, si->src_reg, 0); + *insn++ = BPF_LDX_MEM( + BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); + } + *target_size = sizeof(u32); + break; } return insn - insn_buf; -- cgit v1.2.3 From d7a4cb9b6705a89937d12c8158a35a3145dc967a Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 18 Mar 2019 17:55:26 -0700 Subject: bpf: Introduce bpf_strtol and bpf_strtoul helpers Add bpf_strtol and bpf_strtoul to convert a string to long and unsigned long correspondingly. It's similar to user space strtol(3) and strtoul(3) with a few changes to the API: * instead of NUL-terminated C string the helpers expect buffer and buffer length; * resulting long or unsigned long is returned in a separate result-argument; * return value is used to indicate success or failure, on success number of consumed bytes is returned that can be used to identify position to read next if the buffer is expected to contain multiple integers; * instead of *base* argument, *flags* is used that provides base in 5 LSB, other bits are reserved for future use; * number of supported bases is limited. Documentation for the new helpers is provided in bpf.h UAPI. The helpers are made available to BPF_PROG_TYPE_CGROUP_SYSCTL programs to be able to convert string input to e.g. "ulongvec" output. E.g. "net/ipv4/tcp_mem" consists of three ulong integers. They can be parsed by calling to bpf_strtoul three times. Implementation notes: Implementation includes "../../lib/kstrtox.h" to reuse integer parsing functions. It's done exactly same way as fs/proc/base.c already does. Unfortunately existing kstrtoX function can't be used directly since they fail if any invalid character is present right after integer in the string. Existing simple_strtoX functions can't be used either since they're obsolete and don't handle overflow properly. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 + include/uapi/linux/bpf.h | 51 +++++++++++++++++- kernel/bpf/cgroup.c | 4 ++ kernel/bpf/helpers.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 187 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fd06ada941ad..f15432d90728 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -989,6 +989,8 @@ extern const struct bpf_func_proto bpf_sk_redirect_map_proto; extern const struct bpf_func_proto bpf_spin_lock_proto; extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; +extern const struct bpf_func_proto bpf_strtol_proto; +extern const struct bpf_func_proto bpf_strtoul_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 89976de909af..c26be24fd5e2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2575,6 +2575,53 @@ union bpf_attr { * **-E2BIG** if the *buf_len* is too big. * * **-EINVAL** if sysctl is being read. + * + * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by isspace(3)) followed by a single optional '-' + * sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space strtol(3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than buf_len. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by isspace(3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space strtoul(3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than buf_len. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2681,7 +2728,9 @@ union bpf_attr { FN(sysctl_get_name), \ FN(sysctl_get_current_value), \ FN(sysctl_get_new_value), \ - FN(sysctl_set_new_value), + FN(sysctl_set_new_value), \ + FN(strtol), \ + FN(strtoul), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b2adf22139b3..789d4ab2336e 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1016,6 +1016,10 @@ static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { + case BPF_FUNC_strtol: + return &bpf_strtol_proto; + case BPF_FUNC_strtoul: + return &bpf_strtoul_proto; case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a411fc17d265..4266ffde07ca 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,9 @@ #include #include #include +#include + +#include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -363,4 +366,132 @@ const struct bpf_func_proto bpf_get_local_storage_proto = { .arg2_type = ARG_ANYTHING, }; #endif + +#define BPF_STRTOX_BASE_MASK 0x1F + +static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, + unsigned long long *res, bool *is_negative) +{ + unsigned int base = flags & BPF_STRTOX_BASE_MASK; + const char *cur_buf = buf; + size_t cur_len = buf_len; + unsigned int consumed; + size_t val_len; + char str[64]; + + if (!buf || !buf_len || !res || !is_negative) + return -EINVAL; + + if (base != 0 && base != 8 && base != 10 && base != 16) + return -EINVAL; + + if (flags & ~BPF_STRTOX_BASE_MASK) + return -EINVAL; + + while (cur_buf < buf + buf_len && isspace(*cur_buf)) + ++cur_buf; + + *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); + if (*is_negative) + ++cur_buf; + + consumed = cur_buf - buf; + cur_len -= consumed; + if (!cur_len) + return -EINVAL; + + cur_len = min(cur_len, sizeof(str) - 1); + memcpy(str, cur_buf, cur_len); + str[cur_len] = '\0'; + cur_buf = str; + + cur_buf = _parse_integer_fixup_radix(cur_buf, &base); + val_len = _parse_integer(cur_buf, base, res); + + if (val_len & KSTRTOX_OVERFLOW) + return -ERANGE; + + if (val_len == 0) + return -EINVAL; + + cur_buf += val_len; + consumed += cur_buf - str; + + return consumed; +} + +static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, + long long *res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) { + if ((long long)-_res > 0) + return -ERANGE; + *res = -_res; + } else { + if ((long long)_res < 0) + return -ERANGE; + *res = _res; + } + return err; +} + +BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, + long *, res) +{ + long long _res; + int err; + + err = __bpf_strtoll(buf, buf_len, flags, &_res); + if (err < 0) + return err; + if (_res != (long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtol_proto = { + .func = bpf_strtol, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, + unsigned long *, res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) + return -EINVAL; + if (_res != (unsigned long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtoul_proto = { + .func = bpf_strtoul, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; #endif -- cgit v1.2.3 From bfb35c27c65fce60a12e188135ae1344d1b89e24 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 12 Apr 2019 12:27:34 +0100 Subject: bpf: fix whitespace for ENCAP_L2 defines in bpf.h replace tab after #define with space in line with rest of definitions Signed-off-by: Alan Maguire Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 6 +++--- tools/include/uapi/linux/bpf.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c26be24fd5e2..704bb69514a2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2792,14 +2792,14 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) -#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c26be24fd5e2..704bb69514a2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2792,14 +2792,14 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) -#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) -- cgit v1.2.3 From 725721a6506eea53bfde81a34e91a06d6162c216 Mon Sep 17 00:00:00 2001 From: Viet Hoang Tran Date: Mon, 15 Apr 2019 09:54:55 +0000 Subject: bpf: allow clearing all sock_ops callback flags The helper function bpf_sock_ops_cb_flags_set() can be used to both set and clear the sock_ops callback flags. However, its current behavior is not consistent. BPF program may clear a flag if more than one were set, or replace a flag with another one, but cannot clear all flags. This patch also updates the documentation to clarify the ability to clear flags of this helper function. Signed-off-by: Hoang Tran Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 ++++++++- net/core/filter.c | 3 +-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 704bb69514a2..eaf2d3284248 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1737,12 +1737,19 @@ union bpf_attr { * error if an eBPF program tries to set a callback that is not * supported in the current kernel. * - * The supported callback values that *argval* can combine are: + * *argval* is a flag array which can combine these flags: * * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * * Here are some examples of where one could call such eBPF * program: * diff --git a/net/core/filter.c b/net/core/filter.c index bd1f51907b83..1833926a63fc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4437,8 +4437,7 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; - if (val) - tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); } -- cgit v1.2.3 From 0bc199854405543b0debe67c735c0aae94f1d319 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Wed, 17 Apr 2019 16:35:49 -0400 Subject: ipv6: Add rate limit mask for ICMPv6 messages To make ICMPv6 closer to ICMPv4, add ratemask parameter. Since the ICMP message types use larger numeric values, a simple bitmask doesn't fit. I use large bitmap. The input and output are the in form of list of ranges. Set the default to rate limit all error messages but Packet Too Big. For Packet Too Big, use ratemask instead of hard-coded. There are functions where icmpv6_xrlim_allow() and icmpv6_global_allow() aren't called. This patch only adds them to icmpv6_echo_reply(). Rate limiting error messages is mandated by RFC 4443 but RFC 4890 says that it is also acceptable to rate limit informational messages. Thus, I removed the current hard-coded behavior of icmpv6_mask_allow() that doesn't rate limit informational messages. v2: Add dummy function proc_do_large_bitmap() if CONFIG_PROC_SYSCTL isn't defined, expand the description in ip-sysctl.txt and remove unnecessary conditional before kfree(). v3: Inline the bitmap instead of dynamically allocated. Still is a pointer to it is needed because of the way proc_do_large_bitmap work. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 17 ++++++++++++++++- include/net/netns/ipv6.h | 3 +++ include/uapi/linux/icmpv6.h | 4 ++++ kernel/sysctl.c | 6 ++++++ net/ipv6/af_inet6.c | 9 +++++++++ net/ipv6/icmp.c | 31 ++++++++++++++++++++++--------- 6 files changed, 60 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 5eedc6941ce5..8a5e59ba223f 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1913,11 +1913,26 @@ enhanced_dad - BOOLEAN icmp/*: ratelimit - INTEGER - Limit the maximal rates for sending ICMPv6 packets. + Limit the maximal rates for sending ICMPv6 messages. 0 to disable any limiting, otherwise the minimal space between responses in milliseconds. Default: 1000 +ratemask - list of comma separated ranges + For ICMPv6 message types matching the ranges in the ratemask, limit + the sending of the message according to ratelimit parameter. + + The format used for both input and output is a comma separated + list of ranges (e.g. "0-127,129" for ICMPv6 message type 0 to 127 and + 129). Writing to the file will clear all previous ranges of ICMPv6 + message types and update the current list with the input. + + Refer to: https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xhtml + for numerical values of ICMPv6 message types, e.g. echo request is 128 + and echo reply is 129. + + Default: 0-1,3-127 (rate limit ICMPv6 errors except Packet Too Big) + echo_ignore_all - BOOLEAN If set non-zero, then the kernel will ignore all ICMP ECHO requests sent to it over the IPv6 protocol. diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 64e29b58bb5e..5e61b5a8635d 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -8,6 +8,7 @@ #ifndef __NETNS_IPV6_H__ #define __NETNS_IPV6_H__ #include +#include struct ctl_table_header; @@ -35,6 +36,8 @@ struct netns_sysctl_ipv6 { int icmpv6_echo_ignore_all; int icmpv6_echo_ignore_multicast; int icmpv6_echo_ignore_anycast; + DECLARE_BITMAP(icmpv6_ratemask, ICMPV6_MSG_MAX + 1); + unsigned long *icmpv6_ratemask_ptr; int anycast_src_echo_reply; int ip_nonlocal_bind; int fwmark_reflect; diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index 325395f56bfa..2622b5a3e616 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -90,6 +90,8 @@ struct icmp6hdr { #define ICMPV6_TIME_EXCEED 3 #define ICMPV6_PARAMPROB 4 +#define ICMPV6_ERRMSG_MAX 127 + #define ICMPV6_INFOMSG_MASK 0x80 #define ICMPV6_ECHO_REQUEST 128 @@ -110,6 +112,8 @@ struct icmp6hdr { #define ICMPV6_MRDISC_ADV 151 +#define ICMPV6_MSG_MAX 255 + /* * Codes for Destination Unreachable */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c9ec050bcf46..599510a3355e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3326,6 +3326,11 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} #endif /* CONFIG_PROC_SYSCTL */ @@ -3366,3 +3371,4 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); +EXPORT_SYMBOL(proc_do_large_bitmap); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d8587ca4fbeb..3d1de28aaa9e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -850,6 +850,15 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; + + /* By default, rate limit error messages. + * Except for pmtu discovery, it would break it. + * proc_do_large_bitmap needs pointer to the bitmap. + */ + bitmap_set(net->ipv6.sysctl.icmpv6_ratemask, 0, ICMPV6_ERRMSG_MAX + 1); + bitmap_clear(net->ipv6.sysctl.icmpv6_ratemask, ICMPV6_PKT_TOOBIG, 1); + net->ipv6.sysctl.icmpv6_ratemask_ptr = net->ipv6.sysctl.icmpv6_ratemask; + net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index cc14b9998941..afb915807cd0 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -168,22 +168,21 @@ static bool is_ineligible(const struct sk_buff *skb) return false; } -static bool icmpv6_mask_allow(int type) +static bool icmpv6_mask_allow(struct net *net, int type) { - /* Informational messages are not limited. */ - if (type & ICMPV6_INFOMSG_MASK) + if (type > ICMPV6_MSG_MAX) return true; - /* Do not limit pmtu discovery, it would break it. */ - if (type == ICMPV6_PKT_TOOBIG) + /* Limit if icmp type is set in ratemask. */ + if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask)) return true; return false; } -static bool icmpv6_global_allow(int type) +static bool icmpv6_global_allow(struct net *net, int type) { - if (icmpv6_mask_allow(type)) + if (icmpv6_mask_allow(net, type)) return true; if (icmp_global_allow()) @@ -202,7 +201,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct dst_entry *dst; bool res = false; - if (icmpv6_mask_allow(type)) + if (icmpv6_mask_allow(net, type)) return true; /* @@ -511,7 +510,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ - if (!(skb->dev->flags&IFF_LOOPBACK) && !icmpv6_global_allow(type)) + if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) goto out_bh_enable; mip6_addr_swap(skb); @@ -731,6 +730,11 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (IS_ERR(dst)) goto out; + /* Check the ratelimit */ + if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) || + !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6)) + goto out_dst_release; + idev = __in6_dev_get(skb->dev); msg.skb = skb; @@ -751,6 +755,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); } +out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); @@ -1137,6 +1142,13 @@ static struct ctl_table ipv6_icmp_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "ratemask", + .data = &init_net.ipv6.sysctl.icmpv6_ratemask_ptr, + .maxlen = ICMPV6_MSG_MAX + 1, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, { }, }; @@ -1153,6 +1165,7 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all; table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast; table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; + table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr; } return table; } -- cgit v1.2.3 From 8c8b3458d0b91b2230f76fbe1b0280568f10d19f Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Thu, 18 Apr 2019 18:35:31 +0100 Subject: vlan: support binding link state to vlan member bridge ports In the case of vlan filtering on bridges, the bridge may also have the corresponding vlan devices as upper devices. Currently the link state of vlan devices is transferred from the lower device. So this is up if the bridge is in admin up state and there is at least one bridge port that is up, regardless of the vlan that the port is a member of. The link state of the vlan device may need to track only the state of the subset of ports that are also members of the corresponding vlan, rather than that of all ports. Add a flag to specify a vlan bridge binding mode, by which the link state is no longer automatically transferred from the lower device, but is instead determined by the bridge ports that are members of the vlan. Signed-off-by: Mike Manning Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_vlan.h | 9 +++++---- net/8021q/vlan_dev.c | 3 ++- net/8021q/vlan_netlink.c | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_vlan.h b/include/uapi/linux/if_vlan.h index 7a0e8bd65b6b..90a2c89afc8f 100644 --- a/include/uapi/linux/if_vlan.h +++ b/include/uapi/linux/if_vlan.h @@ -32,10 +32,11 @@ enum vlan_ioctl_cmds { }; enum vlan_flags { - VLAN_FLAG_REORDER_HDR = 0x1, - VLAN_FLAG_GVRP = 0x2, - VLAN_FLAG_LOOSE_BINDING = 0x4, - VLAN_FLAG_MVRP = 0x8, + VLAN_FLAG_REORDER_HDR = 0x1, + VLAN_FLAG_GVRP = 0x2, + VLAN_FLAG_LOOSE_BINDING = 0x4, + VLAN_FLAG_MVRP = 0x8, + VLAN_FLAG_BRIDGE_BINDING = 0x10, }; enum vlan_name_types { diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 8d77b6ee4477..ed996b500b10 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -223,7 +223,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) u32 old_flags = vlan->flags; if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | - VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) + VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | + VLAN_FLAG_BRIDGE_BINDING)) return -EINVAL; vlan->flags = (old_flags & ~mask) | (flags & mask); diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 9b60c1e399e2..a624dccf68fd 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -84,7 +84,8 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], flags = nla_data(data[IFLA_VLAN_FLAGS]); if ((flags->flags & flags->mask) & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | - VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) { + VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | + VLAN_FLAG_BRIDGE_BINDING)) { NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags"); return -EINVAL; } -- cgit v1.2.3 From 0768e17073dc527ccd18ed5f96ce85f9985e9115 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 Apr 2019 22:56:11 +0200 Subject: net: socket: implement 64-bit timestamps The 'timeval' and 'timespec' data structures used for socket timestamps are going to be redefined in user space based on 64-bit time_t in future versions of the C library to deal with the y2038 overflow problem, which breaks the ABI definition. Unlike many modern ioctl commands, SIOCGSTAMP and SIOCGSTAMPNS do not use the _IOR() macro to encode the size of the transferred data, so it remains ambiguous whether the application uses the old or new layout. The best workaround I could find is rather ugly: we redefine the command code based on the size of the respective data structure with a ternary operator. This lets it get evaluated as late as possible, hopefully after that structure is visible to the caller. We cannot use an #ifdef here, because inux/sockios.h might have been included before any libc header that could determine the size of time_t. The ioctl implementation now interprets the new command codes as always referring to the 64-bit structure on all architectures, while the old architecture specific command code still refers to the old architecture specific layout. The new command number is only used when they are actually different. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/sockios.h | 4 ++-- arch/mips/include/uapi/asm/sockios.h | 4 ++-- arch/sh/include/uapi/asm/sockios.h | 5 +++-- arch/xtensa/include/uapi/asm/sockios.h | 4 ++-- include/uapi/asm-generic/sockios.h | 4 ++-- include/uapi/linux/sockios.h | 21 +++++++++++++++++++++ net/socket.c | 24 ++++++++++++++++++------ 7 files changed, 50 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/alpha/include/uapi/asm/sockios.h b/arch/alpha/include/uapi/asm/sockios.h index ba287e4b01bf..af92bc27c3be 100644 --- a/arch/alpha/include/uapi/asm/sockios.h +++ b/arch/alpha/include/uapi/asm/sockios.h @@ -11,7 +11,7 @@ #define SIOCSPGRP _IOW('s', 8, pid_t) #define SIOCGPGRP _IOR('s', 9, pid_t) -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ +#define SIOCGSTAMP_OLD 0x8906 /* Get stamp (timeval) */ +#define SIOCGSTAMPNS_OLD 0x8907 /* Get stamp (timespec) */ #endif /* _ASM_ALPHA_SOCKIOS_H */ diff --git a/arch/mips/include/uapi/asm/sockios.h b/arch/mips/include/uapi/asm/sockios.h index 5b40a88593fa..66f60234f290 100644 --- a/arch/mips/include/uapi/asm/sockios.h +++ b/arch/mips/include/uapi/asm/sockios.h @@ -21,7 +21,7 @@ #define SIOCSPGRP _IOW('s', 8, pid_t) #define SIOCGPGRP _IOR('s', 9, pid_t) -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ +#define SIOCGSTAMP_OLD 0x8906 /* Get stamp (timeval) */ +#define SIOCGSTAMPNS_OLD 0x8907 /* Get stamp (timespec) */ #endif /* _ASM_SOCKIOS_H */ diff --git a/arch/sh/include/uapi/asm/sockios.h b/arch/sh/include/uapi/asm/sockios.h index 17313d2c3527..ef18a668456d 100644 --- a/arch/sh/include/uapi/asm/sockios.h +++ b/arch/sh/include/uapi/asm/sockios.h @@ -10,6 +10,7 @@ #define SIOCSPGRP _IOW('s', 8, pid_t) #define SIOCGPGRP _IOR('s', 9, pid_t) -#define SIOCGSTAMP _IOR('s', 100, struct timeval) /* Get stamp (timeval) */ -#define SIOCGSTAMPNS _IOR('s', 101, struct timespec) /* Get stamp (timespec) */ +#define SIOCGSTAMP_OLD _IOR('s', 100, struct timeval) /* Get stamp (timeval) */ +#define SIOCGSTAMPNS_OLD _IOR('s', 101, struct timespec) /* Get stamp (timespec) */ + #endif /* __ASM_SH_SOCKIOS_H */ diff --git a/arch/xtensa/include/uapi/asm/sockios.h b/arch/xtensa/include/uapi/asm/sockios.h index fb8ac3607189..1a1f58f4b75a 100644 --- a/arch/xtensa/include/uapi/asm/sockios.h +++ b/arch/xtensa/include/uapi/asm/sockios.h @@ -26,7 +26,7 @@ #define SIOCSPGRP _IOW('s', 8, pid_t) #define SIOCGPGRP _IOR('s', 9, pid_t) -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ +#define SIOCGSTAMP_OLD 0x8906 /* Get stamp (timeval) */ +#define SIOCGSTAMPNS_OLD 0x8907 /* Get stamp (timespec) */ #endif /* _XTENSA_SOCKIOS_H */ diff --git a/include/uapi/asm-generic/sockios.h b/include/uapi/asm-generic/sockios.h index 64f658c7cec2..44fa3ed70483 100644 --- a/include/uapi/asm-generic/sockios.h +++ b/include/uapi/asm-generic/sockios.h @@ -8,7 +8,7 @@ #define FIOGETOWN 0x8903 #define SIOCGPGRP 0x8904 #define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ +#define SIOCGSTAMP_OLD 0x8906 /* Get stamp (timeval) */ +#define SIOCGSTAMPNS_OLD 0x8907 /* Get stamp (timespec) */ #endif /* __ASM_GENERIC_SOCKIOS_H */ diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h index d393e9ed3964..7d1bccbbef78 100644 --- a/include/uapi/linux/sockios.h +++ b/include/uapi/linux/sockios.h @@ -19,6 +19,7 @@ #ifndef _LINUX_SOCKIOS_H #define _LINUX_SOCKIOS_H +#include #include /* Linux-specific socket ioctls */ @@ -27,6 +28,26 @@ #define SOCK_IOC_TYPE 0x89 +/* + * the timeval/timespec data structure layout is defined by libc, + * so we need to cover both possible versions on 32-bit. + */ +/* Get stamp (timeval) */ +#define SIOCGSTAMP_NEW _IOR(SOCK_IOC_TYPE, 0x06, long long[2]) +/* Get stamp (timespec) */ +#define SIOCGSTAMPNS_NEW _IOR(SOCK_IOC_TYPE, 0x07, long long[2]) + +#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) +/* on 64-bit and x32, avoid the ?: operator */ +#define SIOCGSTAMP SIOCGSTAMP_OLD +#define SIOCGSTAMPNS SIOCGSTAMPNS_OLD +#else +#define SIOCGSTAMP ((sizeof(struct timeval)) == 8 ? \ + SIOCGSTAMP_OLD : SIOCGSTAMP_NEW) +#define SIOCGSTAMPNS ((sizeof(struct timespec)) == 8 ? \ + SIOCGSTAMPNS_OLD : SIOCGSTAMPNS_NEW) +#endif + /* Routing table calls. */ #define SIOCADDRT 0x890B /* add routing table entry */ #define SIOCDELRT 0x890C /* delete routing table entry */ diff --git a/net/socket.c b/net/socket.c index ab624d42ead5..8d9d4fc7d962 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1164,14 +1164,24 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = open_related_ns(&net->ns, get_net_ns); break; - case SIOCGSTAMP: - case SIOCGSTAMPNS: + case SIOCGSTAMP_OLD: + case SIOCGSTAMPNS_OLD: if (!sock->ops->gettstamp) { err = -ENOIOCTLCMD; break; } err = sock->ops->gettstamp(sock, argp, - cmd == SIOCGSTAMP, false); + cmd == SIOCGSTAMP_OLD, + !IS_ENABLED(CONFIG_64BIT)); + case SIOCGSTAMP_NEW: + case SIOCGSTAMPNS_NEW: + if (!sock->ops->gettstamp) { + err = -ENOIOCTLCMD; + break; + } + err = sock->ops->gettstamp(sock, argp, + cmd == SIOCGSTAMP_NEW, + false); break; default: err = sock_do_ioctl(net, sock, cmd, arg); @@ -3324,11 +3334,11 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCADDRT: case SIOCDELRT: return routing_ioctl(net, sock, cmd, argp); - case SIOCGSTAMP: - case SIOCGSTAMPNS: + case SIOCGSTAMP_OLD: + case SIOCGSTAMPNS_OLD: if (!sock->ops->gettstamp) return -ENOIOCTLCMD; - return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP, + return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, !COMPAT_USE_64BIT_TIME); case SIOCBONDSLAVEINFOQUERY: @@ -3348,6 +3358,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCADDDLCI: case SIOCDELDLCI: case SIOCGSKNS: + case SIOCGSTAMP_NEW: + case SIOCGSTAMPNS_NEW: return sock_ioctl(file, cmd, arg); case SIOCGIFFLAGS: -- cgit v1.2.3 From 42e5425aa0dfd8a6cdd7e177cfd9703df05c7411 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 18 Apr 2019 21:02:19 +0700 Subject: tipc: introduce new socket option TIPC_SOCK_RECVQ_USED When using TIPC_SOCK_RECVQ_DEPTH for getsockopt(), it returns the number of buffers in receive socket buffer which is not so helpful for user space applications. This commit introduces the new option TIPC_SOCK_RECVQ_USED which returns the current allocated bytes of the receive socket buffer. This helps user space applications dimension its buffer usage to avoid buffer overload issue. Signed-off-by: Tung Nguyen Acked-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 1 + net/tipc/socket.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 6b2fd4d9655f..7df026ea6aff 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -190,6 +190,7 @@ struct sockaddr_tipc { #define TIPC_MCAST_REPLICAST 134 /* Default: TIPC selects. No arg */ #define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ #define TIPC_GROUP_LEAVE 136 /* No argument */ +#define TIPC_SOCK_RECVQ_USED 137 /* Default: none (read only) */ /* * Flag values diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 8ac8ddf1e324..1385207a301f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3070,6 +3070,9 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, case TIPC_SOCK_RECVQ_DEPTH: value = skb_queue_len(&sk->sk_receive_queue); break; + case TIPC_SOCK_RECVQ_USED: + value = sk_rmem_alloc_get(sk); + break; case TIPC_GROUP_JOIN: seq.type = 0; if (tsk->group) -- cgit v1.2.3 From 1d9373329bcbe0cfbb9af1738139292a9b10fe6a Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Fri, 15 Mar 2019 17:38:58 +0200 Subject: nl80211: increase NL80211_MAX_SUPP_REG_RULES The iwlwifi driver creates one rule per channel, thus it needs more rules than normal. To solve this, increase NL80211_MAX_SUPP_REG_RULES so iwlwifi can also fit UHB (ultra high band) channels. Signed-off-by: Shaul Triebitz Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a99d75bef598..f00dbd82149e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -11,7 +11,7 @@ * Copyright 2008 Jouni Malinen * Copyright 2008 Colin McCabe * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -2809,7 +2809,7 @@ enum nl80211_attrs { #define NL80211_MAX_SUPP_RATES 32 #define NL80211_MAX_SUPP_HT_RATES 77 -#define NL80211_MAX_SUPP_REG_RULES 64 +#define NL80211_MAX_SUPP_REG_RULES 128 #define NL80211_TKIP_DATA_OFFSET_ENCR_KEY 0 #define NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY 16 #define NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY 24 -- cgit v1.2.3 From 6cdd3979a2bdc16116c5b2eb09475abf54ba9e70 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Tue, 19 Mar 2019 21:34:07 +0100 Subject: nl80211/cfg80211: Extended Key ID support Add support for IEEE 802.11-2016 "Extended Key ID for Individually Addressed Frames". Extend cfg80211 and nl80211 to allow pairwise keys to be installed for Rx only, enable Tx separately and allow Key ID 1 for pairwise keys. Signed-off-by: Alexander Wetzel [use NLA_POLICY_RANGE() for NL80211_KEY_MODE] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 28 ++++++++++++++++++++++++++++ net/wireless/nl80211.c | 32 ++++++++++++++++++++++++++++---- net/wireless/rdev-ops.h | 3 ++- net/wireless/trace.h | 31 ++++++++++++++++++++++++++----- net/wireless/util.c | 21 +++++++++++++++------ 6 files changed, 101 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f6665f8eba5a..2b039802ae2e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -485,6 +485,7 @@ struct vif_params { * with the get_key() callback, must be in little endian, * length given by @seq_len. * @seq_len: length of @seq. + * @mode: key install mode (RX_TX, NO_TX or SET_TX) */ struct key_params { const u8 *key; @@ -492,6 +493,7 @@ struct key_params { int key_len; int seq_len; u32 cipher; + enum nl80211_key_mode mode; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f00dbd82149e..e75615bf4453 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4152,6 +4152,27 @@ enum nl80211_channel_type { NL80211_CHAN_HT40PLUS }; +/** + * enum nl80211_key_mode - Key mode + * + * @NL80211_KEY_RX_TX: (Default) + * Key can be used for Rx and Tx immediately + * + * The following modes can only be selected for unicast keys and when the + * driver supports @NL80211_EXT_FEATURE_EXT_KEY_ID: + * + * @NL80211_KEY_NO_TX: Only allowed in combination with @NL80211_CMD_NEW_KEY: + * Unicast key can only be used for Rx, Tx not allowed, yet + * @NL80211_KEY_SET_TX: Only allowed in combination with @NL80211_CMD_SET_KEY: + * The unicast key identified by idx and mac is cleared for Tx and becomes + * the preferred Tx key for the station. + */ +enum nl80211_key_mode { + NL80211_KEY_RX_TX, + NL80211_KEY_NO_TX, + NL80211_KEY_SET_TX +}; + /** * enum nl80211_chan_width - channel width definitions * @@ -4395,6 +4416,9 @@ enum nl80211_key_default_types { * @NL80211_KEY_DEFAULT_TYPES: A nested attribute containing flags * attributes, specifying what a key should be set as default as. * See &enum nl80211_key_default_types. + * @NL80211_KEY_MODE: the mode from enum nl80211_key_mode. + * Defaults to @NL80211_KEY_RX_TX. + * * @__NL80211_KEY_AFTER_LAST: internal * @NL80211_KEY_MAX: highest key attribute */ @@ -4408,6 +4432,7 @@ enum nl80211_key_attributes { NL80211_KEY_DEFAULT_MGMT, NL80211_KEY_TYPE, NL80211_KEY_DEFAULT_TYPES, + NL80211_KEY_MODE, /* keep last */ __NL80211_KEY_AFTER_LAST, @@ -5353,6 +5378,8 @@ enum nl80211_feature_flags { * able to rekey an in-use key correctly. Userspace must not rekey PTK keys * if this flag is not set. Ignoring this can leak clear text packets and/or * freeze the connection. + * @NL80211_EXT_FEATURE_EXT_KEY_ID: Driver supports "Extended Key ID for + * Individually Addressed Frames" from IEEE802.11-2016. * * @NL80211_EXT_FEATURE_AIRTIME_FAIRNESS: Driver supports getting airtime * fairness for transmitted packets and has enabled airtime fairness @@ -5406,6 +5433,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_AIRTIME_FAIRNESS, NL80211_EXT_FEATURE_AP_PMKSA_CACHING, NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD, + NL80211_EXT_FEATURE_EXT_KEY_ID, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0124bab1f8a7..ab9b095f6094 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -553,6 +553,7 @@ static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = { [NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG }, [NL80211_KEY_TYPE] = NLA_POLICY_MAX(NLA_U32, NUM_NL80211_KEYTYPES - 1), [NL80211_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED }, + [NL80211_KEY_MODE] = NLA_POLICY_RANGE(NLA_U8, 0, NL80211_KEY_SET_TX), }; /* policy for the key default flags */ @@ -967,6 +968,9 @@ static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key, k->def_multi = kdt[NL80211_KEY_DEFAULT_TYPE_MULTICAST]; } + if (tb[NL80211_KEY_MODE]) + k->p.mode = nla_get_u8(tb[NL80211_KEY_MODE]); + return 0; } @@ -3643,8 +3647,11 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) if (key.idx < 0) return -EINVAL; - /* only support setting default key */ - if (!key.def && !key.defmgmt) + /* Only support setting default key and + * Extended Key ID action NL80211_KEY_SET_TX. + */ + if (!key.def && !key.defmgmt && + !(key.p.mode == NL80211_KEY_SET_TX)) return -EINVAL; wdev_lock(dev->ieee80211_ptr); @@ -3668,7 +3675,7 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) #ifdef CONFIG_CFG80211_WEXT dev->ieee80211_ptr->wext.default_key = key.idx; #endif - } else { + } else if (key.defmgmt) { if (key.def_uni || !key.def_multi) { err = -EINVAL; goto out; @@ -3690,8 +3697,25 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) #ifdef CONFIG_CFG80211_WEXT dev->ieee80211_ptr->wext.default_mgmt_key = key.idx; #endif - } + } else if (key.p.mode == NL80211_KEY_SET_TX && + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EXT_KEY_ID)) { + u8 *mac_addr = NULL; + if (info->attrs[NL80211_ATTR_MAC]) + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (!mac_addr || key.idx < 0 || key.idx > 1) { + err = -EINVAL; + goto out; + } + + err = rdev_add_key(rdev, dev, key.idx, + NL80211_KEYTYPE_PAIRWISE, + mac_addr, &key.p); + } else { + err = -EINVAL; + } out: wdev_unlock(dev->ieee80211_ptr); diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index c1e3210b09e6..18437a9deb54 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -77,7 +77,8 @@ static inline int rdev_add_key(struct cfg80211_registered_device *rdev, struct key_params *params) { int ret; - trace_rdev_add_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr); + trace_rdev_add_key(&rdev->wiphy, netdev, key_index, pairwise, + mac_addr, params->mode); ret = rdev->ops->add_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr, params); trace_rdev_return_int(&rdev->wiphy, ret); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 2dda5291fc01..488ef2ce8231 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -430,22 +430,43 @@ DECLARE_EVENT_CLASS(key_handle, BOOL_TO_STR(__entry->pairwise), MAC_PR_ARG(mac_addr)) ); -DEFINE_EVENT(key_handle, rdev_add_key, +DEFINE_EVENT(key_handle, rdev_get_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, bool pairwise, const u8 *mac_addr), TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr) ); -DEFINE_EVENT(key_handle, rdev_get_key, +DEFINE_EVENT(key_handle, rdev_del_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, bool pairwise, const u8 *mac_addr), TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr) ); -DEFINE_EVENT(key_handle, rdev_del_key, +TRACE_EVENT(rdev_add_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, - bool pairwise, const u8 *mac_addr), - TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr) + bool pairwise, const u8 *mac_addr, u8 mode), + TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr, mode), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(mac_addr) + __field(u8, key_index) + __field(bool, pairwise) + __field(u8, mode) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(mac_addr, mac_addr); + __entry->key_index = key_index; + __entry->pairwise = pairwise; + __entry->mode = mode; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key_index: %u, " + "mode: %u, pairwise: %s, mac addr: " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index, + __entry->mode, BOOL_TO_STR(__entry->pairwise), + MAC_PR_ARG(mac_addr)) ); TRACE_EVENT(rdev_set_default_key, diff --git a/net/wireless/util.c b/net/wireless/util.c index e4b8db5e81ec..6c02c9cf7aa9 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -237,14 +237,23 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - /* Disallow pairwise keys with non-zero index unless it's WEP - * or a vendor specific cipher (because current deployments use - * pairwise WEP keys with non-zero indices and for vendor - * specific ciphers this should be validated in the driver or - * hardware level - but 802.11i clearly specifies to use zero) + /* IEEE802.11-2016 allows only 0 and - when using Extended Key + * ID - 1 as index for pairwise keys. + * @NL80211_KEY_NO_TX is only allowed for pairwise keys when + * the driver supports Extended Key ID. + * @NL80211_KEY_SET_TX can't be set when installing and + * validating a key. */ - if (pairwise && key_idx) + if (params->mode == NL80211_KEY_NO_TX) { + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EXT_KEY_ID)) + return -EINVAL; + else if (!pairwise || key_idx < 0 || key_idx > 1) + return -EINVAL; + } else if ((pairwise && key_idx) || + params->mode == NL80211_KEY_SET_TX) { return -EINVAL; + } break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: -- cgit v1.2.3 From e96d1cd2635c05efdd01b4eafcfc50c22c40751f Mon Sep 17 00:00:00 2001 From: Ashok Raj Nagarajan Date: Fri, 29 Mar 2019 16:18:21 +0530 Subject: cfg80211: Add support to set tx power for a station associated This patch adds support to set transmit power setting type and transmit power level attributes to NL80211_CMD_SET_STATION in order to facilitate adjusting the transmit power level of a station associated to the AP. The added attributes allow selection of automatic and limited transmit power level, with the level defined in dBm format. Co-developed-by: Balaji Pothunoori Signed-off-by: Ashok Raj Nagarajan Signed-off-by: Balaji Pothunoori Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 22 ++++++++++++++++++++++ include/uapi/linux/nl80211.h | 15 +++++++++++++++ net/wireless/nl80211.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2b039802ae2e..2ea04e94b522 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -975,6 +975,27 @@ enum station_parameters_apply_mask { STATION_PARAM_APPLY_UAPSD = BIT(0), STATION_PARAM_APPLY_CAPABILITY = BIT(1), STATION_PARAM_APPLY_PLINK_STATE = BIT(2), + STATION_PARAM_APPLY_STA_TXPOWER = BIT(3), +}; + +/** + * struct sta_txpwr - station txpower configuration + * + * Used to configure txpower for station. + * + * @power: tx power (in dBm) to be used for sending data traffic. If tx power + * is not provided, the default per-interface tx power setting will be + * overriding. Driver should be picking up the lowest tx power, either tx + * power per-interface or per-station. + * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power + * will be less than or equal to specified from userspace, whereas if TPC + * %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power. + * NL80211_TX_POWER_FIXED is not a valid configuration option for + * per peer TPC. + */ +struct sta_txpwr { + s16 power; + enum nl80211_tx_power_setting type; }; /** @@ -1049,6 +1070,7 @@ struct station_parameters { const struct ieee80211_he_cap_elem *he_capa; u8 he_capa_len; u16 airtime_weight; + struct sta_txpwr txpwr; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e75615bf4453..25f70dd2b583 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2315,6 +2315,15 @@ enum nl80211_commands { * @NL80211_ATTR_AIRTIME_WEIGHT: Station's weight when scheduled by the airtime * scheduler. * + * @NL80211_ATTR_STA_TX_POWER_SETTING: Transmit power setting type (u8) for + * station associated with the AP. See &enum nl80211_tx_power_setting for + * possible values. + * @NL80211_ATTR_STA_TX_POWER: Transmit power level (s16) in dBm units. This + * allows to set Tx power for a station. If this attribute is not included, + * the default per-interface tx power setting will be overriding. Driver + * should be picking up the lowest tx power, either tx power per-interface + * or per-station. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2765,6 +2774,8 @@ enum nl80211_attrs { NL80211_ATTR_PEER_MEASUREMENTS, NL80211_ATTR_AIRTIME_WEIGHT, + NL80211_ATTR_STA_TX_POWER_SETTING, + NL80211_ATTR_STA_TX_POWER, /* add attributes here, update the policy in nl80211.c */ @@ -5391,6 +5402,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD: Driver supports * filtering of sched scan results using band specific RSSI thresholds. * + * @NL80211_EXT_FEATURE_STA_TX_PWR: This driver supports controlling tx power + * to a station. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5434,6 +5448,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_AP_PMKSA_CACHING, NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD, NL80211_EXT_FEATURE_EXT_KEY_ID, + NL80211_EXT_FEATURE_STA_TX_PWR, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 64f191244c67..0524a6fb84ad 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -331,6 +331,11 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = NL80211_MAX_SUPP_RATES }, [NL80211_ATTR_STA_PLINK_ACTION] = NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_ACTIONS - 1), + [NL80211_ATTR_STA_TX_POWER_SETTING] = + NLA_POLICY_RANGE(NLA_U8, + NL80211_TX_POWER_AUTOMATIC, + NL80211_TX_POWER_FIXED), + [NL80211_ATTR_STA_TX_POWER] = { .type = NLA_S16 }, [NL80211_ATTR_STA_VLAN] = { .type = NLA_U32 }, [NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ }, [NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY, @@ -5420,6 +5425,36 @@ static int nl80211_set_station_tdls(struct genl_info *info, return nl80211_parse_sta_wme(info, params); } +static int nl80211_parse_sta_txpower_setting(struct genl_info *info, + struct station_parameters *params) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int idx; + + if (info->attrs[NL80211_ATTR_STA_TX_POWER_SETTING]) { + if (!rdev->ops->set_tx_power || + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_STA_TX_PWR)) + return -EOPNOTSUPP; + + idx = NL80211_ATTR_STA_TX_POWER_SETTING; + params->txpwr.type = nla_get_u8(info->attrs[idx]); + + if (params->txpwr.type == NL80211_TX_POWER_LIMITED) { + idx = NL80211_ATTR_STA_TX_POWER; + + if (info->attrs[idx]) + params->txpwr.power = + nla_get_s16(info->attrs[idx]); + else + return -EINVAL; + } + params->sta_modify_mask |= STATION_PARAM_APPLY_STA_TXPOWER; + } + + return 0; +} + static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -5513,6 +5548,10 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) return -EOPNOTSUPP; + err = nl80211_parse_sta_txpower_setting(info, ¶ms); + if (err) + return err; + /* Include parameters for TDLS peer (will check later) */ err = nl80211_set_station_tdls(info, ¶ms); if (err) @@ -5650,6 +5689,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) return -EOPNOTSUPP; + err = nl80211_parse_sta_txpower_setting(info, ¶ms); + if (err) + return err; + err = nl80211_parse_sta_channel_info(info, ¶ms); if (err) return err; -- cgit v1.2.3 From 5ab92e7fe49ad74293b50fb9e6f25be5521e2f68 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 11 Apr 2019 13:47:24 -0700 Subject: cfg80211: add support to probe unexercised mesh link Adding support to allow mesh HWMP to measure link metrics on unexercised direct mesh path by sending some data frames to other mesh points which are not currently selected as a primary traffic path but only 1 hop away. The absence of the primary path to the chosen node makes it necessary to apply some form of marking on a chosen packet stream so that the packets can be properly steered to the selected node for testing, and not by the regular mesh path lookup. Tested-by: Pradeep Kumar Chitrapu Signed-off-by: Rajkumar Manoharan Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 +++++ include/uapi/linux/nl80211.h | 17 ++++++++++++++++ net/wireless/nl80211.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 13 ++++++++++++ net/wireless/trace.h | 18 +++++++++++++++++ 5 files changed, 101 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 944de1802210..298301525f9f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3501,6 +3501,9 @@ struct cfg80211_update_owe_info { * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME * but offloading OWE processing to the user space will get the updated * DH IE through this interface. + * + * @probe_mesh_link: Probe direct Mesh peer's link quality by sending data frame + * and overrule HWMP path selection algorithm. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3817,6 +3820,8 @@ struct cfg80211_ops { struct cfg80211_pmsr_request *request); int (*update_owe_info)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_update_owe_info *owe_info); + int (*probe_mesh_link)(struct wiphy *wiphy, struct net_device *dev, + const u8 *buf, size_t len); }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 25f70dd2b583..6f09d1500960 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1070,6 +1070,21 @@ * OWE AKM by the host drivers that implement SME but rely * on the user space for the cryptographic/DH IE processing in AP mode. * + * @NL80211_CMD_PROBE_MESH_LINK: The requirement for mesh link metric + * refreshing, is that from one mesh point we be able to send some data + * frames to other mesh points which are not currently selected as a + * primary traffic path, but which are only 1 hop away. The absence of + * the primary path to the chosen node makes it necessary to apply some + * form of marking on a chosen packet stream so that the packets can be + * properly steered to the selected node for testing, and not by the + * regular mesh path lookup. Further, the packets must be of type data + * so that the rate control (often embedded in firmware) is used for + * rate selection. + * + * Here attribute %NL80211_ATTR_MAC is used to specify connected mesh + * peer MAC address and %NL80211_ATTR_FRAME is used to specify the frame + * content. The frame is ethernet data. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1292,6 +1307,8 @@ enum nl80211_commands { NL80211_CMD_UPDATE_OWE_INFO, + NL80211_CMD_PROBE_MESH_LINK, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5dfc4dba9e56..3aecdd3d5b07 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13404,6 +13404,47 @@ static int nl80211_update_owe_info(struct sk_buff *skb, struct genl_info *info) return rdev_update_owe_info(rdev, dev, &owe_info); } +static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct station_info sinfo = {}; + const u8 *buf; + size_t len; + u8 *dest; + int err; + + if (!rdev->ops->probe_mesh_link || !rdev->ops->get_station) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_FRAME]) { + GENL_SET_ERR_MSG(info, "Frame or MAC missing"); + return -EINVAL; + } + + if (wdev->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + + dest = nla_data(info->attrs[NL80211_ATTR_MAC]); + buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); + len = nla_len(info->attrs[NL80211_ATTR_FRAME]); + + if (len < sizeof(struct ethhdr)) + return -EINVAL; + + if (!ether_addr_equal(buf, dest) || is_multicast_ether_addr(buf) || + !ether_addr_equal(buf + ETH_ALEN, dev->dev_addr)) + return -EINVAL; + + err = rdev_get_station(rdev, dev, dest, &sinfo); + if (err) + return err; + + return rdev_probe_mesh_link(rdev, dev, dest, buf, len); +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -14241,6 +14282,13 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_PROBE_MESH_LINK, + .doit = nl80211_probe_mesh_link, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_family nl80211_fam __ro_after_init = { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 18437a9deb54..e853a4fe6f97 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1286,4 +1286,17 @@ static inline int rdev_update_owe_info(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_probe_mesh_link(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *dest, + const void *buf, size_t len) +{ + int ret; + + trace_rdev_probe_mesh_link(&rdev->wiphy, dev, dest, buf, len); + ret = rdev->ops->probe_mesh_link(&rdev->wiphy, dev, buf, len); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 488ef2ce8231..2abfff925aac 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3421,6 +3421,24 @@ TRACE_EVENT(cfg80211_update_owe_info_event, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) ); +TRACE_EVENT(rdev_probe_mesh_link, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const u8 *dest, const u8 *buf, size_t len), + TP_ARGS(wiphy, netdev, dest, buf, len), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(dest) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(dest, dest); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest)) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 9df1c28bb75217b244257152ab7d788bb2a386d0 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:47 -0700 Subject: bpf: add writable context for raw tracepoints This is an opt-in interface that allows a tracepoint to provide a safe buffer that can be written from a BPF_PROG_TYPE_RAW_TRACEPOINT program. The size of the buffer must be a compile-time constant, and is checked before allowing a BPF program to attach to a tracepoint that uses this feature. The pointer to this buffer will be the first argument of tracepoints that opt in; the pointer is valid and can be bpf_probe_read() by both BPF_PROG_TYPE_RAW_TRACEPOINT and BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE programs that attach to such a tracepoint, but the buffer to which it points may only be written by the latter. Signed-off-by: Matt Mullins Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_types.h | 1 + include/linux/tracepoint-defs.h | 1 + include/trace/bpf_probe.h | 27 +++++++++++++++++++++++++-- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 8 ++++++-- kernel/bpf/verifier.c | 31 +++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 24 ++++++++++++++++++++++++ 8 files changed, 91 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f15432d90728..cd6341eabd74 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -272,6 +272,7 @@ enum bpf_reg_type { PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ + PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ }; /* The information passed from prog-specific *_is_valid_access @@ -361,6 +362,7 @@ struct bpf_prog_aux { u32 used_map_cnt; u32 max_ctx_offset; u32 max_pkt_offset; + u32 max_tp_access; u32 stack_depth; u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index d26991a16894..a10d37bce364 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -25,6 +25,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 49ba9cde7e4b..b29950a19205 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -45,6 +45,7 @@ struct bpf_raw_event_map { struct tracepoint *tp; void *bpf_func; u32 num_args; + u32 writable_size; } __aligned(32); #endif diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 505dae0bed80..d6e556c0a085 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -69,8 +69,7 @@ __bpf_trace_##call(void *__data, proto) \ * to make sure that if the tracepoint handling changes, the * bpf probe will fail to compile unless it too is updated. */ -#undef DEFINE_EVENT -#define DEFINE_EVENT(template, call, proto, args) \ +#define __DEFINE_EVENT(template, call, proto, args, size) \ static inline void bpf_test_probe_##call(void) \ { \ check_trace_callback_type_##call(__bpf_trace_##template); \ @@ -81,12 +80,36 @@ __bpf_trace_tp_map_##call = { \ .tp = &__tracepoint_##call, \ .bpf_func = (void *)__bpf_trace_##template, \ .num_args = COUNT_ARGS(args), \ + .writable_size = size, \ }; +#define FIRST(x, ...) x + +#undef DEFINE_EVENT_WRITABLE +#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \ +static inline void bpf_test_buffer_##call(void) \ +{ \ + /* BUILD_BUG_ON() is ignored if the code is completely eliminated, but \ + * BUILD_BUG_ON_ZERO() uses a different mechanism that is not \ + * dead-code-eliminated. \ + */ \ + FIRST(proto); \ + (void)BUILD_BUG_ON_ZERO(size != sizeof(*FIRST(args))); \ +} \ +__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) + +#undef DEFINE_EVENT +#define DEFINE_EVENT(template, call, proto, args) \ + __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0) #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef DEFINE_EVENT_WRITABLE +#undef __DEFINE_EVENT +#undef FIRST + #endif /* CONFIG_BPF_EVENTS */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index eaf2d3284248..f7fa7a34a62d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -168,6 +168,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, }; enum bpf_attach_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b0de49598341..ae141e745f92 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1789,12 +1789,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) } raw_tp->btp = btp; - prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, - BPF_PROG_TYPE_RAW_TRACEPOINT); + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out_free_tp; } + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { + err = -EINVAL; + goto out_put_prog; + } err = bpf_probe_register(raw_tp->btp, prog); if (err) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 423f242a5efb..2ef442c62c0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -405,6 +405,7 @@ static const char * const reg_type_str[] = { [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", + [PTR_TO_TP_BUFFER] = "tp_buffer", }; static char slot_type_char[] = { @@ -1993,6 +1994,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env, return 0; } +static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + if (off < 0) { + verbose(env, + "R%d invalid tracepoint buffer access: off=%d, size=%d", + regno, off, size); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, + "R%d invalid variable buffer offset: off=%d, var_off=%s", + regno, off, tn_buf); + return -EACCES; + } + if (off + size > env->prog->aux->max_tp_access) + env->prog->aux->max_tp_access = off + size; + + return 0; +} + + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -2137,6 +2164,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_TP_BUFFER) { + err = check_tp_buffer_access(env, reg, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 91800be0c8eb..8607aba1d882 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -915,6 +915,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; +static bool raw_tp_writable_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off == 0) { + if (size != sizeof(u64) || type != BPF_READ) + return false; + info->reg_type = PTR_TO_TP_BUFFER; + } + return raw_tp_prog_is_valid_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_writable_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -1204,6 +1225,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) return -EINVAL; + if (prog->aux->max_tp_access > btp->writable_size) + return -EINVAL; + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); } -- cgit v1.2.3 From 6ac99e8f23d4b10258406ca0dd7bffca5f31da9d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 26 Apr 2019 16:39:39 -0700 Subject: bpf: Introduce bpf sk local storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After allowing a bpf prog to - directly read the skb->sk ptr - get the fullsock bpf_sock by "bpf_sk_fullsock()" - get the bpf_tcp_sock by "bpf_tcp_sock()" - get the listener sock by "bpf_get_listener_sock()" - avoid duplicating the fields of "(bpf_)sock" and "(bpf_)tcp_sock" into different bpf running context. this patch is another effort to make bpf's network programming more intuitive to do (together with memory and performance benefit). When bpf prog needs to store data for a sk, the current practice is to define a map with the usual 4-tuples (src/dst ip/port) as the key. If multiple bpf progs require to store different sk data, multiple maps have to be defined. Hence, wasting memory to store the duplicated keys (i.e. 4 tuples here) in each of the bpf map. [ The smallest key could be the sk pointer itself which requires some enhancement in the verifier and it is a separate topic. ] Also, the bpf prog needs to clean up the elem when sk is freed. Otherwise, the bpf map will become full and un-usable quickly. The sk-free tracking currently could be done during sk state transition (e.g. BPF_SOCK_OPS_STATE_CB). The size of the map needs to be predefined which then usually ended-up with an over-provisioned map in production. Even the map was re-sizable, while the sk naturally come and go away already, this potential re-size operation is arguably redundant if the data can be directly connected to the sk itself instead of proxy-ing through a bpf map. This patch introduces sk->sk_bpf_storage to provide local storage space at sk for bpf prog to use. The space will be allocated when the first bpf prog has created data for this particular sk. The design optimizes the bpf prog's lookup (and then optionally followed by an inline update). bpf_spin_lock should be used if the inline update needs to be protected. BPF_MAP_TYPE_SK_STORAGE: ----------------------- To define a bpf "sk-local-storage", a BPF_MAP_TYPE_SK_STORAGE map (new in this patch) needs to be created. Multiple BPF_MAP_TYPE_SK_STORAGE maps can be created to fit different bpf progs' needs. The map enforces BTF to allow printing the sk-local-storage during a system-wise sk dump (e.g. "ss -ta") in the future. The purpose of a BPF_MAP_TYPE_SK_STORAGE map is not for lookup/update/delete a "sk-local-storage" data from a particular sk. Think of the map as a meta-data (or "type") of a "sk-local-storage". This particular "type" of "sk-local-storage" data can then be stored in any sk. The main purposes of this map are mostly: 1. Define the size of a "sk-local-storage" type. 2. Provide a similar syscall userspace API as the map (e.g. lookup/update, map-id, map-btf...etc.) 3. Keep track of all sk's storages of this "type" and clean them up when the map is freed. sk->sk_bpf_storage: ------------------ The main lookup/update/delete is done on sk->sk_bpf_storage (which is a "struct bpf_sk_storage"). When doing a lookup, the "map" pointer is now used as the "key" to search on the sk_storage->list. The "map" pointer is actually serving as the "type" of the "sk-local-storage" that is being requested. To allow very fast lookup, it should be as fast as looking up an array at a stable-offset. At the same time, it is not ideal to set a hard limit on the number of sk-local-storage "type" that the system can have. Hence, this patch takes a cache approach. The last search result from sk_storage->list is cached in sk_storage->cache[] which is a stable sized array. Each "sk-local-storage" type has a stable offset to the cache[] array. In the future, a map's flag could be introduced to do cache opt-out/enforcement if it became necessary. The cache size is 16 (i.e. 16 types of "sk-local-storage"). Programs can share map. On the program side, having a few bpf_progs running in the networking hotpath is already a lot. The bpf_prog should have already consolidated the existing sock-key-ed map usage to minimize the map lookup penalty. 16 has enough runway to grow. All sk-local-storage data will be removed from sk->sk_bpf_storage during sk destruction. bpf_sk_storage_get() and bpf_sk_storage_delete(): ------------------------------------------------ Instead of using bpf_map_(lookup|update|delete)_elem(), the bpf prog needs to use the new helper bpf_sk_storage_get() and bpf_sk_storage_delete(). The verifier can then enforce the ARG_PTR_TO_SOCKET argument. The bpf_sk_storage_get() also allows to "create" new elem if one does not exist in the sk. It is done by the new BPF_SK_STORAGE_GET_F_CREATE flag. An optional value can also be provided as the initial value during BPF_SK_STORAGE_GET_F_CREATE. The BPF_MAP_TYPE_SK_STORAGE also supports bpf_spin_lock. Together, it has eliminated the potential use cases for an equivalent bpf_map_update_elem() API (for bpf_prog) in this patch. Misc notes: ---------- 1. map_get_next_key is not supported. From the userspace syscall perspective, the map has the socket fd as the key while the map can be shared by pinned-file or map-id. Since btf is enforced, the existing "ss" could be enhanced to pretty print the local-storage. Supporting a kernel defined btf with 4 tuples as the return key could be explored later also. 2. The sk->sk_lock cannot be acquired. Atomic operations is used instead. e.g. cmpxchg is done on the sk->sk_bpf_storage ptr. Please refer to the source code comments for the details in synchronization cases and considerations. 3. The mem is charged to the sk->sk_omem_alloc as the sk filter does. Benchmark: --------- Here is the benchmark data collected by turning on the "kernel.bpf_stats_enabled" sysctl. Two bpf progs are tested: One bpf prog with the usual bpf hashmap (max_entries = 8192) with the sk ptr as the key. (verifier is modified to support sk ptr as the key That should have shortened the key lookup time.) Another bpf prog is with the new BPF_MAP_TYPE_SK_STORAGE. Both are storing a "u32 cnt", do a lookup on "egress_skb/cgroup" for each egress skb and then bump the cnt. netperf is used to drive data with 4096 connected UDP sockets. BPF_MAP_TYPE_HASH with a modifier verifier (152ns per bpf run) 27: cgroup_skb name egress_sk_map tag 74f56e832918070b run_time_ns 58280107540 run_cnt 381347633 loaded_at 2019-04-15T13:46:39-0700 uid 0 xlated 344B jited 258B memlock 4096B map_ids 16 btf_id 5 BPF_MAP_TYPE_SK_STORAGE in this patch (66ns per bpf run) 30: cgroup_skb name egress_sk_stora tag d4aa70984cc7bbf6 run_time_ns 25617093319 run_cnt 390989739 loaded_at 2019-04-15T13:47:54-0700 uid 0 xlated 168B jited 156B memlock 4096B map_ids 17 btf_id 6 Here is a high-level picture on how are the objects organized: sk ┌──────┐ │ │ │ │ │ │ │*sk_bpf_storage─────▶ bpf_sk_storage └──────┘ ┌───────┐ ┌───────────┤ list │ │ │ │ │ │ │ │ │ │ │ └───────┘ │ │ elem │ ┌────────┐ ├─▶│ snode │ │ ├────────┤ │ │ data │ bpf_map │ ├────────┤ ┌─────────┐ │ │map_node│◀─┬─────┤ list │ │ └────────┘ │ │ │ │ │ │ │ │ elem │ │ │ │ ┌────────┐ │ └─────────┘ └─▶│ snode │ │ ├────────┤ │ bpf_map │ data │ │ ┌─────────┐ ├────────┤ │ │ list ├───────▶│map_node│ │ │ │ └────────┘ │ │ │ │ │ │ elem │ └─────────┘ ┌────────┐ │ ┌─▶│ snode │ │ │ ├────────┤ │ │ │ data │ │ │ ├────────┤ │ │ │map_node│◀─┘ │ └────────┘ │ │ │ ┌───────┐ sk └──────────│ list │ ┌──────┐ │ │ │ │ │ │ │ │ │ │ │ │ └───────┘ │*sk_bpf_storage───────▶bpf_sk_storage └──────┘ Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 + include/linux/bpf_types.h | 1 + include/net/bpf_sk_storage.h | 13 + include/net/sock.h | 5 + include/uapi/linux/bpf.h | 44 ++- kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 27 +- net/bpf/test_run.c | 2 + net/core/Makefile | 1 + net/core/bpf_sk_storage.c | 804 +++++++++++++++++++++++++++++++++++++++++++ net/core/filter.c | 12 + net/core/sock.c | 5 + 12 files changed, 914 insertions(+), 5 deletions(-) create mode 100644 include/net/bpf_sk_storage.h create mode 100644 net/core/bpf_sk_storage.c (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd6341eabd74..9a21848fdb07 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -184,6 +184,7 @@ enum bpf_arg_type { ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ + ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack @@ -204,6 +205,7 @@ enum bpf_arg_type { ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ + ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ }; /* type of values returned from helper functions */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a10d37bce364..5a9975678d6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -61,6 +61,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) #if defined(CONFIG_BPF_STREAM_PARSER) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h new file mode 100644 index 000000000000..b9dcb02e756b --- /dev/null +++ b/include/net/bpf_sk_storage.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2019 Facebook */ +#ifndef _BPF_SK_STORAGE_H +#define _BPF_SK_STORAGE_H + +struct sock; + +void bpf_sk_storage_free(struct sock *sk); + +extern const struct bpf_func_proto bpf_sk_storage_get_proto; +extern const struct bpf_func_proto bpf_sk_storage_delete_proto; + +#endif /* _BPF_SK_STORAGE_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 784cd19d5ff7..4d208c0f9c14 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -236,6 +236,8 @@ struct sock_common { /* public: */ }; +struct bpf_sk_storage; + /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock @@ -510,6 +512,9 @@ struct sock { #endif void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; +#ifdef CONFIG_BPF_SYSCALL + struct bpf_sk_storage __rcu *sk_bpf_storage; +#endif struct rcu_head sk_rcu; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f7fa7a34a62d..72336bac7573 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, + BPF_MAP_TYPE_SK_STORAGE, }; /* Note that tracing related programs such as @@ -2630,6 +2631,42 @@ union bpf_attr { * was provided. * * **-ERANGE** if resulting value was out of range. + * + * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * Description + * Get a bpf-local-storage from a sk. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem(map, &sk)** except this + * helper enforces the key must be a **bpf_fullsock()** + * and the map must be a BPF_MAP_TYPE_SK_STORAGE also. + * + * Underneath, the value is stored locally at *sk* instead of + * the map. The *map* is used as the bpf-local-storage **type**. + * The bpf-local-storage **type** (i.e. the *map*) is searched + * against all bpf-local-storages residing at sk. + * + * An optional *flags* (BPF_SK_STORAGE_GET_F_CREATE) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with BPF_SK_STORAGE_GET_F_CREATE to specify + * the initial value of a bpf-local-storage. If *value* is + * NULL, the new bpf-local-storage will be zero initialized. + * Return + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + * + * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * Description + * Delete a bpf-local-storage from a sk. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2738,7 +2775,9 @@ union bpf_attr { FN(sysctl_get_new_value), \ FN(sysctl_set_new_value), \ FN(strtol), \ - FN(strtoul), + FN(strtoul), \ + FN(sk_storage_get), \ + FN(sk_storage_delete), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2814,6 +2853,9 @@ enum bpf_func_id { /* BPF_FUNC_sysctl_get_name flags. */ #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +/* BPF_FUNC_sk_storage_get flags */ +#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae141e745f92..ad3ccf82f31d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -526,7 +526,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EACCES; if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_SK_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ef442c62c0e..271717246af3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2543,10 +2543,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && - type != expected_type) + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) + /* final test in check_stack_boundary() */; + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -2578,6 +2583,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } meta->ref_obj_id = reg->ref_obj_id; } + } else if (arg_type == ARG_PTR_TO_SOCKET) { + expected_type = PTR_TO_SOCKET; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -2635,6 +2644,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta->map_ptr->key_size, false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE || + (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && + !register_is_null(reg)) || arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -2784,6 +2795,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_SK_STORAGE: + if (func_id != BPF_FUNC_sk_storage_get && + func_id != BPF_FUNC_sk_storage_delete) + goto error; + break; default: break; } @@ -2847,6 +2863,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_STACK) goto error; break; + case BPF_FUNC_sk_storage_get: + case BPF_FUNC_sk_storage_delete: + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) + goto error; + break; default: break; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 6c4694ae4241..33e0dc168c16 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -335,6 +336,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, sizeof(struct __sk_buff)); out: kfree_skb(skb); + bpf_sk_storage_free(sk); kfree(sk); kfree(ctx); return ret; diff --git a/net/core/Makefile b/net/core/Makefile index f97d6254e564..a104dc8faafc 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -34,3 +34,4 @@ obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c new file mode 100644 index 000000000000..a8e9ac71b22d --- /dev/null +++ b/net/core/bpf_sk_storage.c @@ -0,0 +1,804 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static atomic_t cache_idx; + +struct bucket { + struct hlist_head list; + raw_spinlock_t lock; +}; + +/* Thp map is not the primary owner of a bpf_sk_storage_elem. + * Instead, the sk->sk_bpf_storage is. + * + * The map (bpf_sk_storage_map) is for two purposes + * 1. Define the size of the "sk local storage". It is + * the map's value_size. + * + * 2. Maintain a list to keep track of all elems such + * that they can be cleaned up during the map destruction. + * + * When a bpf local storage is being looked up for a + * particular sk, the "bpf_map" pointer is actually used + * as the "key" to search in the list of elem in + * sk->sk_bpf_storage. + * + * Hence, consider sk->sk_bpf_storage is the mini-map + * with the "bpf_map" pointer as the searching key. + */ +struct bpf_sk_storage_map { + struct bpf_map map; + /* Lookup elem does not require accessing the map. + * + * Updating/Deleting requires a bucket lock to + * link/unlink the elem from the map. Having + * multiple buckets to improve contention. + */ + struct bucket *buckets; + u32 bucket_log; + u16 elem_size; + u16 cache_idx; +}; + +struct bpf_sk_storage_data { + /* smap is used as the searching key when looking up + * from sk->sk_bpf_storage. + * + * Put it in the same cacheline as the data to minimize + * the number of cachelines access during the cache hit case. + */ + struct bpf_sk_storage_map __rcu *smap; + u8 data[0] __aligned(8); +}; + +/* Linked to bpf_sk_storage and bpf_sk_storage_map */ +struct bpf_sk_storage_elem { + struct hlist_node map_node; /* Linked to bpf_sk_storage_map */ + struct hlist_node snode; /* Linked to bpf_sk_storage */ + struct bpf_sk_storage __rcu *sk_storage; + struct rcu_head rcu; + /* 8 bytes hole */ + /* The data is stored in aother cacheline to minimize + * the number of cachelines access during a cache hit. + */ + struct bpf_sk_storage_data sdata ____cacheline_aligned; +}; + +#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata) +#define SDATA(_SELEM) (&(_SELEM)->sdata) +#define BPF_SK_STORAGE_CACHE_SIZE 16 + +struct bpf_sk_storage { + struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE]; + struct hlist_head list; /* List of bpf_sk_storage_elem */ + struct sock *sk; /* The sk that owns the the above "list" of + * bpf_sk_storage_elem. + */ + struct rcu_head rcu; + raw_spinlock_t lock; /* Protect adding/removing from the "list" */ +}; + +static struct bucket *select_bucket(struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *selem) +{ + return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; +} + +static int omem_charge(struct sock *sk, unsigned int size) +{ + /* same check as in sock_kmalloc() */ + if (size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + atomic_add(size, &sk->sk_omem_alloc); + return 0; + } + + return -ENOMEM; +} + +static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem) +{ + return !hlist_unhashed(&selem->snode); +} + +static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem) +{ + return !hlist_unhashed(&selem->map_node); +} + +static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap, + struct sock *sk, void *value, + bool charge_omem) +{ + struct bpf_sk_storage_elem *selem; + + if (charge_omem && omem_charge(sk, smap->elem_size)) + return NULL; + + selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (selem) { + if (value) + memcpy(SDATA(selem)->data, value, smap->map.value_size); + return selem; + } + + if (charge_omem) + atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + + return NULL; +} + +/* sk_storage->lock must be held and selem->sk_storage == sk_storage. + * The caller must ensure selem->smap is still valid to be + * dereferenced for its smap->elem_size and smap->cache_idx. + */ +static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage, + struct bpf_sk_storage_elem *selem, + bool uncharge_omem) +{ + struct bpf_sk_storage_map *smap; + bool free_sk_storage; + struct sock *sk; + + smap = rcu_dereference(SDATA(selem)->smap); + sk = sk_storage->sk; + + /* All uncharging on sk->sk_omem_alloc must be done first. + * sk may be freed once the last selem is unlinked from sk_storage. + */ + if (uncharge_omem) + atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + + free_sk_storage = hlist_is_singular_node(&selem->snode, + &sk_storage->list); + if (free_sk_storage) { + atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc); + sk_storage->sk = NULL; + /* After this RCU_INIT, sk may be freed and cannot be used */ + RCU_INIT_POINTER(sk->sk_bpf_storage, NULL); + + /* sk_storage is not freed now. sk_storage->lock is + * still held and raw_spin_unlock_bh(&sk_storage->lock) + * will be done by the caller. + * + * Although the unlock will be done under + * rcu_read_lock(), it is more intutivie to + * read if kfree_rcu(sk_storage, rcu) is done + * after the raw_spin_unlock_bh(&sk_storage->lock). + * + * Hence, a "bool free_sk_storage" is returned + * to the caller which then calls the kfree_rcu() + * after unlock. + */ + } + hlist_del_init_rcu(&selem->snode); + if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) == + SDATA(selem)) + RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL); + + kfree_rcu(selem, rcu); + + return free_sk_storage; +} + +static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) +{ + struct bpf_sk_storage *sk_storage; + bool free_sk_storage = false; + + if (unlikely(!selem_linked_to_sk(selem))) + /* selem has already been unlinked from sk */ + return; + + sk_storage = rcu_dereference(selem->sk_storage); + raw_spin_lock_bh(&sk_storage->lock); + if (likely(selem_linked_to_sk(selem))) + free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); + raw_spin_unlock_bh(&sk_storage->lock); + + if (free_sk_storage) + kfree_rcu(sk_storage, rcu); +} + +/* sk_storage->lock must be held and sk_storage->list cannot be empty */ +static void __selem_link_sk(struct bpf_sk_storage *sk_storage, + struct bpf_sk_storage_elem *selem) +{ + RCU_INIT_POINTER(selem->sk_storage, sk_storage); + hlist_add_head(&selem->snode, &sk_storage->list); +} + +static void selem_unlink_map(struct bpf_sk_storage_elem *selem) +{ + struct bpf_sk_storage_map *smap; + struct bucket *b; + + if (unlikely(!selem_linked_to_map(selem))) + /* selem has already be unlinked from smap */ + return; + + smap = rcu_dereference(SDATA(selem)->smap); + b = select_bucket(smap, selem); + raw_spin_lock_bh(&b->lock); + if (likely(selem_linked_to_map(selem))) + hlist_del_init_rcu(&selem->map_node); + raw_spin_unlock_bh(&b->lock); +} + +static void selem_link_map(struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *selem) +{ + struct bucket *b = select_bucket(smap, selem); + + raw_spin_lock_bh(&b->lock); + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + hlist_add_head_rcu(&selem->map_node, &b->list); + raw_spin_unlock_bh(&b->lock); +} + +static void selem_unlink(struct bpf_sk_storage_elem *selem) +{ + /* Always unlink from map before unlinking from sk_storage + * because selem will be freed after successfully unlinked from + * the sk_storage. + */ + selem_unlink_map(selem); + selem_unlink_sk(selem); +} + +static struct bpf_sk_storage_data * +__sk_storage_lookup(struct bpf_sk_storage *sk_storage, + struct bpf_sk_storage_map *smap, + bool cacheit_lockit) +{ + struct bpf_sk_storage_data *sdata; + struct bpf_sk_storage_elem *selem; + + /* Fast path (cache hit) */ + sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]); + if (sdata && rcu_access_pointer(sdata->smap) == smap) + return sdata; + + /* Slow path (cache miss) */ + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) + if (rcu_access_pointer(SDATA(selem)->smap) == smap) + break; + + if (!selem) + return NULL; + + sdata = SDATA(selem); + if (cacheit_lockit) { + /* spinlock is needed to avoid racing with the + * parallel delete. Otherwise, publishing an already + * deleted sdata to the cache will become a use-after-free + * problem in the next __sk_storage_lookup(). + */ + raw_spin_lock_bh(&sk_storage->lock); + if (selem_linked_to_sk(selem)) + rcu_assign_pointer(sk_storage->cache[smap->cache_idx], + sdata); + raw_spin_unlock_bh(&sk_storage->lock); + } + + return sdata; +} + +static struct bpf_sk_storage_data * +sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) +{ + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_map *smap; + + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage) + return NULL; + + smap = (struct bpf_sk_storage_map *)map; + return __sk_storage_lookup(sk_storage, smap, cacheit_lockit); +} + +static int check_flags(const struct bpf_sk_storage_data *old_sdata, + u64 map_flags) +{ + if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + +static int sk_storage_alloc(struct sock *sk, + struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *first_selem) +{ + struct bpf_sk_storage *prev_sk_storage, *sk_storage; + int err; + + err = omem_charge(sk, sizeof(*sk_storage)); + if (err) + return err; + + sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN); + if (!sk_storage) { + err = -ENOMEM; + goto uncharge; + } + INIT_HLIST_HEAD(&sk_storage->list); + raw_spin_lock_init(&sk_storage->lock); + sk_storage->sk = sk; + + __selem_link_sk(sk_storage, first_selem); + selem_link_map(smap, first_selem); + /* Publish sk_storage to sk. sk->sk_lock cannot be acquired. + * Hence, atomic ops is used to set sk->sk_bpf_storage + * from NULL to the newly allocated sk_storage ptr. + * + * From now on, the sk->sk_bpf_storage pointer is protected + * by the sk_storage->lock. Hence, when freeing + * the sk->sk_bpf_storage, the sk_storage->lock must + * be held before setting sk->sk_bpf_storage to NULL. + */ + prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage, + NULL, sk_storage); + if (unlikely(prev_sk_storage)) { + selem_unlink_map(first_selem); + err = -EAGAIN; + goto uncharge; + + /* Note that even first_selem was linked to smap's + * bucket->list, first_selem can be freed immediately + * (instead of kfree_rcu) because + * bpf_sk_storage_map_free() does a + * synchronize_rcu() before walking the bucket->list. + * Hence, no one is accessing selem from the + * bucket->list under rcu_read_lock(). + */ + } + + return 0; + +uncharge: + kfree(sk_storage); + atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc); + return err; +} + +/* sk cannot be going away because it is linking new elem + * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). + * Otherwise, it will become a leak (and other memory issues + * during map destruction). + */ +static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk, + struct bpf_map *map, + void *value, + u64 map_flags) +{ + struct bpf_sk_storage_data *old_sdata = NULL; + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_map *smap; + int err; + + /* BPF_EXIST and BPF_NOEXIST cannot be both set */ + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || + /* BPF_F_LOCK can only be used in a value with spin_lock */ + unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + return ERR_PTR(-EINVAL); + + smap = (struct bpf_sk_storage_map *)map; + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + /* Very first elem for this sk */ + err = check_flags(NULL, map_flags); + if (err) + return ERR_PTR(err); + + selem = selem_alloc(smap, sk, value, true); + if (!selem) + return ERR_PTR(-ENOMEM); + + err = sk_storage_alloc(sk, smap, selem); + if (err) { + kfree(selem); + atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + return ERR_PTR(err); + } + + return SDATA(selem); + } + + if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { + /* Hoping to find an old_sdata to do inline update + * such that it can avoid taking the sk_storage->lock + * and changing the lists. + */ + old_sdata = __sk_storage_lookup(sk_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + return ERR_PTR(err); + if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) { + copy_map_value_locked(map, old_sdata->data, + value, false); + return old_sdata; + } + } + + raw_spin_lock_bh(&sk_storage->lock); + + /* Recheck sk_storage->list under sk_storage->lock */ + if (unlikely(hlist_empty(&sk_storage->list))) { + /* A parallel del is happening and sk_storage is going + * away. It has just been checked before, so very + * unlikely. Return instead of retry to keep things + * simple. + */ + err = -EAGAIN; + goto unlock_err; + } + + old_sdata = __sk_storage_lookup(sk_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + goto unlock_err; + + if (old_sdata && (map_flags & BPF_F_LOCK)) { + copy_map_value_locked(map, old_sdata->data, value, false); + selem = SELEM(old_sdata); + goto unlock; + } + + /* sk_storage->lock is held. Hence, we are sure + * we can unlink and uncharge the old_sdata successfully + * later. Hence, instead of charging the new selem now + * and then uncharge the old selem later (which may cause + * a potential but unnecessary charge failure), avoid taking + * a charge at all here (the "!old_sdata" check) and the + * old_sdata will not be uncharged later during __selem_unlink_sk(). + */ + selem = selem_alloc(smap, sk, value, !old_sdata); + if (!selem) { + err = -ENOMEM; + goto unlock_err; + } + + /* First, link the new selem to the map */ + selem_link_map(smap, selem); + + /* Second, link (and publish) the new selem to sk_storage */ + __selem_link_sk(sk_storage, selem); + + /* Third, remove old selem, SELEM(old_sdata) */ + if (old_sdata) { + selem_unlink_map(SELEM(old_sdata)); + __selem_unlink_sk(sk_storage, SELEM(old_sdata), false); + } + +unlock: + raw_spin_unlock_bh(&sk_storage->lock); + return SDATA(selem); + +unlock_err: + raw_spin_unlock_bh(&sk_storage->lock); + return ERR_PTR(err); +} + +static int sk_storage_delete(struct sock *sk, struct bpf_map *map) +{ + struct bpf_sk_storage_data *sdata; + + sdata = sk_storage_lookup(sk, map, false); + if (!sdata) + return -ENOENT; + + selem_unlink(SELEM(sdata)); + + return 0; +} + +/* Called by __sk_destruct() */ +void bpf_sk_storage_free(struct sock *sk) +{ + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage *sk_storage; + bool free_sk_storage = false; + struct hlist_node *n; + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage) { + rcu_read_unlock(); + return; + } + + /* Netiher the bpf_prog nor the bpf-map's syscall + * could be modifying the sk_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * sk_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_sk_storage_map_free() alone + * when unlinking elem from the sk_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&sk_storage->lock); + hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) { + /* Always unlink from map before unlinking from + * sk_storage. + */ + selem_unlink_map(selem); + free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); + } + raw_spin_unlock_bh(&sk_storage->lock); + rcu_read_unlock(); + + if (free_sk_storage) + kfree_rcu(sk_storage, rcu); +} + +static void bpf_sk_storage_map_free(struct bpf_map *map) +{ + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage_map *smap; + struct bucket *b; + unsigned int i; + + smap = (struct bpf_sk_storage_map *)map; + + synchronize_rcu(); + + /* bpf prog and the userspace can no longer access this map + * now. No new selem (of this map) can be added + * to the sk->sk_bpf_storage or to the map bucket's list. + * + * The elem of this map can be cleaned up here + * or + * by bpf_sk_storage_free() during __sk_destruct(). + */ + for (i = 0; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + + rcu_read_lock(); + /* No one is adding to b->list now */ + while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)), + struct bpf_sk_storage_elem, + map_node))) { + selem_unlink(selem); + cond_resched_rcu(); + } + rcu_read_unlock(); + } + + /* bpf_sk_storage_free() may still need to access the map. + * e.g. bpf_sk_storage_free() has unlinked selem from the map + * which then made the above while((selem = ...)) loop + * exited immediately. + * + * However, the bpf_sk_storage_free() still needs to access + * the smap->elem_size to do the uncharging in + * __selem_unlink_sk(). + * + * Hence, wait another rcu grace period for the + * bpf_sk_storage_free() to finish. + */ + synchronize_rcu(); + + kvfree(smap->buckets); + kfree(map); +} + +static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) +{ + if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries || + attr->key_size != sizeof(int) || !attr->value_size || + /* Enforce BTF for userspace sk dumping */ + !attr->btf_key_type_id || !attr->btf_value_type_id) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->value_size >= KMALLOC_MAX_SIZE - + MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) || + /* U16_MAX is much more than enough for sk local storage + * considering a tcp_sock is ~2k. + */ + attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem)) + return -E2BIG; + + return 0; +} + +static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_sk_storage_map *smap; + unsigned int i; + u32 nbuckets; + u64 cost; + + smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); + if (!smap) + return ERR_PTR(-ENOMEM); + bpf_map_init_from_attr(&smap->map, attr); + + smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus())); + nbuckets = 1U << smap->bucket_log; + smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, + GFP_USER | __GFP_NOWARN); + if (!smap->buckets) { + kfree(smap); + return ERR_PTR(-ENOMEM); + } + cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); + + for (i = 0; i < nbuckets; i++) { + INIT_HLIST_HEAD(&smap->buckets[i].list); + raw_spin_lock_init(&smap->buckets[i].lock); + } + + smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; + smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % + BPF_SK_STORAGE_CACHE_SIZE; + smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + return &smap->map; +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static int bpf_sk_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + u32 int_data; + + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; + + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + return -EINVAL; + + return 0; +} + +static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_sk_storage_data *sdata; + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + sdata = sk_storage_lookup(sock->sk, map, true); + sockfd_put(sock); + return sdata ? sdata->data : NULL; + } + + return ERR_PTR(err); +} + +static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_sk_storage_data *sdata; + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + sdata = sk_storage_update(sock->sk, map, value, map_flags); + sockfd_put(sock); + return IS_ERR(sdata) ? PTR_ERR(sdata) : 0; + } + + return err; +} + +static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + err = sk_storage_delete(sock->sk, map); + sockfd_put(sock); + return err; + } + + return err; +} + +BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags) +{ + struct bpf_sk_storage_data *sdata; + + if (flags > BPF_SK_STORAGE_GET_F_CREATE) + return (unsigned long)NULL; + + sdata = sk_storage_lookup(sk, map, true); + if (sdata) + return (unsigned long)sdata->data; + + if (flags == BPF_SK_STORAGE_GET_F_CREATE && + /* Cannot add new elem to a going away sk. + * Otherwise, the new elem may become a leak + * (and also other memory issues during map + * destruction). + */ + refcount_inc_not_zero(&sk->sk_refcnt)) { + sdata = sk_storage_update(sk, map, value, BPF_NOEXIST); + /* sk must be a fullsock (guaranteed by verifier), + * so sock_gen_put() is unnecessary. + */ + sock_put(sk); + return IS_ERR(sdata) ? + (unsigned long)NULL : (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) +{ + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + int err; + + err = sk_storage_delete(sk, map); + sock_put(sk); + return err; + } + + return -ENOENT; +} + +const struct bpf_map_ops sk_storage_map_ops = { + .map_alloc_check = bpf_sk_storage_map_alloc_check, + .map_alloc = bpf_sk_storage_map_alloc, + .map_free = bpf_sk_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, + .map_update_elem = bpf_fd_sk_storage_update_elem, + .map_delete_elem = bpf_fd_sk_storage_delete_elem, + .map_check_btf = bpf_sk_storage_map_check_btf, +}; + +const struct bpf_func_proto bpf_sk_storage_get_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_SOCKET, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_proto = { + .func = bpf_sk_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_SOCKET, +}; diff --git a/net/core/filter.c b/net/core/filter.c index 2f88baf39cc2..27b0dc01dc3f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -75,6 +75,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -5903,6 +5904,9 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_sk_storage_get_proto __weak; +const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; + static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5911,6 +5915,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; @@ -5992,6 +6000,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_fib_lookup_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; diff --git a/net/core/sock.c b/net/core/sock.c index 443b98d05f1e..9773be724aa9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -137,6 +137,7 @@ #include #include +#include #include @@ -1709,6 +1710,10 @@ static void __sk_destruct(struct rcu_head *head) sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); +#ifdef CONFIG_BPF_SYSCALL + bpf_sk_storage_free(sk); +#endif + if (atomic_read(&sk->sk_omem_alloc)) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); -- cgit v1.2.3 From 3087c3f7c23b9c54b956ee5519e97a42413ddf22 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Wed, 24 Apr 2019 10:48:44 -0400 Subject: netfilter: nft_ct: Add ct id support The 'id' key returns the unique id of the conntrack entry as returned by nf_ct_get_id(). Signed-off-by: Brett Mastbergen Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_ct.c | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 061bb3eb20c3..f0cf7b0f4f35 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -967,6 +967,7 @@ enum nft_socket_keys { * @NFT_CT_SRC_IP6: conntrack layer 3 protocol source (IPv6 address) * @NFT_CT_DST_IP6: conntrack layer 3 protocol destination (IPv6 address) * @NFT_CT_TIMEOUT: connection tracking timeout policy assigned to conntrack + * @NFT_CT_ID: conntrack id */ enum nft_ct_keys { NFT_CT_STATE, @@ -993,6 +994,7 @@ enum nft_ct_keys { NFT_CT_SRC_IP6, NFT_CT_DST_IP6, NFT_CT_TIMEOUT, + NFT_CT_ID, __NFT_CT_MAX }; #define NFT_CT_MAX (__NFT_CT_MAX - 1) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index b422b74bfe08..f043936763f3 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -178,6 +178,11 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; } #endif + case NFT_CT_ID: + if (!nf_ct_is_confirmed(ct)) + goto err; + *dest = nf_ct_get_id(ct); + return; default: break; } @@ -479,6 +484,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, len = sizeof(u16); break; #endif + case NFT_CT_ID: + len = sizeof(u32); + break; default: return -EOPNOTSUPP; } -- cgit v1.2.3 From a3d43c0d56f1b94e74963a2fbadfb70126d92213 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 29 Apr 2019 15:48:31 -0700 Subject: taprio: Add support adding an admin schedule The IEEE 802.1Q-2018 defines two "types" of schedules, the "Oper" (from operational?) and "Admin" ones. Up until now, 'taprio' only had support for the "Oper" one, added when the qdisc is created. This adds support for the "Admin" one, which allows the .change() operation to be supported. Just for clarification, some quick (and dirty) definitions, the "Oper" schedule is the currently (as in this instant) running one, and it's read-only. The "Admin" one is the one that the system configurator has installed, it can be changed, and it will be "promoted" to "Oper" when it's 'base-time' is reached. The idea behing this patch is that calling something like the below, (after taprio is already configured with an initial schedule): $ tc qdisc change taprio dev IFACE parent root \ base-time X \ sched-entry \ ... Will cause a new admin schedule to be created and programmed to be "promoted" to "Oper" at instant X. If an "Admin" schedule already exists, it will be overwritten with the new parameters. Up until now, there was some code that was added to ease the support of changing a single entry of a schedule, but was ultimately unused. Now, that we have support for "change" with more well thought semantics, updating a single entry seems to be less useful. So we remove what is in practice dead code, and return a "not supported" error if the user tries to use it. If changing a single entry would make the user's life easier we may ressurrect this idea, but at this point, removing it simplifies the code. For now, only the schedule specific bits are allowed to be added for a new schedule, that means that 'clockid', 'num_tc', 'map' and 'queues' cannot be modified. Example: $ tc qdisc change dev IFACE parent root handle 100 taprio \ base-time $BASE_TIME \ sched-entry S 00 500000 \ sched-entry S 0f 500000 \ clockid CLOCK_TAI The only change in the netlink API introduced by this change is the introduction of an "admin" type in the response to a dump request, that type allows userspace to separate the "oper" schedule from the "admin" schedule. If userspace doesn't support the "admin" type, it will only display the "oper" schedule. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 11 + net/sched/sch_taprio.c | 511 +++++++++++++++++++++++++---------------- 2 files changed, 329 insertions(+), 193 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 7ee74c3474bf..d59770d0eb84 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1148,6 +1148,16 @@ enum { #define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) +/* The format for the admin sched (dump only): + * [TCA_TAPRIO_SCHED_ADMIN_SCHED] + * [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL] + */ + enum { TCA_TAPRIO_ATTR_UNSPEC, TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ @@ -1156,6 +1166,7 @@ enum { TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ TCA_TAPRIO_PAD, + TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ __TCA_TAPRIO_ATTR_MAX, }; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index f827caa73862..ec8ccaee64e6 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -41,25 +42,69 @@ struct sched_entry { u8 command; }; +struct sched_gate_list { + struct rcu_head rcu; + struct list_head entries; + size_t num_entries; + s64 base_time; +}; + struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; - s64 base_time; int clockid; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ - size_t num_entries; /* Protects the update side of the RCU protected current_entry */ spinlock_t current_entry_lock; struct sched_entry __rcu *current_entry; - struct list_head entries; + struct sched_gate_list __rcu *oper_sched; + struct sched_gate_list __rcu *admin_sched; ktime_t (*get_time)(void); struct hrtimer advance_timer; struct list_head taprio_list; }; +static ktime_t sched_base_time(const struct sched_gate_list *sched) +{ + if (!sched) + return KTIME_MAX; + + return ns_to_ktime(sched->base_time); +} + +static void taprio_free_sched_cb(struct rcu_head *head) +{ + struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); + struct sched_entry *entry, *n; + + if (!sched) + return; + + list_for_each_entry_safe(entry, n, &sched->entries, list) { + list_del(&entry->list); + kfree(entry); + } + + kfree(sched); +} + +static void switch_schedules(struct taprio_sched *q, + struct sched_gate_list **admin, + struct sched_gate_list **oper) +{ + rcu_assign_pointer(q->oper_sched, *admin); + rcu_assign_pointer(q->admin_sched, NULL); + + if (*oper) + call_rcu(&(*oper)->rcu, taprio_free_sched_cb); + + *oper = *admin; + *admin = NULL; +} + static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -211,10 +256,31 @@ done: return skb; } +static bool should_change_schedules(const struct sched_gate_list *admin, + const struct sched_gate_list *oper, + ktime_t close_time) +{ + ktime_t next_base_time; + + if (!admin) + return false; + + next_base_time = sched_base_time(admin); + + /* This is the simple case, the close_time would fall after + * the next schedule base_time. + */ + if (ktime_compare(next_base_time, close_time) <= 0) + return true; + + return false; +} + static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); + struct sched_gate_list *oper, *admin; struct sched_entry *entry, *next; struct Qdisc *sch = q->root; ktime_t close_time; @@ -222,26 +288,43 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, lockdep_is_held(&q->current_entry_lock)); + oper = rcu_dereference_protected(q->oper_sched, + lockdep_is_held(&q->current_entry_lock)); + admin = rcu_dereference_protected(q->admin_sched, + lockdep_is_held(&q->current_entry_lock)); - /* This is the case that it's the first time that the schedule - * runs, so it only happens once per schedule. The first entry - * is pre-calculated during the schedule initialization. + if (!oper) + switch_schedules(q, &admin, &oper); + + /* This can happen in two cases: 1. this is the very first run + * of this function (i.e. we weren't running any schedule + * previously); 2. The previous schedule just ended. The first + * entry of all schedules are pre-calculated during the + * schedule initialization. */ - if (unlikely(!entry)) { - next = list_first_entry(&q->entries, struct sched_entry, + if (unlikely(!entry || entry->close_time == oper->base_time)) { + next = list_first_entry(&oper->entries, struct sched_entry, list); close_time = next->close_time; goto first_run; } - if (list_is_last(&entry->list, &q->entries)) - next = list_first_entry(&q->entries, struct sched_entry, + if (list_is_last(&entry->list, &oper->entries)) + next = list_first_entry(&oper->entries, struct sched_entry, list); else next = list_next_entry(entry, list); close_time = ktime_add_ns(entry->close_time, next->interval); + if (should_change_schedules(admin, oper, close_time)) { + /* Set things so the next time this runs, the new + * schedule runs. + */ + close_time = sched_base_time(admin); + switch_schedules(q, &admin, &oper); + } + next->close_time = close_time; taprio_set_budget(q, next); @@ -324,71 +407,8 @@ static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry, return fill_sched_entry(tb, entry, extack); } -/* Returns the number of entries in case of success */ -static int parse_sched_single_entry(struct nlattr *n, - struct taprio_sched *q, - struct netlink_ext_ack *extack) -{ - struct nlattr *tb_entry[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; - struct nlattr *tb_list[TCA_TAPRIO_SCHED_MAX + 1] = { }; - struct sched_entry *entry; - bool found = false; - u32 index; - int err; - - err = nla_parse_nested_deprecated(tb_list, TCA_TAPRIO_SCHED_MAX, n, - entry_list_policy, NULL); - if (err < 0) { - NL_SET_ERR_MSG(extack, "Could not parse nested entry"); - return -EINVAL; - } - - if (!tb_list[TCA_TAPRIO_SCHED_ENTRY]) { - NL_SET_ERR_MSG(extack, "Single-entry must include an entry"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb_entry, - TCA_TAPRIO_SCHED_ENTRY_MAX, - tb_list[TCA_TAPRIO_SCHED_ENTRY], - entry_policy, NULL); - if (err < 0) { - NL_SET_ERR_MSG(extack, "Could not parse nested entry"); - return -EINVAL; - } - - if (!tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]) { - NL_SET_ERR_MSG(extack, "Entry must specify an index\n"); - return -EINVAL; - } - - index = nla_get_u32(tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]); - if (index >= q->num_entries) { - NL_SET_ERR_MSG(extack, "Index for single entry exceeds number of entries in schedule"); - return -EINVAL; - } - - list_for_each_entry(entry, &q->entries, list) { - if (entry->index == index) { - found = true; - break; - } - } - - if (!found) { - NL_SET_ERR_MSG(extack, "Could not find entry"); - return -ENOENT; - } - - err = fill_sched_entry(tb_entry, entry, extack); - if (err < 0) - return err; - - return q->num_entries; -} - static int parse_sched_list(struct nlattr *list, - struct taprio_sched *q, + struct sched_gate_list *sched, struct netlink_ext_ack *extack) { struct nlattr *n; @@ -418,64 +438,36 @@ static int parse_sched_list(struct nlattr *list, return err; } - list_add_tail(&entry->list, &q->entries); + list_add_tail(&entry->list, &sched->entries); i++; } - q->num_entries = i; + sched->num_entries = i; return i; } -/* Returns the number of entries in case of success */ -static int parse_taprio_opt(struct nlattr **tb, struct taprio_sched *q, - struct netlink_ext_ack *extack) +static int parse_taprio_schedule(struct nlattr **tb, + struct sched_gate_list *new, + struct netlink_ext_ack *extack) { int err = 0; - int clockid; - - if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] && - tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) - return -EINVAL; - if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] && q->num_entries == 0) - return -EINVAL; - - if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) - return -EINVAL; + if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { + NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); + return -ENOTSUPP; + } if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) - q->base_time = nla_get_s64( - tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); - - if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { - clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); - - /* We only support static clockids and we don't allow - * for it to be modified after the first init. - */ - if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) - return -EINVAL; - - q->clockid = clockid; - } + new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) err = parse_sched_list( - tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], q, extack); - else if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) - err = parse_sched_single_entry( - tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY], q, extack); - - /* parse_sched_* return the number of entries in the schedule, - * a schedule with zero entries is an error. - */ - if (err == 0) { - NL_SET_ERR_MSG(extack, "The schedule should contain at least one entry"); - return -EINVAL; - } + tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack); + if (err < 0) + return err; - return err; + return 0; } static int taprio_parse_mqprio_opt(struct net_device *dev, @@ -484,11 +476,17 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, { int i, j; - if (!qopt) { + if (!qopt && !dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); return -EINVAL; } + /* If num_tc is already set, it means that the user already + * configured the mqprio part + */ + if (dev->num_tc) + return 0; + /* Verify num_tc is not out of max range */ if (qopt->num_tc > TC_MAX_QUEUE) { NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); @@ -534,14 +532,16 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, return 0; } -static int taprio_get_start_time(struct Qdisc *sch, ktime_t *start) +static int taprio_get_start_time(struct Qdisc *sch, + struct sched_gate_list *sched, + ktime_t *start) { struct taprio_sched *q = qdisc_priv(sch); struct sched_entry *entry; ktime_t now, base, cycle; s64 n; - base = ns_to_ktime(q->base_time); + base = sched_base_time(sched); now = q->get_time(); if (ktime_after(base, now)) { @@ -552,7 +552,7 @@ static int taprio_get_start_time(struct Qdisc *sch, ktime_t *start) /* Calculate the cycle_time, by summing all the intervals. */ cycle = 0; - list_for_each_entry(entry, &q->entries, list) + list_for_each_entry(entry, &sched->entries, list) cycle = ktime_add_ns(cycle, entry->interval); /* The qdisc is expected to have at least one sched_entry. Moreover, @@ -571,22 +571,34 @@ static int taprio_get_start_time(struct Qdisc *sch, ktime_t *start) return 0; } -static void taprio_start_sched(struct Qdisc *sch, ktime_t start) +static void setup_first_close_time(struct taprio_sched *q, + struct sched_gate_list *sched, ktime_t base) { - struct taprio_sched *q = qdisc_priv(sch); struct sched_entry *first; - unsigned long flags; - - spin_lock_irqsave(&q->current_entry_lock, flags); - first = list_first_entry(&q->entries, struct sched_entry, - list); + first = list_first_entry(&sched->entries, + struct sched_entry, list); - first->close_time = ktime_add_ns(start, first->interval); + first->close_time = ktime_add_ns(base, first->interval); taprio_set_budget(q, first); rcu_assign_pointer(q->current_entry, NULL); +} - spin_unlock_irqrestore(&q->current_entry_lock, flags); +static void taprio_start_sched(struct Qdisc *sch, + ktime_t start, struct sched_gate_list *new) +{ + struct taprio_sched *q = qdisc_priv(sch); + ktime_t expires; + + expires = hrtimer_get_expires(&q->advance_timer); + if (expires == 0) + expires = KTIME_MAX; + + /* If the new schedule starts before the next expiration, we + * reprogram it to the earliest one, so we change the admin + * schedule to the operational one at the right time. + */ + start = min_t(ktime_t, start, expires); hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); } @@ -641,10 +653,12 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; + struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; - int i, err, size; + int i, err, clockid; + unsigned long flags; ktime_t start; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, @@ -659,48 +673,64 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; - /* A schedule with less than one entry is an error */ - size = parse_taprio_opt(tb, q, extack); - if (size < 0) - return size; + new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); + if (!new_admin) { + NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); + return -ENOMEM; + } + INIT_LIST_HEAD(&new_admin->entries); - hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); - q->advance_timer.function = advance_sched; + rcu_read_lock(); + oper = rcu_dereference(q->oper_sched); + admin = rcu_dereference(q->admin_sched); + rcu_read_unlock(); - switch (q->clockid) { - case CLOCK_REALTIME: - q->get_time = ktime_get_real; - break; - case CLOCK_MONOTONIC: - q->get_time = ktime_get; - break; - case CLOCK_BOOTTIME: - q->get_time = ktime_get_boottime; - break; - case CLOCK_TAI: - q->get_time = ktime_get_clocktai; - break; - default: - return -ENOTSUPP; + if (mqprio && (oper || admin)) { + NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); + err = -ENOTSUPP; + goto free_sched; } - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *dev_queue; - struct Qdisc *qdisc; + err = parse_taprio_schedule(tb, new_admin, extack); + if (err < 0) + goto free_sched; - dev_queue = netdev_get_tx_queue(dev, i); - qdisc = qdisc_create_dflt(dev_queue, - &pfifo_qdisc_ops, - TC_H_MAKE(TC_H_MAJ(sch->handle), - TC_H_MIN(i + 1)), - extack); - if (!qdisc) - return -ENOMEM; + if (new_admin->num_entries == 0) { + NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); + err = -EINVAL; + goto free_sched; + } - if (i < dev->real_num_tx_queues) - qdisc_hash_add(qdisc, false); + if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); - q->qdiscs[i] = qdisc; + /* We only support static clockids and we don't allow + * for it to be modified after the first init. + */ + if (clockid < 0 || + (q->clockid != -1 && q->clockid != clockid)) { + NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported"); + err = -ENOTSUPP; + goto free_sched; + } + + q->clockid = clockid; + } + + if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { + NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); + err = -EINVAL; + goto free_sched; + } + + taprio_set_picos_per_byte(dev, q); + + /* Protects against enqueue()/dequeue() */ + spin_lock_bh(qdisc_lock(sch)); + + if (!hrtimer_active(&q->advance_timer)) { + hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); + q->advance_timer.function = advance_sched; } if (mqprio) { @@ -716,24 +746,60 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, mqprio->prio_tc_map[i]); } - taprio_set_picos_per_byte(dev, q); + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + err = -EINVAL; + goto unlock; + } - err = taprio_get_start_time(sch, &start); + err = taprio_get_start_time(sch, new_admin, &start); if (err < 0) { NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); - return err; + goto unlock; } - taprio_start_sched(sch, start); + setup_first_close_time(q, new_admin, start); - return 0; + /* Protects against advance_sched() */ + spin_lock_irqsave(&q->current_entry_lock, flags); + + taprio_start_sched(sch, start, new_admin); + + rcu_assign_pointer(q->admin_sched, new_admin); + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); + new_admin = NULL; + + spin_unlock_irqrestore(&q->current_entry_lock, flags); + + err = 0; + +unlock: + spin_unlock_bh(qdisc_lock(sch)); + +free_sched: + kfree(new_admin); + + return err; } static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - struct sched_entry *entry, *n; unsigned int i; spin_lock(&taprio_list_lock); @@ -752,10 +818,11 @@ static void taprio_destroy(struct Qdisc *sch) netdev_set_num_tc(dev, 0); - list_for_each_entry_safe(entry, n, &q->entries, list) { - list_del(&entry->list); - kfree(entry); - } + if (q->oper_sched) + call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb); + + if (q->admin_sched) + call_rcu(&q->admin_sched->rcu, taprio_free_sched_cb); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, @@ -763,12 +830,12 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + int i; - INIT_LIST_HEAD(&q->entries); spin_lock_init(&q->current_entry_lock); - /* We may overwrite the configuration later */ hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); + q->advance_timer.function = advance_sched; q->root = sch; @@ -798,6 +865,25 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, list_add(&q->taprio_list, &taprio_list); spin_unlock(&taprio_list_lock); + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *dev_queue; + struct Qdisc *qdisc; + + dev_queue = netdev_get_tx_queue(dev, i); + qdisc = qdisc_create_dflt(dev_queue, + &pfifo_qdisc_ops, + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1)), + extack); + if (!qdisc) + return -ENOMEM; + + if (i < dev->real_num_tx_queues) + qdisc_hash_add(qdisc, false); + + q->qdiscs[i] = qdisc; + } + return taprio_change(sch, opt, extack); } @@ -869,15 +955,47 @@ nla_put_failure: return -1; } +static int dump_schedule(struct sk_buff *msg, + const struct sched_gate_list *root) +{ + struct nlattr *entry_list; + struct sched_entry *entry; + + if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, + root->base_time, TCA_TAPRIO_PAD)) + return -1; + + entry_list = nla_nest_start_noflag(msg, + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); + if (!entry_list) + goto error_nest; + + list_for_each_entry(entry, &root->entries, list) { + if (dump_entry(msg, entry) < 0) + goto error_nest; + } + + nla_nest_end(msg, entry_list); + return 0; + +error_nest: + nla_nest_cancel(msg, entry_list); + return -1; +} + static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; - struct nlattr *nest, *entry_list; - struct sched_entry *entry; + struct nlattr *nest, *sched_nest; unsigned int i; + rcu_read_lock(); + oper = rcu_dereference(q->oper_sched); + admin = rcu_dereference(q->admin_sched); + opt.num_tc = netdev_get_num_tc(dev); memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); @@ -888,35 +1006,41 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) - return -ENOSPC; + goto start_error; if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) goto options_error; - if (nla_put_s64(skb, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, - q->base_time, TCA_TAPRIO_PAD)) - goto options_error; - if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; - entry_list = nla_nest_start_noflag(skb, - TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); - if (!entry_list) + if (oper && dump_schedule(skb, oper)) goto options_error; - list_for_each_entry(entry, &q->entries, list) { - if (dump_entry(skb, entry) < 0) - goto options_error; - } + if (!admin) + goto done; + + sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); - nla_nest_end(skb, entry_list); + if (dump_schedule(skb, admin)) + goto admin_error; + + nla_nest_end(skb, sched_nest); + +done: + rcu_read_unlock(); return nla_nest_end(skb, nest); +admin_error: + nla_nest_cancel(skb, sched_nest); + options_error: nla_nest_cancel(skb, nest); - return -1; + +start_error: + rcu_read_unlock(); + return -ENOSPC; } static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) @@ -1003,6 +1127,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .id = "taprio", .priv_size = sizeof(struct taprio_sched), .init = taprio_init, + .change = taprio_change, .destroy = taprio_destroy, .peek = taprio_peek, .dequeue = taprio_dequeue, -- cgit v1.2.3 From 6ca6a6654225f3cd001304d33429c817e0c0b85f Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 29 Apr 2019 15:48:32 -0700 Subject: taprio: Add support for setting the cycle-time manually IEEE 802.1Q-2018 defines that a the cycle-time of a schedule may be overridden, so the schedule is truncated to a determined "width". Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_taprio.c | 59 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index d59770d0eb84..7a32276838e1 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1167,6 +1167,7 @@ enum { TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ TCA_TAPRIO_PAD, TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ + TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ __TCA_TAPRIO_ATTR_MAX, }; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index ec8ccaee64e6..6b37ffda23ec 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -46,6 +46,8 @@ struct sched_gate_list { struct rcu_head rcu; struct list_head entries; size_t num_entries; + ktime_t cycle_close_time; + s64 cycle_time; s64 base_time; }; @@ -105,6 +107,22 @@ static void switch_schedules(struct taprio_sched *q, *admin = NULL; } +static ktime_t get_cycle_time(struct sched_gate_list *sched) +{ + struct sched_entry *entry; + ktime_t cycle = 0; + + if (sched->cycle_time != 0) + return sched->cycle_time; + + list_for_each_entry(entry, &sched->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + + sched->cycle_time = cycle; + + return cycle; +} + static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -256,6 +274,18 @@ done: return skb; } +static bool should_restart_cycle(const struct sched_gate_list *oper, + const struct sched_entry *entry) +{ + if (list_is_last(&entry->list, &oper->entries)) + return true; + + if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0) + return true; + + return false; +} + static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, ktime_t close_time) @@ -309,13 +339,17 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) goto first_run; } - if (list_is_last(&entry->list, &oper->entries)) + if (should_restart_cycle(oper, entry)) { next = list_first_entry(&oper->entries, struct sched_entry, list); - else + oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time, + oper->cycle_time); + } else { next = list_next_entry(entry, list); + } close_time = ktime_add_ns(entry->close_time, next->interval); + close_time = min_t(ktime_t, close_time, oper->cycle_close_time); if (should_change_schedules(admin, oper, close_time)) { /* Set things so the next time this runs, the new @@ -360,6 +394,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, + [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, }; static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, @@ -461,6 +496,9 @@ static int parse_taprio_schedule(struct nlattr **tb, if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); + if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) + new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); + if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) err = parse_sched_list( tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack); @@ -537,7 +575,6 @@ static int taprio_get_start_time(struct Qdisc *sch, ktime_t *start) { struct taprio_sched *q = qdisc_priv(sch); - struct sched_entry *entry; ktime_t now, base, cycle; s64 n; @@ -549,11 +586,7 @@ static int taprio_get_start_time(struct Qdisc *sch, return 0; } - /* Calculate the cycle_time, by summing all the intervals. - */ - cycle = 0; - list_for_each_entry(entry, &sched->entries, list) - cycle = ktime_add_ns(cycle, entry->interval); + cycle = get_cycle_time(sched); /* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, @@ -575,10 +608,16 @@ static void setup_first_close_time(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct sched_entry *first; + ktime_t cycle; first = list_first_entry(&sched->entries, struct sched_entry, list); + cycle = get_cycle_time(sched); + + /* FIXME: find a better place to do this */ + sched->cycle_close_time = ktime_add_ns(base, cycle); + first->close_time = ktime_add_ns(base, first->interval); taprio_set_budget(q, first); rcu_assign_pointer(q->current_entry, NULL); @@ -965,6 +1004,10 @@ static int dump_schedule(struct sk_buff *msg, root->base_time, TCA_TAPRIO_PAD)) return -1; + if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, + root->cycle_time, TCA_TAPRIO_PAD)) + return -1; + entry_list = nla_nest_start_noflag(msg, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); if (!entry_list) -- cgit v1.2.3 From c25031e993440debdd530278ce2171ce477df029 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 29 Apr 2019 15:48:33 -0700 Subject: taprio: Add support for cycle-time-extension IEEE 802.1Q-2018 defines the concept of a cycle-time-extension, so the last entry of a schedule before the start of a new schedule can be extended, so "too-short" entries can be avoided. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_taprio.c | 35 +++++++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 7a32276838e1..8b2f993cbb77 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1168,6 +1168,7 @@ enum { TCA_TAPRIO_PAD, TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ __TCA_TAPRIO_ATTR_MAX, }; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 6b37ffda23ec..539677120b9f 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -48,6 +48,7 @@ struct sched_gate_list { size_t num_entries; ktime_t cycle_close_time; s64 cycle_time; + s64 cycle_time_extension; s64 base_time; }; @@ -290,7 +291,7 @@ static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, ktime_t close_time) { - ktime_t next_base_time; + ktime_t next_base_time, extension_time; if (!admin) return false; @@ -303,6 +304,20 @@ static bool should_change_schedules(const struct sched_gate_list *admin, if (ktime_compare(next_base_time, close_time) <= 0) return true; + /* This is the cycle_time_extension case, if the close_time + * plus the amount that can be extended would fall after the + * next schedule base_time, we can extend the current schedule + * for that amount. + */ + extension_time = ktime_add_ns(close_time, oper->cycle_time_extension); + + /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about + * how precisely the extension should be made. So after + * conformance testing, this logic may change. + */ + if (ktime_compare(next_base_time, extension_time) <= 0) + return true; + return false; } @@ -390,11 +405,12 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) }, - [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, - [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, - [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, - [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, - [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, + [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, }; static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, @@ -496,6 +512,9 @@ static int parse_taprio_schedule(struct nlattr **tb, if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); + if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) + new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); + if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); @@ -1008,6 +1027,10 @@ static int dump_schedule(struct sk_buff *msg, root->cycle_time, TCA_TAPRIO_PAD)) return -1; + if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, + root->cycle_time_extension, TCA_TAPRIO_PAD)) + return -1; + entry_list = nla_nest_start_noflag(msg, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); if (!entry_list) -- cgit v1.2.3 From 0e1a2a3e6e7d37cea9f8586f6d7745b539147d9f Mon Sep 17 00:00:00 2001 From: Erez Alfasi Date: Tue, 5 Mar 2019 15:42:23 +0200 Subject: ethtool: Add SFF-8436 and SFF-8636 max EEPROM length definitions Added max EEPROM length defines for ethtool usage: #define ETH_MODULE_SFF_8636_MAX_LEN 640 #define ETH_MODULE_SFF_8436_MAX_LEN 640 These definitions are used to determine the EEPROM data length when reading high eeprom pages. For example, SFF-8636 EEPROM data from page 03h needs to be stored at data[512] - data[639]. Signed-off-by: Erez Alfasi Signed-off-by: Saeed Mahameed --- include/uapi/linux/ethtool.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 818ad368b586..3534ce157ae9 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1712,6 +1712,9 @@ static inline int ethtool_validate_duplex(__u8 duplex) #define ETH_MODULE_SFF_8436 0x4 #define ETH_MODULE_SFF_8436_LEN 256 +#define ETH_MODULE_SFF_8636_MAX_LEN 640 +#define ETH_MODULE_SFF_8436_MAX_LEN 640 + /* Reset flags */ /* The reset() operation must clear the flags for the components which * were actually reset. On successful return, the flags indicate the -- cgit v1.2.3 From bf5bc3ce8a8f32a0d45b6820ede8f9fc3e9c23df Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 2 May 2019 23:23:33 +0300 Subject: ether: Add dedicated Ethertype for pseudo-802.1Q DSA tagging There are two possible utilizations so far: - Switch devices that don't support a native insertion/extraction header on the CPU port may still enjoy the benefits of port isolation with a custom VLAN tag. For this, they need to have a customizable TPID in hardware and a new Ethertype to distinguish between real 802.1Q traffic and the private tags used for port separation. - Switches that don't support the deactivation of VLAN awareness, but still want to have a mode in which they accept all traffic, including frames that are tagged with a VLAN not configured on their ports, may use this as a fake to trick the hardware into thinking that the TPID for VLAN is something other than 0x8100. What follows after the ETH_P_DSA_8021Q EtherType is a regular VLAN header (TCI), however there is no other EtherType that can be used for this purpose and doesn't already have a well-defined meaning. ETH_P_8021AD, ETH_P_QINQ1, ETH_P_QINQ2 and ETH_P_QINQ3 expect that another follow-up VLAN tag is present, which is not the case here. Signed-off-by: Vladimir Oltean Suggested-by: Andrew Lunn Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 3a45b4ad71a3..3158ba672b72 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -109,6 +109,7 @@ #define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_QINQ3 0x9300 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_EDSA 0xDADA /* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_DSA_8021Q 0xDADB /* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_IFE 0xED3E /* ForCES inter-FE LFB type */ #define ETH_P_AF_IUCV 0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */ -- cgit v1.2.3