summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2021-06-17 21:54:56 +0300
committerDavid S. Miller <davem@davemloft.net>2021-06-17 21:54:56 +0300
commita52171ae7b803f4587b8172d1768313b4d093d0a (patch)
treeb7504137cddb40533c047a6effd024bb0ba2434f /kernel
parent4de772511fd13aa5e7b9bf485ce26f87e6de2bb8 (diff)
parentf20792d425d2efd2680f2855c1e3fec01c2e569e (diff)
downloadlinux-a52171ae7b803f4587b8172d1768313b4d093d0a.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== pull-request: bpf-next 2021-06-17 The following pull-request contains BPF updates for your *net-next* tree. We've added 50 non-merge commits during the last 25 day(s) which contain a total of 148 files changed, 4779 insertions(+), 1248 deletions(-). The main changes are: 1) BPF infrastructure to migrate TCP child sockets from a listener to another in the same reuseport group/map, from Kuniyuki Iwashima. 2) Add a provably sound, faster and more precise algorithm for tnum_mul() as noted in https://arxiv.org/abs/2105.05398, from Harishankar Vishwanathan. 3) Streamline error reporting changes in libbpf as planned out in the 'libbpf: the road to v1.0' effort, from Andrii Nakryiko. 4) Add broadcast support to xdp_redirect_map(), from Hangbin Liu. 5) Extends bpf_map_lookup_and_delete_elem() functionality to 4 more map types, that is, {LRU_,PERCPU_,LRU_PERCPU_,}HASH, from Denis Salopek. 6) Support new LLVM relocations in libbpf to make them more linker friendly, also add a doc to describe the BPF backend relocations, from Yonghong Song. 7) Silence long standing KUBSAN complaints on register-based shifts in interpreter, from Daniel Borkmann and Eric Biggers. 8) Add dummy PT_REGS macros in libbpf to fail BPF program compilation when target arch cannot be determined, from Lorenz Bauer. 9) Extend AF_XDP to support large umems with 1M+ pages, from Magnus Karlsson. 10) Fix two minor libbpf tc BPF API issues, from Kumar Kartikeya Dwivedi. 11) Move libbpf BPF_SEQ_PRINTF/BPF_SNPRINTF macros that can be used by BPF programs to bpf_helpers.h header, from Florent Revest. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/bpf_inode_storage.c2
-rw-r--r--kernel/bpf/bpf_lsm.c2
-rw-r--r--kernel/bpf/btf.c6
-rw-r--r--kernel/bpf/core.c61
-rw-r--r--kernel/bpf/cpumap.c3
-rw-r--r--kernel/bpf/devmap.c305
-rw-r--r--kernel/bpf/hashtab.c102
-rw-r--r--kernel/bpf/preload/iterators/iterators.bpf.c1
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/syscall.c47
-rw-r--r--kernel/bpf/tnum.c41
-rw-r--r--kernel/bpf/trampoline.c2
-rw-r--r--kernel/bpf/verifier.c12
13 files changed, 476 insertions, 110 deletions
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 2921ca39a93e..96ceed0e0fb5 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -72,7 +72,7 @@ void bpf_inode_storage_free(struct inode *inode)
return;
}
- /* Netiher the bpf_prog nor the bpf-map's syscall
+ /* Neither the bpf_prog nor the bpf-map's syscall
* could be modifying the local_storage->list now.
* Thus, no elem can be added-to or deleted-from the
* local_storage->list by the bpf_prog or by the bpf-map's syscall.
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index da471bf01b97..06062370c3b8 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -127,7 +127,7 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
/* The set of hooks which are called without pagefaults disabled and are allowed
- * to "sleep" and thus can be used for sleeable BPF programs.
+ * to "sleep" and thus can be used for sleepable BPF programs.
*/
BTF_SET_START(sleepable_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3925592d62e3..cb4b72997d9b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -51,7 +51,7 @@
* The BTF type section contains a list of 'struct btf_type' objects.
* Each one describes a C type. Recall from the above section
* that a 'struct btf_type' object could be immediately followed by extra
- * data in order to desribe some particular C types.
+ * data in order to describe some particular C types.
*
* type_id:
* ~~~~~~~
@@ -1143,7 +1143,7 @@ static void *btf_show_obj_safe(struct btf_show *show,
/*
* We need a new copy to our safe object, either because we haven't
- * yet copied and are intializing safe data, or because the data
+ * yet copied and are initializing safe data, or because the data
* we want falls outside the boundaries of the safe object.
*/
if (!safe) {
@@ -3417,7 +3417,7 @@ static struct btf_kind_operations func_proto_ops = {
* BTF_KIND_FUNC_PROTO cannot be directly referred by
* a struct's member.
*
- * It should be a funciton pointer instead.
+ * It should be a function pointer instead.
* (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
*
* Hence, there is no btf_func_check_member().
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5e31ee9f7512..034ad93a1ad7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1392,29 +1392,54 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
select_insn:
goto *jumptable[insn->code];
- /* ALU */
-#define ALU(OPCODE, OP) \
- ALU64_##OPCODE##_X: \
- DST = DST OP SRC; \
- CONT; \
- ALU_##OPCODE##_X: \
- DST = (u32) DST OP (u32) SRC; \
- CONT; \
- ALU64_##OPCODE##_K: \
- DST = DST OP IMM; \
- CONT; \
- ALU_##OPCODE##_K: \
- DST = (u32) DST OP (u32) IMM; \
+ /* Explicitly mask the register-based shift amounts with 63 or 31
+ * to avoid undefined behavior. Normally this won't affect the
+ * generated code, for example, in case of native 64 bit archs such
+ * as x86-64 or arm64, the compiler is optimizing the AND away for
+ * the interpreter. In case of JITs, each of the JIT backends compiles
+ * the BPF shift operations to machine instructions which produce
+ * implementation-defined results in such a case; the resulting
+ * contents of the register may be arbitrary, but program behaviour
+ * as a whole remains defined. In other words, in case of JIT backends,
+ * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
+ */
+ /* ALU (shifts) */
+#define SHT(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP (SRC & 63); \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP ((u32) SRC & 31); \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+ /* ALU (rest) */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
CONT;
-
ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
- ALU(LSH, <<)
- ALU(RSH, >>)
ALU(XOR, ^)
ALU(MUL, *)
+ SHT(LSH, <<)
+ SHT(RSH, >>)
+#undef SHT
#undef ALU
ALU_NEG:
DST = (u32) -DST;
@@ -1439,13 +1464,13 @@ select_insn:
insn++;
CONT;
ALU_ARSH_X:
- DST = (u64) (u32) (((s32) DST) >> SRC);
+ DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
CONT;
ALU_ARSH_K:
DST = (u64) (u32) (((s32) DST) >> IMM);
CONT;
ALU64_ARSH_X:
- (*(s64 *) &DST) >>= SRC;
+ (*(s64 *) &DST) >>= (SRC & 63);
CONT;
ALU64_ARSH_K:
(*(s64 *) &DST) >>= IMM;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 5dd3e866599a..a1a0c4e791c6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -601,7 +601,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ __cpu_map_lookup_elem);
}
static int cpu_map_btf_id;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index aa516472ce46..2a75e6c2d27d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -57,6 +57,7 @@ struct xdp_dev_bulk_queue {
struct list_head flush_node;
struct net_device *dev;
struct net_device *dev_rx;
+ struct bpf_prog *xdp_prog;
unsigned int count;
};
@@ -197,6 +198,7 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
+ bpf_clear_redirect_map(map);
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -326,22 +328,69 @@ bool dev_map_can_have_prog(struct bpf_map *map)
return false;
}
+static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
+ struct xdp_frame **frames, int n,
+ struct net_device *dev)
+{
+ struct xdp_txq_info txq = { .dev = dev };
+ struct xdp_buff xdp;
+ int i, nframes = 0;
+
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
+
+ xdp_convert_frame_to_buff(xdpf, &xdp);
+ xdp.txq = &txq;
+
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (unlikely(err < 0))
+ xdp_return_frame_rx_napi(xdpf);
+ else
+ frames[nframes++] = xdpf;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dev, xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_frame_rx_napi(xdpf);
+ break;
+ }
+ }
+ return nframes; /* sent frames count */
+}
+
static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
struct net_device *dev = bq->dev;
+ unsigned int cnt = bq->count;
int sent = 0, err = 0;
+ int to_send = cnt;
int i;
- if (unlikely(!bq->count))
+ if (unlikely(!cnt))
return;
- for (i = 0; i < bq->count; i++) {
+ for (i = 0; i < cnt; i++) {
struct xdp_frame *xdpf = bq->q[i];
prefetch(xdpf);
}
- sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
+ if (bq->xdp_prog) {
+ to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
+ if (!to_send)
+ goto out;
+ }
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
if (sent < 0) {
/* If ndo_xdp_xmit fails with an errno, no frames have
* been xmit'ed.
@@ -353,13 +402,12 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
/* If not all frames have been transmitted, it is our
* responsibility to free them
*/
- for (i = sent; unlikely(i < bq->count); i++)
+ for (i = sent; unlikely(i < to_send); i++)
xdp_return_frame_rx_napi(bq->q[i]);
- trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, bq->count - sent, err);
- bq->dev_rx = NULL;
+out:
bq->count = 0;
- __list_del_clearprev(&bq->flush_node);
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
}
/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
@@ -377,13 +425,17 @@ void __dev_flush(void)
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
- list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
bq_xmit_all(bq, XDP_XMIT_FLUSH);
+ bq->dev_rx = NULL;
+ bq->xdp_prog = NULL;
+ __list_del_clearprev(&bq->flush_node);
+ }
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
- * update happens in parallel here a dev_put wont happen until after reading the
- * ifindex.
+ * update happens in parallel here a dev_put won't happen until after reading
+ * the ifindex.
*/
static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
@@ -401,7 +453,7 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
* Thus, safe percpu variable access.
*/
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
- struct net_device *dev_rx)
+ struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
@@ -412,18 +464,22 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
* from net_device drivers NAPI func end.
+ *
+ * Do the same with xdp_prog and flush_list since these fields
+ * are only ever modified together.
*/
- if (!bq->dev_rx)
+ if (!bq->dev_rx) {
bq->dev_rx = dev_rx;
+ bq->xdp_prog = xdp_prog;
+ list_add(&bq->flush_node, flush_list);
+ }
bq->q[bq->count++] = xdpf;
-
- if (!bq->flush_node.prev)
- list_add(&bq->flush_node, flush_list);
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+ struct net_device *dev_rx,
+ struct bpf_prog *xdp_prog)
{
struct xdp_frame *xdpf;
int err;
@@ -439,55 +495,115 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
if (unlikely(!xdpf))
return -EOVERFLOW;
- bq_enqueue(dev, xdpf, dev_rx);
+ bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
return 0;
}
-static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
- struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
{
- struct xdp_txq_info txq = { .dev = dev };
- u32 act;
+ return __xdp_enqueue(dev, xdp, dev_rx, NULL);
+}
- xdp_set_data_meta_invalid(xdp);
- xdp->txq = &txq;
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct net_device *dev = dst->dev;
- act = bpf_prog_run_xdp(xdp_prog, xdp);
- switch (act) {
- case XDP_PASS:
- return xdp;
- case XDP_DROP:
- break;
- default:
- bpf_warn_invalid_xdp_action(act);
- fallthrough;
- case XDP_ABORTED:
- trace_xdp_exception(dev, xdp_prog, act);
- break;
- }
+ return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
+}
- xdp_return_buff(xdp);
- return NULL;
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp,
+ int exclude_ifindex)
+{
+ if (!obj || obj->dev->ifindex == exclude_ifindex ||
+ !obj->dev->netdev_ops->ndo_xdp_xmit)
+ return false;
+
+ if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
+ return false;
+
+ return true;
}
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
+ struct net_device *dev_rx,
+ struct xdp_frame *xdpf)
{
- return __xdp_enqueue(dev, xdp, dev_rx);
+ struct xdp_frame *nxdpf;
+
+ nxdpf = xdpf_clone(xdpf);
+ if (!nxdpf)
+ return -ENOMEM;
+
+ bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
+
+ return 0;
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+ struct bpf_map *map, bool exclude_ingress)
{
- struct net_device *dev = dst->dev;
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0;
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ struct hlist_head *head;
+ struct xdp_frame *xdpf;
+ unsigned int i;
+ int err;
- if (dst->xdp_prog) {
- xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
- if (!xdp)
- return 0;
+ xdpf = xdp_convert_buff_to_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = READ_ONCE(dtab->netdev_map[i]);
+ if (!is_valid_dst(dst, xdp, exclude_ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_rcu(dst, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock)) {
+ if (!is_valid_dst(dst, xdp, exclude_ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
}
- return __xdp_enqueue(dev, xdp, dev_rx);
+
+ /* consume the last copy of the frame */
+ if (last_dst)
+ bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
+ else
+ xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
+
+ return 0;
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
@@ -504,6 +620,87 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
return 0;
}
+static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
+ struct sk_buff *skb,
+ struct bpf_prog *xdp_prog)
+{
+ struct sk_buff *nskb;
+ int err;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ err = dev_map_generic_redirect(dst, nskb, xdp_prog);
+ if (unlikely(err)) {
+ consume_skb(nskb);
+ return err;
+ }
+
+ return 0;
+}
+
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+ struct bpf_prog *xdp_prog, struct bpf_map *map,
+ bool exclude_ingress)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ int exclude_ifindex = exclude_ingress ? dev->ifindex : 0;
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ struct hlist_head *head;
+ struct hlist_node *next;
+ unsigned int i;
+ int err;
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = READ_ONCE(dtab->netdev_map[i]);
+ if (!dst || dst->dev->ifindex == exclude_ifindex)
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_safe(dst, next, head, index_hlist) {
+ if (!dst || dst->dev->ifindex == exclude_ifindex)
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
+ }
+
+ /* consume the first skb and return */
+ if (last_dst)
+ return dev_map_generic_redirect(last_dst, skb, xdp_prog);
+
+ /* dtab is empty */
+ consume_skb(skb);
+ return 0;
+}
+
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
@@ -730,12 +927,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_lookup_elem);
}
static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_hash_lookup_elem);
}
static int dev_map_btf_id;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d7ebb12ffffc..6f6681b07364 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -46,12 +46,12 @@
* events, kprobes and tracing to be invoked before the prior invocation
* from one of these contexts completed. sys_bpf() uses the same mechanism
* by pinning the task to the current CPU and incrementing the recursion
- * protection accross the map operation.
+ * protection across the map operation.
*
* This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
* operations like memory allocations (even with GFP_ATOMIC) from atomic
* contexts. This is required because even with GFP_ATOMIC the memory
- * allocator calls into code pathes which acquire locks with long held lock
+ * allocator calls into code paths which acquire locks with long held lock
* sections. To ensure the deterministic behaviour these locks are regular
* spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
* true atomic contexts on an RT kernel are the low level hardware
@@ -1401,6 +1401,100 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, bool is_lru_map,
+ bool is_percpu, u64 flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_head *head;
+ unsigned long bflags;
+ struct htab_elem *l;
+ u32 hash, key_size;
+ struct bucket *b;
+ int ret;
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ ret = htab_lock_bucket(htab, b, hash, &bflags);
+ if (ret)
+ return ret;
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+ if (!l) {
+ ret = -ENOENT;
+ } else {
+ if (is_percpu) {
+ u32 roundup_value_size = round_up(map->value_size, 8);
+ void __percpu *pptr;
+ int off = 0, cpu;
+
+ pptr = htab_elem_get_ptr(l, key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off,
+ per_cpu_ptr(pptr, cpu),
+ roundup_value_size);
+ off += roundup_value_size;
+ }
+ } else {
+ u32 roundup_key_size = round_up(map->key_size, 8);
+
+ if (flags & BPF_F_LOCK)
+ copy_map_value_locked(map, value, l->key +
+ roundup_key_size,
+ true);
+ else
+ copy_map_value(map, value, l->key +
+ roundup_key_size);
+ check_and_init_map_lock(map, value);
+ }
+
+ hlist_nulls_del_rcu(&l->hash_node);
+ if (!is_lru_map)
+ free_htab_elem(htab, l);
+ }
+
+ htab_unlock_bucket(htab, b, hash, bflags);
+
+ if (is_lru_map && l)
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+
+ return ret;
+}
+
+static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
+ flags);
+}
+
+static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+ void *key, void *value,
+ u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
+ flags);
+}
+
+static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
+ flags);
+}
+
+static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+ void *key, void *value,
+ u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
+ flags);
+}
+
static int
__htab_map_lookup_and_delete_batch(struct bpf_map *map,
const union bpf_attr *attr,
@@ -1934,6 +2028,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
@@ -1954,6 +2049,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
@@ -2077,6 +2173,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
@@ -2096,6 +2193,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c
index 52aa7b38e8b8..03af863314ea 100644
--- a/kernel/bpf/preload/iterators/iterators.bpf.c
+++ b/kernel/bpf/preload/iterators/iterators.bpf.c
@@ -2,7 +2,6 @@
/* Copyright (c) 2020 Facebook */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 4838922f723d..93a55391791a 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -102,7 +102,7 @@ static void reuseport_array_free(struct bpf_map *map)
/*
* ops->map_*_elem() will not be able to access this
* array now. Hence, this function only races with
- * bpf_sk_reuseport_detach() which was triggerred by
+ * bpf_sk_reuseport_detach() which was triggered by
* close() or disconnect().
*
* This function and bpf_sk_reuseport_detach() are
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 73d15bc62d8c..e343f158e556 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1484,7 +1484,7 @@ free_buf:
return err;
}
-#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
@@ -1500,6 +1500,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
return -EINVAL;
+ if (attr->flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
@@ -1510,24 +1513,47 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
goto err_put;
}
+ if (attr->flags &&
+ (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ if ((attr->flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
- value_size = map->value_size;
+ value_size = bpf_map_value_size(map);
err = -ENOMEM;
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
+ err = -ENOTSUPP;
if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_pop_elem(map, value);
- } else {
- err = -ENOTSUPP;
+ } else if (map->map_type == BPF_MAP_TYPE_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ if (!bpf_map_is_dev_bound(map)) {
+ bpf_disable_instrumentation();
+ rcu_read_lock();
+ err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ }
}
if (err)
@@ -1947,6 +1973,11 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
attr->expected_attach_type =
BPF_CGROUP_INET_SOCK_CREATE;
break;
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ if (!attr->expected_attach_type)
+ attr->expected_attach_type =
+ BPF_SK_REUSEPORT_SELECT;
+ break;
}
}
@@ -2030,6 +2061,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
if (expected_attach_type == BPF_SK_LOOKUP)
return 0;
return -EINVAL;
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ switch (expected_attach_type) {
+ case BPF_SK_REUSEPORT_SELECT:
+ case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
+ return 0;
+ default:
+ return -EINVAL;
+ }
case BPF_PROG_TYPE_SYSCALL:
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index ceac5281bd31..3d7127f439a1 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -111,28 +111,31 @@ struct tnum tnum_xor(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
-/* half-multiply add: acc += (unknown * mask * value).
- * An intermediate step in the multiply algorithm.
+/* Generate partial products by multiplying each bit in the multiplier (tnum a)
+ * with the multiplicand (tnum b), and add the partial products after
+ * appropriately bit-shifting them. Instead of directly performing tnum addition
+ * on the generated partial products, equivalenty, decompose each partial
+ * product into two tnums, consisting of the value-sum (acc_v) and the
+ * mask-sum (acc_m) and then perform tnum addition on them. The following paper
+ * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398.
*/
-static struct tnum hma(struct tnum acc, u64 value, u64 mask)
-{
- while (mask) {
- if (mask & 1)
- acc = tnum_add(acc, TNUM(0, value));
- mask >>= 1;
- value <<= 1;
- }
- return acc;
-}
-
struct tnum tnum_mul(struct tnum a, struct tnum b)
{
- struct tnum acc;
- u64 pi;
-
- pi = a.value * b.value;
- acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
- return hma(acc, b.mask, a.value);
+ u64 acc_v = a.value * b.value;
+ struct tnum acc_m = TNUM(0, 0);
+
+ while (a.value || a.mask) {
+ /* LSB of tnum a is a certain 1 */
+ if (a.value & 1)
+ acc_m = tnum_add(acc_m, TNUM(0, b.mask));
+ /* LSB of tnum a is uncertain */
+ else if (a.mask & 1)
+ acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask));
+ /* Note: no case for LSB is certain 0 */
+ a = tnum_rshift(a, 1);
+ b = tnum_lshift(b, 1);
+ }
+ return tnum_add(TNUM(acc_v, 0), acc_m);
}
/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 2d44b5aa0057..28a3630c48ee 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -552,7 +552,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
* __bpf_prog_enter returns:
* 0 - skip execution of the bpf prog
* 1 - execute bpf prog
- * [2..MAX_U64] - excute bpf prog and record execution time.
+ * [2..MAX_U64] - execute bpf prog and record execution time.
* This is start time.
*/
u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 331b170d9fcc..b7d51fc937c7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -47,7 +47,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
* - unreachable insns exist (shouldn't be a forest. program = one function)
* - out of bounds or malformed jumps
* The second pass is all possible path descent from the 1st insn.
- * Since it's analyzing all pathes through the program, the length of the
+ * Since it's analyzing all paths through the program, the length of the
* analysis is limited to 64k insn, which may be hit even if total number of
* insn is less then 4K, but there are too many branches that change stack/regs.
* Number of 'branches to be analyzed' is limited to 1k
@@ -132,7 +132,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
* If it's ok, then verifier allows this BPF_CALL insn and looks at
* .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
* R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
- * returns ether pointer to map value or NULL.
+ * returns either pointer to map value or NULL.
*
* When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
* insn, the register holding that pointer in the true branch changes state to
@@ -2616,7 +2616,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
if (dst_reg != BPF_REG_FP) {
/* The backtracking logic can only recognize explicit
* stack slot address like [fp - 8]. Other spill of
- * scalar via different register has to be conervative.
+ * scalar via different register has to be conservative.
* Backtrack from here and mark all registers as precise
* that contributed into 'reg' being a constant.
*/
@@ -9059,7 +9059,7 @@ static int check_return_code(struct bpf_verifier_env *env)
!prog->aux->attach_func_proto->type)
return 0;
- /* eBPF calling convetion is such that R0 is used
+ /* eBPF calling convention is such that R0 is used
* to return the value from eBPF program.
* Make sure that it's readable at this time
* of bpf_exit, which means that program wrote
@@ -9850,7 +9850,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
* Since the verifier pushes the branch states as it sees them while exploring
* the program the condition of walking the branch instruction for the second
* time means that all states below this branch were already explored and
- * their final liveness markes are already propagated.
+ * their final liveness marks are already propagated.
* Hence when the verifier completes the search of state list in is_state_visited()
* we can call this clean_live_states() function to mark all liveness states
* as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
@@ -12470,7 +12470,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
prog->aux->max_pkt_offset = MAX_PACKET_OFF;
/* mark bpf_tail_call as different opcode to avoid
- * conditional branch in the interpeter for every normal
+ * conditional branch in the interpreter for every normal
* call and to prevent accidental JITing by JIT compiler
* that doesn't support bpf_tail_call yet
*/