diff options
Diffstat (limited to 'kernel/bpf/memalloc.c')
-rw-r--r-- | kernel/bpf/memalloc.c | 388 |
1 files changed, 293 insertions, 95 deletions
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 0668bcd7c926..9c49ae53deaf 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -98,11 +98,23 @@ struct bpf_mem_cache { int free_cnt; int low_watermark, high_watermark, batch; int percpu_size; + bool draining; + struct bpf_mem_cache *tgt; - struct rcu_head rcu; + /* list of objects to be freed after RCU GP */ struct llist_head free_by_rcu; + struct llist_node *free_by_rcu_tail; struct llist_head waiting_for_gp; + struct llist_node *waiting_for_gp_tail; + struct rcu_head rcu; atomic_t call_rcu_in_progress; + struct llist_head free_llist_extra_rcu; + + /* list of objects to be freed after RCU tasks trace GP */ + struct llist_head free_by_rcu_ttrace; + struct llist_head waiting_for_gp_ttrace; + struct rcu_head rcu_ttrace; + atomic_t call_rcu_ttrace_in_progress; }; struct bpf_mem_caches { @@ -153,59 +165,87 @@ static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) #endif } +static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + /* In RT irq_work runs in per-cpu kthread, so disable + * interrupts to avoid preemption and interrupts and + * reduce the chance of bpf prog executing on this cpu + * when active counter is busy. + */ + local_irq_save(*flags); + /* alloc_bulk runs from irq_work which will not preempt a bpf + * program that does unit_alloc/unit_free since IRQs are + * disabled there. There is no race to increment 'active' + * counter. It protects free_llist from corruption in case NMI + * bpf prog preempted this loop. + */ + WARN_ON_ONCE(local_inc_return(&c->active) != 1); +} + +static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) +{ + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) +{ + unsigned long flags; + + inc_active(c, &flags); + __llist_add(obj, &c->free_llist); + c->free_cnt++; + dec_active(c, &flags); +} + /* Mostly runs from irq_work except __init phase. */ -static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) { struct mem_cgroup *memcg = NULL, *old_memcg; - unsigned long flags; + gfp_t gfp; void *obj; int i; - memcg = get_memcg(c); - old_memcg = set_active_memcg(memcg); + gfp = __GFP_NOWARN | __GFP_ACCOUNT; + gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; + for (i = 0; i < cnt; i++) { /* - * free_by_rcu is only manipulated by irq work refill_work(). - * IRQ works on the same CPU are called sequentially, so it is - * safe to use __llist_del_first() here. If alloc_bulk() is - * invoked by the initial prefill, there will be no running - * refill_work(), so __llist_del_first() is fine as well. - * - * In most cases, objects on free_by_rcu are from the same CPU. - * If some objects come from other CPUs, it doesn't incur any - * harm because NUMA_NO_NODE means the preference for current - * numa node and it is not a guarantee. + * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is + * done only by one CPU == current CPU. Other CPUs might + * llist_add() and llist_del_all() in parallel. */ - obj = __llist_del_first(&c->free_by_rcu); - if (!obj) { - /* Allocate, but don't deplete atomic reserves that typical - * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc - * will allocate from the current numa node which is what we - * want here. - */ - obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT); - if (!obj) - break; - } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - /* In RT irq_work runs in per-cpu kthread, so disable - * interrupts to avoid preemption and interrupts and - * reduce the chance of bpf prog executing on this cpu - * when active counter is busy. - */ - local_irq_save(flags); - /* alloc_bulk runs from irq_work which will not preempt a bpf - * program that does unit_alloc/unit_free since IRQs are - * disabled there. There is no race to increment 'active' - * counter. It protects free_llist from corruption in case NMI - * bpf prog preempted this loop. + obj = llist_del_first(&c->free_by_rcu_ttrace); + if (!obj) + break; + add_obj_to_free_list(c, obj); + } + if (i >= cnt) + return; + + for (; i < cnt; i++) { + obj = llist_del_first(&c->waiting_for_gp_ttrace); + if (!obj) + break; + add_obj_to_free_list(c, obj); + } + if (i >= cnt) + return; + + memcg = get_memcg(c); + old_memcg = set_active_memcg(memcg); + for (; i < cnt; i++) { + /* Allocate, but don't deplete atomic reserves that typical + * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc + * will allocate from the current numa node which is what we + * want here. */ - WARN_ON_ONCE(local_inc_return(&c->active) != 1); - __llist_add(obj, &c->free_llist); - c->free_cnt++; - local_dec(&c->active); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(flags); + obj = __alloc(c, node, gfp); + if (!obj) + break; + add_obj_to_free_list(c, obj); } set_active_memcg(old_memcg); mem_cgroup_put(memcg); @@ -222,20 +262,24 @@ static void free_one(void *obj, bool percpu) kfree(obj); } -static void free_all(struct llist_node *llnode, bool percpu) +static int free_all(struct llist_node *llnode, bool percpu) { struct llist_node *pos, *t; + int cnt = 0; - llist_for_each_safe(pos, t, llnode) + llist_for_each_safe(pos, t, llnode) { free_one(pos, percpu); + cnt++; + } + return cnt; } static void __free_rcu(struct rcu_head *head) { - struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); - free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); - atomic_set(&c->call_rcu_in_progress, 0); + free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); + atomic_set(&c->call_rcu_ttrace_in_progress, 0); } static void __free_rcu_tasks_trace(struct rcu_head *head) @@ -254,60 +298,128 @@ static void enque_to_free(struct bpf_mem_cache *c, void *obj) struct llist_node *llnode = obj; /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. - * Nothing races to add to free_by_rcu list. + * Nothing races to add to free_by_rcu_ttrace list. */ - __llist_add(llnode, &c->free_by_rcu); + llist_add(llnode, &c->free_by_rcu_ttrace); } -static void do_call_rcu(struct bpf_mem_cache *c) +static void do_call_rcu_ttrace(struct bpf_mem_cache *c) { struct llist_node *llnode, *t; - if (atomic_xchg(&c->call_rcu_in_progress, 1)) + if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { + if (unlikely(READ_ONCE(c->draining))) { + llnode = llist_del_all(&c->free_by_rcu_ttrace); + free_all(llnode, !!c->percpu_size); + } return; + } + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) + llist_add(llnode, &c->waiting_for_gp_ttrace); + + if (unlikely(READ_ONCE(c->draining))) { + __free_rcu(&c->rcu_ttrace); + return; + } - WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) - /* There is no concurrent __llist_add(waiting_for_gp) access. - * It doesn't race with llist_del_all either. - * But there could be two concurrent llist_del_all(waiting_for_gp): - * from __free_rcu() and from drain_mem_cache(). - */ - __llist_add(llnode, &c->waiting_for_gp); /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. * If RCU Tasks Trace grace period implies RCU grace period, free * these elements directly, else use call_rcu() to wait for normal * progs to finish and finally do free_one() on each element. */ - call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace); + call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); } static void free_bulk(struct bpf_mem_cache *c) { + struct bpf_mem_cache *tgt = c->tgt; struct llist_node *llnode, *t; unsigned long flags; int cnt; + WARN_ON_ONCE(tgt->unit_size != c->unit_size); + do { - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_save(flags); - WARN_ON_ONCE(local_inc_return(&c->active) != 1); + inc_active(c, &flags); llnode = __llist_del_first(&c->free_llist); if (llnode) cnt = --c->free_cnt; else cnt = 0; - local_dec(&c->active); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(flags); + dec_active(c, &flags); if (llnode) - enque_to_free(c, llnode); + enque_to_free(tgt, llnode); } while (cnt > (c->high_watermark + c->low_watermark) / 2); /* and drain free_llist_extra */ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) - enque_to_free(c, llnode); - do_call_rcu(c); + enque_to_free(tgt, llnode); + do_call_rcu_ttrace(tgt); +} + +static void __free_by_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct bpf_mem_cache *tgt = c->tgt; + struct llist_node *llnode; + + llnode = llist_del_all(&c->waiting_for_gp); + if (!llnode) + goto out; + + llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); + + /* Objects went through regular RCU GP. Send them to RCU tasks trace */ + do_call_rcu_ttrace(tgt); +out: + atomic_set(&c->call_rcu_in_progress, 0); +} + +static void check_free_by_rcu(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + unsigned long flags; + + /* drain free_llist_extra_rcu */ + if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { + inc_active(c, &flags); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) + if (__llist_add(llnode, &c->free_by_rcu)) + c->free_by_rcu_tail = llnode; + dec_active(c, &flags); + } + + if (llist_empty(&c->free_by_rcu)) + return; + + if (atomic_xchg(&c->call_rcu_in_progress, 1)) { + /* + * Instead of kmalloc-ing new rcu_head and triggering 10k + * call_rcu() to hit rcutree.qhimark and force RCU to notice + * the overload just ask RCU to hurry up. There could be many + * objects in free_by_rcu list. + * This hint reduces memory consumption for an artificial + * benchmark from 2 Gbyte to 150 Mbyte. + */ + rcu_request_urgent_qs_task(current); + return; + } + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); + + inc_active(c, &flags); + WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); + c->waiting_for_gp_tail = c->free_by_rcu_tail; + dec_active(c, &flags); + + if (unlikely(READ_ONCE(c->draining))) { + free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); + atomic_set(&c->call_rcu_in_progress, 0); + } else { + call_rcu_hurry(&c->rcu, __free_by_rcu); + } } static void bpf_mem_refill(struct irq_work *work) @@ -321,9 +433,11 @@ static void bpf_mem_refill(struct irq_work *work) /* irq_work runs on this cpu and kmalloc will allocate * from the current numa node which is what we want here. */ - alloc_bulk(c, c->batch, NUMA_NO_NODE); + alloc_bulk(c, c->batch, NUMA_NO_NODE, true); else if (cnt > c->high_watermark) free_bulk(c); + + check_free_by_rcu(c); } static void notrace irq_work_raise(struct bpf_mem_cache *c) @@ -367,7 +481,7 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) * prog won't be doing more than 4 map_update_elem from * irq disabled region */ - alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu)); + alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); } /* When size != 0 bpf_mem_cache for each cpu. @@ -406,6 +520,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c->unit_size = unit_size; c->objcg = objcg; c->percpu_size = percpu_size; + c->tgt = c; prefill_mem_cache(c, cpu); } ma->cache = pc; @@ -428,6 +543,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c = &cc->cache[i]; c->unit_size = sizes[i]; c->objcg = objcg; + c->tgt = c; prefill_mem_cache(c, cpu); } } @@ -441,19 +557,57 @@ static void drain_mem_cache(struct bpf_mem_cache *c) /* No progs are using this bpf_mem_cache, but htab_map_free() called * bpf_mem_cache_free() for all remaining elements and they can be in - * free_by_rcu or in waiting_for_gp lists, so drain those lists now. + * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. * - * Except for waiting_for_gp list, there are no concurrent operations + * Except for waiting_for_gp_ttrace list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all(). */ - free_all(__llist_del_all(&c->free_by_rcu), percpu); - free_all(llist_del_all(&c->waiting_for_gp), percpu); + free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); + free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); free_all(__llist_del_all(&c->free_llist), percpu); free_all(__llist_del_all(&c->free_llist_extra), percpu); + free_all(__llist_del_all(&c->free_by_rcu), percpu); + free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); + free_all(llist_del_all(&c->waiting_for_gp), percpu); +} + +static void check_mem_cache(struct bpf_mem_cache *c) +{ + WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); + WARN_ON_ONCE(!llist_empty(&c->free_llist)); + WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); + WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); + WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); +} + +static void check_leaked_objs(struct bpf_mem_alloc *ma) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i; + + if (ma->cache) { + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + check_mem_cache(c); + } + } + if (ma->caches) { + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + check_mem_cache(c); + } + } + } } static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) { + check_leaked_objs(ma); free_percpu(ma->cache); free_percpu(ma->caches); ma->cache = NULL; @@ -462,8 +616,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) static void free_mem_alloc(struct bpf_mem_alloc *ma) { - /* waiting_for_gp lists was drained, but __free_rcu might - * still execute. Wait for it now before we freeing percpu caches. + /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks + * might still execute. Wait for them. * * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used @@ -472,7 +626,8 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by * using rcu_trace_implies_rcu_gp() as well. */ - rcu_barrier_tasks_trace(); + rcu_barrier(); /* wait for __free_by_rcu */ + rcu_barrier_tasks_trace(); /* wait for __free_rcu */ if (!rcu_trace_implies_rcu_gp()) rcu_barrier(); free_mem_alloc_no_barrier(ma); @@ -498,7 +653,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) return; } - copy = kmalloc(sizeof(*ma), GFP_KERNEL); + copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); if (!copy) { /* Slow path with inline barrier-s */ free_mem_alloc(ma); @@ -506,10 +661,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) } /* Defer barriers into worker to let the rest of map memory to be freed */ - copy->cache = ma->cache; - ma->cache = NULL; - copy->caches = ma->caches; - ma->caches = NULL; + memset(ma, 0, sizeof(*ma)); INIT_WORK(©->work, free_mem_alloc_deferred); queue_work(system_unbound_wq, ©->work); } @@ -524,17 +676,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress = 0; for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); - /* - * refill_work may be unfinished for PREEMPT_RT kernel - * in which irq work is invoked in a per-CPU RT thread. - * It is also possible for kernel with - * arch_irq_work_has_interrupt() being false and irq - * work is invoked in timer interrupt. So waiting for - * the completion of irq work to ease the handling of - * concurrency. - */ + WRITE_ONCE(c->draining, true); irq_work_sync(&c->refill_work); drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } /* objcg is the same across cpus */ @@ -548,8 +693,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; + WRITE_ONCE(c->draining, true); irq_work_sync(&c->refill_work); drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } @@ -581,8 +728,10 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) local_irq_save(flags); if (local_inc_return(&c->active) == 1) { llnode = __llist_del_first(&c->free_llist); - if (llnode) + if (llnode) { cnt = --c->free_cnt; + *(struct bpf_mem_cache **)llnode = c; + } } local_dec(&c->active); local_irq_restore(flags); @@ -606,6 +755,12 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) BUILD_BUG_ON(LLIST_NODE_SZ > 8); + /* + * Remember bpf_mem_cache that allocated this object. + * The hint is not accurate. + */ + c->tgt = *(struct bpf_mem_cache **)llnode; + local_irq_save(flags); if (local_inc_return(&c->active) == 1) { __llist_add(llnode, &c->free_llist); @@ -627,6 +782,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) irq_work_raise(c); } +static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) +{ + struct llist_node *llnode = ptr - LLIST_NODE_SZ; + unsigned long flags; + + c->tgt = *(struct bpf_mem_cache **)llnode; + + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + if (__llist_add(llnode, &c->free_by_rcu)) + c->free_by_rcu_tail = llnode; + } else { + llist_add(llnode, &c->free_llist_extra_rcu); + } + local_dec(&c->active); + local_irq_restore(flags); + + if (!atomic_read(&c->call_rcu_in_progress)) + irq_work_raise(c); +} + /* Called from BPF program or from sys_bpf syscall. * In both cases migration is disabled. */ @@ -660,6 +836,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); } +void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) +{ + int idx; + + if (!ptr) + return; + + idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + if (idx < 0) + return; + + unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); +} + void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) { void *ret; @@ -676,6 +866,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) unit_free(this_cpu_ptr(ma->cache), ptr); } +void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) +{ + if (!ptr) + return; + + unit_free_rcu(this_cpu_ptr(ma->cache), ptr); +} + /* Directly does a kfree() without putting 'ptr' back to the free_llist * for reuse and without waiting for a rcu_tasks_trace gp. * The caller must first go through the rcu_tasks_trace gp for 'ptr' |