// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) "%s() " fmt "\n", __func__ #include #include #include #include #include #include #include "rcu_pending.h" #include "darray.h" #include "util.h" #define static_array_for_each(_a, _i) \ for (typeof(&(_a)[0]) _i = _a; \ _i < (_a) + ARRAY_SIZE(_a); \ _i++) enum rcu_pending_special { RCU_PENDING_KVFREE = 1, RCU_PENDING_CALL_RCU = 2, }; #define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) #define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp) { return ssp ? get_state_synchronize_srcu(ssp) : get_state_synchronize_rcu(); } static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp) { return ssp ? start_poll_synchronize_srcu(ssp) : start_poll_synchronize_rcu(); } static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie) { return ssp ? poll_state_synchronize_srcu(ssp, cookie) : poll_state_synchronize_rcu(cookie); } static inline void __rcu_barrier(struct srcu_struct *ssp) { return ssp ? srcu_barrier(ssp) : rcu_barrier(); } static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) { if (ssp) call_srcu(ssp, rhp, func); else call_rcu(rhp, func); } struct rcu_pending_seq { /* * We're using a radix tree like a vector - we're just pushing elements * onto the end; we're using a radix tree instead of an actual vector to * avoid reallocation overhead */ GENRADIX(struct rcu_head *) objs; size_t nr; struct rcu_head **cursor; unsigned long seq; }; struct rcu_pending_list { struct rcu_head *head; struct rcu_head *tail; unsigned long seq; }; struct rcu_pending_pcpu { struct rcu_pending *parent; spinlock_t lock; int cpu; /* * We can't bound the number of unprocessed gp sequence numbers, and we * can't efficiently merge radix trees for expired grace periods, so we * need darray/vector: */ DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs; /* Third entry is for expired objects: */ struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1]; struct rcu_head cb; bool cb_armed; struct work_struct work; }; static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p) { if (p->objs.nr) return true; static_array_for_each(p->lists, i) if (i->head) return true; return false; } static void rcu_pending_list_merge(struct rcu_pending_list *l1, struct rcu_pending_list *l2) { #ifdef __KERNEL__ if (!l1->head) l1->head = l2->head; else l1->tail->next = l2->head; #else if (!l1->head) l1->head = l2->head; else l1->tail->next.next = (void *) l2->head; #endif l1->tail = l2->tail; l2->head = l2->tail = NULL; } static void rcu_pending_list_add(struct rcu_pending_list *l, struct rcu_head *n) { #ifdef __KERNEL__ if (!l->head) l->head = n; else l->tail->next = n; l->tail = n; n->next = NULL; #else if (!l->head) l->head = n; else l->tail->next.next = (void *) n; l->tail = n; n->next.next = NULL; #endif } static void merge_expired_lists(struct rcu_pending_pcpu *p) { struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; for (struct rcu_pending_list *i = p->lists; i < expired; i++) if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq)) rcu_pending_list_merge(expired, i); } #ifndef __KERNEL__ static inline void kfree_bulk(size_t nr, void ** p) { while (nr--) kfree(*p); } #define local_irq_save(flags) \ do { \ flags = 0; \ } while (0) #endif static noinline void __process_finished_items(struct rcu_pending *pending, struct rcu_pending_pcpu *p, unsigned long flags) { struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; struct rcu_pending_seq objs = {}; struct rcu_head *list = NULL; if (p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) { objs = p->objs.data[0]; darray_remove_item(&p->objs, p->objs.data); } merge_expired_lists(p); list = expired->head; expired->head = expired->tail = NULL; spin_unlock_irqrestore(&p->lock, flags); switch ((ulong) pending->process) { case RCU_PENDING_KVFREE: for (size_t i = 0; i < objs.nr; ) { size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i); kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i)); i += nr_this_node; } genradix_free(&objs.objs); while (list) { struct rcu_head *obj = list; #ifdef __KERNEL__ list = obj->next; #else list = (void *) obj->next.next; #endif /* * low bit of pointer indicates whether rcu_head needs * to be freed - kvfree_rcu_mightsleep() */ BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0); void *ptr = (void *)(((unsigned long) obj->func) & ~1UL); bool free_head = ((unsigned long) obj->func) & 1UL; kvfree(ptr); if (free_head) kfree(obj); } break; case RCU_PENDING_CALL_RCU: for (size_t i = 0; i < objs.nr; i++) { struct rcu_head *obj = *genradix_ptr(&objs.objs, i); obj->func(obj); } genradix_free(&objs.objs); while (list) { struct rcu_head *obj = list; #ifdef __KERNEL__ list = obj->next; #else list = (void *) obj->next.next; #endif obj->func(obj); } break; default: for (size_t i = 0; i < objs.nr; i++) pending->process(pending, *genradix_ptr(&objs.objs, i)); genradix_free(&objs.objs); while (list) { struct rcu_head *obj = list; #ifdef __KERNEL__ list = obj->next; #else list = (void *) obj->next.next; #endif pending->process(pending, obj); } break; } } static bool process_finished_items(struct rcu_pending *pending, struct rcu_pending_pcpu *p, unsigned long flags) { /* * XXX: we should grab the gp seq once and avoid multiple function * calls, this is called from __rcu_pending_enqueue() fastpath in * may_sleep==true mode */ if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) || (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) || (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) || p->lists[2].head) { __process_finished_items(pending, p, flags); return true; } return false; } static void rcu_pending_work(struct work_struct *work) { struct rcu_pending_pcpu *p = container_of(work, struct rcu_pending_pcpu, work); struct rcu_pending *pending = p->parent; unsigned long flags; do { spin_lock_irqsave(&p->lock, flags); } while (process_finished_items(pending, p, flags)); spin_unlock_irqrestore(&p->lock, flags); } static void rcu_pending_rcu_cb(struct rcu_head *rcu) { struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb); schedule_work_on(p->cpu, &p->work); unsigned long flags; spin_lock_irqsave(&p->lock, flags); if (__rcu_pending_has_pending(p)) { spin_unlock_irqrestore(&p->lock, flags); __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb); } else { p->cb_armed = false; spin_unlock_irqrestore(&p->lock, flags); } } static __always_inline struct rcu_pending_seq * get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) { darray_for_each_reverse(p->objs, objs) if (objs->seq == seq) return objs; if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) return NULL; return &darray_last(p->objs); } static noinline bool rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, struct rcu_head *head, void *ptr, unsigned long *flags) { if (ptr) { if (!head) { /* * kvfree_rcu_mightsleep(): we weren't passed an * rcu_head, but we need one: use the low bit of the * ponter to free to flag that the head needs to be * freed as well: */ ptr = (void *)(((unsigned long) ptr)|1UL); head = kmalloc(sizeof(*head), __GFP_NOWARN); if (!head) { spin_unlock_irqrestore(&p->lock, *flags); head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL); /* * dropped lock, did GFP_KERNEL allocation, * check for gp expiration */ if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) { kvfree(--ptr); kfree(head); spin_lock_irqsave(&p->lock, *flags); return false; } } } head->func = ptr; } again: for (struct rcu_pending_list *i = p->lists; i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { if (i->seq == seq) { rcu_pending_list_add(i, head); return false; } } for (struct rcu_pending_list *i = p->lists; i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { if (!i->head) { i->seq = seq; rcu_pending_list_add(i, head); return true; } } merge_expired_lists(p); goto again; } /* * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via * pending->pracess) once grace period elapses. * * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall * back to a linked list. * * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a * process callback * * - If @ptr and @head are both not NULL, we're kvfree_rcu() * * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep() * * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process * expired items. */ static __always_inline void __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, void *ptr, bool may_sleep) { struct rcu_pending_pcpu *p; struct rcu_pending_seq *objs; struct genradix_node *new_node = NULL; unsigned long seq, flags; bool start_gp = false; BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); local_irq_save(flags); p = this_cpu_ptr(pending->p); spin_lock(&p->lock); seq = __get_state_synchronize_rcu(pending->srcu); restart: if (may_sleep && unlikely(process_finished_items(pending, p, flags))) goto check_expired; /* * In kvfree_rcu() mode, the radix tree is only for slab pointers so * that we can do kfree_bulk() - vmalloc pointers always use the linked * list: */ if (ptr && unlikely(is_vmalloc_addr(ptr))) goto list_add; objs = get_object_radix(p, seq); if (unlikely(!objs)) goto list_add; if (unlikely(!objs->cursor)) { /* * New radix tree nodes must be added under @p->lock because the * tree root is in a darray that can be resized (typically, * genradix supports concurrent unlocked allocation of new * nodes) - hence preallocation and the retry loop: */ objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs, objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN); if (unlikely(!objs->cursor)) { if (may_sleep) { spin_unlock_irqrestore(&p->lock, flags); gfp_t gfp = GFP_KERNEL; if (!head) gfp |= __GFP_NOFAIL; new_node = genradix_alloc_node(gfp); if (!new_node) may_sleep = false; goto check_expired; } list_add: start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags); goto start_gp; } } *objs->cursor++ = ptr ?: head; /* zero cursor if we hit the end of a radix tree node: */ if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1))) objs->cursor = NULL; start_gp = !objs->nr; objs->nr++; start_gp: if (unlikely(start_gp)) { /* * We only have one callback (ideally, we would have one for * every outstanding graceperiod) - so if our callback is * already in flight, we may still have to start a grace period * (since we used get_state() above, not start_poll()) */ if (!p->cb_armed) { p->cb_armed = true; __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb); } else { __start_poll_synchronize_rcu(pending->srcu); } } spin_unlock_irqrestore(&p->lock, flags); free_node: if (new_node) genradix_free_node(new_node); return; check_expired: if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) { switch ((ulong) pending->process) { case RCU_PENDING_KVFREE: kvfree(ptr); break; case RCU_PENDING_CALL_RCU: head->func(head); break; default: pending->process(pending, head); break; } goto free_node; } local_irq_save(flags); p = this_cpu_ptr(pending->p); spin_lock(&p->lock); goto restart; } void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj) { __rcu_pending_enqueue(pending, obj, NULL, true); } static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p) { struct rcu_head *ret = NULL; spin_lock_irq(&p->lock); darray_for_each(p->objs, objs) if (objs->nr) { ret = *genradix_ptr(&objs->objs, --objs->nr); objs->cursor = NULL; if (!objs->nr) genradix_free(&objs->objs); goto out; } static_array_for_each(p->lists, i) if (i->head) { ret = i->head; #ifdef __KERNEL__ i->head = ret->next; #else i->head = (void *) ret->next.next; #endif if (!i->head) i->tail = NULL; goto out; } out: spin_unlock_irq(&p->lock); return ret; } struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending) { return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p)); } struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending) { struct rcu_head *ret = rcu_pending_dequeue(pending); if (ret) return ret; int cpu; for_each_possible_cpu(cpu) { ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu)); if (ret) break; } return ret; } static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending) { int cpu; for_each_possible_cpu(cpu) { struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); spin_lock_irq(&p->lock); if (__rcu_pending_has_pending(p) || p->cb_armed) { spin_unlock_irq(&p->lock); return true; } spin_unlock_irq(&p->lock); } return false; } void rcu_pending_exit(struct rcu_pending *pending) { int cpu; if (!pending->p) return; while (rcu_pending_has_pending_or_armed(pending)) { __rcu_barrier(pending->srcu); for_each_possible_cpu(cpu) { struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); flush_work(&p->work); } } for_each_possible_cpu(cpu) { struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); flush_work(&p->work); } for_each_possible_cpu(cpu) { struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); static_array_for_each(p->lists, i) WARN_ON(i->head); WARN_ON(p->objs.nr); darray_exit(&p->objs); } free_percpu(pending->p); } /** * rcu_pending_init: - initialize a rcu_pending * * @pending: Object to init * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal * RCU flavor * @process: Callback function invoked on objects once their RCU barriers * have completed; if NULL, kvfree() is used. */ int rcu_pending_init(struct rcu_pending *pending, struct srcu_struct *srcu, rcu_pending_process_fn process) { pending->p = alloc_percpu(struct rcu_pending_pcpu); if (!pending->p) return -ENOMEM; int cpu; for_each_possible_cpu(cpu) { struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); p->parent = pending; p->cpu = cpu; spin_lock_init(&p->lock); darray_init(&p->objs); INIT_WORK(&p->work, rcu_pending_work); } pending->srcu = srcu; pending->process = process; return 0; }