diff options
-rw-r--r-- | include/linux/workqueue.h | 1 | ||||
-rw-r--r-- | kernel/workqueue.c | 280 |
2 files changed, 210 insertions, 71 deletions
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0143dd24430..ac551b8ee7d9 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -335,6 +335,7 @@ enum { */ WQ_POWER_EFFICIENT = 1 << 7, + __WQ_DESTROYING = 1 << 15, /* internal: workqueue is destroying */ __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 07895deca271..b8b541caed48 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -169,7 +169,9 @@ struct worker_pool { struct list_head idle_list; /* L: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ - struct timer_list mayday_timer; /* L: SOS timer for workers */ + struct work_struct idle_cull_work; /* L: worker idle cleanup */ + + struct timer_list mayday_timer; /* L: SOS timer for workers */ /* a workers is either on busy_hash or idle_list, or the manager */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); @@ -177,6 +179,7 @@ struct worker_pool { struct worker *manager; /* L: purely informational */ struct list_head workers; /* A: attached workers */ + struct list_head dying_workers; /* A: workers about to die */ struct completion *detach_completion; /* all workers detached */ struct ida worker_ida; /* worker IDs for task name */ @@ -326,7 +329,7 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ -/* PL: allowable cpus for unbound wqs and work items */ +/* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; /* CPU where unbound work was last round robin scheduled from this CPU */ @@ -1433,9 +1436,13 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, lockdep_assert_irqs_disabled(); - /* if draining, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & __WQ_DRAINING) && - WARN_ON_ONCE(!is_chained_work(wq))) + /* + * For a draining wq, only works from the same workqueue are + * allowed. The __WQ_DESTROYING helps to spot the issue that + * queues a new work item to a wq after destroy_workqueue(wq). + */ + if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && + WARN_ON_ONCE(!is_chained_work(wq)))) return; rcu_read_lock(); retry: @@ -1900,7 +1907,7 @@ static void worker_detach_from_pool(struct worker *worker) list_del(&worker->node); worker->pool = NULL; - if (list_empty(&pool->workers)) + if (list_empty(&pool->workers) && list_empty(&pool->dying_workers)) detach_completion = pool->detach_completion; mutex_unlock(&wq_pool_attach_mutex); @@ -1972,21 +1979,55 @@ fail: return NULL; } +static void unbind_worker(struct worker *worker) +{ + lockdep_assert_held(&wq_pool_attach_mutex); + + kthread_set_per_cpu(worker->task, -1); + if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); + else + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); +} + +static void wake_dying_workers(struct list_head *cull_list) +{ + struct worker *worker, *tmp; + + list_for_each_entry_safe(worker, tmp, cull_list, entry) { + list_del_init(&worker->entry); + unbind_worker(worker); + /* + * If the worker was somehow already running, then it had to be + * in pool->idle_list when set_worker_dying() happened or we + * wouldn't have gotten here. + * + * Thus, the worker must either have observed the WORKER_DIE + * flag, or have set its state to TASK_IDLE. Either way, the + * below will be observed by the worker and is safe to do + * outside of pool->lock. + */ + wake_up_process(worker->task); + } +} + /** - * destroy_worker - destroy a workqueue worker + * set_worker_dying - Tag a worker for destruction * @worker: worker to be destroyed + * @list: transfer worker away from its pool->idle_list and into list * - * Destroy @worker and adjust @pool stats accordingly. The worker should - * be idle. + * Tag @worker for destruction and adjust @pool stats accordingly. The worker + * should be idle. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ -static void destroy_worker(struct worker *worker) +static void set_worker_dying(struct worker *worker, struct list_head *list) { struct worker_pool *pool = worker->pool; lockdep_assert_held(&pool->lock); + lockdep_assert_held(&wq_pool_attach_mutex); /* sanity check frenzy */ if (WARN_ON(worker->current_work) || @@ -1997,34 +2038,94 @@ static void destroy_worker(struct worker *worker) pool->nr_workers--; pool->nr_idle--; - list_del_init(&worker->entry); worker->flags |= WORKER_DIE; - wake_up_process(worker->task); + + list_move(&worker->entry, list); + list_move(&worker->node, &pool->dying_workers); } +/** + * idle_worker_timeout - check if some idle workers can now be deleted. + * @t: The pool's idle_timer that just expired + * + * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in + * worker_leave_idle(), as a worker flicking between idle and active while its + * pool is at the too_many_workers() tipping point would cause too much timer + * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let + * it expire and re-evaluate things from there. + */ static void idle_worker_timeout(struct timer_list *t) { struct worker_pool *pool = from_timer(pool, t, idle_timer); + bool do_cull = false; + + if (work_pending(&pool->idle_cull_work)) + return; raw_spin_lock_irq(&pool->lock); - while (too_many_workers(pool)) { + if (too_many_workers(pool)) { struct worker *worker; unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; + do_cull = !time_before(jiffies, expires); + + if (!do_cull) + mod_timer(&pool->idle_timer, expires); + } + raw_spin_unlock_irq(&pool->lock); + + if (do_cull) + queue_work(system_unbound_wq, &pool->idle_cull_work); +} + +/** + * idle_cull_fn - cull workers that have been idle for too long. + * @work: the pool's work for handling these idle workers + * + * This goes through a pool's idle workers and gets rid of those that have been + * idle for at least IDLE_WORKER_TIMEOUT seconds. + * + * We don't want to disturb isolated CPUs because of a pcpu kworker being + * culled, so this also resets worker affinity. This requires a sleepable + * context, hence the split between timer callback and work item. + */ +static void idle_cull_fn(struct work_struct *work) +{ + struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); + struct list_head cull_list; + + INIT_LIST_HEAD(&cull_list); + /* + * Grabbing wq_pool_attach_mutex here ensures an already-running worker + * cannot proceed beyong worker_detach_from_pool() in its self-destruct + * path. This is required as a previously-preempted worker could run after + * set_worker_dying() has happened but before wake_dying_workers() did. + */ + mutex_lock(&wq_pool_attach_mutex); + raw_spin_lock_irq(&pool->lock); + + while (too_many_workers(pool)) { + struct worker *worker; + unsigned long expires; + + worker = list_entry(pool->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { mod_timer(&pool->idle_timer, expires); break; } - destroy_worker(worker); + set_worker_dying(worker, &cull_list); } raw_spin_unlock_irq(&pool->lock); + wake_dying_workers(&cull_list); + mutex_unlock(&wq_pool_attach_mutex); } static void send_mayday(struct work_struct *work) @@ -2388,12 +2489,12 @@ woke_up: /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { raw_spin_unlock_irq(&pool->lock); - WARN_ON_ONCE(!list_empty(&worker->entry)); set_pf_worker(false); set_task_comm(worker->task, "kworker/dying"); ida_free(&pool->worker_ida, worker->id); worker_detach_from_pool(worker); + WARN_ON_ONCE(!list_empty(&worker->entry)); kfree(worker); return 0; } @@ -3462,10 +3563,12 @@ static int init_worker_pool(struct worker_pool *pool) hash_init(pool->busy_hash); timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); + INIT_WORK(&pool->idle_cull_work, idle_cull_fn); timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); INIT_LIST_HEAD(&pool->workers); + INIT_LIST_HEAD(&pool->dying_workers); ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); @@ -3540,18 +3643,6 @@ static void rcu_free_pool(struct rcu_head *rcu) kfree(pool); } -/* This returns with the lock held on success (pool manager is inactive). */ -static bool wq_manager_inactive(struct worker_pool *pool) -{ - raw_spin_lock_irq(&pool->lock); - - if (pool->flags & POOL_MANAGER_ACTIVE) { - raw_spin_unlock_irq(&pool->lock); - return false; - } - return true; -} - /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put @@ -3566,8 +3657,11 @@ static bool wq_manager_inactive(struct worker_pool *pool) static void put_unbound_pool(struct worker_pool *pool) { DECLARE_COMPLETION_ONSTACK(detach_completion); + struct list_head cull_list; struct worker *worker; + INIT_LIST_HEAD(&cull_list); + lockdep_assert_held(&wq_pool_mutex); if (--pool->refcnt) @@ -3587,20 +3681,38 @@ static void put_unbound_pool(struct worker_pool *pool) * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. - * Because of how wq_manager_inactive() works, we will hold the - * spinlock after a successful wait. + * + * Having a concurrent manager is quite unlikely to happen as we can + * only get here with + * pwq->refcnt == pool->refcnt == 0 + * which implies no work queued to the pool, which implies no worker can + * become the manager. However a worker could have taken the role of + * manager before the refcnts dropped to 0, since maybe_create_worker() + * drops pool->lock */ - rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool), - TASK_UNINTERRUPTIBLE); - pool->flags |= POOL_MANAGER_ACTIVE; + while (true) { + rcuwait_wait_event(&manager_wait, + !(pool->flags & POOL_MANAGER_ACTIVE), + TASK_UNINTERRUPTIBLE); + + mutex_lock(&wq_pool_attach_mutex); + raw_spin_lock_irq(&pool->lock); + if (!(pool->flags & POOL_MANAGER_ACTIVE)) { + pool->flags |= POOL_MANAGER_ACTIVE; + break; + } + raw_spin_unlock_irq(&pool->lock); + mutex_unlock(&wq_pool_attach_mutex); + } while ((worker = first_idle_worker(pool))) - destroy_worker(worker); + set_worker_dying(worker, &cull_list); WARN_ON(pool->nr_workers || pool->nr_idle); raw_spin_unlock_irq(&pool->lock); - mutex_lock(&wq_pool_attach_mutex); - if (!list_empty(&pool->workers)) + wake_dying_workers(&cull_list); + + if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers)) pool->detach_completion = &detach_completion; mutex_unlock(&wq_pool_attach_mutex); @@ -3609,6 +3721,7 @@ static void put_unbound_pool(struct worker_pool *pool) /* shut down the timers */ del_timer_sync(&pool->idle_timer); + cancel_work_sync(&pool->idle_cull_work); del_timer_sync(&pool->mayday_timer); /* RCU protected to allow dereferences from get_work_pool() */ @@ -3952,7 +4065,8 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) /* allocate the attrs and pwqs for later installation */ static struct apply_wqattrs_ctx * apply_wqattrs_prepare(struct workqueue_struct *wq, - const struct workqueue_attrs *attrs) + const struct workqueue_attrs *attrs, + const cpumask_var_t unbound_cpumask) { struct apply_wqattrs_ctx *ctx; struct workqueue_attrs *new_attrs, *tmp_attrs; @@ -3968,14 +4082,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, goto out_free; /* - * Calculate the attrs of the default pwq. + * Calculate the attrs of the default pwq with unbound_cpumask + * which is wq_unbound_cpumask or to set to wq_unbound_cpumask. * If the user configured cpumask doesn't overlap with the * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. */ copy_workqueue_attrs(new_attrs, attrs); - cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask); + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask); if (unlikely(cpumask_empty(new_attrs->cpumask))) - cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask); + cpumask_copy(new_attrs->cpumask, unbound_cpumask); /* * We may create multiple pwqs with differing cpumasks. Make a @@ -4072,7 +4187,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, wq->flags &= ~__WQ_ORDERED; } - ctx = apply_wqattrs_prepare(wq, attrs); + ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); if (!ctx) return -ENOMEM; @@ -4414,6 +4529,11 @@ void destroy_workqueue(struct workqueue_struct *wq) */ workqueue_sysfs_unregister(wq); + /* mark the workqueue destruction is in progress */ + mutex_lock(&wq->mutex); + wq->flags |= __WQ_DESTROYING; + mutex_unlock(&wq->mutex); + /* drain it before proceeding with destruction */ drain_workqueue(wq); @@ -4709,22 +4829,53 @@ static void pr_cont_pool_info(struct worker_pool *pool) pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); } -static void pr_cont_work(bool comma, struct work_struct *work) +struct pr_cont_work_struct { + bool comma; + work_func_t func; + long ctr; +}; + +static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp) +{ + if (!pcwsp->ctr) + goto out_record; + if (func == pcwsp->func) { + pcwsp->ctr++; + return; + } + if (pcwsp->ctr == 1) + pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func); + else + pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func); + pcwsp->ctr = 0; +out_record: + if ((long)func == -1L) + return; + pcwsp->comma = comma; + pcwsp->func = func; + pcwsp->ctr = 1; +} + +static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp) { if (work->func == wq_barrier_func) { struct wq_barrier *barr; barr = container_of(work, struct wq_barrier, work); + pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { - pr_cont("%s %ps", comma ? "," : "", work->func); + if (!comma) + pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); + pr_cont_work_flush(comma, work->func, pcwsp); } } static void show_pwq(struct pool_workqueue *pwq) { + struct pr_cont_work_struct pcws = { .ctr = 0, }; struct worker_pool *pool = pwq->pool; struct work_struct *work; struct worker *worker; @@ -4757,7 +4908,8 @@ static void show_pwq(struct pool_workqueue *pwq) worker->rescue_wq ? "(RESCUER)" : "", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) - pr_cont_work(false, work); + pr_cont_work(false, work, &pcws); + pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); comma = true; } pr_cont("\n"); @@ -4777,9 +4929,10 @@ static void show_pwq(struct pool_workqueue *pwq) if (get_work_pwq(work) != pwq) continue; - pr_cont_work(comma, work); + pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } + pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } @@ -4788,9 +4941,10 @@ static void show_pwq(struct pool_workqueue *pwq) pr_info(" inactive:"); list_for_each_entry(work, &pwq->inactive_works, entry) { - pr_cont_work(comma, work); + pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } + pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } } @@ -5006,13 +5160,8 @@ static void unbind_workers(int cpu) raw_spin_unlock_irq(&pool->lock); - for_each_pool_worker(worker, pool) { - kthread_set_per_cpu(worker->task, -1); - if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); - else - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); - } + for_each_pool_worker(worker, pool) + unbind_worker(worker); mutex_unlock(&wq_pool_attach_mutex); } @@ -5334,7 +5483,7 @@ out_unlock: } #endif /* CONFIG_FREEZER */ -static int workqueue_apply_unbound_cpumask(void) +static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) { LIST_HEAD(ctxs); int ret = 0; @@ -5350,7 +5499,7 @@ static int workqueue_apply_unbound_cpumask(void) if (wq->flags & __WQ_ORDERED) continue; - ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs); + ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); if (!ctx) { ret = -ENOMEM; break; @@ -5365,6 +5514,11 @@ static int workqueue_apply_unbound_cpumask(void) apply_wqattrs_cleanup(ctx); } + if (!ret) { + mutex_lock(&wq_pool_attach_mutex); + cpumask_copy(wq_unbound_cpumask, unbound_cpumask); + mutex_unlock(&wq_pool_attach_mutex); + } return ret; } @@ -5383,7 +5537,6 @@ static int workqueue_apply_unbound_cpumask(void) int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) { int ret = -EINVAL; - cpumask_var_t saved_cpumask; /* * Not excluding isolated cpus on purpose. @@ -5397,23 +5550,8 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) goto out_unlock; } - if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out_unlock; - } - - /* save the old wq_unbound_cpumask. */ - cpumask_copy(saved_cpumask, wq_unbound_cpumask); - - /* update wq_unbound_cpumask at first and apply it to wqs. */ - cpumask_copy(wq_unbound_cpumask, cpumask); - ret = workqueue_apply_unbound_cpumask(); - - /* restore the wq_unbound_cpumask when failed. */ - if (ret < 0) - cpumask_copy(wq_unbound_cpumask, saved_cpumask); + ret = workqueue_apply_unbound_cpumask(cpumask); - free_cpumask_var(saved_cpumask); out_unlock: apply_wqattrs_unlock(); } |