diff options
-rw-r--r-- | Documentation/kernel-parameters.txt | 9 | ||||
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | include/linux/workqueue.h | 6 | ||||
-rw-r--r-- | kernel/sched/clock.c | 2 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 6 | ||||
-rw-r--r-- | kernel/watchdog.c | 18 | ||||
-rw-r--r-- | kernel/workqueue.c | 220 | ||||
-rw-r--r-- | lib/Kconfig.debug | 11 |
8 files changed, 261 insertions, 15 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1a44a7a7f71d..1a8169ba29e6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -4140,6 +4140,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. or other driver-specific files in the Documentation/watchdog/ directory. + workqueue.watchdog_thresh= + If CONFIG_WQ_WATCHDOG is configured, workqueue can + warn stall conditions and dump internal state to + help debugging. 0 disables workqueue stall + detection; otherwise, it's the stall threshold + duration in seconds. The default value is 30 and + it can be updated at runtime by writing to the + corresponding sysfs file. + workqueue.disable_numa By default, all work items queued to unbound workqueues are affine to the NUMA nodes they're diff --git a/include/linux/sched.h b/include/linux/sched.h index 0c0e78102850..4bae8ab3b893 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -377,6 +377,7 @@ extern void scheduler_tick(void); extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_LOCKUP_DETECTOR +extern void touch_softlockup_watchdog_sched(void); extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); @@ -387,6 +388,9 @@ extern unsigned int softlockup_panic; extern unsigned int hardlockup_panic; void lockup_detector_init(void); #else +static inline void touch_softlockup_watchdog_sched(void) +{ +} static inline void touch_softlockup_watchdog(void) { } diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0197358f1e81..0e32bc71245e 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -618,4 +618,10 @@ static inline int workqueue_sysfs_register(struct workqueue_struct *wq) { return 0; } #endif /* CONFIG_SYSFS */ +#ifdef CONFIG_WQ_WATCHDOG +void wq_watchdog_touch(int cpu); +#else /* CONFIG_WQ_WATCHDOG */ +static inline void wq_watchdog_touch(int cpu) { } +#endif /* CONFIG_WQ_WATCHDOG */ + #endif diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index caf4041f5b0a..bc54e84675da 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) return; sched_clock_tick(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99ef0df12807..9cc20af58c76 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -143,7 +143,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) * when we go busy again does not account too much ticks. */ if (ts->tick_stopped) { - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; } @@ -430,7 +430,7 @@ static void tick_nohz_update_jiffies(ktime_t now) tick_do_update_jiffies64(now); local_irq_restore(flags); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } /* @@ -717,7 +717,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int update_cpu_load_nohz(active); calc_load_exit_idle(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 84b5035cb6a5..b3ace6ebbba3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -20,6 +20,7 @@ #include <linux/smpboot.h> #include <linux/sched/rt.h> #include <linux/tick.h> +#include <linux/workqueue.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -225,7 +226,15 @@ static void __touch_watchdog(void) __this_cpu_write(watchdog_touch_ts, get_timestamp()); } -void touch_softlockup_watchdog(void) +/** + * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls + * + * Call when the scheduler may have stalled for legitimate reasons + * preventing the watchdog task from executing - e.g. the scheduler + * entering idle state. This should only be used for scheduler events. + * Use touch_softlockup_watchdog() for everything else. + */ +void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's timestamp @@ -233,6 +242,12 @@ void touch_softlockup_watchdog(void) */ raw_cpu_write(watchdog_touch_ts, 0); } + +void touch_softlockup_watchdog(void) +{ + touch_softlockup_watchdog_sched(); + wq_watchdog_touch(raw_smp_processor_id()); +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -246,6 +261,7 @@ void touch_all_softlockup_watchdogs(void) */ for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; + wq_watchdog_touch(-1); } #ifdef CONFIG_HARDLOCKUP_DETECTOR diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c579dbab2e36..61a0264e28f9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -148,6 +148,8 @@ struct worker_pool { int id; /* I: pool ID */ unsigned int flags; /* X: flags */ + unsigned long watchdog_ts; /* L: watchdog timestamp */ + struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -1083,6 +1085,8 @@ static void pwq_activate_delayed_work(struct work_struct *work) struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); + if (list_empty(&pwq->pool->worklist)) + pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); pwq->nr_active++; @@ -1385,6 +1389,8 @@ retry: trace_workqueue_activate_work(work); pwq->nr_active++; worklist = &pwq->pool->worklist; + if (list_empty(worklist)) + pwq->pool->watchdog_ts = jiffies; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &pwq->delayed_works; @@ -2157,6 +2163,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); + pool->watchdog_ts = jiffies; + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { /* optimization path, not strictly necessary */ process_one_work(worker, work); @@ -2240,6 +2248,7 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; + bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2256,9 +2265,14 @@ repeat: * process'em. */ WARN_ON_ONCE(!list_empty(scheduled)); - list_for_each_entry_safe(work, n, &pool->worklist, entry) - if (get_work_pwq(work) == pwq) + list_for_each_entry_safe(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + if (first) + pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); + } + first = false; + } if (!list_empty(scheduled)) { process_scheduled_works(rescuer); @@ -2316,6 +2330,37 @@ repeat: goto repeat; } +/** + * check_flush_dependency - check for flush dependency sanity + * @target_wq: workqueue being flushed + * @target_work: work item being flushed (NULL for workqueue flushes) + * + * %current is trying to flush the whole @target_wq or @target_work on it. + * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not + * reclaiming memory or running on a workqueue which doesn't have + * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to + * a deadlock. + */ +static void check_flush_dependency(struct workqueue_struct *target_wq, + struct work_struct *target_work) +{ + work_func_t target_func = target_work ? target_work->func : NULL; + struct worker *worker; + + if (target_wq->flags & WQ_MEM_RECLAIM) + return; + + worker = current_wq_worker(); + + WARN_ONCE(current->flags & PF_MEMALLOC, + "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", + current->pid, current->comm, target_wq->name, target_func); + WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM), + "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", + worker->current_pwq->wq->name, worker->current_func, + target_wq->name, target_func); +} + struct wq_barrier { struct work_struct work; struct completion done; @@ -2525,6 +2570,8 @@ void flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } + check_flush_dependency(wq, NULL); + mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); @@ -2697,6 +2744,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) pwq = worker->current_pwq; } + check_flush_dependency(pwq->wq, work); + insert_wq_barrier(pwq, barr, work, worker); spin_unlock_irq(&pool->lock); @@ -3069,6 +3118,7 @@ static int init_worker_pool(struct worker_pool *pool) pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; + pool->watchdog_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); @@ -3601,7 +3651,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct apply_wqattrs_ctx *ctx; - int ret = -ENOMEM; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) @@ -3612,16 +3661,14 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; ctx = apply_wqattrs_prepare(wq, attrs); + if (!ctx) + return -ENOMEM; /* the ctx has been prepared successfully, let's commit it */ - if (ctx) { - apply_wqattrs_commit(ctx); - ret = 0; - } - + apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); - return ret; + return 0; } /** @@ -4308,7 +4355,9 @@ void show_workqueue_state(void) pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" workers=%d", pool->nr_workers); + pr_cont(" hung=%us workers=%d", + jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, + pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); @@ -5167,6 +5216,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ +/* + * Workqueue watchdog. + * + * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal + * flush dependency, a concurrency managed work item which stays RUNNING + * indefinitely. Workqueue stalls can be very difficult to debug as the + * usual warning mechanisms don't trigger and internal workqueue state is + * largely opaque. + * + * Workqueue watchdog monitors all worker pools periodically and dumps + * state if some pools failed to make forward progress for a while where + * forward progress is defined as the first item on ->worklist changing. + * + * This mechanism is controlled through the kernel parameter + * "workqueue.watchdog_thresh" which can be updated at runtime through the + * corresponding sysfs parameter file. + */ +#ifdef CONFIG_WQ_WATCHDOG + +static void wq_watchdog_timer_fn(unsigned long data); + +static unsigned long wq_watchdog_thresh = 30; +static struct timer_list wq_watchdog_timer = + TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0); + +static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; + +static void wq_watchdog_reset_touched(void) +{ + int cpu; + + wq_watchdog_touched = jiffies; + for_each_possible_cpu(cpu) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; +} + +static void wq_watchdog_timer_fn(unsigned long data) +{ + unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + bool lockup_detected = false; + struct worker_pool *pool; + int pi; + + if (!thresh) + return; + + rcu_read_lock(); + + for_each_pool(pool, pi) { + unsigned long pool_ts, touched, ts; + + if (list_empty(&pool->worklist)) + continue; + + /* get the latest of pool and touched timestamps */ + pool_ts = READ_ONCE(pool->watchdog_ts); + touched = READ_ONCE(wq_watchdog_touched); + + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + + if (pool->cpu >= 0) { + unsigned long cpu_touched = + READ_ONCE(per_cpu(wq_watchdog_touched_cpu, + pool->cpu)); + if (time_after(cpu_touched, ts)) + ts = cpu_touched; + } + + /* did we stall? */ + if (time_after(jiffies, ts + thresh)) { + lockup_detected = true; + pr_emerg("BUG: workqueue lockup - pool"); + pr_cont_pool_info(pool); + pr_cont(" stuck for %us!\n", + jiffies_to_msecs(jiffies - pool_ts) / 1000); + } + } + + rcu_read_unlock(); + + if (lockup_detected) + show_workqueue_state(); + + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh); +} + +void wq_watchdog_touch(int cpu) +{ + if (cpu >= 0) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; + else + wq_watchdog_touched = jiffies; +} + +static void wq_watchdog_set_thresh(unsigned long thresh) +{ + wq_watchdog_thresh = 0; + del_timer_sync(&wq_watchdog_timer); + + if (thresh) { + wq_watchdog_thresh = thresh; + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); + } +} + +static int wq_watchdog_param_set_thresh(const char *val, + const struct kernel_param *kp) +{ + unsigned long thresh; + int ret; + + ret = kstrtoul(val, 0, &thresh); + if (ret) + return ret; + + if (system_wq) + wq_watchdog_set_thresh(thresh); + else + wq_watchdog_thresh = thresh; + + return 0; +} + +static const struct kernel_param_ops wq_watchdog_thresh_ops = { + .set = wq_watchdog_param_set_thresh, + .get = param_get_ulong, +}; + +module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, + 0644); + +static void wq_watchdog_init(void) +{ + wq_watchdog_set_thresh(wq_watchdog_thresh); +} + +#else /* CONFIG_WQ_WATCHDOG */ + +static inline void wq_watchdog_init(void) { } + +#endif /* CONFIG_WQ_WATCHDOG */ + static void __init wq_numa_init(void) { cpumask_var_t *tbl; @@ -5290,6 +5487,9 @@ static int __init init_workqueues(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); + + wq_watchdog_init(); + return 0; } early_initcall(init_workqueues); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8c15b29d5adc..3048bf5b729a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -812,6 +812,17 @@ config BOOTPARAM_HUNG_TASK_PANIC_VALUE default 0 if !BOOTPARAM_HUNG_TASK_PANIC default 1 if BOOTPARAM_HUNG_TASK_PANIC +config WQ_WATCHDOG + bool "Detect Workqueue Stalls" + depends on DEBUG_KERNEL + help + Say Y here to enable stall detection on workqueues. If a + worker pool doesn't make forward progress on a pending work + item for over a given amount of time, 30s by default, a + warning message is printed along with dump of workqueue + state. This can be configured through kernel parameter + "workqueue.watchdog_thresh" and its sysfs counterpart. + endmenu # "Debug lockups and hangs" config PANIC_ON_OOPS |