From 265f3ed077036f053981f5eea0b5b43e7c5b39ff Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 24 Sep 2023 17:07:02 +0200 Subject: workqueue: Provide one lock class key per work_on_cpu() callsite All callers of work_on_cpu() share the same lock class key for all the functions queued. As a result the workqueue related locking scenario for a function A may be spuriously accounted as an inversion against the locking scenario of function B such as in the following model: long A(void *arg) { mutex_lock(&mutex); mutex_unlock(&mutex); } long B(void *arg) { } void launchA(void) { work_on_cpu(0, A, NULL); } void launchB(void) { mutex_lock(&mutex); work_on_cpu(1, B, NULL); mutex_unlock(&mutex); } launchA and launchB running concurrently have no chance to deadlock. However the above can be reported by lockdep as a possible locking inversion because the works containing A() and B() are treated as belonging to the same locking class. The following shows an existing example of such a spurious lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 6.6.0-rc1-00065-g934ebd6e5359 #35409 Not tainted ------------------------------------------------------ kworker/0:1/9 is trying to acquire lock: ffffffff9bc72f30 (cpu_hotplug_lock){++++}-{0:0}, at: _cpu_down+0x57/0x2b0 but task is already holding lock: ffff9e3bc0057e60 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_scheduled_works+0x216/0x500 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 ((work_completion)(&wfc.work)){+.+.}-{0:0}: __flush_work+0x83/0x4e0 work_on_cpu+0x97/0xc0 rcu_nocb_cpu_offload+0x62/0xb0 rcu_nocb_toggle+0xd0/0x1d0 kthread+0xe6/0x120 ret_from_fork+0x2f/0x40 ret_from_fork_asm+0x1b/0x30 -> #1 (rcu_state.barrier_mutex){+.+.}-{3:3}: __mutex_lock+0x81/0xc80 rcu_nocb_cpu_deoffload+0x38/0xb0 rcu_nocb_toggle+0x144/0x1d0 kthread+0xe6/0x120 ret_from_fork+0x2f/0x40 ret_from_fork_asm+0x1b/0x30 -> #0 (cpu_hotplug_lock){++++}-{0:0}: __lock_acquire+0x1538/0x2500 lock_acquire+0xbf/0x2a0 percpu_down_write+0x31/0x200 _cpu_down+0x57/0x2b0 __cpu_down_maps_locked+0x10/0x20 work_for_cpu_fn+0x15/0x20 process_scheduled_works+0x2a7/0x500 worker_thread+0x173/0x330 kthread+0xe6/0x120 ret_from_fork+0x2f/0x40 ret_from_fork_asm+0x1b/0x30 other info that might help us debug this: Chain exists of: cpu_hotplug_lock --> rcu_state.barrier_mutex --> (work_completion)(&wfc.work) Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock((work_completion)(&wfc.work)); lock(rcu_state.barrier_mutex); lock((work_completion)(&wfc.work)); lock(cpu_hotplug_lock); *** DEADLOCK *** 2 locks held by kworker/0:1/9: #0: ffff900481068b38 ((wq_completion)events){+.+.}-{0:0}, at: process_scheduled_works+0x212/0x500 #1: ffff9e3bc0057e60 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_scheduled_works+0x216/0x500 stack backtrace: CPU: 0 PID: 9 Comm: kworker/0:1 Not tainted 6.6.0-rc1-00065-g934ebd6e5359 #35409 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 Workqueue: events work_for_cpu_fn Call Trace: rcu-torture: rcu_torture_read_exit: Start of episode dump_stack_lvl+0x4a/0x80 check_noncircular+0x132/0x150 __lock_acquire+0x1538/0x2500 lock_acquire+0xbf/0x2a0 ? _cpu_down+0x57/0x2b0 percpu_down_write+0x31/0x200 ? _cpu_down+0x57/0x2b0 _cpu_down+0x57/0x2b0 __cpu_down_maps_locked+0x10/0x20 work_for_cpu_fn+0x15/0x20 process_scheduled_works+0x2a7/0x500 worker_thread+0x173/0x330 ? __pfx_worker_thread+0x10/0x10 kthread+0xe6/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x40 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 Signed-off-by: Frederic Weisbecker Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 46 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 1c1d06804d45..24b1e5070f4d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -274,18 +274,16 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } * to generate better code. */ #ifdef CONFIG_LOCKDEP -#define __INIT_WORK(_work, _func, _onstack) \ +#define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ - static struct lock_class_key __key; \ - \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ - lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \ + lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #else -#define __INIT_WORK(_work, _func, _onstack) \ +#define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ @@ -294,12 +292,22 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } } while (0) #endif +#define __INIT_WORK(_work, _func, _onstack) \ + do { \ + static __maybe_unused struct lock_class_key __key; \ + \ + __INIT_WORK_KEY(_work, _func, _onstack, &__key); \ + } while (0) + #define INIT_WORK(_work, _func) \ __INIT_WORK((_work), (_func), 0) #define INIT_WORK_ONSTACK(_work, _func) \ __INIT_WORK((_work), (_func), 1) +#define INIT_WORK_ONSTACK_KEY(_work, _func, _key) \ + __INIT_WORK_KEY((_work), (_func), 1, _key) + #define __INIT_DELAYED_WORK(_work, _func, _tflags) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ @@ -693,8 +701,32 @@ static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) return fn(arg); } #else -long work_on_cpu(int cpu, long (*fn)(void *), void *arg); -long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg); +long work_on_cpu_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key); +/* + * A new key is defined for each caller to make sure the work + * associated with the function doesn't share its locking class. + */ +#define work_on_cpu(_cpu, _fn, _arg) \ +({ \ + static struct lock_class_key __key; \ + \ + work_on_cpu_key(_cpu, _fn, _arg, &__key); \ +}) + +long work_on_cpu_safe_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key); + +/* + * A new key is defined for each caller to make sure the work + * associated with the function doesn't share its locking class. + */ +#define work_on_cpu_safe(_cpu, _fn, _arg) \ +({ \ + static struct lock_class_key __key; \ + \ + work_on_cpu_safe_key(_cpu, _fn, _arg, &__key); \ +}) #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER -- cgit v1.2.3 From fe28f631fa941fba583d1c4f25895284b90af671 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Oct 2023 14:25:52 -0400 Subject: workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask When the "isolcpus" boot command line option is used to add a set of isolated CPUs, those CPUs will be excluded automatically from wq_unbound_cpumask to avoid running work functions from unbound workqueues. Recently cpuset has been extended to allow the creation of partitions of isolated CPUs dynamically. To make it closer to the "isolcpus" in functionality, the CPUs in those isolated cpuset partitions should be excluded from wq_unbound_cpumask as well. This can be done currently by explicitly writing to the workqueue's cpumask sysfs file after creating the isolated partitions. However, this process can be error prone. Ideally, the cpuset code should be allowed to request the workqueue code to exclude those isolated CPUs from wq_unbound_cpumask so that this operation can be done automatically and the isolated CPUs will be returned back to wq_unbound_cpumask after the destructions of the isolated cpuset partitions. This patch adds a new workqueue_unbound_exclude_cpumask() function to enable that. This new function will exclude the specified isolated CPUs from wq_unbound_cpumask. To be able to restore those isolated CPUs back after the destruction of isolated cpuset partitions, a new wq_requested_unbound_cpumask is added to store the user provided unbound cpumask either from the boot command line options or from writing to the cpumask sysfs file. This new cpumask provides the basis for CPU exclusion. To enable users to understand how the wq_unbound_cpumask is being modified internally, this patch also exposes the newly introduced wq_requested_unbound_cpumask as well as a wq_isolated_cpumask to store the cpumask to be excluded from wq_unbound_cpumask as read-only sysfs files. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- kernel/workqueue.c | 91 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 84 insertions(+), 9 deletions(-) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 24b1e5070f4d..b0b9604b76b8 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -491,7 +491,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); +extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6e578f576a6f..bd9d34eacd78 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -381,6 +381,12 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; +/* PL: user requested unbound cpumask via sysfs */ +static cpumask_var_t wq_requested_unbound_cpumask; + +/* PL: isolated cpumask to be excluded from unbound cpumask */ +static cpumask_var_t wq_isolated_cpumask; + /* for further constrain wq_unbound_cpumask by cmdline parameter*/ static struct cpumask wq_cmdline_cpumask __initdata; @@ -5839,7 +5845,7 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) { int ret = -EINVAL; @@ -5850,6 +5856,7 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); + cpumask_copy(wq_requested_unbound_cpumask, cpumask); if (cpumask_equal(cpumask, wq_unbound_cpumask)) { ret = 0; goto out_unlock; @@ -5864,6 +5871,44 @@ out_unlock: return ret; } +/** + * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask + * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask + * + * This function can be called from cpuset code to provide a set of isolated + * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold + * either cpus_read_lock or cpus_write_lock. + */ +int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) +{ + cpumask_var_t cpumask; + int ret = 0; + + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + lockdep_assert_cpus_held(); + mutex_lock(&wq_pool_mutex); + + /* Save the current isolated cpumask & export it via sysfs */ + cpumask_copy(wq_isolated_cpumask, exclude_cpumask); + + /* + * If the operation fails, it will fall back to + * wq_requested_unbound_cpumask which is initially set to + * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten + * by any subsequent write to workqueue/cpumask sysfs file. + */ + if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask)) + cpumask_copy(cpumask, wq_requested_unbound_cpumask); + if (!cpumask_equal(cpumask, wq_unbound_cpumask)) + ret = workqueue_apply_unbound_cpumask(cpumask); + + mutex_unlock(&wq_pool_mutex); + free_cpumask_var(cpumask); + return ret; +} + static int parse_affn_scope(const char *val) { int i; @@ -6158,19 +6203,36 @@ static struct bus_type wq_subsys = { .dev_groups = wq_sysfs_groups, }; -static ssize_t wq_unbound_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t __wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf, cpumask_var_t mask) { int written; mutex_lock(&wq_pool_mutex); - written = scnprintf(buf, PAGE_SIZE, "%*pb\n", - cpumask_pr_args(wq_unbound_cpumask)); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); mutex_unlock(&wq_pool_mutex); return written; } +static ssize_t wq_unbound_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); +} + +static ssize_t wq_requested_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); +} + +static ssize_t wq_isolated_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); +} + static ssize_t wq_unbound_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -6188,9 +6250,13 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev, return ret ? ret : count; } -static struct device_attribute wq_sysfs_cpumask_attr = +static struct device_attribute wq_sysfs_cpumask_attrs[] = { __ATTR(cpumask, 0644, wq_unbound_cpumask_show, - wq_unbound_cpumask_store); + wq_unbound_cpumask_store), + __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL), + __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL), + __ATTR_NULL, +}; static int __init wq_sysfs_init(void) { @@ -6203,7 +6269,13 @@ static int __init wq_sysfs_init(void) dev_root = bus_get_dev_root(&wq_subsys); if (dev_root) { - err = device_create_file(dev_root, &wq_sysfs_cpumask_attr); + struct device_attribute *attr; + + for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) { + err = device_create_file(dev_root, attr); + if (err) + break; + } put_device(dev_root); } return err; @@ -6534,11 +6606,14 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL)); cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit v1.2.3 From b2fa8443db320c4873feca2588b957439e350890 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:55:01 -0500 Subject: workqueue: Split out workqueue_types.h More sched.h dependency culling - this lets us kill a rhashtable-types.h dependency on workqueue.h. Signed-off-by: Kent Overstreet --- include/linux/dma-fence.h | 1 + include/linux/rhashtable-types.h | 2 +- include/linux/timekeeping.h | 1 + include/linux/workqueue.h | 16 +--------------- include/linux/workqueue_types.h | 25 +++++++++++++++++++++++++ 5 files changed, 29 insertions(+), 16 deletions(-) create mode 100644 include/linux/workqueue_types.h (limited to 'include/linux/workqueue.h') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index b3772edca2e6..e06bad467f55 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -21,6 +21,7 @@ #include #include #include +#include struct dma_fence; struct dma_fence_ops; diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 57467cbf4c5b..b6f3797277ff 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include struct rhash_head { struct rhash_head __rcu *next; diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index fe1e467ba046..7c43e98cf211 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -4,6 +4,7 @@ #include #include +#include /* Included from linux/ktime.h */ diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 24b1e5070f4d..f1bb2e35301f 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -14,12 +14,7 @@ #include #include #include - -struct workqueue_struct; - -struct work_struct; -typedef void (*work_func_t)(struct work_struct *work); -void delayed_work_timer_fn(struct timer_list *t); +#include /* * The first word is the work queue pointer and the flags rolled into @@ -95,15 +90,6 @@ enum { #define WORK_STRUCT_FLAG_MASK ((1ul << WORK_STRUCT_FLAG_BITS) - 1) #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) -struct work_struct { - atomic_long_t data; - struct list_head entry; - work_func_t func; -#ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; -#endif -}; - #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)) diff --git a/include/linux/workqueue_types.h b/include/linux/workqueue_types.h new file mode 100644 index 000000000000..4c38824f3ab4 --- /dev/null +++ b/include/linux/workqueue_types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_WORKQUEUE_TYPES_H +#define _LINUX_WORKQUEUE_TYPES_H + +#include +#include +#include +#include + +struct workqueue_struct; + +struct work_struct; +typedef void (*work_func_t)(struct work_struct *work); +void delayed_work_timer_fn(struct timer_list *t); + +struct work_struct { + atomic_long_t data; + struct list_head entry; + work_func_t func; +#ifdef CONFIG_LOCKDEP + struct lockdep_map lockdep_map; +#endif +}; + +#endif /* _LINUX_WORKQUEUE_TYPES_H */ -- cgit v1.2.3