diff options
Diffstat (limited to 'kernel')
86 files changed, 6738 insertions, 3672 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 3de8fd11873b..4198f0273ecd 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -251,7 +251,7 @@ config ARCH_USE_QUEUED_RWLOCKS config QUEUED_RWLOCKS def_bool y if ARCH_USE_QUEUED_RWLOCKS - depends on SMP + depends on SMP && !PREEMPT_RT config ARCH_HAS_MMIOWB bool diff --git a/kernel/audit.h b/kernel/audit.h index b565ea16c0a5..d6a2c899a8db 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -6,6 +6,9 @@ * Copyright 2005 IBM Corporation */ +#ifndef _KERNEL_AUDIT_H_ +#define _KERNEL_AUDIT_H_ + #include <linux/fs.h> #include <linux/audit.h> #include <linux/skbuff.h> @@ -331,3 +334,5 @@ extern int audit_filter(int msgtype, unsigned int listtype); extern void audit_ctl_lock(void); extern void audit_ctl_unlock(void); + +#endif diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b2be4e978ba3..2cd7b5694422 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -593,7 +593,6 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged) spin_lock(&hash_lock); } spin_unlock(&hash_lock); - put_tree(victim); } /* @@ -602,6 +601,7 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged) static void prune_one(struct audit_tree *victim) { prune_tree_chunks(victim, false); + put_tree(victim); } /* trim the uncommitted chunks from tree */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index de2c432dee20..35b920328344 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -50,6 +50,8 @@ bool cgroup1_ssid_disabled(int ssid) * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task * @tsk: the task to be attached + * + * Return: %0 on success or a negative errno code on failure */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { @@ -80,7 +82,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /** - * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * cgroup_transfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved * @from: cgroup in which the tasks currently reside * @@ -89,6 +91,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all); * is guaranteed to be either visible in the source cgroup after the * parent's migration is complete or put into the target cgroup. No task * can slip out of migration through forking. + * + * Return: %0 on success or a negative errno code on failure */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { @@ -682,6 +686,8 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) * * Build and fill cgroupstats so that taskstats can export it to user * space. + * + * Return: %0 on success or a negative errno code on failure */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3a0161c21b6b..881ce1470beb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -68,6 +68,14 @@ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* + * To avoid confusing the compiler (and generating warnings) with code + * that attempts to access what would be a 0-element array (i.e. sized + * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this + * constant expression can be added. + */ +#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) + +/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * @@ -248,7 +256,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, */ bool cgroup_ssid_enabled(int ssid) { - if (CGROUP_SUBSYS_COUNT == 0) + if (!CGROUP_HAS_SUBSYS_CONFIG) return false; return static_key_enabled(cgroup_subsys_enabled_key[ssid]); @@ -472,7 +480,7 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp) static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { - if (ss) + if (CGROUP_HAS_SUBSYS_CONFIG && ss) return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else @@ -550,6 +558,9 @@ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, { struct cgroup_subsys_state *css; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return NULL; + do { css = cgroup_css(cgrp, ss); @@ -577,6 +588,9 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, { struct cgroup_subsys_state *css; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return NULL; + rcu_read_lock(); do { @@ -647,7 +661,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete. */ - if (cft->ss) + if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self; @@ -695,7 +709,7 @@ EXPORT_SYMBOL_GPL(of_css); */ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsigned long __ss_mask = (ss_mask); \ - if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ + if (!CGROUP_HAS_SUBSYS_CONFIG) { \ (ssid) = 0; \ break; \ } \ @@ -2169,7 +2183,6 @@ static void cgroup_kill_sb(struct super_block *sb) /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). - * cgroup_mount() may wait for @root's release. * * And don't kill the default root. */ @@ -2373,7 +2386,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; - while (&cset->mg_node != tset->csets) { + while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) { if (!task) task = list_first_entry(&cset->mg_tasks, struct task_struct, cg_list); @@ -4644,7 +4657,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, it->ss = css->ss; it->flags = flags; - if (it->ss) + if (CGROUP_HAS_SUBSYS_CONFIG && it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; else it->cset_pos = &css->cgroup->cset_links; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index adb5190c4429..df1ccf4558f8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -160,6 +160,9 @@ struct cpuset { */ int use_parent_ecpus; int child_ecpus_count; + + /* Handle for cpuset.cpus.partition */ + struct cgroup_file partition_file; }; /* @@ -263,6 +266,16 @@ static inline int is_partition_root(const struct cpuset *cs) return cs->partition_root_state > 0; } +/* + * Send notification event of whenever partition_root_state changes. + */ +static inline void notify_partition_change(struct cpuset *cs, + int old_prs, int new_prs) +{ + if (old_prs != new_prs) + cgroup_file_notify(&cs->partition_file); +} + static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), @@ -372,18 +385,29 @@ static inline bool is_in_v2_mode(void) } /* - * Return in pmask the portion of a cpusets's cpus_allowed that - * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. + * Return in pmask the portion of a task's cpusets's cpus_allowed that + * are online and are capable of running the task. If none are found, + * walk up the cpuset hierarchy until we find one that does have some + * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * * Call with callback_lock or cpuset_mutex held. */ -static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) +static void guarantee_online_cpus(struct task_struct *tsk, + struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + struct cpuset *cs; + + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) + cpumask_copy(pmask, cpu_online_mask); + + rcu_read_lock(); + cs = task_cs(tsk); + + while (!cpumask_intersects(cs->effective_cpus, pmask)) { cs = parent_cs(cs); if (unlikely(!cs)) { /* @@ -393,11 +417,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ - cpumask_copy(pmask, cpu_online_mask); - return; + goto out_unlock; } } - cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); + cpumask_and(pmask, pmask, cs->effective_cpus); + +out_unlock: + rcu_read_unlock(); } /* @@ -979,7 +1005,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). + * Call with cpuset_mutex held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { @@ -1040,11 +1066,11 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); rebuild_sched_domains_locked(); percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); } /** @@ -1114,7 +1140,7 @@ enum subparts_cmd { * cpus_allowed can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transofrmed from a partition - * root back to a non-partition root. any CPUs in cpus_allowed that are in + * root back to a non-partition root. Any CPUs in cpus_allowed that are in * parent's subparts_cpus will be taken away from that cpumask and put back * into parent's effective_cpus. 0 should always be returned. * @@ -1148,6 +1174,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpuset *parent = parent_cs(cpuset); int adding; /* Moving cpus from effective_cpus to subparts_cpus */ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ + int old_prs, new_prs; bool part_error = false; /* Partition error? */ percpu_rwsem_assert_held(&cpuset_rwsem); @@ -1183,6 +1210,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * A cpumask update cannot make parent's effective_cpus become empty. */ adding = deleting = false; + old_prs = new_prs = cpuset->partition_root_state; if (cmd == partcmd_enable) { cpumask_copy(tmp->addmask, cpuset->cpus_allowed); adding = true; @@ -1225,7 +1253,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, /* * partcmd_update w/o newmask: * - * addmask = cpus_allowed & parent->effectiveb_cpus + * addmask = cpus_allowed & parent->effective_cpus * * Note that parent's subparts_cpus may have been * pre-shrunk in case there is a change in the cpu list. @@ -1247,11 +1275,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, switch (cpuset->partition_root_state) { case PRS_ENABLED: if (part_error) - cpuset->partition_root_state = PRS_ERROR; + new_prs = PRS_ERROR; break; case PRS_ERROR: if (!part_error) - cpuset->partition_root_state = PRS_ENABLED; + new_prs = PRS_ENABLED; break; } /* @@ -1260,10 +1288,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, part_error = (prev_prs == PRS_ERROR); } - if (!part_error && (cpuset->partition_root_state == PRS_ERROR)) + if (!part_error && (new_prs == PRS_ERROR)) return 0; /* Nothing need to be done */ - if (cpuset->partition_root_state == PRS_ERROR) { + if (new_prs == PRS_ERROR) { /* * Remove all its cpus from parent's subparts_cpus. */ @@ -1272,7 +1300,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, parent->subparts_cpus); } - if (!adding && !deleting) + if (!adding && !deleting && (new_prs == old_prs)) return 0; /* @@ -1299,7 +1327,12 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, } parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); + + if (old_prs != new_prs) + cpuset->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + notify_partition_change(cpuset, old_prs, new_prs); return cmd == partcmd_update; } @@ -1321,6 +1354,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) struct cpuset *cp; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; + int old_prs, new_prs; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { @@ -1360,17 +1394,18 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) * update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's subparts_cpus changes. */ - if ((cp != cs) && cp->partition_root_state) { + old_prs = new_prs = cp->partition_root_state; + if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { case PRS_DISABLED: /* * If parent is not a partition root or an - * invalid partition root, clear the state - * state and the CS_CPU_EXCLUSIVE flag. + * invalid partition root, clear its state + * and its CS_CPU_EXCLUSIVE flag. */ WARN_ON_ONCE(cp->partition_root_state != PRS_ERROR); - cp->partition_root_state = 0; + new_prs = PRS_DISABLED; /* * clear_bit() is an atomic operation and @@ -1391,11 +1426,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) /* * When parent is invalid, it has to be too. */ - cp->partition_root_state = PRS_ERROR; - if (cp->nr_subparts_cpus) { - cp->nr_subparts_cpus = 0; - cpumask_clear(cp->subparts_cpus); - } + new_prs = PRS_ERROR; break; } } @@ -1407,8 +1438,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); - if (cp->nr_subparts_cpus && - (cp->partition_root_state != PRS_ENABLED)) { + if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) { cp->nr_subparts_cpus = 0; cpumask_clear(cp->subparts_cpus); } else if (cp->nr_subparts_cpus) { @@ -1435,7 +1465,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) = cpumask_weight(cp->subparts_cpus); } } + + if (new_prs != old_prs) + cp->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + notify_partition_change(cp, old_prs, new_prs); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); @@ -1612,6 +1647,11 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, { struct cpuset_migrate_mm_work *mwork; + if (nodes_equal(*from, *to)) { + mmput(mm); + return; + } + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); if (mwork) { mwork->mm = mm; @@ -1937,34 +1977,32 @@ out: /* * update_prstate - update partititon_root_state - * cs: the cpuset to update - * val: 0 - disabled, 1 - enabled + * cs: the cpuset to update + * new_prs: new partition root state * * Call with cpuset_mutex held. */ -static int update_prstate(struct cpuset *cs, int val) +static int update_prstate(struct cpuset *cs, int new_prs) { - int err; + int err, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); - struct tmpmasks tmp; + struct tmpmasks tmpmask; - if ((val != 0) && (val != 1)) - return -EINVAL; - if (val == cs->partition_root_state) + if (old_prs == new_prs) return 0; /* * Cannot force a partial or invalid partition root to a full * partition root. */ - if (val && cs->partition_root_state) + if (new_prs && (old_prs == PRS_ERROR)) return -EINVAL; - if (alloc_cpumasks(NULL, &tmp)) + if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; err = -EINVAL; - if (!cs->partition_root_state) { + if (!old_prs) { /* * Turning on partition root requires setting the * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed @@ -1978,31 +2016,27 @@ static int update_prstate(struct cpuset *cs, int val) goto out; err = update_parent_subparts_cpumask(cs, partcmd_enable, - NULL, &tmp); + NULL, &tmpmask); if (err) { update_flag(CS_CPU_EXCLUSIVE, cs, 0); goto out; } - cs->partition_root_state = PRS_ENABLED; } else { /* * Turning off partition root will clear the * CS_CPU_EXCLUSIVE bit. */ - if (cs->partition_root_state == PRS_ERROR) { - cs->partition_root_state = 0; + if (old_prs == PRS_ERROR) { update_flag(CS_CPU_EXCLUSIVE, cs, 0); err = 0; goto out; } err = update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, &tmp); + NULL, &tmpmask); if (err) goto out; - cs->partition_root_state = 0; - /* Turning off CS_CPU_EXCLUSIVE will not return error */ update_flag(CS_CPU_EXCLUSIVE, cs, 0); } @@ -2015,11 +2049,18 @@ static int update_prstate(struct cpuset *cs, int val) update_tasks_cpumask(parent); if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmp); + update_sibling_cpumasks(parent, cs, &tmpmask); rebuild_sched_domains_locked(); out: - free_cpumasks(NULL, &tmp); + if (!err) { + spin_lock_irq(&callback_lock); + cs->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + notify_partition_change(cs, old_prs, new_prs); + } + + free_cpumasks(NULL, &tmpmask); return err; } @@ -2199,15 +2240,13 @@ static void cpuset_attach(struct cgroup_taskset *tset) percpu_down_write(&cpuset_rwsem); - /* prepare for attach */ - if (cs == &top_cpuset) - cpumask_copy(cpus_attach, cpu_possible_mask); - else - guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, css, tset) { + if (cs != &top_cpuset) + guarantee_online_cpus(task, cpus_attach); + else + cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -2282,7 +2321,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -2320,7 +2359,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); return retval; } @@ -2331,7 +2370,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2346,7 +2385,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); return retval; } @@ -2385,7 +2424,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2411,7 +2450,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_cpuset(trialcs); out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2542,7 +2581,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, return -EINVAL; css_get(&cs->css); - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2550,7 +2589,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, retval = update_prstate(cs, val); out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); css_put(&cs->css); return retval ?: nbytes; } @@ -2702,6 +2741,7 @@ static struct cftype dfl_files[] = { .write = sched_partition_write, .private = FILE_PARTITION_ROOT, .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct cpuset, partition_file), }, { @@ -2737,12 +2777,16 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); } - set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); nodes_clear(cs->mems_allowed); nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; + /* Set CS_MEMORY_MIGRATE for default hierarchy */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + __set_bit(CS_MEMORY_MIGRATE, &cs->flags); + return &cs->css; } @@ -2756,7 +2800,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); set_bit(CS_ONLINE, &cs->flags); @@ -2809,7 +2853,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_unlock_irq(&callback_lock); out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -2828,7 +2872,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (is_partition_root(cs)) @@ -2849,7 +2893,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags); percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -3060,7 +3104,7 @@ retry: goto retry; } - parent = parent_cs(cs); + parent = parent_cs(cs); compute_effective_cpumask(&new_cpus, cs, parent); nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); @@ -3082,8 +3126,10 @@ retry: if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || (parent->partition_root_state == PRS_ERROR))) { if (cs->nr_subparts_cpus) { + spin_lock_irq(&callback_lock); cs->nr_subparts_cpus = 0; cpumask_clear(cs->subparts_cpus); + spin_unlock_irq(&callback_lock); compute_effective_cpumask(&new_cpus, cs, parent); } @@ -3095,9 +3141,17 @@ retry: */ if ((parent->partition_root_state == PRS_ERROR) || cpumask_empty(&new_cpus)) { + int old_prs; + update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); - cs->partition_root_state = PRS_ERROR; + old_prs = cs->partition_root_state; + if (old_prs != PRS_ERROR) { + spin_lock_irq(&callback_lock); + cs->partition_root_state = PRS_ERROR; + spin_unlock_irq(&callback_lock); + notify_partition_change(cs, old_prs, PRS_ERROR); + } } cpuset_force_rebuild(); } @@ -3168,6 +3222,13 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); + /* + * In the rare case that hotplug removes all the cpus in subparts_cpus, + * we assumed that cpus are updated. + */ + if (!cpus_updated && top_cpuset.nr_subparts_cpus) + cpus_updated = true; + /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { spin_lock_irq(&callback_lock); @@ -3302,9 +3363,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); - guarantee_online_cpus(task_cs(tsk), pmask); - rcu_read_unlock(); + guarantee_online_cpus(tsk, pmask); spin_unlock_irqrestore(&callback_lock, flags); } @@ -3318,13 +3377,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) * which will not contain a sane cpumask during cases such as cpu hotplugging. * This is the absolute last resort for the scheduler and it is only used if * _every_ other avenue has been traveled. + * + * Returns true if the affinity of @tsk was changed, false otherwise. **/ -void cpuset_cpus_allowed_fallback(struct task_struct *tsk) +bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + const struct cpumask *cs_mask; + bool changed = false; + rcu_read_lock(); - do_set_cpus_allowed(tsk, is_in_v2_mode() ? - task_cs(tsk)->cpus_allowed : cpu_possible_mask); + cs_mask = task_cs(tsk)->cpus_allowed; + if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { + do_set_cpus_allowed(tsk, cs_mask); + changed = true; + } rcu_read_unlock(); /* @@ -3344,6 +3412,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) * select_fallback_rq() will fix things ups and set cpu_possible_mask * if required. */ + return changed; } void __init cpuset_init_current_mems_allowed(void) diff --git a/kernel/cpu.c b/kernel/cpu.c index 804b847912dc..192e43a87407 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -41,14 +41,19 @@ #include "smpboot.h" /** - * cpuhp_cpu_state - Per cpu hotplug state storage + * struct cpuhp_cpu_state - Per cpu hotplug state storage * @state: The current cpu state * @target: The target state + * @fail: Current CPU hotplug callback state * @thread: Pointer to the hotplug thread * @should_run: Thread should execute * @rollback: Perform a rollback * @single: Single callback invocation * @bringup: Single callback bringup or teardown selector + * @cpu: CPU number + * @node: Remote CPU node; for multi-instance, do a + * single entry callback for install/remove + * @last: For multi-instance rollback, remember how far we got * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation * @done_up: Signal completion to the issuer of the task for cpu-up @@ -106,11 +111,12 @@ static inline void cpuhp_lock_release(bool bringup) { } #endif /** - * cpuhp_step - Hotplug state machine step + * struct cpuhp_step - Hotplug state machine step * @name: Name of the step * @startup: Startup function of the step * @teardown: Teardown function of the step * @cant_stop: Bringup/teardown can't be stopped at this step + * @multi_instance: State has multiple instances which get added afterwards */ struct cpuhp_step { const char *name; @@ -124,7 +130,9 @@ struct cpuhp_step { int (*multi)(unsigned int cpu, struct hlist_node *node); } teardown; + /* private: */ struct hlist_head list; + /* public: */ bool cant_stop; bool multi_instance; }; @@ -143,7 +151,7 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step) } /** - * cpuhp_invoke_callback _ Invoke the callbacks for a given state + * cpuhp_invoke_callback - Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked * @state: The state to do callbacks for * @bringup: True if the bringup callback should be invoked @@ -151,6 +159,8 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step) * @lastp: For multi-instance rollback, remember how far we got * * Called from cpu hotplug and from the state register machinery. + * + * Return: %0 on success or a negative errno code */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node, @@ -682,6 +692,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret = cpuhp_invoke_callback_range(true, cpu, st, target); if (ret) { + pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n", + ret, cpu, cpuhp_get_step(st->state)->name, + st->state); + cpuhp_reset_state(st, prev_state); if (can_rollback_cpu(st)) WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, @@ -1081,6 +1095,9 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret = cpuhp_invoke_callback_range(false, cpu, st, target); if (ret) { + pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n", + ret, cpu, cpuhp_get_step(st->state)->name, + st->state); cpuhp_reset_state(st, prev_state); @@ -1183,6 +1200,8 @@ static int cpu_down(unsigned int cpu, enum cpuhp_state target) * This function is meant to be used by device core cpu subsystem only. * * Other subsystems should use remove_cpu() instead. + * + * Return: %0 on success or a negative errno code */ int cpu_device_down(struct device *dev) { @@ -1395,6 +1414,8 @@ out: * This function is meant to be used by device core cpu subsystem only. * * Other subsystems should use add_cpu() instead. + * + * Return: %0 on success or a negative errno code */ int cpu_device_up(struct device *dev) { @@ -1420,6 +1441,8 @@ EXPORT_SYMBOL_GPL(add_cpu); * On some architectures like arm64, we can hibernate on any CPU, but on * wake up the CPU we hibernated on might be offline as a side effect of * using maxcpus= for example. + * + * Return: %0 on success or a negative errno code */ int bringup_hibernate_cpu(unsigned int sleep_cpu) { @@ -1976,6 +1999,7 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); /** * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state * @state: The state to setup + * @name: Name of the step * @invoke: If true, the startup function is invoked for cpus where * cpu state >= @state * @startup: startup callback function @@ -1984,9 +2008,9 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); * added afterwards. * * The caller needs to hold cpus read locked while calling this function. - * Returns: + * Return: * On success: - * Positive state number if @state is CPUHP_AP_ONLINE_DYN + * Positive state number if @state is CPUHP_AP_ONLINE_DYN; * 0 for all other states * On failure: proper (negative) error code */ @@ -2232,18 +2256,17 @@ int cpuhp_smt_enable(void) #endif #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) -static ssize_t show_cpuhp_state(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t state_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->state); } -static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL); +static DEVICE_ATTR_RO(state); -static ssize_t write_cpuhp_target(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t target_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); struct cpuhp_step *sp; @@ -2281,19 +2304,17 @@ out: return ret ? ret : count; } -static ssize_t show_cpuhp_target(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t target_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->target); } -static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); - +static DEVICE_ATTR_RW(target); -static ssize_t write_cpuhp_fail(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t fail_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); struct cpuhp_step *sp; @@ -2342,15 +2363,15 @@ static ssize_t write_cpuhp_fail(struct device *dev, return count; } -static ssize_t show_cpuhp_fail(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t fail_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->fail); } -static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail); +static DEVICE_ATTR_RW(fail); static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, @@ -2365,7 +2386,7 @@ static const struct attribute_group cpuhp_cpu_attr_group = { NULL }; -static ssize_t show_cpuhp_states(struct device *dev, +static ssize_t states_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t cur, res = 0; @@ -2384,7 +2405,7 @@ static ssize_t show_cpuhp_states(struct device *dev, mutex_unlock(&cpuhp_state_mutex); return res; } -static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL); +static DEVICE_ATTR_RO(states); static struct attribute *cpuhp_cpu_root_attrs[] = { &dev_attr_states.attr, @@ -2457,28 +2478,27 @@ static const char *smt_states[] = { [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented", }; -static ssize_t -show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) +static ssize_t control_show(struct device *dev, + struct device_attribute *attr, char *buf) { const char *state = smt_states[cpu_smt_control]; return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); } -static ssize_t -store_smt_control(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t control_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { return __store_smt_control(dev, attr, buf, count); } -static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); +static DEVICE_ATTR_RW(control); -static ssize_t -show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) +static ssize_t active_show(struct device *dev, + struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active()); } -static DEVICE_ATTR(active, 0444, show_smt_active, NULL); +static DEVICE_ATTR_RO(active); static struct attribute *cpuhp_smt_attrs[] = { &dev_attr_control.attr, diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index f7e1d0eccdbc..246efc74e3f3 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -13,19 +13,32 @@ #include <linux/spinlock.h> #include <linux/syscore_ops.h> -static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain); +/* + * atomic_notifiers use a spinlock_t, which can block under PREEMPT_RT. + * Notifications for cpu_pm will be issued by the idle task itself, which can + * never block, IOW it requires using a raw_spinlock_t. + */ +static struct { + struct raw_notifier_head chain; + raw_spinlock_t lock; +} cpu_pm_notifier = { + .chain = RAW_NOTIFIER_INIT(cpu_pm_notifier.chain), + .lock = __RAW_SPIN_LOCK_UNLOCKED(cpu_pm_notifier.lock), +}; static int cpu_pm_notify(enum cpu_pm_event event) { int ret; /* - * atomic_notifier_call_chain has a RCU read critical section, which - * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let - * RCU know this. + * This introduces a RCU read critical section, which could be + * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know + * this. */ rcu_irq_enter_irqson(); - ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL); + rcu_read_lock(); + ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); + rcu_read_unlock(); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -33,10 +46,13 @@ static int cpu_pm_notify(enum cpu_pm_event event) static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down) { + unsigned long flags; int ret; rcu_irq_enter_irqson(); - ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL); + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -49,12 +65,17 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev * Add a driver to a list of drivers that are notified about * CPU and CPU cluster low power entry and exit. * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_register. + * This function has the same return conditions as raw_notifier_chain_register. */ int cpu_pm_register_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_chain_register(&cpu_pm_notifier.chain, nb); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); + return ret; } EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); @@ -64,12 +85,17 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); * * Remove a driver from the CPU PM notifier list. * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_unregister. + * This function has the same return conditions as raw_notifier_chain_unregister. */ int cpu_pm_unregister_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_chain_unregister(&cpu_pm_notifier.chain, nb); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); + return ret; } EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 835973444a1e..f32320ac02fd 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -568,7 +568,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, if (!cpu_events) return (void __percpu __force *)ERR_PTR(-ENOMEM); - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered, context); @@ -579,7 +579,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, per_cpu(*cpu_events, cpu) = bp; } - put_online_cpus(); + cpus_read_unlock(); if (likely(!err)) return cpu_events; diff --git a/kernel/exit.c b/kernel/exit.c index 9a89e7f36acb..91a43e57a32e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -777,7 +777,7 @@ void __noreturn do_exit(long code) schedule(); } - io_uring_files_cancel(tsk->files); + io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ /* sync mm's RSS info before statistics gathering */ diff --git a/kernel/fork.c b/kernel/fork.c index c97e85245dfc..695d1343a254 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -446,6 +446,7 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { + release_user_cpus_ptr(tsk); scs_release(tsk); #ifndef CONFIG_THREAD_INFO_IN_TASK @@ -924,6 +925,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #endif if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; + dup_user_cpus_ptr(tsk, orig, node); /* * One for the user space visible state that goes away when reaped. diff --git a/kernel/futex.c b/kernel/futex.c index 2ecb07575055..e7b4c6121da4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -179,7 +179,7 @@ struct futex_pi_state { /* * The PI object: */ - struct rt_mutex pi_mutex; + struct rt_mutex_base pi_mutex; struct task_struct *owner; refcount_t refcount; @@ -197,6 +197,8 @@ struct futex_pi_state { * @rt_waiter: rt_waiter storage for use with requeue_pi * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup + * @requeue_state: State field for futex_requeue_pi() + * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). @@ -219,12 +221,68 @@ struct futex_q { struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; + atomic_t requeue_state; +#ifdef CONFIG_PREEMPT_RT + struct rcuwait requeue_wait; +#endif } __randomize_layout; +/* + * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an + * underlying rtmutex. The task which is about to be requeued could have + * just woken up (timeout, signal). After the wake up the task has to + * acquire hash bucket lock, which is held by the requeue code. As a task + * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking + * and the hash bucket lock blocking would collide and corrupt state. + * + * On !PREEMPT_RT this is not a problem and everything could be serialized + * on hash bucket lock, but aside of having the benefit of common code, + * this allows to avoid doing the requeue when the task is already on the + * way out and taking the hash bucket lock of the original uaddr1 when the + * requeue has been completed. + * + * The following state transitions are valid: + * + * On the waiter side: + * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT + * + * On the requeue side: + * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) + * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED + * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) + * + * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this + * signals that the waiter is already on the way out. It also means that + * the waiter is still on the 'wait' futex, i.e. uaddr1. + * + * The waiter side signals early wakeup to the requeue side either through + * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending + * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately + * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, + * which means the wakeup is interleaving with a requeue in progress it has + * to wait for the requeue side to change the state. Either to DONE/LOCKED + * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex + * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by + * the requeue side when the requeue attempt failed via deadlock detection + * and therefore the waiter q is still on the uaddr1 futex. + */ +enum { + Q_REQUEUE_PI_NONE = 0, + Q_REQUEUE_PI_IGNORE, + Q_REQUEUE_PI_IN_PROGRESS, + Q_REQUEUE_PI_WAIT, + Q_REQUEUE_PI_DONE, + Q_REQUEUE_PI_LOCKED, +}; + static const struct futex_q futex_q_init = { /* list gets initialized in queue_me()*/ - .key = FUTEX_KEY_INIT, - .bitset = FUTEX_BITSET_MATCH_ANY + .key = FUTEX_KEY_INIT, + .bitset = FUTEX_BITSET_MATCH_ANY, + .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), }; /* @@ -1299,27 +1357,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, return 0; } -static int lookup_pi_state(u32 __user *uaddr, u32 uval, - struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps, - struct task_struct **exiting) -{ - struct futex_q *top_waiter = futex_top_waiter(hb, key); - - /* - * If there is a waiter on that futex, validate it and - * attach to the pi_state when the validation succeeds. - */ - if (top_waiter) - return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); - - /* - * We are the first waiter - try to look up the owner based on - * @uval and attach to it. - */ - return attach_to_pi_owner(uaddr, uval, key, ps, exiting); -} - static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { int err; @@ -1354,7 +1391,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * - 1 - acquired the lock; * - <0 - error * - * The hb->lock and futex_key refs shall be held by the caller. + * The hb->lock must be held by the caller. * * @exiting is only set when the return value is -EBUSY. If so, this holds * a refcount on the exiting task on return and the caller needs to drop it @@ -1493,11 +1530,11 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) */ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - u32 curval, newval; struct rt_mutex_waiter *top_waiter; struct task_struct *new_owner; bool postunlock = false; - DEFINE_WAKE_Q(wake_q); + DEFINE_RT_WAKE_Q(wqh); + u32 curval, newval; int ret = 0; top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); @@ -1549,14 +1586,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ * not fail. */ pi_state_update_owner(pi_state, new_owner); - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); } out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); if (postunlock) - rt_mutex_postunlock(&wake_q); + rt_mutex_postunlock(&wqh); return ret; } @@ -1793,6 +1830,108 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, q->key = *key2; } +static inline bool futex_requeue_pi_prepare(struct futex_q *q, + struct futex_pi_state *pi_state) +{ + int old, new; + + /* + * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has + * already set Q_REQUEUE_PI_IGNORE to signal that requeue should + * ignore the waiter. + */ + old = atomic_read_acquire(&q->requeue_state); + do { + if (old == Q_REQUEUE_PI_IGNORE) + return false; + + /* + * futex_proxy_trylock_atomic() might have set it to + * IN_PROGRESS and a interleaved early wake to WAIT. + * + * It was considered to have an extra state for that + * trylock, but that would just add more conditionals + * all over the place for a dubious value. + */ + if (old != Q_REQUEUE_PI_NONE) + break; + + new = Q_REQUEUE_PI_IN_PROGRESS; + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + + q->pi_state = pi_state; + return true; +} + +static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) +{ + int old, new; + + old = atomic_read_acquire(&q->requeue_state); + do { + if (old == Q_REQUEUE_PI_IGNORE) + return; + + if (locked >= 0) { + /* Requeue succeeded. Set DONE or LOCKED */ + WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && + old != Q_REQUEUE_PI_WAIT); + new = Q_REQUEUE_PI_DONE + locked; + } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { + /* Deadlock, no early wakeup interleave */ + new = Q_REQUEUE_PI_NONE; + } else { + /* Deadlock, early wakeup interleave. */ + WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); + new = Q_REQUEUE_PI_IGNORE; + } + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + +#ifdef CONFIG_PREEMPT_RT + /* If the waiter interleaved with the requeue let it know */ + if (unlikely(old == Q_REQUEUE_PI_WAIT)) + rcuwait_wake_up(&q->requeue_wait); +#endif +} + +static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) +{ + int old, new; + + old = atomic_read_acquire(&q->requeue_state); + do { + /* Is requeue done already? */ + if (old >= Q_REQUEUE_PI_DONE) + return old; + + /* + * If not done, then tell the requeue code to either ignore + * the waiter or to wake it up once the requeue is done. + */ + new = Q_REQUEUE_PI_WAIT; + if (old == Q_REQUEUE_PI_NONE) + new = Q_REQUEUE_PI_IGNORE; + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + + /* If the requeue was in progress, wait for it to complete */ + if (old == Q_REQUEUE_PI_IN_PROGRESS) { +#ifdef CONFIG_PREEMPT_RT + rcuwait_wait_event(&q->requeue_wait, + atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, + TASK_UNINTERRUPTIBLE); +#else + (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); +#endif + } + + /* + * Requeue is now either prohibited or complete. Reread state + * because during the wait above it might have changed. Nothing + * will modify q->requeue_state after this point. + */ + return atomic_read(&q->requeue_state); +} + /** * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * @q: the futex_q @@ -1820,6 +1959,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, q->lock_ptr = &hb->lock; + /* Signal locked state to the waiter */ + futex_requeue_pi_complete(q, 1); wake_up_state(q->task, TASK_NORMAL); } @@ -1879,10 +2020,21 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, if (!top_waiter) return 0; + /* + * Ensure that this is a waiter sitting in futex_wait_requeue_pi() + * and waiting on the 'waitqueue' futex which is always !PI. + */ + if (!top_waiter->rt_waiter || top_waiter->pi_state) + ret = -EINVAL; + /* Ensure we requeue to the expected futex. */ if (!match_futex(top_waiter->requeue_pi_key, key2)) return -EINVAL; + /* Ensure that this does not race against an early wakeup */ + if (!futex_requeue_pi_prepare(top_waiter, NULL)) + return -EAGAIN; + /* * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in * the contended case or if set_waiters is 1. The pi_state is returned @@ -1892,8 +2044,22 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { + /* Dequeue, wake up and update top_waiter::requeue_state */ requeue_pi_wake_futex(top_waiter, key2, hb2); return vpid; + } else if (ret < 0) { + /* Rewind top_waiter::requeue_state */ + futex_requeue_pi_complete(top_waiter, ret); + } else { + /* + * futex_lock_pi_atomic() did not acquire the user space + * futex, but managed to establish the proxy lock and pi + * state. top_waiter::requeue_state cannot be fixed up here + * because the waiter is not enqueued on the rtmutex + * yet. This is handled at the callsite depending on the + * result of rt_mutex_start_proxy_lock() which is + * guaranteed to be reached with this function returning 0. + */ } return ret; } @@ -1948,23 +2114,35 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, return -EINVAL; /* + * futex_requeue() allows the caller to define the number + * of waiters to wake up via the @nr_wake argument. With + * REQUEUE_PI, waking up more than one waiter is creating + * more problems than it solves. Waking up a waiter makes + * only sense if the PI futex @uaddr2 is uncontended as + * this allows the requeue code to acquire the futex + * @uaddr2 before waking the waiter. The waiter can then + * return to user space without further action. A secondary + * wakeup would just make the futex_wait_requeue_pi() + * handling more complex, because that code would have to + * look up pi_state and do more or less all the handling + * which the requeue code has to do for the to be requeued + * waiters. So restrict the number of waiters to wake to + * one, and only wake it up when the PI futex is + * uncontended. Otherwise requeue it and let the unlock of + * the PI futex handle the wakeup. + * + * All REQUEUE_PI users, e.g. pthread_cond_signal() and + * pthread_cond_broadcast() must use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + + /* * requeue_pi requires a pi_state, try to allocate it now * without any locks in case it fails. */ if (refill_pi_state_cache()) return -ENOMEM; - /* - * requeue_pi must wake as many tasks as it can, up to nr_wake - * + nr_requeue, since it acquires the rt_mutex prior to - * returning to userspace, so as to not leave the rt_mutex with - * waiters and no owner. However, second and third wake-ups - * cannot be predicted as they involve race conditions with the - * first wake and a fault while looking up the pi_state. Both - * pthread_cond_signal() and pthread_cond_broadcast() should - * use nr_wake=1. - */ - if (nr_wake != 1) - return -EINVAL; } retry: @@ -2014,7 +2192,7 @@ retry_private: } } - if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + if (requeue_pi) { struct task_struct *exiting = NULL; /* @@ -2022,6 +2200,8 @@ retry_private: * intend to requeue waiters, force setting the FUTEX_WAITERS * bit. We force this here where we are able to easily handle * faults rather in the requeue loop below. + * + * Updates topwaiter::requeue_state if a top waiter exists. */ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, &key2, &pi_state, @@ -2031,28 +2211,52 @@ retry_private: * At this point the top_waiter has either taken uaddr2 or is * waiting on it. If the former, then the pi_state will not * exist yet, look it up one more time to ensure we have a - * reference to it. If the lock was taken, ret contains the - * vpid of the top waiter task. + * reference to it. If the lock was taken, @ret contains the + * VPID of the top waiter task. * If the lock was not taken, we have pi_state and an initial * refcount on it. In case of an error we have nothing. + * + * The top waiter's requeue_state is up to date: + * + * - If the lock was acquired atomically (ret > 0), then + * the state is Q_REQUEUE_PI_LOCKED. + * + * - If the trylock failed with an error (ret < 0) then + * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing + * happened", or Q_REQUEUE_PI_IGNORE when there was an + * interleaved early wakeup. + * + * - If the trylock did not succeed (ret == 0) then the + * state is either Q_REQUEUE_PI_IN_PROGRESS or + * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. + * This will be cleaned up in the loop below, which + * cannot fail because futex_proxy_trylock_atomic() did + * the same sanity checks for requeue_pi as the loop + * below does. */ if (ret > 0) { WARN_ON(pi_state); task_count++; /* - * If we acquired the lock, then the user space value - * of uaddr2 should be vpid. It cannot be changed by - * the top waiter as it is blocked on hb2 lock if it - * tries to do so. If something fiddled with it behind - * our back the pi state lookup might unearth it. So - * we rather use the known value than rereading and - * handing potential crap to lookup_pi_state. + * If futex_proxy_trylock_atomic() acquired the + * user space futex, then the user space value + * @uaddr2 has been set to the @hb1's top waiter + * task VPID. This task is guaranteed to be alive + * and cannot be exiting because it is either + * sleeping or blocked on @hb2 lock. + * + * The @uaddr2 futex cannot have waiters either as + * otherwise futex_proxy_trylock_atomic() would not + * have succeeded. * - * If that call succeeds then we have pi_state and an - * initial refcount on it. + * In order to requeue waiters to @hb2, pi state is + * required. Hand in the VPID value (@ret) and + * allocate PI state with an initial refcount on + * it. */ - ret = lookup_pi_state(uaddr2, ret, hb2, &key2, - &pi_state, &exiting); + ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state, + &exiting); + WARN_ON(ret); } switch (ret) { @@ -2060,7 +2264,10 @@ retry_private: /* We hold a reference on the pi state. */ break; - /* If the above failed, then pi_state is NULL */ + /* + * If the above failed, then pi_state is NULL and + * waiter::requeue_state is correct. + */ case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -2112,18 +2319,17 @@ retry_private: break; } - /* - * Wake nr_wake waiters. For requeue_pi, if we acquired the - * lock, we already woke the top_waiter. If not, it will be - * woken by futex_unlock_pi(). - */ - if (++task_count <= nr_wake && !requeue_pi) { - mark_wake_futex(&wake_q, this); + /* Plain futexes just wake or requeue and are done */ + if (!requeue_pi) { + if (++task_count <= nr_wake) + mark_wake_futex(&wake_q, this); + else + requeue_futex(this, hb1, hb2, &key2); continue; } /* Ensure we requeue to the expected futex for requeue_pi. */ - if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { + if (!match_futex(this->requeue_pi_key, &key2)) { ret = -EINVAL; break; } @@ -2131,54 +2337,67 @@ retry_private: /* * Requeue nr_requeue waiters and possibly one more in the case * of requeue_pi if we couldn't acquire the lock atomically. + * + * Prepare the waiter to take the rt_mutex. Take a refcount + * on the pi_state and store the pointer in the futex_q + * object of the waiter. */ - if (requeue_pi) { + get_pi_state(pi_state); + + /* Don't requeue when the waiter is already on the way out. */ + if (!futex_requeue_pi_prepare(this, pi_state)) { /* - * Prepare the waiter to take the rt_mutex. Take a - * refcount on the pi_state and store the pointer in - * the futex_q object of the waiter. + * Early woken waiter signaled that it is on the + * way out. Drop the pi_state reference and try the + * next waiter. @this->pi_state is still NULL. */ - get_pi_state(pi_state); - this->pi_state = pi_state; - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, - this->rt_waiter, - this->task); - if (ret == 1) { - /* - * We got the lock. We do neither drop the - * refcount on pi_state nor clear - * this->pi_state because the waiter needs the - * pi_state for cleaning up the user space - * value. It will drop the refcount after - * doing so. - */ - requeue_pi_wake_futex(this, &key2, hb2); - continue; - } else if (ret) { - /* - * rt_mutex_start_proxy_lock() detected a - * potential deadlock when we tried to queue - * that waiter. Drop the pi_state reference - * which we took above and remove the pointer - * to the state from the waiters futex_q - * object. - */ - this->pi_state = NULL; - put_pi_state(pi_state); - /* - * We stop queueing more waiters and let user - * space deal with the mess. - */ - break; - } + put_pi_state(pi_state); + continue; + } + + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task); + + if (ret == 1) { + /* + * We got the lock. We do neither drop the refcount + * on pi_state nor clear this->pi_state because the + * waiter needs the pi_state for cleaning up the + * user space value. It will drop the refcount + * after doing so. this::requeue_state is updated + * in the wakeup as well. + */ + requeue_pi_wake_futex(this, &key2, hb2); + task_count++; + } else if (!ret) { + /* Waiter is queued, move it to hb2 */ + requeue_futex(this, hb1, hb2, &key2); + futex_requeue_pi_complete(this, 0); + task_count++; + } else { + /* + * rt_mutex_start_proxy_lock() detected a potential + * deadlock when we tried to queue that waiter. + * Drop the pi_state reference which we took above + * and remove the pointer to the state from the + * waiters futex_q object. + */ + this->pi_state = NULL; + put_pi_state(pi_state); + futex_requeue_pi_complete(this, ret); + /* + * We stop queueing more waiters and let user space + * deal with the mess. + */ + break; } - requeue_futex(this, hb1, hb2, &key2); } /* - * We took an extra initial reference to the pi_state either - * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We - * need to drop it here again. + * We took an extra initial reference to the pi_state either in + * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need + * to drop it here again. */ put_pi_state(pi_state); @@ -2357,7 +2576,7 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * Modifying pi_state _before_ the user space value would leave the * pi_state in an inconsistent state when we fault here, because we * need to drop the locks to handle the fault. This might be observed - * in the PID check in lookup_pi_state. + * in the PID checks when attaching to PI state . */ retry: if (!argowner) { @@ -2614,8 +2833,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. - * Return with the hb lock held and a q.key reference on success, and unlocked - * with no q.key reference on failure. + * Return with the hb lock held on success, and unlocked on failure. * * Return: * - 0 - uaddr contains val and hb has been locked; @@ -2693,8 +2911,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, current->timer_slack_ns); retry: /* - * Prepare to wait on uaddr. On success, holds hb lock and increments - * q.key refs. + * Prepare to wait on uaddr. On success, it holds hb->lock and q + * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) @@ -2705,7 +2923,6 @@ retry: /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; - /* unqueue_me() drops q.key ref */ if (!unqueue_me(&q)) goto out; ret = -ETIMEDOUT; @@ -3072,27 +3289,22 @@ pi_faulted: } /** - * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex * @hb: the hash_bucket futex_q was original enqueued on * @q: the futex_q woken while waiting to be requeued - * @key2: the futex_key of the requeue target futex * @timeout: the timeout associated with the wait (NULL if none) * - * Detect if the task was woken on the initial futex as opposed to the requeue - * target futex. If so, determine if it was a timeout or a signal that caused - * the wakeup and return the appropriate error code to the caller. Must be - * called with the hb lock held. + * Determine the cause for the early wakeup. * * Return: - * - 0 = no early wakeup detected; - * - <0 = -ETIMEDOUT or -ERESTARTNOINTR + * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, - struct futex_q *q, union futex_key *key2, + struct futex_q *q, struct hrtimer_sleeper *timeout) { - int ret = 0; + int ret; /* * With the hb lock held, we avoid races while we process the wakeup. @@ -3101,22 +3313,21 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * It can't be requeued from uaddr2 to something else since we don't * support a PI aware source futex for requeue. */ - if (!match_futex(&q->key, key2)) { - WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); - /* - * We were woken prior to requeue by a timeout or a signal. - * Unqueue the futex_q and determine which it was. - */ - plist_del(&q->list, &hb->chain); - hb_waiters_dec(hb); + WARN_ON_ONCE(&hb->lock != q->lock_ptr); - /* Handle spurious wakeups gracefully */ - ret = -EWOULDBLOCK; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - else if (signal_pending(current)) - ret = -ERESTARTNOINTR; - } + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); + + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else if (signal_pending(current)) + ret = -ERESTARTNOINTR; return ret; } @@ -3169,6 +3380,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; + struct rt_mutex_base *pi_mutex; int res, ret; if (!IS_ENABLED(CONFIG_FUTEX_PI)) @@ -3198,8 +3410,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, q.requeue_pi_key = &key2; /* - * Prepare to wait on uaddr. On success, increments q.key (key1) ref - * count. + * Prepare to wait on uaddr. On success, it holds hb->lock and q + * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) @@ -3218,32 +3430,22 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); - spin_lock(&hb->lock); - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); - spin_unlock(&hb->lock); - if (ret) - goto out; - - /* - * In order for us to be here, we know our q.key == key2, and since - * we took the hb->lock above, we also know that futex_requeue() has - * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquisition by the requeue code. The - * futex_requeue dropped our key1 reference and incremented our key2 - * reference count. - */ + switch (futex_requeue_pi_wakeup_sync(&q)) { + case Q_REQUEUE_PI_IGNORE: + /* The waiter is still on uaddr1 */ + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, to); + spin_unlock(&hb->lock); + break; - /* - * Check if the requeue code acquired the second futex for us and do - * any pertinent fixup. - */ - if (!q.rt_waiter) { + case Q_REQUEUE_PI_LOCKED: + /* The requeue acquired the lock */ if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_owner(uaddr2, &q, true); /* - * Drop the reference to the pi state which - * the requeue_pi() code acquired for us. + * Drop the reference to the pi state which the + * requeue_pi() code acquired for us. */ put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); @@ -3253,18 +3455,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = ret < 0 ? ret : 0; } - } else { - struct rt_mutex *pi_mutex; + break; - /* - * We have been woken up by futex_unlock_pi(), a timeout, or a - * signal. futex_unlock_pi() will not destroy the lock_ptr nor - * the pi_state. - */ - WARN_ON(!q.pi_state); + case Q_REQUEUE_PI_DONE: + /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + /* Current is not longer pi_blocked_on */ spin_lock(q.lock_ptr); if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; @@ -3284,17 +3482,21 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, unqueue_me_pi(&q); spin_unlock(q.lock_ptr); - } - if (ret == -EINTR) { - /* - * We've already been requeued, but cannot restart by calling - * futex_lock_pi() directly. We could restart this syscall, but - * it would detect that the user space "val" changed and return - * -EWOULDBLOCK. Save the overhead of the restart and return - * -EWOULDBLOCK directly. - */ - ret = -EWOULDBLOCK; + if (ret == -EINTR) { + /* + * We've already been requeued, but cannot restart + * by calling futex_lock_pi() directly. We could + * restart this syscall, but it would detect that + * the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart + * and return -EWOULDBLOCK directly. + */ + ret = -EWOULDBLOCK; + } + break; + default: + BUG(); } out: diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4d89ad4fae3b..f7ff8919dc9b 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -355,7 +355,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, goto fail_npresmsk; /* Stabilize the cpumasks */ - get_online_cpus(); + cpus_read_lock(); build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ @@ -384,7 +384,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, nr_others = ret; fail_build_affinity: - put_online_cpus(); + cpus_read_unlock(); if (ret >= 0) WARN_ON(nr_present + nr_others < numvecs); @@ -505,9 +505,9 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, if (affd->calc_sets) { set_vecs = maxvec - resv; } else { - get_online_cpus(); + cpus_read_lock(); set_vecs = cpumask_weight(cpu_possible_mask); - put_online_cpus(); + cpus_read_unlock(); } return resv + min(set_vecs, maxvec - resv); diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 02236b13b359..39a41c56ad4f 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -166,7 +166,7 @@ void irq_migrate_all_off_this_cpu(void) raw_spin_unlock(&desc->lock); if (affinity_broken) { - pr_warn_ratelimited("IRQ %u: no longer affine to CPU%u\n", + pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n", irq, smp_processor_id()); } } diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index f8f23af6ab0d..cc7cdd26e23e 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -240,9 +240,8 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler) { struct irq_chip_generic *gc; - unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); - gc = kzalloc(sz, GFP_KERNEL); + gc = kzalloc(struct_size(gc, chip_types, num_ct), GFP_KERNEL); if (gc) { irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base, handler); @@ -288,8 +287,11 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, { struct irq_domain_chip_generic *dgc; struct irq_chip_generic *gc; - int numchips, sz, i; unsigned long flags; + int numchips, i; + size_t dgc_sz; + size_t gc_sz; + size_t sz; void *tmp; if (d->gc) @@ -300,8 +302,9 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, return -EINVAL; /* Allocate a pointer, generic chip and chiptypes for each chip */ - sz = sizeof(*dgc) + numchips * sizeof(gc); - sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type)); + gc_sz = struct_size(gc, chip_types, num_ct); + dgc_sz = struct_size(dgc, gc, numchips); + sz = dgc_sz + numchips * gc_sz; tmp = dgc = kzalloc(sz, GFP_KERNEL); if (!dgc) @@ -314,7 +317,7 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, d->gc = dgc; /* Calc pointer to the first generic chip */ - tmp += sizeof(*dgc) + numchips * sizeof(gc); + tmp += dgc_sz; for (i = 0; i < numchips; i++) { /* Store the pointer to the generic chip */ dgc->gc[i] = gc = tmp; @@ -331,7 +334,7 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, list_add_tail(&gc->list, &gc_list); raw_spin_unlock_irqrestore(&gc_lock, flags); /* Calc pointer to the next generic chip */ - tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + tmp += gc_sz; } return 0; } diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 52f11c791bf8..08ce7da3b57c 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -14,11 +14,11 @@ /** * irq_reserve_ipi() - Setup an IPI to destination cpumask * @domain: IPI domain - * @dest: cpumask of cpus which can receive the IPI + * @dest: cpumask of CPUs which can receive the IPI * * Allocate a virq that can be used to send IPI to any CPU in dest mask. * - * On success it'll return linux irq number and error code on failure + * Return: Linux IRQ number on success or error code on failure */ int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest) @@ -104,13 +104,13 @@ free_descs: /** * irq_destroy_ipi() - unreserve an IPI that was previously allocated - * @irq: linux irq number to be destroyed - * @dest: cpumask of cpus which should have the IPI removed + * @irq: Linux IRQ number to be destroyed + * @dest: cpumask of CPUs which should have the IPI removed * * The IPIs allocated with irq_reserve_ipi() are returned to the system * destroying all virqs associated with them. * - * Return 0 on success or error code on failure. + * Return: %0 on success or error code on failure. */ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest) { @@ -150,14 +150,14 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest) } /** - * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu - * @irq: linux irq number - * @cpu: the target cpu + * ipi_get_hwirq - Get the hwirq associated with an IPI to a CPU + * @irq: Linux IRQ number + * @cpu: the target CPU * * When dealing with coprocessors IPI, we need to inform the coprocessor of * the hwirq it needs to use to receive and send IPIs. * - * Returns hwirq value on success and INVALID_HWIRQ on failure. + * Return: hwirq value on success or INVALID_HWIRQ on failure. */ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) { @@ -216,7 +216,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, * This function is for architecture or core code to speed up IPI sending. Not * usable from driver code. * - * Returns zero on success and negative error number on failure. + * Return: %0 on success or negative error number on failure. */ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) { @@ -250,7 +250,7 @@ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) } /** - * ipi_send_mask - send an IPI to target Linux SMP CPU(s) + * __ipi_send_mask - send an IPI to target Linux SMP CPU(s) * @desc: pointer to irq_desc of the IRQ * @dest: dest CPU(s), must be a subset of the mask passed to * irq_reserve_ipi() @@ -258,7 +258,7 @@ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) * This function is for architecture or core code to speed up IPI sending. Not * usable from driver code. * - * Returns zero on success and negative error number on failure. + * Return: %0 on success or negative error number on failure. */ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest) { @@ -298,11 +298,11 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest) /** * ipi_send_single - Send an IPI to a single CPU - * @virq: linux irq number from irq_reserve_ipi() + * @virq: Linux IRQ number from irq_reserve_ipi() * @cpu: destination CPU, must in the destination mask passed to * irq_reserve_ipi() * - * Returns zero on success and negative error number on failure. + * Return: %0 on success or negative error number on failure. */ int ipi_send_single(unsigned int virq, unsigned int cpu) { @@ -319,11 +319,11 @@ EXPORT_SYMBOL_GPL(ipi_send_single); /** * ipi_send_mask - Send an IPI to target CPU(s) - * @virq: linux irq number from irq_reserve_ipi() + * @virq: Linux IRQ number from irq_reserve_ipi() * @dest: dest CPU(s), must be a subset of the mask passed to * irq_reserve_ipi() * - * Returns zero on success and negative error number on failure. + * Return: %0 on success or negative error number on failure. */ int ipi_send_mask(unsigned int virq, const struct cpumask *dest) { diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index fadb93766020..4e3c29bb603c 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -188,7 +188,7 @@ static ssize_t hwirq_show(struct kobject *kobj, raw_spin_lock_irq(&desc->lock); if (desc->irq_data.domain) - ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq); + ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq); raw_spin_unlock_irq(&desc->lock); return ret; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 51c483ce2447..62be16135e7c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1215,6 +1215,7 @@ int irq_domain_disconnect_hierarchy(struct irq_domain *domain, irqd->chip = ERR_PTR(-ENOTCONN); return 0; } +EXPORT_SYMBOL_GPL(irq_domain_disconnect_hierarchy); static int irq_domain_trim_hierarchy(unsigned int virq) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef30b4762947..27667e82ecc9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -25,12 +25,11 @@ #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) -__read_mostly bool force_irqthreads; -EXPORT_SYMBOL_GPL(force_irqthreads); +DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); static int __init setup_forced_irqthreads(char *arg) { - force_irqthreads = true; + static_branch_enable(&force_irqthreads_key); return 0; } early_param("threadirqs", setup_forced_irqthreads); @@ -1260,8 +1259,8 @@ static int irq_thread(void *data) irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); - if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, - &action->thread_flags)) + if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) handler_fn = irq_forced_thread_fn; else handler_fn = irq_thread_fn; @@ -1322,7 +1321,7 @@ EXPORT_SYMBOL_GPL(irq_wake_thread); static int irq_setup_forced_threading(struct irqaction *new) { - if (!force_irqthreads) + if (!force_irqthreads()) return 0; if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; @@ -2072,9 +2071,9 @@ const void *free_nmi(unsigned int irq, void *dev_id) * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. - * Primary handler for threaded interrupts - * If NULL and thread_fn != NULL the default - * primary handler is installed + * Primary handler for threaded interrupts. + * If handler is NULL and thread_fn != NULL + * the default primary handler is installed. * @thread_fn: Function called from the irq handler thread * If NULL, no irq thread is created * @irqflags: Interrupt type flags @@ -2108,7 +2107,7 @@ const void *free_nmi(unsigned int irq, void *dev_id) * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level - * + * IRQF_ONESHOT Run thread_fn with interrupt line masked */ int request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 578596e41cb6..bbfb26489aa1 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -280,7 +280,8 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) /** * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map * @m: Matrix pointer - * @cpu: On which CPU the interrupt should be allocated + * @msk: Which CPUs to search in + * @mapped_cpu: Pointer to store the CPU for which the irq was allocated */ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, unsigned int *mapped_cpu) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 85df3ca03efe..6a5ecee6e567 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -14,17 +14,20 @@ #include <linux/irqdomain.h> #include <linux/msi.h> #include <linux/slab.h> +#include <linux/pci.h> #include "internals.h" /** - * alloc_msi_entry - Allocate an initialize msi_entry + * alloc_msi_entry - Allocate an initialized msi_desc * @dev: Pointer to the device for which this is allocated * @nvec: The number of vectors used in this entry * @affinity: Optional pointer to an affinity mask array size of @nvec * - * If @affinity is not NULL then an affinity array[@nvec] is allocated + * If @affinity is not %NULL then an affinity array[@nvec] is allocated * and the affinity masks and flags from @affinity are copied. + * + * Return: pointer to allocated &msi_desc on success or %NULL on failure */ struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, const struct irq_affinity_desc *affinity) @@ -69,6 +72,139 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) } EXPORT_SYMBOL_GPL(get_cached_msi_msg); +static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct msi_desc *entry; + bool is_msix = false; + unsigned long irq; + int retval; + + retval = kstrtoul(attr->attr.name, 10, &irq); + if (retval) + return retval; + + entry = irq_get_msi_desc(irq); + if (!entry) + return -ENODEV; + + if (dev_is_pci(dev)) + is_msix = entry->msi_attrib.is_msix; + + return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi"); +} + +/** + * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices + * @dev: The device(PCI, platform etc) who will get sysfs entries + * + * Return attribute_group ** so that specific bus MSI can save it to + * somewhere during initilizing msi irqs. If devices has no MSI irq, + * return NULL; if it fails to populate sysfs, return ERR_PTR + */ +const struct attribute_group **msi_populate_sysfs(struct device *dev) +{ + const struct attribute_group **msi_irq_groups; + struct attribute **msi_attrs, *msi_attr; + struct device_attribute *msi_dev_attr; + struct attribute_group *msi_irq_group; + struct msi_desc *entry; + int ret = -ENOMEM; + int num_msi = 0; + int count = 0; + int i; + + /* Determine how many msi entries we have */ + for_each_msi_entry(entry, dev) + num_msi += entry->nvec_used; + if (!num_msi) + return NULL; + + /* Dynamically create the MSI attributes for the device */ + msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL); + if (!msi_attrs) + return ERR_PTR(-ENOMEM); + + for_each_msi_entry(entry, dev) { + for (i = 0; i < entry->nvec_used; i++) { + msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL); + if (!msi_dev_attr) + goto error_attrs; + msi_attrs[count] = &msi_dev_attr->attr; + + sysfs_attr_init(&msi_dev_attr->attr); + msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d", + entry->irq + i); + if (!msi_dev_attr->attr.name) + goto error_attrs; + msi_dev_attr->attr.mode = 0444; + msi_dev_attr->show = msi_mode_show; + ++count; + } + } + + msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL); + if (!msi_irq_group) + goto error_attrs; + msi_irq_group->name = "msi_irqs"; + msi_irq_group->attrs = msi_attrs; + + msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL); + if (!msi_irq_groups) + goto error_irq_group; + msi_irq_groups[0] = msi_irq_group; + + ret = sysfs_create_groups(&dev->kobj, msi_irq_groups); + if (ret) + goto error_irq_groups; + + return msi_irq_groups; + +error_irq_groups: + kfree(msi_irq_groups); +error_irq_group: + kfree(msi_irq_group); +error_attrs: + count = 0; + msi_attr = msi_attrs[count]; + while (msi_attr) { + msi_dev_attr = container_of(msi_attr, struct device_attribute, attr); + kfree(msi_attr->name); + kfree(msi_dev_attr); + ++count; + msi_attr = msi_attrs[count]; + } + kfree(msi_attrs); + return ERR_PTR(ret); +} + +/** + * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices + * @dev: The device(PCI, platform etc) who will remove sysfs entries + * @msi_irq_groups: attribute_group for device msi_irqs entries + */ +void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups) +{ + struct device_attribute *dev_attr; + struct attribute **msi_attrs; + int count = 0; + + if (msi_irq_groups) { + sysfs_remove_groups(&dev->kobj, msi_irq_groups); + msi_attrs = msi_irq_groups[0]->attrs; + while (msi_attrs[count]) { + dev_attr = container_of(msi_attrs[count], + struct device_attribute, attr); + kfree(dev_attr->attr.name); + kfree(dev_attr); + ++count; + } + kfree(msi_attrs); + kfree(msi_irq_groups[0]); + kfree(msi_irq_groups); + } +} + #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN static inline void irq_chip_write_msi_msg(struct irq_data *data, struct msi_msg *msg) @@ -97,6 +233,8 @@ static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg) * * Intended to be used by MSI interrupt controllers which are * implemented with hierarchical domains. + * + * Return: IRQ_SET_MASK_* result code */ int msi_domain_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force) @@ -277,10 +415,12 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info) } /** - * msi_create_irq_domain - Create a MSI interrupt domain + * msi_create_irq_domain - Create an MSI interrupt domain * @fwnode: Optional fwnode of the interrupt controller * @info: MSI domain info * @parent: Parent irq domain + * + * Return: pointer to the created &struct irq_domain or %NULL on failure */ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, @@ -487,7 +627,7 @@ cleanup: * are allocated * @nvec: The number of interrupts to allocate * - * Returns 0 on success or an error code. + * Return: %0 on success or an error code. */ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec) @@ -524,7 +664,7 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) } /** - * __msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev + * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev * @domain: The domain to managing the interrupts * @dev: Pointer to device struct of the device for which the interrupts * are free @@ -541,8 +681,7 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) * msi_get_domain_info - Get the MSI interrupt domain info for @domain * @domain: The interrupt domain to retrieve data from * - * Returns the pointer to the msi_domain_info stored in - * @domain->host_data. + * Return: the pointer to the msi_domain_info stored in @domain->host_data. */ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain) { diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index ce0adb22ee96..ca71123a6130 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -227,7 +227,7 @@ unlock: } /** - * irq_pm_syscore_ops - enable interrupt lines early + * irq_pm_syscore_resume - enable interrupt lines early * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 7c5cd42df3b9..ee595ec09778 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -513,7 +513,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, " %8s", "None"); } if (desc->irq_data.domain) - seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq); + seq_printf(p, " %*lu", prec, desc->irq_data.hwirq); else seq_printf(p, " %*s", prec, ""); #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 4d2a702d7aa9..c43e2ac2f8de 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -799,12 +799,14 @@ static int __init irq_timings_test_irqs(struct timings_intervals *ti) __irq_timings_store(irq, irqs, ti->intervals[i]); if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) { + ret = -EBADSLT; pr_err("Failed to store in the circular buffer\n"); goto out; } } if (irqs->count != ti->count) { + ret = -ERANGE; pr_err("Count differs\n"); goto out; } diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index e65de172ccf7..1d1d1b0e4248 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -64,7 +64,7 @@ static noinline void microbenchmark(unsigned long iters) { const struct kcsan_ctx ctx_save = current->kcsan_ctx; const bool was_enabled = READ_ONCE(kcsan_enabled); - cycles_t cycles; + u64 cycles; /* We may have been called from an atomic region; reset context. */ memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 3572808223e4..d51cabf28f38 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -24,7 +24,8 @@ obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o -obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o +obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o ww_rt_mutex.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index b3adb40549bf..7c5a4a087cc7 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -59,7 +59,7 @@ static struct task_struct **writer_tasks; static struct task_struct **reader_tasks; static bool lock_is_write_held; -static bool lock_is_read_held; +static atomic_t lock_is_read_held; static unsigned long last_lock_release; struct lock_stress_stats { @@ -682,7 +682,7 @@ static int lock_torture_writer(void *arg) if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; lock_is_write_held = true; - if (WARN_ON_ONCE(lock_is_read_held)) + if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) lwsp->n_lock_fail++; /* rare, but... */ lwsp->n_lock_acquired++; @@ -717,13 +717,13 @@ static int lock_torture_reader(void *arg) schedule_timeout_uninterruptible(1); cxt.cur_ops->readlock(tid); - lock_is_read_held = true; + atomic_inc(&lock_is_read_held); if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = false; + atomic_dec(&lock_is_read_held); cxt.cur_ops->readunlock(tid); stutter_wait("lock_torture_reader"); @@ -738,20 +738,22 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { + long cur; bool fail = false; int i, n_stress; - long max = 0, min = statp ? statp[0].n_lock_acquired : 0; + long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0; long long sum = 0; n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; for (i = 0; i < n_stress; i++) { - if (statp[i].n_lock_fail) + if (data_race(statp[i].n_lock_fail)) fail = true; - sum += statp[i].n_lock_acquired; - if (max < statp[i].n_lock_acquired) - max = statp[i].n_lock_acquired; - if (min > statp[i].n_lock_acquired) - min = statp[i].n_lock_acquired; + cur = data_race(statp[i].n_lock_acquired); + sum += cur; + if (max < cur) + max = cur; + if (min > cur) + min = cur; } page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", @@ -996,7 +998,6 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - lock_is_read_held = false; cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, sizeof(*cxt.lrsa), GFP_KERNEL); diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index db9301591e3f..bc8abb8549d2 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -1,6 +1,4 @@ /* - * kernel/mutex-debug.c - * * Debugging code for mutexes * * Started by Ingo Molnar: @@ -22,7 +20,7 @@ #include <linux/interrupt.h> #include <linux/debug_locks.h> -#include "mutex-debug.h" +#include "mutex.h" /* * Must be called with lock->wait_lock held. @@ -32,6 +30,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); waiter->magic = waiter; INIT_LIST_HEAD(&waiter->list); + waiter->ww_ctx = MUTEX_POISON_WW_CTX; } void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h deleted file mode 100644 index 53e631e1d76d..000000000000 --- a/kernel/locking/mutex-debug.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Mutexes: blocking mutual exclusion locks - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * - * This file contains mutex debugging related internal declarations, - * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case. - * More details are in kernel/mutex-debug.c. - */ - -/* - * This must be called with lock->wait_lock held. - */ -extern void debug_mutex_lock_common(struct mutex *lock, - struct mutex_waiter *waiter); -extern void debug_mutex_wake_waiter(struct mutex *lock, - struct mutex_waiter *waiter); -extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); -extern void debug_mutex_add_waiter(struct mutex *lock, - struct mutex_waiter *waiter, - struct task_struct *task); -extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct task_struct *task); -extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index d2df5e68b503..d456579d0952 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -30,17 +30,20 @@ #include <linux/debug_locks.h> #include <linux/osq_lock.h> +#ifndef CONFIG_PREEMPT_RT +#include "mutex.h" + #ifdef CONFIG_DEBUG_MUTEXES -# include "mutex-debug.h" +# define MUTEX_WARN_ON(cond) DEBUG_LOCKS_WARN_ON(cond) #else -# include "mutex.h" +# define MUTEX_WARN_ON(cond) #endif void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { atomic_long_set(&lock->owner, 0); - spin_lock_init(&lock->wait_lock); + raw_spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); @@ -91,55 +94,56 @@ static inline unsigned long __owner_flags(unsigned long owner) return owner & MUTEX_FLAGS; } -/* - * Trylock variant that returns the owning task on failure. - */ -static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) +static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff) { unsigned long owner, curr = (unsigned long)current; owner = atomic_long_read(&lock->owner); for (;;) { /* must loop, can race against a flag */ - unsigned long old, flags = __owner_flags(owner); + unsigned long flags = __owner_flags(owner); unsigned long task = owner & ~MUTEX_FLAGS; if (task) { - if (likely(task != curr)) - break; - - if (likely(!(flags & MUTEX_FLAG_PICKUP))) + if (flags & MUTEX_FLAG_PICKUP) { + if (task != curr) + break; + flags &= ~MUTEX_FLAG_PICKUP; + } else if (handoff) { + if (flags & MUTEX_FLAG_HANDOFF) + break; + flags |= MUTEX_FLAG_HANDOFF; + } else { break; - - flags &= ~MUTEX_FLAG_PICKUP; + } } else { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP); -#endif + MUTEX_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP)); + task = curr; } - /* - * We set the HANDOFF bit, we must make sure it doesn't live - * past the point where we acquire it. This would be possible - * if we (accidentally) set the bit on an unlocked mutex. - */ - flags &= ~MUTEX_FLAG_HANDOFF; - - old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags); - if (old == owner) - return NULL; - - owner = old; + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, task | flags)) { + if (task == curr) + return NULL; + break; + } } return __owner_task(owner); } /* + * Trylock or set HANDOFF + */ +static inline bool __mutex_trylock_or_handoff(struct mutex *lock, bool handoff) +{ + return !__mutex_trylock_common(lock, handoff); +} + +/* * Actual trylock that will work on any unlocked state. */ static inline bool __mutex_trylock(struct mutex *lock) { - return !__mutex_trylock_or_owner(lock); + return !__mutex_trylock_common(lock, false); } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -168,10 +172,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) { unsigned long curr = (unsigned long)current; - if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr) - return true; - - return false; + return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL); } #endif @@ -226,23 +227,18 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) unsigned long owner = atomic_long_read(&lock->owner); for (;;) { - unsigned long old, new; + unsigned long new; -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); - DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); -#endif + MUTEX_WARN_ON(__owner_task(owner) != current); + MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP); new = (owner & MUTEX_FLAG_WAITERS); new |= (unsigned long)task; if (task) new |= MUTEX_FLAG_PICKUP; - old = atomic_long_cmpxchg_release(&lock->owner, owner, new); - if (old == owner) + if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, new)) break; - - owner = old; } } @@ -286,218 +282,18 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif -/* - * Wait-Die: - * The newer transactions are killed when: - * It (the new transaction) makes a request for a lock being held - * by an older transaction. - * - * Wound-Wait: - * The newer transactions are wounded when: - * An older transaction makes a request for a lock being held by - * the newer transaction. - */ - -/* - * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired - * it. - */ -static __always_inline void -ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) -{ -#ifdef CONFIG_DEBUG_MUTEXES - /* - * If this WARN_ON triggers, you used ww_mutex_lock to acquire, - * but released with a normal mutex_unlock in this call. - * - * This should never happen, always use ww_mutex_unlock. - */ - DEBUG_LOCKS_WARN_ON(ww->ctx); - - /* - * Not quite done after calling ww_acquire_done() ? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); +#include "ww_mutex.h" - if (ww_ctx->contending_lock) { - /* - * After -EDEADLK you tried to - * acquire a different ww_mutex? Bad! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); - - /* - * You called ww_mutex_lock after receiving -EDEADLK, - * but 'forgot' to unlock everything else first? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); - ww_ctx->contending_lock = NULL; - } - - /* - * Naughty, using a different class will lead to undefined behavior! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); -#endif - ww_ctx->acquired++; - ww->ctx = ww_ctx; -} - -/* - * Determine if context @a is 'after' context @b. IOW, @a is a younger - * transaction than @b and depending on algorithm either needs to wait for - * @b or die. - */ -static inline bool __sched -__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) -{ - - return (signed long)(a->stamp - b->stamp) > 0; -} - -/* - * Wait-Die; wake a younger waiter context (when locks held) such that it can - * die. - * - * Among waiters with context, only the first one can have other locks acquired - * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and - * __ww_mutex_check_kill() wake any but the earliest context. - */ -static bool __sched -__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, - struct ww_acquire_ctx *ww_ctx) -{ - if (!ww_ctx->is_wait_die) - return false; - - if (waiter->ww_ctx->acquired > 0 && - __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { - debug_mutex_wake_waiter(lock, waiter); - wake_up_process(waiter->task); - } - - return true; -} - -/* - * Wound-Wait; wound a younger @hold_ctx if it holds the lock. - * - * Wound the lock holder if there are waiters with older transactions than - * the lock holders. Even if multiple waiters may wound the lock holder, - * it's sufficient that only one does. - */ -static bool __ww_mutex_wound(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, - struct ww_acquire_ctx *hold_ctx) -{ - struct task_struct *owner = __mutex_owner(lock); - - lockdep_assert_held(&lock->wait_lock); - - /* - * Possible through __ww_mutex_add_waiter() when we race with - * ww_mutex_set_context_fastpath(). In that case we'll get here again - * through __ww_mutex_check_waiters(). - */ - if (!hold_ctx) - return false; - - /* - * Can have !owner because of __mutex_unlock_slowpath(), but if owner, - * it cannot go away because we'll have FLAG_WAITERS set and hold - * wait_lock. - */ - if (!owner) - return false; - - if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) { - hold_ctx->wounded = 1; - - /* - * wake_up_process() paired with set_current_state() - * inserts sufficient barriers to make sure @owner either sees - * it's wounded in __ww_mutex_check_kill() or has a - * wakeup pending to re-read the wounded state. - */ - if (owner != current) - wake_up_process(owner); - - return true; - } - - return false; -} - -/* - * We just acquired @lock under @ww_ctx, if there are later contexts waiting - * behind us on the wait-list, check if they need to die, or wound us. - * - * See __ww_mutex_add_waiter() for the list-order construction; basically the - * list is ordered by stamp, smallest (oldest) first. - * - * This relies on never mixing wait-die/wound-wait on the same wait-list; - * which is currently ensured by that being a ww_class property. - * - * The current task must not be on the wait list. - */ -static void __sched -__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) -{ - struct mutex_waiter *cur; - - lockdep_assert_held(&lock->wait_lock); - - list_for_each_entry(cur, &lock->wait_list, list) { - if (!cur->ww_ctx) - continue; - - if (__ww_mutex_die(lock, cur, ww_ctx) || - __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) - break; - } -} +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* - * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx - * and wake up any waiters so they can recheck. + * Trylock variant that returns the owning task on failure. */ -static __always_inline void -ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) { - ww_mutex_lock_acquired(lock, ctx); - - /* - * The lock->ctx update should be visible on all cores before - * the WAITERS check is done, otherwise contended waiters might be - * missed. The contended waiters will either see ww_ctx == NULL - * and keep spinning, or it will acquire wait_lock, add itself - * to waiter list and sleep. - */ - smp_mb(); /* See comments above and below. */ - - /* - * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS - * MB MB - * [R] MUTEX_FLAG_WAITERS [R] ww->ctx - * - * The memory barrier above pairs with the memory barrier in - * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx - * and/or !empty list. - */ - if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) - return; - - /* - * Uh oh, we raced in fastpath, check if any of the waiters need to - * die or wound us. - */ - spin_lock(&lock->base.wait_lock); - __ww_mutex_check_waiters(&lock->base, ctx); - spin_unlock(&lock->base.wait_lock); + return __mutex_trylock_common(lock, false); } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - static inline bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter) @@ -754,171 +550,11 @@ EXPORT_SYMBOL(mutex_unlock); */ void __sched ww_mutex_unlock(struct ww_mutex *lock) { - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ - if (lock->ctx) { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); -#endif - if (lock->ctx->acquired > 0) - lock->ctx->acquired--; - lock->ctx = NULL; - } - + __ww_mutex_unlock(lock); mutex_unlock(&lock->base); } EXPORT_SYMBOL(ww_mutex_unlock); - -static __always_inline int __sched -__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) -{ - if (ww_ctx->acquired > 0) { -#ifdef CONFIG_DEBUG_MUTEXES - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); - ww_ctx->contending_lock = ww; -#endif - return -EDEADLK; - } - - return 0; -} - - -/* - * Check the wound condition for the current lock acquire. - * - * Wound-Wait: If we're wounded, kill ourself. - * - * Wait-Die: If we're trying to acquire a lock already held by an older - * context, kill ourselves. - * - * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to - * look at waiters before us in the wait-list. - */ -static inline int __sched -__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, - struct ww_acquire_ctx *ctx) -{ - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); - struct mutex_waiter *cur; - - if (ctx->acquired == 0) - return 0; - - if (!ctx->is_wait_die) { - if (ctx->wounded) - return __ww_mutex_kill(lock, ctx); - - return 0; - } - - if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) - return __ww_mutex_kill(lock, ctx); - - /* - * If there is a waiter in front of us that has a context, then its - * stamp is earlier than ours and we must kill ourself. - */ - cur = waiter; - list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { - if (!cur->ww_ctx) - continue; - - return __ww_mutex_kill(lock, ctx); - } - - return 0; -} - -/* - * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest - * first. Such that older contexts are preferred to acquire the lock over - * younger contexts. - * - * Waiters without context are interspersed in FIFO order. - * - * Furthermore, for Wait-Die kill ourself immediately when possible (there are - * older contexts already waiting) to avoid unnecessary waiting and for - * Wound-Wait ensure we wound the owning context when it is younger. - */ -static inline int __sched -__ww_mutex_add_waiter(struct mutex_waiter *waiter, - struct mutex *lock, - struct ww_acquire_ctx *ww_ctx) -{ - struct mutex_waiter *cur; - struct list_head *pos; - bool is_wait_die; - - if (!ww_ctx) { - __mutex_add_waiter(lock, waiter, &lock->wait_list); - return 0; - } - - is_wait_die = ww_ctx->is_wait_die; - - /* - * Add the waiter before the first waiter with a higher stamp. - * Waiters without a context are skipped to avoid starving - * them. Wait-Die waiters may die here. Wound-Wait waiters - * never die here, but they are sorted in stamp order and - * may wound the lock holder. - */ - pos = &lock->wait_list; - list_for_each_entry_reverse(cur, &lock->wait_list, list) { - if (!cur->ww_ctx) - continue; - - if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { - /* - * Wait-Die: if we find an older context waiting, there - * is no point in queueing behind it, as we'd have to - * die the moment it would acquire the lock. - */ - if (is_wait_die) { - int ret = __ww_mutex_kill(lock, ww_ctx); - - if (ret) - return ret; - } - - break; - } - - pos = &cur->list; - - /* Wait-Die: ensure younger waiters die. */ - __ww_mutex_die(lock, cur, ww_ctx); - } - - __mutex_add_waiter(lock, waiter, pos); - - /* - * Wound-Wait: if we're blocking on a mutex owned by a younger context, - * wound that such that we might proceed. - */ - if (!is_wait_die) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - - /* - * See ww_mutex_set_context_fastpath(). Orders setting - * MUTEX_FLAG_WAITERS vs the ww->ctx load, - * such that either we or the fastpath will wound @ww->ctx. - */ - smp_mb(); - __ww_mutex_wound(lock, ww_ctx, ww->ctx); - } - - return 0; -} - /* * Lock a mutex (possibly interruptible), slowpath: */ @@ -928,7 +564,6 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { struct mutex_waiter waiter; - bool first = false; struct ww_mutex *ww; int ret; @@ -937,9 +572,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas might_sleep(); -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(lock->magic != lock); -#endif + MUTEX_WARN_ON(lock->magic != lock); ww = container_of(lock, struct ww_mutex, base); if (ww_ctx) { @@ -953,6 +586,10 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas */ if (ww_ctx->acquired == 0) ww_ctx->wounded = 0; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + nest_lock = &ww_ctx->dep_map; +#endif } preempt_disable(); @@ -968,7 +605,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas return 0; } - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); /* * After waiting to acquire the wait_lock, try again. */ @@ -980,17 +617,15 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas } debug_mutex_lock_common(lock, &waiter); + waiter.task = current; + if (use_ww_ctx) + waiter.ww_ctx = ww_ctx; lock_contended(&lock->dep_map, ip); if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ __mutex_add_waiter(lock, &waiter, &lock->wait_list); - - -#ifdef CONFIG_DEBUG_MUTEXES - waiter.ww_ctx = MUTEX_POISON_WW_CTX; -#endif } else { /* * Add in stamp order, waking up waiters that must kill @@ -999,14 +634,12 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); if (ret) goto err_early_kill; - - waiter.ww_ctx = ww_ctx; } - waiter.task = current; - set_current_state(state); for (;;) { + bool first; + /* * Once we hold wait_lock, we're serialized against * mutex_unlock() handing the lock off to us, do a trylock @@ -1032,18 +665,10 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err; } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); schedule_preempt_disabled(); - /* - * ww_mutex needs to always recheck its position since its waiter - * list is not FIFO ordered. - */ - if (ww_ctx || !first) { - first = __mutex_waiter_is_first(lock, &waiter); - if (first) - __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); - } + first = __mutex_waiter_is_first(lock, &waiter); set_current_state(state); /* @@ -1051,13 +676,13 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if (__mutex_trylock(lock) || + if (__mutex_trylock_or_handoff(lock, first) || (first && mutex_optimistic_spin(lock, ww_ctx, &waiter))) break; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); } - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); acquired: __set_current_state(TASK_RUNNING); @@ -1082,7 +707,7 @@ skip_wait: if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); preempt_enable(); return 0; @@ -1090,7 +715,7 @@ err: __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, ip); preempt_enable(); @@ -1106,10 +731,9 @@ __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, static int __sched __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, - struct lockdep_map *nest_lock, unsigned long ip, - struct ww_acquire_ctx *ww_ctx) + unsigned long ip, struct ww_acquire_ctx *ww_ctx) { - return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true); + return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true); } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -1189,8 +813,7 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, - 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, - ctx); + 0, _RET_IP_, ctx); if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -1205,8 +828,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, - 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, - ctx); + 0, _RET_IP_, ctx); if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -1237,29 +859,21 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne */ owner = atomic_long_read(&lock->owner); for (;;) { - unsigned long old; - -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); - DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); -#endif + MUTEX_WARN_ON(__owner_task(owner) != current); + MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP); if (owner & MUTEX_FLAG_HANDOFF) break; - old = atomic_long_cmpxchg_release(&lock->owner, owner, - __owner_flags(owner)); - if (old == owner) { + if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, __owner_flags(owner))) { if (owner & MUTEX_FLAG_WAITERS) break; return; } - - owner = old; } - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); debug_mutex_unlock(lock); if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ @@ -1276,7 +890,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); wake_up_q(&wake_q); } @@ -1380,7 +994,7 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock) static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL, + return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, _RET_IP_, ctx); } @@ -1388,7 +1002,7 @@ static noinline int __sched __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL, + return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, _RET_IP_, ctx); } @@ -1412,9 +1026,7 @@ int __sched mutex_trylock(struct mutex *lock) { bool locked; -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(lock->magic != lock); -#endif + MUTEX_WARN_ON(lock->magic != lock); locked = __mutex_trylock(lock); if (locked) @@ -1455,7 +1067,8 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) } EXPORT_SYMBOL(ww_mutex_lock_interruptible); -#endif +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* !CONFIG_PREEMPT_RT */ /** * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index f0c710b1d192..0b2a79c4013b 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -5,19 +5,41 @@ * started by Ingo Molnar: * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * - * This file contains mutex debugging related internal prototypes, for the - * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) -#define debug_mutex_free_waiter(waiter) do { } while (0) -#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) -#define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) -#define debug_mutex_unlock(lock) do { } while (0) -#define debug_mutex_init(lock, name, key) do { } while (0) +/* + * This is the control structure for tasks blocked on mutex, which resides + * on the blocked task's kernel stack: + */ +struct mutex_waiter { + struct list_head list; + struct task_struct *task; + struct ww_acquire_ctx *ww_ctx; +#ifdef CONFIG_DEBUG_MUTEXES + void *magic; +#endif +}; -static inline void -debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) -{ -} +#ifdef CONFIG_DEBUG_MUTEXES +extern void debug_mutex_lock_common(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_wake_waiter(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); +extern void debug_mutex_add_waiter(struct mutex *lock, + struct mutex_waiter *waiter, + struct task_struct *task); +extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct task_struct *task); +extern void debug_mutex_unlock(struct mutex *lock); +extern void debug_mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key); +#else /* CONFIG_DEBUG_MUTEXES */ +# define debug_mutex_lock_common(lock, waiter) do { } while (0) +# define debug_mutex_wake_waiter(lock, waiter) do { } while (0) +# define debug_mutex_free_waiter(waiter) do { } while (0) +# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) +# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) +# define debug_mutex_unlock(lock) do { } while (0) +# define debug_mutex_init(lock, name, key) do { } while (0) +#endif /* !CONFIG_DEBUG_MUTEXES */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index ad0db322ed3b..8eabdc79602b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -8,20 +8,58 @@ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen + * Adaptive Spinlocks: + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, + * and Peter Morreale, + * Adaptive Spinlocks simplification: + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com> * * See Documentation/locking/rt-mutex-design.rst for details. */ -#include <linux/spinlock.h> -#include <linux/export.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/deadline.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> -#include <linux/sched/deadline.h> #include <linux/sched/wake_q.h> -#include <linux/sched/debug.h> -#include <linux/timer.h> +#include <linux/ww_mutex.h> #include "rtmutex_common.h" +#ifndef WW_RT +# define build_ww_mutex() (false) +# define ww_container_of(rtm) NULL + +static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter, + struct rt_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ + return 0; +} + +static inline void __ww_mutex_check_waiters(struct rt_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ +} + +static inline void ww_mutex_lock_acquired(struct ww_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ +} + +static inline int __ww_mutex_check_kill(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct ww_acquire_ctx *ww_ctx) +{ + return 0; +} + +#else +# define build_ww_mutex() (true) +# define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base) +# include "ww_mutex.h" +#endif + /* * lock->owner state tracking: * @@ -50,7 +88,7 @@ */ static __always_inline void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) +rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) { unsigned long val = (unsigned long)owner; @@ -60,13 +98,13 @@ rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) WRITE_ONCE(lock->owner, (struct task_struct *)val); } -static __always_inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } -static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -141,15 +179,26 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) -# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) +static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return try_cmpxchg_acquire(&lock->owner, &old, new); +} + +static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return try_cmpxchg_release(&lock->owner, &old, new); +} /* * Callers must hold the ->wait_lock -- which is the whole purpose as we force * all future threads that attempt to [Rmw] the lock to the slowpath. As such * relaxed semantics suffice. */ -static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -165,7 +214,7 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, +static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, unsigned long flags) __releases(lock->wait_lock) { @@ -201,10 +250,22 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #else -# define rt_mutex_cmpxchg_acquire(l,c,n) (0) -# define rt_mutex_cmpxchg_release(l,c,n) (0) +static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return false; + +} -static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return false; +} + +static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); @@ -213,7 +274,7 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, +static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, unsigned long flags) __releases(lock->wait_lock) { @@ -223,11 +284,28 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #endif +static __always_inline int __waiter_prio(struct task_struct *task) +{ + int prio = task->prio; + + if (!rt_prio(prio)) + return DEFAULT_PRIO; + + return prio; +} + +static __always_inline void +waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + waiter->prio = __waiter_prio(task); + waiter->deadline = task->dl.deadline; +} + /* * Only use with rt_mutex_waiter_{less,equal}() */ #define task_to_waiter(p) \ - &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } + &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, struct rt_mutex_waiter *right) @@ -265,22 +343,63 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, return 1; } +static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, + struct rt_mutex_waiter *top_waiter) +{ + if (rt_mutex_waiter_less(waiter, top_waiter)) + return true; + +#ifdef RT_MUTEX_BUILD_SPINLOCKS + /* + * Note that RT tasks are excluded from same priority (lateral) + * steals to prevent the introduction of an unbounded latency. + */ + if (rt_prio(waiter->prio) || dl_prio(waiter->prio)) + return false; + + return rt_mutex_waiter_equal(waiter, top_waiter); +#else + return false; +#endif +} + #define __node_2_waiter(node) \ rb_entry((node), struct rt_mutex_waiter, tree_entry) static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) { - return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b)); + struct rt_mutex_waiter *aw = __node_2_waiter(a); + struct rt_mutex_waiter *bw = __node_2_waiter(b); + + if (rt_mutex_waiter_less(aw, bw)) + return 1; + + if (!build_ww_mutex()) + return 0; + + if (rt_mutex_waiter_less(bw, aw)) + return 0; + + /* NOTE: relies on waiter->ww_ctx being set before insertion */ + if (aw->ww_ctx) { + if (!bw->ww_ctx) + return 1; + + return (signed long)(aw->ww_ctx->stamp - + bw->ww_ctx->stamp) < 0; + } + + return 0; } static __always_inline void -rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less); } static __always_inline void -rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { if (RB_EMPTY_NODE(&waiter->tree_entry)) return; @@ -326,6 +445,35 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) rt_mutex_setprio(p, pi_task); } +/* RT mutex specific wake_q wrappers */ +static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, + struct rt_mutex_waiter *w) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) { + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(wqh->rtlock_task); + get_task_struct(w->task); + wqh->rtlock_task = w->task; + } else { + wake_q_add(&wqh->head, w->task); + } +} + +static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) { + wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT); + put_task_struct(wqh->rtlock_task); + wqh->rtlock_task = NULL; + } + + if (!wake_q_empty(&wqh->head)) + wake_up_q(&wqh->head); + + /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ + preempt_enable(); +} + /* * Deadlock detection is conditional: * @@ -348,12 +496,7 @@ rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, return chwalk == RT_MUTEX_FULL_CHAINWALK; } -/* - * Max number of times we'll walk the boosting chain: - */ -int max_lock_depth = 1024; - -static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) +static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p) { return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; } @@ -423,15 +566,15 @@ static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct */ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, enum rtmutex_chainwalk chwalk, - struct rt_mutex *orig_lock, - struct rt_mutex *next_lock, + struct rt_mutex_base *orig_lock, + struct rt_mutex_base *next_lock, struct rt_mutex_waiter *orig_waiter, struct task_struct *top_task) { struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; struct rt_mutex_waiter *prerequeue_top_waiter; int ret = 0, depth = 0; - struct rt_mutex *lock; + struct rt_mutex_base *lock; bool detect_deadlock; bool requeue = true; @@ -514,6 +657,31 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; /* + * There could be 'spurious' loops in the lock graph due to ww_mutex, + * consider: + * + * P1: A, ww_A, ww_B + * P2: ww_B, ww_A + * P3: A + * + * P3 should not return -EDEADLK because it gets trapped in the cycle + * created by P1 and P2 (which will resolve -- and runs into + * max_lock_depth above). Therefore disable detect_deadlock such that + * the below termination condition can trigger once all relevant tasks + * are boosted. + * + * Even when we start with ww_mutex we can disable deadlock detection, + * since we would supress a ww_mutex induced deadlock at [6] anyway. + * Supressing it here however is not sufficient since we might still + * hit [6] due to adjustment driven iteration. + * + * NOTE: if someone were to create a deadlock between 2 ww_classes we'd + * utterly fail to report it; lockdep should. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock) + detect_deadlock = false; + + /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting * mode! @@ -574,8 +742,21 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * walk, we detected a deadlock. */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - raw_spin_unlock(&lock->wait_lock); ret = -EDEADLK; + + /* + * When the deadlock is due to ww_mutex; also see above. Don't + * report the deadlock and instead let the ww_mutex wound/die + * logic pick which of the contending threads gets -EDEADLK. + * + * NOTE: assumes the cycle only contains a single ww_class; any + * other configuration and we fail to report; also, see + * lockdep. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter->ww_ctx) + ret = 0; + + raw_spin_unlock(&lock->wait_lock); goto out_unlock_pi; } @@ -653,8 +834,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * serializes all pi_waiters access and rb_erase() does not care about * the values of the node being removed. */ - waiter->prio = task->prio; - waiter->deadline = task->dl.deadline; + waiter_update_prio(waiter, task); rt_mutex_enqueue(lock, waiter); @@ -676,7 +856,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * to get the lock. */ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) - wake_up_process(rt_mutex_top_waiter(lock)->task); + wake_up_state(waiter->task, waiter->wake_state); raw_spin_unlock_irq(&lock->wait_lock); return 0; } @@ -779,7 +959,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * callsite called task_blocked_on_lock(), otherwise NULL */ static int __sched -try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, +try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { lockdep_assert_held(&lock->wait_lock); @@ -815,19 +995,21 @@ try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * trylock attempt. */ if (waiter) { - /* - * If waiter is not the highest priority waiter of - * @lock, give up. - */ - if (waiter != rt_mutex_top_waiter(lock)) - return 0; + struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock); /* - * We can acquire the lock. Remove the waiter from the - * lock waiters tree. + * If waiter is the highest priority waiter of @lock, + * or allowed to steal it, take it over. */ - rt_mutex_dequeue(lock, waiter); - + if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) { + /* + * We can acquire the lock. Remove the waiter from the + * lock waiters tree. + */ + rt_mutex_dequeue(lock, waiter); + } else { + return 0; + } } else { /* * If the lock has waiters already we check whether @task is @@ -838,13 +1020,9 @@ try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * not need to be dequeued. */ if (rt_mutex_has_waiters(lock)) { - /* - * If @task->prio is greater than or equal to - * the top waiter priority (kernel view), - * @task lost. - */ - if (!rt_mutex_waiter_less(task_to_waiter(task), - rt_mutex_top_waiter(lock))) + /* Check whether the trylock can steal it. */ + if (!rt_mutex_steal(task_to_waiter(task), + rt_mutex_top_waiter(lock))) return 0; /* @@ -897,14 +1075,15 @@ takeit: * * This must be called with lock->wait_lock held and interrupts disabled */ -static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock, +static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, + struct ww_acquire_ctx *ww_ctx, enum rtmutex_chainwalk chwalk) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - struct rt_mutex *next_lock; + struct rt_mutex_base *next_lock; int chain_walk = 0, res; lockdep_assert_held(&lock->wait_lock); @@ -924,8 +1103,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_lock(&task->pi_lock); waiter->task = task; waiter->lock = lock; - waiter->prio = task->prio; - waiter->deadline = task->dl.deadline; + waiter_update_prio(waiter, task); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -936,6 +1114,21 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock(&task->pi_lock); + if (build_ww_mutex() && ww_ctx) { + struct rt_mutex *rtm; + + /* Check whether the waiter should back out immediately */ + rtm = container_of(lock, struct rt_mutex, rtmutex); + res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx); + if (res) { + raw_spin_lock(&task->pi_lock); + rt_mutex_dequeue(lock, waiter); + task->pi_blocked_on = NULL; + raw_spin_unlock(&task->pi_lock); + return res; + } + } + if (!owner) return 0; @@ -986,8 +1179,8 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock, * * Called with lock->wait_lock held and interrupts disabled. */ -static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q, - struct rt_mutex *lock) +static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, + struct rt_mutex_base *lock) { struct rt_mutex_waiter *waiter; @@ -1023,235 +1216,14 @@ static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q, * deboost but before waking our donor task, hence the preempt_disable() * before unlock. * - * Pairs with preempt_enable() in rt_mutex_postunlock(); + * Pairs with preempt_enable() in rt_mutex_wake_up_q(); */ preempt_disable(); - wake_q_add(wake_q, waiter->task); - raw_spin_unlock(¤t->pi_lock); -} - -/* - * Remove a waiter from a lock and give up - * - * Must be called with lock->wait_lock held and interrupts disabled. I must - * have just failed to try_to_take_rt_mutex(). - */ -static void __sched remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) -{ - bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); - struct task_struct *owner = rt_mutex_owner(lock); - struct rt_mutex *next_lock; - - lockdep_assert_held(&lock->wait_lock); - - raw_spin_lock(¤t->pi_lock); - rt_mutex_dequeue(lock, waiter); - current->pi_blocked_on = NULL; + rt_mutex_wake_q_add(wqh, waiter); raw_spin_unlock(¤t->pi_lock); - - /* - * Only update priority if the waiter was the highest priority - * waiter of the lock and there is an owner to update. - */ - if (!owner || !is_top_waiter) - return; - - raw_spin_lock(&owner->pi_lock); - - rt_mutex_dequeue_pi(owner, waiter); - - if (rt_mutex_has_waiters(lock)) - rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - - rt_mutex_adjust_prio(owner); - - /* Store the lock on which owner is blocked or NULL */ - next_lock = task_blocked_on_lock(owner); - - raw_spin_unlock(&owner->pi_lock); - - /* - * Don't walk the chain, if the owner task is not blocked - * itself. - */ - if (!next_lock) - return; - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - - raw_spin_unlock_irq(&lock->wait_lock); - - rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, - next_lock, NULL, current); - - raw_spin_lock_irq(&lock->wait_lock); -} - -/* - * Recheck the pi chain, in case we got a priority setting - * - * Called from sched_setscheduler - */ -void __sched rt_mutex_adjust_pi(struct task_struct *task) -{ - struct rt_mutex_waiter *waiter; - struct rt_mutex *next_lock; - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - - waiter = task->pi_blocked_on; - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - return; - } - next_lock = waiter->lock; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(task); - - rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, - next_lock, NULL, task); } -void __sched rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) -{ - debug_rt_mutex_init_waiter(waiter); - RB_CLEAR_NODE(&waiter->pi_tree_entry); - RB_CLEAR_NODE(&waiter->tree_entry); - waiter->task = NULL; -} - -/** - * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop - * @lock: the rt_mutex to take - * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) - * @timeout: the pre-initialized and started timer, or NULL for none - * @waiter: the pre-initialized rt_mutex_waiter - * - * Must be called with lock->wait_lock held and interrupts disabled - */ -static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state, - struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) -{ - int ret = 0; - - for (;;) { - /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) - break; - - if (timeout && !timeout->task) { - ret = -ETIMEDOUT; - break; - } - if (signal_pending_state(state, current)) { - ret = -EINTR; - break; - } - - raw_spin_unlock_irq(&lock->wait_lock); - - schedule(); - - raw_spin_lock_irq(&lock->wait_lock); - set_current_state(state); - } - - __set_current_state(TASK_RUNNING); - return ret; -} - -static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, - struct rt_mutex_waiter *w) -{ - /* - * If the result is not -EDEADLOCK or the caller requested - * deadlock detection, nothing to do here. - */ - if (res != -EDEADLOCK || detect_deadlock) - return; - - /* - * Yell loudly and stop the task right here. - */ - WARN(1, "rtmutex deadlock detected\n"); - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - } -} - -/* - * Slow path lock function: - */ -static int __sched rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk) -{ - struct rt_mutex_waiter waiter; - unsigned long flags; - int ret = 0; - - rt_mutex_init_waiter(&waiter); - - /* - * Technically we could use raw_spin_[un]lock_irq() here, but this can - * be called in early boot if the cmpxchg() fast path is disabled - * (debug, no architecture support). In this case we will acquire the - * rtmutex with lock->wait_lock held. But we cannot unconditionally - * enable interrupts in that early boot case. So we need to use the - * irqsave/restore variants. - */ - raw_spin_lock_irqsave(&lock->wait_lock, flags); - - /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - return 0; - } - - set_current_state(state); - - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - - ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); - - if (likely(!ret)) - /* sleep on the mutex */ - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); - - if (unlikely(ret)) { - __set_current_state(TASK_RUNNING); - remove_waiter(lock, &waiter); - rt_mutex_handle_deadlock(ret, chwalk, &waiter); - } - - /* - * try_to_take_rt_mutex() sets the waiter bit - * unconditionally. We might have to fix that up. - */ - fixup_rt_mutex_waiters(lock); - - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - - /* Remove pending timer: */ - if (unlikely(timeout)) - hrtimer_cancel(&timeout->timer); - - debug_rt_mutex_free_waiter(&waiter); - - return ret; -} - -static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock) +static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) { int ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1267,7 +1239,7 @@ static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock) /* * Slow path try-lock function: */ -static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock) +static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock) { unsigned long flags; int ret; @@ -1293,25 +1265,20 @@ static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock) return ret; } -/* - * Performs the wakeup of the top-waiter and re-enables preemption. - */ -void __sched rt_mutex_postunlock(struct wake_q_head *wake_q) +static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock) { - wake_up_q(wake_q); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 1; - /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ - preempt_enable(); + return rt_mutex_slowtrylock(lock); } /* * Slow path to release a rt-mutex. - * - * Return whether the current task needs to call rt_mutex_postunlock(). */ -static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) +static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock) { - DEFINE_WAKE_Q(wake_q); + DEFINE_RT_WAKE_Q(wqh); unsigned long flags; /* irqsave required to support early boot calls */ @@ -1364,422 +1331,387 @@ static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) * * Queue the next waiter for wakeup once we release the wait_lock. */ - mark_wakeup_next_waiter(&wake_q, lock); + mark_wakeup_next_waiter(&wqh, lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - rt_mutex_postunlock(&wake_q); + rt_mutex_wake_up_q(&wqh); } -/* - * debug aware fast / slowpath lock,trylock,unlock - * - * The atomic acquire/release ops are compiled away, when either the - * architecture does not support cmpxchg or when debugging is enabled. - */ -static __always_inline int __rt_mutex_lock(struct rt_mutex *lock, long state, - unsigned int subclass) +static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock) { - int ret; - - might_sleep(); - mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 0; + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; - ret = rt_mutex_slowlock(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); - if (ret) - mutex_release(&lock->dep_map, _RET_IP_); - return ret; + rt_mutex_slowunlock(lock); } -#ifdef CONFIG_DEBUG_LOCK_ALLOC -/** - * rt_mutex_lock_nested - lock a rt_mutex - * - * @lock: the rt_mutex to be locked - * @subclass: the lockdep subclass - */ -void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) +#ifdef CONFIG_SMP +static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *owner) { - __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); - -#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + bool res = true; -/** - * rt_mutex_lock - lock a rt_mutex - * - * @lock: the rt_mutex to be locked - */ -void __sched rt_mutex_lock(struct rt_mutex *lock) + rcu_read_lock(); + for (;;) { + /* If owner changed, trylock again. */ + if (owner != rt_mutex_owner(lock)) + break; + /* + * Ensure that @owner is dereferenced after checking that + * the lock owner still matches @owner. If that fails, + * @owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + /* + * Stop spinning when: + * - the lock owner has been scheduled out + * - current is not longer the top waiter + * - current is requested to reschedule (redundant + * for CONFIG_PREEMPT_RCU=y) + * - the VCPU on which owner runs is preempted + */ + if (!owner->on_cpu || need_resched() || + rt_mutex_waiter_is_top_waiter(lock, waiter) || + vcpu_is_preempted(task_cpu(owner))) { + res = false; + break; + } + cpu_relax(); + } + rcu_read_unlock(); + return res; +} +#else +static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *owner) { - __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0); + return false; } -EXPORT_SYMBOL_GPL(rt_mutex_lock); #endif -/** - * rt_mutex_lock_interruptible - lock a rt_mutex interruptible - * - * @lock: the rt_mutex to be locked - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal +#ifdef RT_MUTEX_BUILD_MUTEX +/* + * Functions required for: + * - rtmutex, futex on all kernels + * - mutex and rwsem substitutions on RT kernels */ -int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) -{ - return __rt_mutex_lock(lock, TASK_INTERRUPTIBLE, 0); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); -/** - * rt_mutex_trylock - try to lock a rt_mutex - * - * @lock: the rt_mutex to be locked - * - * This function can only be called in thread context. It's safe to call it - * from atomic regions, but not from hard or soft interrupt context. +/* + * Remove a waiter from a lock and give up * - * Returns: - * 1 on success - * 0 on contention + * Must be called with lock->wait_lock held and interrupts disabled. It must + * have just failed to try_to_take_rt_mutex(). */ -int __sched rt_mutex_trylock(struct rt_mutex *lock) +static void __sched remove_waiter(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter) { - int ret; + bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); + struct task_struct *owner = rt_mutex_owner(lock); + struct rt_mutex_base *next_lock; - if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) - return 0; + lockdep_assert_held(&lock->wait_lock); + + raw_spin_lock(¤t->pi_lock); + rt_mutex_dequeue(lock, waiter); + current->pi_blocked_on = NULL; + raw_spin_unlock(¤t->pi_lock); /* - * No lockdep annotation required because lockdep disables the fast - * path. + * Only update priority if the waiter was the highest priority + * waiter of the lock and there is an owner to update. */ - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 1; - - ret = rt_mutex_slowtrylock(lock); - if (ret) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - - return ret; -} -EXPORT_SYMBOL_GPL(rt_mutex_trylock); - -/** - * rt_mutex_unlock - unlock a rt_mutex - * - * @lock: the rt_mutex to be unlocked - */ -void __sched rt_mutex_unlock(struct rt_mutex *lock) -{ - mutex_release(&lock->dep_map, _RET_IP_); - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + if (!owner || !is_top_waiter) return; - rt_mutex_slowunlock(lock); -} -EXPORT_SYMBOL_GPL(rt_mutex_unlock); + raw_spin_lock(&owner->pi_lock); -/* - * Futex variants, must not use fastpath. - */ -int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) -{ - return rt_mutex_slowtrylock(lock); -} + rt_mutex_dequeue_pi(owner, waiter); -int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) -{ - return __rt_mutex_slowtrylock(lock); -} + if (rt_mutex_has_waiters(lock)) + rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); -/** - * __rt_mutex_futex_unlock - Futex variant, that since futex variants - * do not use the fast-path, can be simple and will not need to retry. - * - * @lock: The rt_mutex to be unlocked - * @wake_q: The wake queue head from which to get the next lock waiter - */ -bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wake_q) -{ - lockdep_assert_held(&lock->wait_lock); + rt_mutex_adjust_prio(owner); - debug_rt_mutex_unlock(lock); + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - return false; /* done */ - } + raw_spin_unlock(&owner->pi_lock); /* - * We've already deboosted, mark_wakeup_next_waiter() will - * retain preempt_disabled when we drop the wait_lock, to - * avoid inversion prior to the wakeup. preempt_disable() - * therein pairs with rt_mutex_postunlock(). + * Don't walk the chain, if the owner task is not blocked + * itself. */ - mark_wakeup_next_waiter(wake_q, lock); + if (!next_lock) + return; - return true; /* call postunlock() */ -} + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); -void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) -{ - DEFINE_WAKE_Q(wake_q); - unsigned long flags; - bool postunlock; + raw_spin_unlock_irq(&lock->wait_lock); - raw_spin_lock_irqsave(&lock->wait_lock, flags); - postunlock = __rt_mutex_futex_unlock(lock, &wake_q); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, + next_lock, NULL, current); - if (postunlock) - rt_mutex_postunlock(&wake_q); + raw_spin_lock_irq(&lock->wait_lock); } /** - * __rt_mutex_init - initialize the rt_mutex - * - * @lock: The rt_mutex to be initialized - * @name: The lock name used for debugging - * @key: The lock class key used for debugging - * - * Initialize the rt_mutex to unlocked state. + * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take + * @ww_ctx: WW mutex context pointer + * @state: the state the task should block in (TASK_INTERRUPTIBLE + * or TASK_UNINTERRUPTIBLE) + * @timeout: the pre-initialized and started timer, or NULL for none + * @waiter: the pre-initialized rt_mutex_waiter * - * Initializing of a locked rt_mutex is not allowed + * Must be called with lock->wait_lock held and interrupts disabled */ -void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name, - struct lock_class_key *key) +static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter) { - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); + struct task_struct *owner; + int ret = 0; - __rt_mutex_basic_init(lock); -} -EXPORT_SYMBOL_GPL(__rt_mutex_init); + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock, current, waiter)) + break; -/** - * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a - * proxy owner - * - * @lock: the rt_mutex to be locked - * @proxy_owner:the task to set as owner - * - * No locking. Caller has to do serializing itself - * - * Special API call for PI-futex support. This initializes the rtmutex and - * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not - * possible at this point because the pi_state which contains the rtmutex - * is not yet visible to other tasks. - */ -void __sched rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner) -{ - __rt_mutex_basic_init(lock); - rt_mutex_set_owner(lock, proxy_owner); + if (timeout && !timeout->task) { + ret = -ETIMEDOUT; + break; + } + if (signal_pending_state(state, current)) { + ret = -EINTR; + break; + } + + if (build_ww_mutex() && ww_ctx) { + ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx); + if (ret) + break; + } + + if (waiter == rt_mutex_top_waiter(lock)) + owner = rt_mutex_owner(lock); + else + owner = NULL; + raw_spin_unlock_irq(&lock->wait_lock); + + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) + schedule(); + + raw_spin_lock_irq(&lock->wait_lock); + set_current_state(state); + } + + __set_current_state(TASK_RUNNING); + return ret; } -/** - * rt_mutex_proxy_unlock - release a lock on behalf of owner - * - * @lock: the rt_mutex to be locked - * - * No locking. Caller has to do serializing itself - * - * Special API call for PI-futex support. This merrily cleans up the rtmutex - * (debugging) state. Concurrent operations on this rt_mutex are not - * possible because it belongs to the pi_state which is about to be freed - * and it is not longer visible to other tasks. - */ -void __sched rt_mutex_proxy_unlock(struct rt_mutex *lock) +static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_waiter *w) { - debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL); + /* + * If the result is not -EDEADLOCK or the caller requested + * deadlock detection, nothing to do here. + */ + if (res != -EDEADLOCK || detect_deadlock) + return; + + if (build_ww_mutex() && w->ww_ctx) + return; + + /* + * Yell loudly and stop the task right here. + */ + WARN(1, "rtmutex deadlock detected\n"); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } } /** - * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock - * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. - * - * NOTE: does _NOT_ remove the @waiter on failure; must either call - * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for PI-futex support. + * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held + * @lock: The rtmutex to block lock + * @ww_ctx: WW mutex context pointer + * @state: The task state for sleeping + * @chwalk: Indicator whether full or partial chainwalk is requested + * @waiter: Initializer waiter for blocking */ -int __sched __rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task) +static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state, + enum rtmutex_chainwalk chwalk, + struct rt_mutex_waiter *waiter) { + struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); + struct ww_mutex *ww = ww_container_of(rtm); int ret; lockdep_assert_held(&lock->wait_lock); - if (try_to_take_rt_mutex(lock, task, NULL)) - return 1; + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock, current, NULL)) { + if (build_ww_mutex() && ww_ctx) { + __ww_mutex_check_waiters(rtm, ww_ctx); + ww_mutex_lock_acquired(ww, ww_ctx); + } + return 0; + } - /* We enforce deadlock detection for futexes */ - ret = task_blocks_on_rt_mutex(lock, waiter, task, - RT_MUTEX_FULL_CHAINWALK); + set_current_state(state); - if (ret && !rt_mutex_owner(lock)) { - /* - * Reset the return value. We might have - * returned with -EDEADLK and the owner - * released the lock while we were walking the - * pi chain. Let the waiter sort it out. - */ - ret = 0; + ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk); + if (likely(!ret)) + ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); + + if (likely(!ret)) { + /* acquired the lock */ + if (build_ww_mutex() && ww_ctx) { + if (!ww_ctx->is_wait_die) + __ww_mutex_check_waiters(rtm, ww_ctx); + ww_mutex_lock_acquired(ww, ww_ctx); + } + } else { + __set_current_state(TASK_RUNNING); + remove_waiter(lock, waiter); + rt_mutex_handle_deadlock(ret, chwalk, waiter); } + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up. + */ + fixup_rt_mutex_waiters(lock); return ret; } -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock - * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. - * - * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter - * on failure. - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for PI-futex support. - */ -int __sched rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task) +static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state) { + struct rt_mutex_waiter waiter; int ret; - raw_spin_lock_irq(&lock->wait_lock); - ret = __rt_mutex_start_proxy_lock(lock, waiter, task); - if (unlikely(ret)) - remove_waiter(lock, waiter); - raw_spin_unlock_irq(&lock->wait_lock); + rt_mutex_init_waiter(&waiter); + waiter.ww_ctx = ww_ctx; + ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK, + &waiter); + + debug_rt_mutex_free_waiter(&waiter); return ret; } -/** - * rt_mutex_wait_proxy_lock() - Wait for lock acquisition - * @lock: the rt_mutex we were woken on - * @to: the timeout, null if none. hrtimer should already have - * been started. - * @waiter: the pre-initialized rt_mutex_waiter - * - * Wait for the lock acquisition started on our behalf by - * rt_mutex_start_proxy_lock(). Upon failure, the caller must call - * rt_mutex_cleanup_proxy_lock(). - * - * Returns: - * 0 - success - * <0 - error, one of -EINTR, -ETIMEDOUT - * - * Special API call for PI-futex support +/* + * rt_mutex_slowlock - Locking slowpath invoked when fast path fails + * @lock: The rtmutex to block lock + * @ww_ctx: WW mutex context pointer + * @state: The task state for sleeping */ -int __sched rt_mutex_wait_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter) +static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state) { + unsigned long flags; int ret; - raw_spin_lock_irq(&lock->wait_lock); - /* sleep on the mutex */ - set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); /* - * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might - * have to fix that up. + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. */ - fixup_rt_mutex_waiters(lock); - raw_spin_unlock_irq(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); + ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } +static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, + unsigned int state) +{ + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 0; + + return rt_mutex_slowlock(lock, NULL, state); +} +#endif /* RT_MUTEX_BUILD_MUTEX */ + +#ifdef RT_MUTEX_BUILD_SPINLOCKS +/* + * Functions required for spin/rw_lock substitution on RT kernels + */ + /** - * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition - * @lock: the rt_mutex we were woken on - * @waiter: the pre-initialized rt_mutex_waiter - * - * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or - * rt_mutex_wait_proxy_lock(). - * - * Unless we acquired the lock; we're still enqueued on the wait-list and can - * in fact still be granted ownership until we're removed. Therefore we can - * find we are in fact the owner and must disregard the - * rt_mutex_wait_proxy_lock() failure. - * - * Returns: - * true - did the cleanup, we done. - * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, - * caller should disregards its return value. - * - * Special API call for PI-futex support + * rtlock_slowlock_locked - Slow path lock acquisition for RT locks + * @lock: The underlying RT mutex */ -bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) +static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) { - bool cleanup = false; + struct rt_mutex_waiter waiter; + struct task_struct *owner; - raw_spin_lock_irq(&lock->wait_lock); - /* - * Do an unconditional try-lock, this deals with the lock stealing - * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() - * sets a NULL owner. - * - * We're not interested in the return value, because the subsequent - * test on rt_mutex_owner() will infer that. If the trylock succeeded, - * we will own the lock and it will have removed the waiter. If we - * failed the trylock, we're still not owner and we need to remove - * ourselves. - */ - try_to_take_rt_mutex(lock, current, waiter); - /* - * Unless we're the owner; we're still enqueued on the wait_list. - * So check if we became owner, if not, take us off the wait_list. - */ - if (rt_mutex_owner(lock) != current) { - remove_waiter(lock, waiter); - cleanup = true; + lockdep_assert_held(&lock->wait_lock); + + if (try_to_take_rt_mutex(lock, current, NULL)) + return; + + rt_mutex_init_rtlock_waiter(&waiter); + + /* Save current state and set state to TASK_RTLOCK_WAIT */ + current_save_and_set_rtlock_wait_state(); + + task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK); + + for (;;) { + /* Try to acquire the lock again */ + if (try_to_take_rt_mutex(lock, current, &waiter)) + break; + + if (&waiter == rt_mutex_top_waiter(lock)) + owner = rt_mutex_owner(lock); + else + owner = NULL; + raw_spin_unlock_irq(&lock->wait_lock); + + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) + schedule_rtlock(); + + raw_spin_lock_irq(&lock->wait_lock); + set_current_state(TASK_RTLOCK_WAIT); } + + /* Restore the task state */ + current_restore_rtlock_saved_state(); + /* - * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might - * have to fix that up. + * try_to_take_rt_mutex() sets the waiter bit unconditionally. + * We might have to fix that up: */ fixup_rt_mutex_waiters(lock); - - raw_spin_unlock_irq(&lock->wait_lock); - - return cleanup; + debug_rt_mutex_free_waiter(&waiter); } -#ifdef CONFIG_DEBUG_RT_MUTEXES -void rt_mutex_debug_task_free(struct task_struct *task) +static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) { - DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); - DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); + unsigned long flags; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + rtlock_slowlock_locked(lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); } -#endif + +#endif /* RT_MUTEX_BUILD_SPINLOCKS */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c new file mode 100644 index 000000000000..5c9299aaabae --- /dev/null +++ b/kernel/locking/rtmutex_api.c @@ -0,0 +1,590 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * rtmutex API + */ +#include <linux/spinlock.h> +#include <linux/export.h> + +#define RT_MUTEX_BUILD_MUTEX +#include "rtmutex.c" + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +/* + * Debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, + unsigned int state, + unsigned int subclass) +{ + int ret; + + might_sleep(); + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + ret = __rt_mutex_lock(&lock->rtmutex, state); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; +} + +void rt_mutex_base_init(struct rt_mutex_base *rtb) +{ + __rt_mutex_base_init(rtb); +} +EXPORT_SYMBOL(rt_mutex_base_init); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/** + * rt_mutex_lock_nested - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * @subclass: the lockdep subclass + */ +void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); + +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); +#endif + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: the rt_mutex to be locked + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) +{ + return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * + * This function can only be called in thread context. It's safe to call it + * from atomic regions, but not from hard or soft interrupt context. + * + * Returns: + * 1 on success + * 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ + mutex_release(&lock->dep_map, _RET_IP_); + __rt_mutex_unlock(&lock->rtmutex); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/* + * Futex variants, must not use fastpath. + */ +int __sched rt_mutex_futex_trylock(struct rt_mutex_base *lock) +{ + return rt_mutex_slowtrylock(lock); +} + +int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + +/** + * __rt_mutex_futex_unlock - Futex variant, that since futex variants + * do not use the fast-path, can be simple and will not need to retry. + * + * @lock: The rt_mutex to be unlocked + * @wqh: The wake queue head from which to get the next lock waiter + */ +bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock, + struct rt_wake_q_head *wqh) +{ + lockdep_assert_held(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + return false; /* done */ + } + + /* + * We've already deboosted, mark_wakeup_next_waiter() will + * retain preempt_disabled when we drop the wait_lock, to + * avoid inversion prior to the wakeup. preempt_disable() + * therein pairs with rt_mutex_postunlock(). + */ + mark_wakeup_next_waiter(wqh, lock); + + return true; /* call postunlock() */ +} + +void __sched rt_mutex_futex_unlock(struct rt_mutex_base *lock) +{ + DEFINE_RT_WAKE_Q(wqh); + unsigned long flags; + bool postunlock; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + postunlock = __rt_mutex_futex_unlock(lock, &wqh); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + if (postunlock) + rt_mutex_postunlock(&wqh); +} + +/** + * __rt_mutex_init - initialize the rt_mutex + * + * @lock: The rt_mutex to be initialized + * @name: The lock name used for debugging + * @key: The lock class key used for debugging + * + * Initialize the rt_mutex to unlocked state. + * + * Initializing of a locked rt_mutex is not allowed + */ +void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + __rt_mutex_base_init(&lock->rtmutex); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); + +/** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a + * proxy owner + * + * @lock: the rt_mutex to be locked + * @proxy_owner:the task to set as owner + * + * No locking. Caller has to do serializing itself + * + * Special API call for PI-futex support. This initializes the rtmutex and + * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not + * possible at this point because the pi_state which contains the rtmutex + * is not yet visible to other tasks. + */ +void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, + struct task_struct *proxy_owner) +{ + static struct lock_class_key pi_futex_key; + + __rt_mutex_base_init(lock); + /* + * On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping' + * and rtmutex based. That causes a lockdep false positive, because + * some of the futex functions invoke spin_unlock(&hb->lock) with + * the wait_lock of the rtmutex associated to the pi_futex held. + * spin_unlock() in turn takes wait_lock of the rtmutex on which + * the spinlock is based, which makes lockdep notice a lock + * recursion. Give the futex/rtmutex wait_lock a separate key. + */ + lockdep_set_class(&lock->wait_lock, &pi_futex_key); + rt_mutex_set_owner(lock, proxy_owner); +} + +/** + * rt_mutex_proxy_unlock - release a lock on behalf of owner + * + * @lock: the rt_mutex to be locked + * + * No locking. Caller has to do serializing itself + * + * Special API call for PI-futex support. This just cleans up the rtmutex + * (debugging) state. Concurrent operations on this rt_mutex are not + * possible because it belongs to the pi_state which is about to be freed + * and it is not longer visible to other tasks. + */ +void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) +{ + debug_rt_mutex_proxy_unlock(lock); + rt_mutex_set_owner(lock, NULL); +} + +/** + * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: does _NOT_ remove the @waiter on failure; must either call + * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ +int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + lockdep_assert_held(&lock->wait_lock); + + if (try_to_take_rt_mutex(lock, task, NULL)) + return 1; + + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL, + RT_MUTEX_FULL_CHAINWALK); + + if (ret && !rt_mutex_owner(lock)) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + + return ret; +} + +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter + * on failure. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ +int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + if (unlikely(ret)) + remove_waiter(lock, waiter); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * + * Wait for the lock acquisition started on our behalf by + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call + * rt_mutex_cleanup_proxy_lock(). + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT + * + * Special API call for PI-futex support + */ +int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); + ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter); + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition + * @lock: the rt_mutex we were woken on + * @waiter: the pre-initialized rt_mutex_waiter + * + * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or + * rt_mutex_wait_proxy_lock(). + * + * Unless we acquired the lock; we're still enqueued on the wait-list and can + * in fact still be granted ownership until we're removed. Therefore we can + * find we are in fact the owner and must disregard the + * rt_mutex_wait_proxy_lock() failure. + * + * Returns: + * true - did the cleanup, we done. + * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, + * caller should disregards its return value. + * + * Special API call for PI-futex support + */ +bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter) +{ + bool cleanup = false; + + raw_spin_lock_irq(&lock->wait_lock); + /* + * Do an unconditional try-lock, this deals with the lock stealing + * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() + * sets a NULL owner. + * + * We're not interested in the return value, because the subsequent + * test on rt_mutex_owner() will infer that. If the trylock succeeded, + * we will own the lock and it will have removed the waiter. If we + * failed the trylock, we're still not owner and we need to remove + * ourselves. + */ + try_to_take_rt_mutex(lock, current, waiter); + /* + * Unless we're the owner; we're still enqueued on the wait_list. + * So check if we became owner, if not, take us off the wait_list. + */ + if (rt_mutex_owner(lock) != current) { + remove_waiter(lock, waiter); + cleanup = true; + } + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + raw_spin_unlock_irq(&lock->wait_lock); + + return cleanup; +} + +/* + * Recheck the pi chain, in case we got a priority setting + * + * Called from sched_setscheduler + */ +void __sched rt_mutex_adjust_pi(struct task_struct *task) +{ + struct rt_mutex_waiter *waiter; + struct rt_mutex_base *next_lock; + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } + next_lock = waiter->lock; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); + + rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, + next_lock, NULL, task); +} + +/* + * Performs the wakeup of the top-waiter and re-enables preemption. + */ +void __sched rt_mutex_postunlock(struct rt_wake_q_head *wqh) +{ + rt_mutex_wake_up_q(wqh); +} + +#ifdef CONFIG_DEBUG_RT_MUTEXES +void rt_mutex_debug_task_free(struct task_struct *task) +{ + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); +} +#endif + +#ifdef CONFIG_PREEMPT_RT +/* Mutexes */ +void __mutex_rt_init(struct mutex *mutex, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); + lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(__mutex_rt_init); + +static __always_inline int __mutex_lock_common(struct mutex *lock, + unsigned int state, + unsigned int subclass, + struct lockdep_map *nest_lock, + unsigned long ip) +{ + int ret; + + might_sleep(); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + ret = __rt_mutex_lock(&lock->rtmutex, state); + if (ret) + mutex_release(&lock->dep_map, ip); + else + lock_acquired(&lock->dep_map, ip); + return ret; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_nested); + +void __sched _mutex_lock_nest_lock(struct mutex *lock, + struct lockdep_map *nest_lock) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest_lock, _RET_IP_); +} +EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); + +int __sched mutex_lock_interruptible_nested(struct mutex *lock, + unsigned int subclass) +{ + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); + +int __sched mutex_lock_killable_nested(struct mutex *lock, + unsigned int subclass) +{ + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); + +void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) +{ + int token; + + might_sleep(); + + token = io_schedule_prepare(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); + io_schedule_finish(token); +} +EXPORT_SYMBOL_GPL(mutex_lock_io_nested); + +#else /* CONFIG_DEBUG_LOCK_ALLOC */ + +void __sched mutex_lock(struct mutex *lock) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock); + +int __sched mutex_lock_interruptible(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock_interruptible); + +int __sched mutex_lock_killable(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock_killable); + +void __sched mutex_lock_io(struct mutex *lock) +{ + int token = io_schedule_prepare(); + + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); + io_schedule_finish(token); +} +EXPORT_SYMBOL(mutex_lock_io); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ + +int __sched mutex_trylock(struct mutex *lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(mutex_trylock); + +void __sched mutex_unlock(struct mutex *lock) +{ + mutex_release(&lock->dep_map, _RET_IP_); + __rt_mutex_unlock(&lock->rtmutex); +} +EXPORT_SYMBOL(mutex_unlock); + +#endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index a90c22abdbca..c47e8361bfb5 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -25,29 +25,90 @@ * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task * @lock: Pointer to the rt_mutex on which the waiter blocks + * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT) * @prio: Priority of the waiter * @deadline: Deadline of the waiter if applicable + * @ww_ctx: WW context pointer */ struct rt_mutex_waiter { struct rb_node tree_entry; struct rb_node pi_tree_entry; struct task_struct *task; - struct rt_mutex *lock; + struct rt_mutex_base *lock; + unsigned int wake_state; int prio; u64 deadline; + struct ww_acquire_ctx *ww_ctx; }; +/** + * rt_wake_q_head - Wrapper around regular wake_q_head to support + * "sleeping" spinlocks on RT + * @head: The regular wake_q_head for sleeping lock variants + * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups + */ +struct rt_wake_q_head { + struct wake_q_head head; + struct task_struct *rtlock_task; +}; + +#define DEFINE_RT_WAKE_Q(name) \ + struct rt_wake_q_head name = { \ + .head = WAKE_Q_HEAD_INITIALIZER(name.head), \ + .rtlock_task = NULL, \ + } + +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); +extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); +extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); +extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter); +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter); + +extern int rt_mutex_futex_trylock(struct rt_mutex_base *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l); + +extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock); +extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock, + struct rt_wake_q_head *wqh); + +extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh); + /* * Must be guarded because this header is included from rcu/tree_plugin.h * unconditionally. */ #ifdef CONFIG_RT_MUTEXES -static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) { return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } -static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex *lock) +/* + * Lockless speculative check whether @waiter is still the top waiter on + * @lock. This is solely comparing pointers and not derefencing the + * leftmost entry which might be about to vanish. + */ +static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter) +{ + struct rb_node *leftmost = rb_first_cached(&lock->waiters); + + return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter; +} + +static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; @@ -72,19 +133,12 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) #define RT_MUTEX_HAS_WAITERS 1UL -static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) { unsigned long owner = (unsigned long) READ_ONCE(lock->owner); return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } -#else /* CONFIG_RT_MUTEXES */ -/* Used in rcu/tree_plugin.h */ -static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) -{ - return NULL; -} -#endif /* !CONFIG_RT_MUTEXES */ /* * Constants for rt mutex functions which have a selectable deadlock @@ -101,49 +155,21 @@ enum rtmutex_chainwalk { RT_MUTEX_FULL_CHAINWALK, }; -static inline void __rt_mutex_basic_init(struct rt_mutex *lock) +static inline void __rt_mutex_base_init(struct rt_mutex_base *lock) { - lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); lock->waiters = RB_ROOT_CACHED; + lock->owner = NULL; } -/* - * PI-futex support (proxy locking functions, etc.): - */ -extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); -extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task); -extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task); -extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter); -extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter); - -extern int rt_mutex_futex_trylock(struct rt_mutex *l); -extern int __rt_mutex_futex_trylock(struct rt_mutex *l); - -extern void rt_mutex_futex_unlock(struct rt_mutex *lock); -extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh); - -extern void rt_mutex_postunlock(struct wake_q_head *wake_q); - /* Debug functions */ -static inline void debug_rt_mutex_unlock(struct rt_mutex *lock) +static inline void debug_rt_mutex_unlock(struct rt_mutex_base *lock) { if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } -static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) +static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex_base *lock) { if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); @@ -161,4 +187,27 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) memset(waiter, 0x22, sizeof(*waiter)); } +static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree_entry); + RB_CLEAR_NODE(&waiter->tree_entry); + waiter->wake_state = TASK_NORMAL; + waiter->task = NULL; +} + +static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter) +{ + rt_mutex_init_waiter(waiter); + waiter->wake_state = TASK_RTLOCK_WAIT; +} + +#else /* CONFIG_RT_MUTEXES */ +/* Used in rcu/tree_plugin.h */ +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) +{ + return NULL; +} +#endif /* !CONFIG_RT_MUTEXES */ + #endif diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c new file mode 100644 index 000000000000..4ba15088e640 --- /dev/null +++ b/kernel/locking/rwbase_rt.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * RT-specific reader/writer semaphores and reader/writer locks + * + * down_write/write_lock() + * 1) Lock rtmutex + * 2) Remove the reader BIAS to force readers into the slow path + * 3) Wait until all readers have left the critical section + * 4) Mark it write locked + * + * up_write/write_unlock() + * 1) Remove the write locked marker + * 2) Set the reader BIAS, so readers can use the fast path again + * 3) Unlock rtmutex, to release blocked readers + * + * down_read/read_lock() + * 1) Try fast path acquisition (reader BIAS is set) + * 2) Take tmutex::wait_lock, which protects the writelocked flag + * 3) If !writelocked, acquire it for read + * 4) If writelocked, block on tmutex + * 5) unlock rtmutex, goto 1) + * + * up_read/read_unlock() + * 1) Try fast path release (reader count != 1) + * 2) Wake the writer waiting in down_write()/write_lock() #3 + * + * down_read/read_lock()#3 has the consequence, that rw semaphores and rw + * locks on RT are not writer fair, but writers, which should be avoided in + * RT tasks (think mmap_sem), are subject to the rtmutex priority/DL + * inheritance mechanism. + * + * It's possible to make the rw primitives writer fair by keeping a list of + * active readers. A blocked writer would force all newly incoming readers + * to block on the rtmutex, but the rtmutex would have to be proxy locked + * for one reader after the other. We can't use multi-reader inheritance + * because there is no way to support that with SCHED_DEADLINE. + * Implementing the one by one reader boosting/handover mechanism is a + * major surgery for a very dubious value. + * + * The risk of writer starvation is there, but the pathological use cases + * which trigger it are not necessarily the typical RT workloads. + * + * Common code shared between RT rw_semaphore and rwlock + */ + +static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb) +{ + int r; + + /* + * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is + * set. + */ + for (r = atomic_read(&rwb->readers); r < 0;) { + if (likely(atomic_try_cmpxchg(&rwb->readers, &r, r + 1))) + return 1; + } + return 0; +} + +static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + int ret; + + raw_spin_lock_irq(&rtm->wait_lock); + /* + * Allow readers, as long as the writer has not completely + * acquired the semaphore for write. + */ + if (atomic_read(&rwb->readers) != WRITER_BIAS) { + atomic_inc(&rwb->readers); + raw_spin_unlock_irq(&rtm->wait_lock); + return 0; + } + + /* + * Call into the slow lock path with the rtmutex->wait_lock + * held, so this can't result in the following race: + * + * Reader1 Reader2 Writer + * down_read() + * down_write() + * rtmutex_lock(m) + * wait() + * down_read() + * unlock(m->wait_lock) + * up_read() + * wake(Writer) + * lock(m->wait_lock) + * sem->writelocked=true + * unlock(m->wait_lock) + * + * up_write() + * sem->writelocked=false + * rtmutex_unlock(m) + * down_read() + * down_write() + * rtmutex_lock(m) + * wait() + * rtmutex_lock(m) + * + * That would put Reader1 behind the writer waiting on + * Reader2 to call up_read(), which might be unbound. + */ + + /* + * For rwlocks this returns 0 unconditionally, so the below + * !ret conditionals are optimized out. + */ + ret = rwbase_rtmutex_slowlock_locked(rtm, state); + + /* + * On success the rtmutex is held, so there can't be a writer + * active. Increment the reader count and immediately drop the + * rtmutex again. + * + * rtmutex->wait_lock has to be unlocked in any case of course. + */ + if (!ret) + atomic_inc(&rwb->readers); + raw_spin_unlock_irq(&rtm->wait_lock); + if (!ret) + rwbase_rtmutex_unlock(rtm); + return ret; +} + +static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + if (rwbase_read_trylock(rwb)) + return 0; + + return __rwbase_read_lock(rwb, state); +} + +static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + struct task_struct *owner; + + raw_spin_lock_irq(&rtm->wait_lock); + /* + * Wake the writer, i.e. the rtmutex owner. It might release the + * rtmutex concurrently in the fast path (due to a signal), but to + * clean up rwb->readers it needs to acquire rtm->wait_lock. The + * worst case which can happen is a spurious wakeup. + */ + owner = rt_mutex_owner(rtm); + if (owner) + wake_up_state(owner, state); + + raw_spin_unlock_irq(&rtm->wait_lock); +} + +static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, + unsigned int state) +{ + /* + * rwb->readers can only hit 0 when a writer is waiting for the + * active readers to leave the critical section. + */ + if (unlikely(atomic_dec_and_test(&rwb->readers))) + __rwbase_read_unlock(rwb, state); +} + +static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, + unsigned long flags) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + + atomic_add(READER_BIAS - bias, &rwb->readers); + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_rtmutex_unlock(rtm); +} + +static inline void rwbase_write_unlock(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + __rwbase_write_unlock(rwb, WRITER_BIAS, flags); +} + +static inline void rwbase_write_downgrade(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + /* Release it and account current as reader */ + __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags); +} + +static int __sched rwbase_write_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + /* Take the rtmutex as a first step */ + if (rwbase_rtmutex_lock_state(rtm, state)) + return -EINTR; + + /* Force readers into slow path */ + atomic_sub(READER_BIAS, &rwb->readers); + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + /* + * set_current_state() for rw_semaphore + * current_save_and_set_rtlock_wait_state() for rwlock + */ + rwbase_set_and_save_current_state(state); + + /* Block until all readers have left the critical section. */ + for (; atomic_read(&rwb->readers);) { + /* Optimized out for rwlocks */ + if (rwbase_signal_pending_state(state, current)) { + __set_current_state(TASK_RUNNING); + __rwbase_write_unlock(rwb, 0, flags); + return -EINTR; + } + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + + /* + * Schedule and wait for the readers to leave the critical + * section. The last reader leaving it wakes the waiter. + */ + if (atomic_read(&rwb->readers) != 0) + rwbase_schedule(); + set_current_state(state); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + } + + atomic_set(&rwb->readers, WRITER_BIAS); + rwbase_restore_current_state(); + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + return 0; +} + +static inline int rwbase_write_trylock(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + if (!rwbase_rtmutex_trylock(rtm)) + return 0; + + atomic_sub(READER_BIAS, &rwb->readers); + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + if (!atomic_read(&rwb->readers)) { + atomic_set(&rwb->readers, WRITER_BIAS); + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + return 1; + } + __rwbase_write_unlock(rwb, 0, flags); + return 0; +} diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 16bfbb10c74d..9215b4d6a9de 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -28,6 +28,7 @@ #include <linux/rwsem.h> #include <linux/atomic.h> +#ifndef CONFIG_PREEMPT_RT #include "lock_events.h" /* @@ -1165,7 +1166,7 @@ out_nolock: * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count) +static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; DEFINE_WAKE_Q(wake_q); @@ -1297,7 +1298,7 @@ static inline void __up_read(struct rw_semaphore *sem) if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == RWSEM_FLAG_WAITERS)) { clear_nonspinnable(sem); - rwsem_wake(sem, tmp); + rwsem_wake(sem); } } @@ -1319,7 +1320,7 @@ static inline void __up_write(struct rw_semaphore *sem) rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) - rwsem_wake(sem, tmp); + rwsem_wake(sem); } /* @@ -1344,6 +1345,114 @@ static inline void __downgrade_write(struct rw_semaphore *sem) rwsem_downgrade_wake(sem); } +#else /* !CONFIG_PREEMPT_RT */ + +#define RT_MUTEX_BUILD_MUTEX +#include "rtmutex.c" + +#define rwbase_set_and_save_current_state(state) \ + set_current_state(state) + +#define rwbase_restore_current_state() \ + __set_current_state(TASK_RUNNING) + +#define rwbase_rtmutex_lock_state(rtm, state) \ + __rt_mutex_lock(rtm, state) + +#define rwbase_rtmutex_slowlock_locked(rtm, state) \ + __rt_mutex_slowlock_locked(rtm, NULL, state) + +#define rwbase_rtmutex_unlock(rtm) \ + __rt_mutex_unlock(rtm) + +#define rwbase_rtmutex_trylock(rtm) \ + __rt_mutex_trylock(rtm) + +#define rwbase_signal_pending_state(state, current) \ + signal_pending_state(state, current) + +#define rwbase_schedule() \ + schedule() + +#include "rwbase_rt.c" + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __rwsem_init(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(__rwsem_init); +#endif + +static inline void __down_read(struct rw_semaphore *sem) +{ + rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); +} + +static inline int __down_read_interruptible(struct rw_semaphore *sem) +{ + return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE); +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE); +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + return rwbase_read_trylock(&sem->rwbase); +} + +static inline void __up_read(struct rw_semaphore *sem) +{ + rwbase_read_unlock(&sem->rwbase, TASK_NORMAL); +} + +static inline void __sched __down_write(struct rw_semaphore *sem) +{ + rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); +} + +static inline int __sched __down_write_killable(struct rw_semaphore *sem) +{ + return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE); +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + return rwbase_write_trylock(&sem->rwbase); +} + +static inline void __up_write(struct rw_semaphore *sem) +{ + rwbase_write_unlock(&sem->rwbase); +} + +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + rwbase_write_downgrade(&sem->rwbase); +} + +/* Debug stubs for the common API */ +#define DEBUG_RWSEMS_WARN_ON(c, sem) + +static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, + struct task_struct *owner) +{ +} + +static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +{ + int count = atomic_read(&sem->rwbase.readers); + + return count < 0 && count != READER_BIAS; +} + +#endif /* CONFIG_PREEMPT_RT */ + /* * lock for reading */ diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 9aa855a96c4a..9ee381e4d2a4 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -54,6 +54,7 @@ void down(struct semaphore *sem) { unsigned long flags; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; @@ -77,6 +78,7 @@ int down_interruptible(struct semaphore *sem) unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; @@ -103,6 +105,7 @@ int down_killable(struct semaphore *sem) unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; @@ -157,6 +160,7 @@ int down_timeout(struct semaphore *sem, long timeout) unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index c8d7ad9fb9b2..c5830cfa379a 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ * __[spin|read|write]_lock_bh() */ BUILD_LOCK_OPS(spin, raw_spinlock); + +#ifndef CONFIG_PREEMPT_RT BUILD_LOCK_OPS(read, rwlock); BUILD_LOCK_OPS(write, rwlock); +#endif #endif @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) EXPORT_SYMBOL(_raw_spin_unlock_bh); #endif +#ifndef CONFIG_PREEMPT_RT + #ifndef CONFIG_INLINE_READ_TRYLOCK int __lockfunc _raw_read_trylock(rwlock_t *lock) { @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) EXPORT_SYMBOL(_raw_write_unlock_bh); #endif +#endif /* !CONFIG_PREEMPT_RT */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index b9d93087ee66..14235671a1a7 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, EXPORT_SYMBOL(__raw_spin_lock_init); +#ifndef CONFIG_PREEMPT_RT void __rwlock_init(rwlock_t *lock, const char *name, struct lock_class_key *key) { @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, } EXPORT_SYMBOL(__rwlock_init); +#endif static void spin_dump(raw_spinlock_t *lock, const char *msg) { @@ -139,6 +141,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock) arch_spin_unlock(&lock->raw_lock); } +#ifndef CONFIG_PREEMPT_RT static void rwlock_bug(rwlock_t *lock, const char *msg) { if (!debug_locks_off()) @@ -228,3 +231,5 @@ void do_raw_write_unlock(rwlock_t *lock) debug_write_unlock(lock); arch_write_unlock(&lock->raw_lock); } + +#endif /* !CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c new file mode 100644 index 000000000000..d2912e44d61f --- /dev/null +++ b/kernel/locking/spinlock_rt.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * PREEMPT_RT substitution for spin/rw_locks + * + * spinlocks and rwlocks on RT are based on rtmutexes, with a few twists to + * resemble the non RT semantics: + * + * - Contrary to plain rtmutexes, spinlocks and rwlocks are state + * preserving. The task state is saved before blocking on the underlying + * rtmutex, and restored when the lock has been acquired. Regular wakeups + * during that time are redirected to the saved state so no wake up is + * missed. + * + * - Non RT spin/rwlocks disable preemption and eventually interrupts. + * Disabling preemption has the side effect of disabling migration and + * preventing RCU grace periods. + * + * The RT substitutions explicitly disable migration and take + * rcu_read_lock() across the lock held section. + */ +#include <linux/spinlock.h> +#include <linux/export.h> + +#define RT_MUTEX_BUILD_SPINLOCKS +#include "rtmutex.c" + +static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) +{ + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + rtlock_slowlock(rtm); +} + +static __always_inline void __rt_spin_lock(spinlock_t *lock) +{ + ___might_sleep(__FILE__, __LINE__, 0); + rtlock_lock(&lock->lock); + rcu_read_lock(); + migrate_disable(); +} + +void __sched rt_spin_lock(spinlock_t *lock) +{ + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass) +{ + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_nested); + +void __sched rt_spin_lock_nest_lock(spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_nest_lock); +#endif + +void __sched rt_spin_unlock(spinlock_t *lock) +{ + spin_release(&lock->dep_map, _RET_IP_); + migrate_enable(); + rcu_read_unlock(); + + if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL))) + rt_mutex_slowunlock(&lock->lock); +} +EXPORT_SYMBOL(rt_spin_unlock); + +/* + * Wait for the lock to get unlocked: instead of polling for an unlock + * (like raw spinlocks do), lock and unlock, to force the kernel to + * schedule if there's contention: + */ +void __sched rt_spin_lock_unlock(spinlock_t *lock) +{ + spin_lock(lock); + spin_unlock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_unlock); + +static __always_inline int __rt_spin_trylock(spinlock_t *lock) +{ + int ret = 1; + + if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current))) + ret = rt_mutex_slowtrylock(&lock->lock); + + if (ret) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} + +int __sched rt_spin_trylock(spinlock_t *lock) +{ + return __rt_spin_trylock(lock); +} +EXPORT_SYMBOL(rt_spin_trylock); + +int __sched rt_spin_trylock_bh(spinlock_t *lock) +{ + int ret; + + local_bh_disable(); + ret = __rt_spin_trylock(lock); + if (!ret) + local_bh_enable(); + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock_bh); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __rt_spin_lock_init(spinlock_t *lock, const char *name, + struct lock_class_key *key, bool percpu) +{ + u8 type = percpu ? LD_LOCK_PERCPU : LD_LOCK_NORMAL; + + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_type(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG, + LD_WAIT_INV, type); +} +EXPORT_SYMBOL(__rt_spin_lock_init); +#endif + +/* + * RT-specific reader/writer locks + */ +#define rwbase_set_and_save_current_state(state) \ + current_save_and_set_rtlock_wait_state() + +#define rwbase_restore_current_state() \ + current_restore_rtlock_saved_state() + +static __always_inline int +rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state) +{ + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + rtlock_slowlock(rtm); + return 0; +} + +static __always_inline int +rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state) +{ + rtlock_slowlock_locked(rtm); + return 0; +} + +static __always_inline void rwbase_rtmutex_unlock(struct rt_mutex_base *rtm) +{ + if (likely(rt_mutex_cmpxchg_acquire(rtm, current, NULL))) + return; + + rt_mutex_slowunlock(rtm); +} + +static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm) +{ + if (likely(rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + return 1; + + return rt_mutex_slowtrylock(rtm); +} + +#define rwbase_signal_pending_state(state, current) (0) + +#define rwbase_schedule() \ + schedule_rtlock() + +#include "rwbase_rt.c" +/* + * The common functions which get wrapped into the rwlock API. + */ +int __sched rt_read_trylock(rwlock_t *rwlock) +{ + int ret; + + ret = rwbase_read_trylock(&rwlock->rwbase); + if (ret) { + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} +EXPORT_SYMBOL(rt_read_trylock); + +int __sched rt_write_trylock(rwlock_t *rwlock) +{ + int ret; + + ret = rwbase_write_trylock(&rwlock->rwbase); + if (ret) { + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} +EXPORT_SYMBOL(rt_write_trylock); + +void __sched rt_read_lock(rwlock_t *rwlock) +{ + ___might_sleep(__FILE__, __LINE__, 0); + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_read_lock); + +void __sched rt_write_lock(rwlock_t *rwlock) +{ + ___might_sleep(__FILE__, __LINE__, 0); + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock); + +void __sched rt_read_unlock(rwlock_t *rwlock) +{ + rwlock_release(&rwlock->dep_map, _RET_IP_); + migrate_enable(); + rcu_read_unlock(); + rwbase_read_unlock(&rwlock->rwbase, TASK_RTLOCK_WAIT); +} +EXPORT_SYMBOL(rt_read_unlock); + +void __sched rt_write_unlock(rwlock_t *rwlock) +{ + rwlock_release(&rwlock->dep_map, _RET_IP_); + rcu_read_unlock(); + migrate_enable(); + rwbase_write_unlock(&rwlock->rwbase); +} +EXPORT_SYMBOL(rt_write_unlock); + +int __sched rt_rwlock_is_contended(rwlock_t *rwlock) +{ + return rw_base_is_contended(&rwlock->rwbase); +} +EXPORT_SYMBOL(rt_rwlock_is_contended); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __rt_rwlock_init(rwlock_t *rwlock, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock)); + lockdep_init_map_wait(&rwlock->dep_map, name, key, 0, LD_WAIT_CONFIG); +} +EXPORT_SYMBOL(__rt_rwlock_init); +#endif diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h new file mode 100644 index 000000000000..56f139201f24 --- /dev/null +++ b/kernel/locking/ww_mutex.h @@ -0,0 +1,569 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef WW_RT + +#define MUTEX mutex +#define MUTEX_WAITER mutex_waiter + +static inline struct mutex_waiter * +__ww_waiter_first(struct mutex *lock) +{ + struct mutex_waiter *w; + + w = list_first_entry(&lock->wait_list, struct mutex_waiter, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) +{ + w = list_next_entry(w, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) +{ + w = list_prev_entry(w, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_last(struct mutex *lock) +{ + struct mutex_waiter *w; + + w = list_last_entry(&lock->wait_list, struct mutex_waiter, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline void +__ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos) +{ + struct list_head *p = &lock->wait_list; + if (pos) + p = &pos->list; + __mutex_add_waiter(lock, waiter, p); +} + +static inline struct task_struct * +__ww_mutex_owner(struct mutex *lock) +{ + return __mutex_owner(lock); +} + +static inline bool +__ww_mutex_has_waiters(struct mutex *lock) +{ + return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS; +} + +static inline void lock_wait_lock(struct mutex *lock) +{ + raw_spin_lock(&lock->wait_lock); +} + +static inline void unlock_wait_lock(struct mutex *lock) +{ + raw_spin_unlock(&lock->wait_lock); +} + +static inline void lockdep_assert_wait_lock_held(struct mutex *lock) +{ + lockdep_assert_held(&lock->wait_lock); +} + +#else /* WW_RT */ + +#define MUTEX rt_mutex +#define MUTEX_WAITER rt_mutex_waiter + +static inline struct rt_mutex_waiter * +__ww_waiter_first(struct rt_mutex *lock) +{ + struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree_entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w) +{ + struct rb_node *n = rb_next(&w->tree_entry); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree_entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) +{ + struct rb_node *n = rb_prev(&w->tree_entry); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree_entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_last(struct rt_mutex *lock) +{ + struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree_entry); +} + +static inline void +__ww_waiter_add(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *pos) +{ + /* RT unconditionally adds the waiter first and then removes it on error */ +} + +static inline struct task_struct * +__ww_mutex_owner(struct rt_mutex *lock) +{ + return rt_mutex_owner(&lock->rtmutex); +} + +static inline bool +__ww_mutex_has_waiters(struct rt_mutex *lock) +{ + return rt_mutex_has_waiters(&lock->rtmutex); +} + +static inline void lock_wait_lock(struct rt_mutex *lock) +{ + raw_spin_lock(&lock->rtmutex.wait_lock); +} + +static inline void unlock_wait_lock(struct rt_mutex *lock) +{ + raw_spin_unlock(&lock->rtmutex.wait_lock); +} + +static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) +{ + lockdep_assert_held(&lock->rtmutex.wait_lock); +} + +#endif /* WW_RT */ + +/* + * Wait-Die: + * The newer transactions are killed when: + * It (the new transaction) makes a request for a lock being held + * by an older transaction. + * + * Wound-Wait: + * The newer transactions are wounded when: + * An older transaction makes a request for a lock being held by + * the newer transaction. + */ + +/* + * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired + * it. + */ +static __always_inline void +ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) +{ +#ifdef DEBUG_WW_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } + + /* + * Naughty, using a different class will lead to undefined behavior! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; + ww->ctx = ww_ctx; +} + +/* + * Determine if @a is 'less' than @b. IOW, either @a is a lower priority task + * or, when of equal priority, a younger transaction than @b. + * + * Depending on the algorithm, @a will either need to wait for @b, or die. + */ +static inline bool +__ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) +{ +/* + * Can only do the RT prio for WW_RT, because task->prio isn't stable due to PI, + * so the wait_list ordering will go wobbly. rt_mutex re-queues the waiter and + * isn't affected by this. + */ +#ifdef WW_RT + /* kernel prio; less is more */ + int a_prio = a->task->prio; + int b_prio = b->task->prio; + + if (rt_prio(a_prio) || rt_prio(b_prio)) { + + if (a_prio > b_prio) + return true; + + if (a_prio < b_prio) + return false; + + /* equal static prio */ + + if (dl_prio(a_prio)) { + if (dl_time_before(b->task->dl.deadline, + a->task->dl.deadline)) + return true; + + if (dl_time_before(a->task->dl.deadline, + b->task->dl.deadline)) + return false; + } + + /* equal prio */ + } +#endif + + /* FIFO order tie break -- bigger is younger */ + return (signed long)(a->stamp - b->stamp) > 0; +} + +/* + * Wait-Die; wake a lesser waiter context (when locks held) such that it can + * die. + * + * Among waiters with context, only the first one can have other locks acquired + * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and + * __ww_mutex_check_kill() wake any but the earliest context. + */ +static bool +__ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, + struct ww_acquire_ctx *ww_ctx) +{ + if (!ww_ctx->is_wait_die) + return false; + + if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) { +#ifndef WW_RT + debug_mutex_wake_waiter(lock, waiter); +#endif + wake_up_process(waiter->task); + } + + return true; +} + +/* + * Wound-Wait; wound a lesser @hold_ctx if it holds the lock. + * + * Wound the lock holder if there are waiters with more important transactions + * than the lock holders. Even if multiple waiters may wound the lock holder, + * it's sufficient that only one does. + */ +static bool __ww_mutex_wound(struct MUTEX *lock, + struct ww_acquire_ctx *ww_ctx, + struct ww_acquire_ctx *hold_ctx) +{ + struct task_struct *owner = __ww_mutex_owner(lock); + + lockdep_assert_wait_lock_held(lock); + + /* + * Possible through __ww_mutex_add_waiter() when we race with + * ww_mutex_set_context_fastpath(). In that case we'll get here again + * through __ww_mutex_check_waiters(). + */ + if (!hold_ctx) + return false; + + /* + * Can have !owner because of __mutex_unlock_slowpath(), but if owner, + * it cannot go away because we'll have FLAG_WAITERS set and hold + * wait_lock. + */ + if (!owner) + return false; + + if (ww_ctx->acquired > 0 && __ww_ctx_less(hold_ctx, ww_ctx)) { + hold_ctx->wounded = 1; + + /* + * wake_up_process() paired with set_current_state() + * inserts sufficient barriers to make sure @owner either sees + * it's wounded in __ww_mutex_check_kill() or has a + * wakeup pending to re-read the wounded state. + */ + if (owner != current) + wake_up_process(owner); + + return true; + } + + return false; +} + +/* + * We just acquired @lock under @ww_ctx, if there are more important contexts + * waiting behind us on the wait-list, check if they need to die, or wound us. + * + * See __ww_mutex_add_waiter() for the list-order construction; basically the + * list is ordered by stamp, smallest (oldest) first. + * + * This relies on never mixing wait-die/wound-wait on the same wait-list; + * which is currently ensured by that being a ww_class property. + * + * The current task must not be on the wait list. + */ +static void +__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) +{ + struct MUTEX_WAITER *cur; + + lockdep_assert_wait_lock_held(lock); + + for (cur = __ww_waiter_first(lock); cur; + cur = __ww_waiter_next(lock, cur)) { + + if (!cur->ww_ctx) + continue; + + if (__ww_mutex_die(lock, cur, ww_ctx) || + __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) + break; + } +} + +/* + * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx + * and wake up any waiters so they can recheck. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + ww_mutex_lock_acquired(lock, ctx); + + /* + * The lock->ctx update should be visible on all cores before + * the WAITERS check is done, otherwise contended waiters might be + * missed. The contended waiters will either see ww_ctx == NULL + * and keep spinning, or it will acquire wait_lock, add itself + * to waiter list and sleep. + */ + smp_mb(); /* See comments above and below. */ + + /* + * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS + * MB MB + * [R] MUTEX_FLAG_WAITERS [R] ww->ctx + * + * The memory barrier above pairs with the memory barrier in + * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx + * and/or !empty list. + */ + if (likely(!__ww_mutex_has_waiters(&lock->base))) + return; + + /* + * Uh oh, we raced in fastpath, check if any of the waiters need to + * die or wound us. + */ + lock_wait_lock(&lock->base); + __ww_mutex_check_waiters(&lock->base, ctx); + unlock_wait_lock(&lock->base); +} + +static __always_inline int +__ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) +{ + if (ww_ctx->acquired > 0) { +#ifdef DEBUG_WW_MUTEXES + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); + ww_ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + +/* + * Check the wound condition for the current lock acquire. + * + * Wound-Wait: If we're wounded, kill ourself. + * + * Wait-Die: If we're trying to acquire a lock already held by an older + * context, kill ourselves. + * + * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to + * look at waiters before us in the wait-list. + */ +static inline int +__ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, + struct ww_acquire_ctx *ctx) +{ + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); + struct MUTEX_WAITER *cur; + + if (ctx->acquired == 0) + return 0; + + if (!ctx->is_wait_die) { + if (ctx->wounded) + return __ww_mutex_kill(lock, ctx); + + return 0; + } + + if (hold_ctx && __ww_ctx_less(ctx, hold_ctx)) + return __ww_mutex_kill(lock, ctx); + + /* + * If there is a waiter in front of us that has a context, then its + * stamp is earlier than ours and we must kill ourself. + */ + for (cur = __ww_waiter_prev(lock, waiter); cur; + cur = __ww_waiter_prev(lock, cur)) { + + if (!cur->ww_ctx) + continue; + + return __ww_mutex_kill(lock, ctx); + } + + return 0; +} + +/* + * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest + * first. Such that older contexts are preferred to acquire the lock over + * younger contexts. + * + * Waiters without context are interspersed in FIFO order. + * + * Furthermore, for Wait-Die kill ourself immediately when possible (there are + * older contexts already waiting) to avoid unnecessary waiting and for + * Wound-Wait ensure we wound the owning context when it is younger. + */ +static inline int +__ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, + struct MUTEX *lock, + struct ww_acquire_ctx *ww_ctx) +{ + struct MUTEX_WAITER *cur, *pos = NULL; + bool is_wait_die; + + if (!ww_ctx) { + __ww_waiter_add(lock, waiter, NULL); + return 0; + } + + is_wait_die = ww_ctx->is_wait_die; + + /* + * Add the waiter before the first waiter with a higher stamp. + * Waiters without a context are skipped to avoid starving + * them. Wait-Die waiters may die here. Wound-Wait waiters + * never die here, but they are sorted in stamp order and + * may wound the lock holder. + */ + for (cur = __ww_waiter_last(lock); cur; + cur = __ww_waiter_prev(lock, cur)) { + + if (!cur->ww_ctx) + continue; + + if (__ww_ctx_less(ww_ctx, cur->ww_ctx)) { + /* + * Wait-Die: if we find an older context waiting, there + * is no point in queueing behind it, as we'd have to + * die the moment it would acquire the lock. + */ + if (is_wait_die) { + int ret = __ww_mutex_kill(lock, ww_ctx); + + if (ret) + return ret; + } + + break; + } + + pos = cur; + + /* Wait-Die: ensure younger waiters die. */ + __ww_mutex_die(lock, cur, ww_ctx); + } + + __ww_waiter_add(lock, waiter, pos); + + /* + * Wound-Wait: if we're blocking on a mutex owned by a younger context, + * wound that such that we might proceed. + */ + if (!is_wait_die) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + + /* + * See ww_mutex_set_context_fastpath(). Orders setting + * MUTEX_FLAG_WAITERS vs the ww->ctx load, + * such that either we or the fastpath will wound @ww->ctx. + */ + smp_mb(); + __ww_mutex_wound(lock, ww_ctx, ww->ctx); + } + + return 0; +} + +static inline void __ww_mutex_unlock(struct ww_mutex *lock) +{ + if (lock->ctx) { +#ifdef DEBUG_WW_MUTEXES + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); +#endif + if (lock->ctx->acquired > 0) + lock->ctx->acquired--; + lock->ctx = NULL; + } +} diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c new file mode 100644 index 000000000000..3f1fff7d2780 --- /dev/null +++ b/kernel/locking/ww_rt_mutex.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * rtmutex API + */ +#include <linux/spinlock.h> +#include <linux/export.h> + +#define RT_MUTEX_BUILD_MUTEX +#define WW_RT +#include "rtmutex.c" + +static int __sched +__ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx, + unsigned int state, unsigned long ip) +{ + struct lockdep_map __maybe_unused *nest_lock = NULL; + struct rt_mutex *rtm = &lock->base; + int ret; + + might_sleep(); + + if (ww_ctx) { + if (unlikely(ww_ctx == READ_ONCE(lock->ctx))) + return -EALREADY; + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + nest_lock = &ww_ctx->dep_map; +#endif + } + mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip); + + if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) { + if (ww_ctx) + ww_mutex_set_context_fastpath(lock, ww_ctx); + return 0; + } + + ret = rt_mutex_slowlock(&rtm->rtmutex, ww_ctx, state); + + if (ret) + mutex_release(&rtm->dep_map, ip); + return ret; +} + +int __sched +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_); +} +EXPORT_SYMBOL(ww_mutex_lock); + +int __sched +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_); +} +EXPORT_SYMBOL(ww_mutex_lock_interruptible); + +void __sched ww_mutex_unlock(struct ww_mutex *lock) +{ + struct rt_mutex *rtm = &lock->base; + + __ww_mutex_unlock(lock); + + mutex_release(&rtm->dep_map, _RET_IP_); + __rt_mutex_unlock(&rtm->rtmutex); +} +EXPORT_SYMBOL(ww_mutex_unlock); diff --git a/kernel/notifier.c b/kernel/notifier.c index 1b019cbca594..b8251dc0bc0f 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -172,25 +172,6 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); -int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, - unsigned long val_up, unsigned long val_down, void *v) -{ - unsigned long flags; - int ret; - - /* - * Musn't use RCU; because then the notifier list can - * change between the up and down traversal. - */ - spin_lock_irqsave(&nh->lock, flags); - ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); - spin_unlock_irqrestore(&nh->lock, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust); -NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust); - /** * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain diff --git a/kernel/padata.c b/kernel/padata.c index d4d3ba6e1728..18d3a5c699d8 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -9,19 +9,6 @@ * * Copyright (c) 2020 Oracle and/or its affiliates. * Author: Daniel Jordan <daniel.m.jordan@oracle.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ #include <linux/completion.h> @@ -211,7 +198,7 @@ int padata_do_parallel(struct padata_shell *ps, if ((pinst->flags & PADATA_RESET)) goto out; - atomic_inc(&pd->refcnt); + refcount_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = *cb_cpu; @@ -383,7 +370,7 @@ static void padata_serial_worker(struct work_struct *serial_work) } local_bh_enable(); - if (atomic_sub_and_test(cnt, &pd->refcnt)) + if (refcount_sub_and_test(cnt, &pd->refcnt)) padata_free_pd(pd); } @@ -593,7 +580,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_reorder_list(pd); padata_init_squeues(pd); pd->seq_nr = -1; - atomic_set(&pd->refcnt, 1); + refcount_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); INIT_WORK(&pd->reorder_work, invoke_padata_reorder); @@ -667,7 +654,7 @@ static int padata_replace(struct padata_instance *pinst) synchronize_rcu(); list_for_each_entry_continue_reverse(ps, &pinst->pslist, list) - if (atomic_dec_and_test(&ps->opd->refcnt)) + if (refcount_dec_and_test(&ps->opd->refcnt)) padata_free_pd(ps->opd); pinst->flags &= ~PADATA_RESET; @@ -733,7 +720,7 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, struct cpumask *serial_mask, *parallel_mask; int err = -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&pinst->lock); switch (cpumask_type) { @@ -753,7 +740,7 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, out: mutex_unlock(&pinst->lock); - put_online_cpus(); + cpus_read_unlock(); return err; } @@ -992,7 +979,7 @@ struct padata_instance *padata_alloc(const char *name) if (!pinst->parallel_wq) goto err_free_inst; - get_online_cpus(); + cpus_read_lock(); pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, 1, name); @@ -1026,7 +1013,7 @@ struct padata_instance *padata_alloc(const char *name) &pinst->cpu_dead_node); #endif - put_online_cpus(); + cpus_read_unlock(); return pinst; @@ -1036,7 +1023,7 @@ err_free_masks: err_free_serial_wq: destroy_workqueue(pinst->serial_wq); err_put_cpus: - put_online_cpus(); + cpus_read_unlock(); destroy_workqueue(pinst->parallel_wq); err_free_inst: kfree(pinst); @@ -1074,9 +1061,9 @@ struct padata_shell *padata_alloc_shell(struct padata_instance *pinst) ps->pinst = pinst; - get_online_cpus(); + cpus_read_lock(); pd = padata_alloc_pd(ps); - put_online_cpus(); + cpus_read_unlock(); if (!pd) goto out_free_ps; diff --git a/kernel/params.c b/kernel/params.c index 2daa2780a92c..8299bd764e42 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -243,6 +243,24 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint); +int param_set_uint_minmax(const char *val, const struct kernel_param *kp, + unsigned int min, unsigned int max) +{ + unsigned int num; + int ret; + + if (!val) + return -EINVAL; + ret = kstrtouint(val, 0, &num); + if (ret) + return ret; + if (num < min || num > max) + return -EINVAL; + *((unsigned int *)kp->arg) = num; + return 0; +} +EXPORT_SYMBOL_GPL(param_set_uint_minmax); + int param_set_charp(const char *val, const struct kernel_param *kp) { if (strlen(val) > 1024) { diff --git a/kernel/pid.c b/kernel/pid.c index ebdf9c60cd0b..efe87db44683 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -550,13 +550,21 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) * Note, that this function can only be called after the fd table has * been unshared to avoid leaking the pidfd to the new process. * + * This symbol should not be explicitly exported to loadable modules. + * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -static int pidfd_create(struct pid *pid, unsigned int flags) +int pidfd_create(struct pid *pid, unsigned int flags) { int fd; + if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + return -EINVAL; + + if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) + return -EINVAL; + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), flags | O_RDWR | O_CLOEXEC); if (fd < 0) @@ -596,10 +604,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) if (!p) return -ESRCH; - if (pid_has_task(p, PIDTYPE_TGID)) - fd = pidfd_create(p, flags); - else - fd = -EINVAL; + fd = pidfd_create(p, flags); put_pid(p); return fd; diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0f4530b3a8cd..a332ccd829e2 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -170,7 +170,9 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = 0; i < nr_states; i++) { - table[i].cost = div64_u64(fmax * table[i].power, + unsigned long power_res = em_scale_power(table[i].power); + + table[i].cost = div64_u64(fmax * power_res, table[i].frequency); } diff --git a/kernel/power/main.c b/kernel/power/main.c index 12c7e1bb442f..44169f3081fd 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -577,7 +577,7 @@ static inline void pm_print_times_init(void) {} struct kobject *power_kobj; -/** +/* * state - control system sleep states. * * show() returns available sleep state labels, which may be "mem", "standby", diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index d8cae434f9eb..eb75f394a059 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -96,7 +96,7 @@ static void s2idle_enter(void) s2idle_state = S2IDLE_STATE_ENTER; raw_spin_unlock_irq(&s2idle_lock); - get_online_cpus(); + cpus_read_lock(); cpuidle_resume(); /* Push all the CPUs into the idle loop. */ @@ -106,7 +106,7 @@ static void s2idle_enter(void) s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); - put_online_cpus(); + cpus_read_unlock(); raw_spin_lock_irq(&s2idle_lock); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index e1ed58adb69e..d20526c5be15 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -129,7 +129,7 @@ static int __init has_wakealarm(struct device *dev, const void *data) { struct rtc_device *candidate = to_rtc_device(dev); - if (!candidate->ops->set_alarm) + if (!test_bit(RTC_FEATURE_ALARM, candidate->features)) return 0; if (!device_may_wakeup(candidate->dev.parent)) return 0; diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index dca51fe9c73f..2cc34a22a506 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -487,7 +487,7 @@ retry: if (gp_async) { cur_ops->gp_barrier(); } - writer_n_durations[me] = i_max; + writer_n_durations[me] = i_max + 1; torture_kthread_stopping("rcu_scale_writer"); return 0; } @@ -561,7 +561,7 @@ rcu_scale_cleanup(void) wdpp = writer_durations[i]; if (!wdpp) continue; - for (j = 0; j <= writer_n_durations[i]; j++) { + for (j = 0; j < writer_n_durations[i]; j++) { wdp = &wdpp[j]; pr_alert("%s%s %4d writer-duration: %5d %llu\n", scale_type, SCALE_FLAG, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 40ef5417d954..ab4215266ebe 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2022,8 +2022,13 @@ static int rcu_torture_stall(void *args) __func__, raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) - if (stall_cpu_block) + if (stall_cpu_block) { +#ifdef CONFIG_PREEMPTION + preempt_schedule(); +#else schedule_timeout_uninterruptible(HZ); +#endif + } if (stall_cpu_irqsoff) local_irq_enable(); else if (!stall_cpu_block) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index d998a76fb542..66dc14cf5687 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -467,6 +467,40 @@ static struct ref_scale_ops acqrel_ops = { .name = "acqrel" }; +static volatile u64 stopopts; + +static void ref_clock_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += ktime_get_real_fast_ns(); + preempt_enable(); + stopopts = x; +} + +static void ref_clock_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += ktime_get_real_fast_ns(); + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static struct ref_scale_ops clock_ops = { + .readsection = ref_clock_section, + .delaysection = ref_clock_delay_section, + .name = "clock" +}; + static void rcu_scale_one_reader(void) { if (readdelay <= 0) @@ -759,7 +793,7 @@ ref_scale_init(void) int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; if (!torture_init_begin(scale_type, verbose)) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 26344dc6483b..a0ba2ed49bc6 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); */ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { - int newval = ssp->srcu_lock_nesting[idx] - 1; + int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8536c55df514..806160c44b17 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -643,8 +643,8 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } // // "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of // passing an empty function to schedule_on_each_cpu(). This approach -// provides an asynchronous call_rcu_tasks_rude() API and batching -// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// provides an asynchronous call_rcu_tasks_rude() API and batching of +// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API. // This invokes schedule_on_each_cpu() in order to send IPIs far and wide // and induces otherwise unnecessary context switches on all online CPUs, // whether idle or not. @@ -785,7 +785,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // set that task's .need_qs flag so that task's next outermost // rcu_read_unlock_trace() will report the quiescent state (in which // case the count of readers is incremented). If both attempts fail, -// the task is added to a "holdout" list. +// the task is added to a "holdout" list. Note that IPIs are used +// to invoke trc_read_check_handler() in the context of running tasks +// in order to avoid ordering overhead on common-case shared-variable +// accessses. // rcu_tasks_trace_postscan(): // Initialize state and attempt to identify an immediate quiescent // state as above (but only for idle tasks), unblock CPU-hotplug @@ -847,7 +850,7 @@ static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { - int nq = t->trc_reader_special.b.need_qs; + int nq = READ_ONCE(t->trc_reader_special.b.need_qs); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) @@ -894,7 +897,7 @@ static void trc_read_check_handler(void *t_in) // If the task is not in a read-side critical section, and // if this is the last reader, awaken the grace-period kthread. - if (likely(!t->trc_reader_nesting)) { + if (likely(!READ_ONCE(t->trc_reader_nesting))) { if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) wake_up(&trc_wait); // Mark as checked after decrement to avoid false @@ -903,7 +906,7 @@ static void trc_read_check_handler(void *t_in) goto reset_ipi; } // If we are racing with an rcu_read_unlock_trace(), try again later. - if (unlikely(t->trc_reader_nesting < 0)) { + if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) { if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) wake_up(&trc_wait); goto reset_ipi; @@ -913,14 +916,14 @@ static void trc_read_check_handler(void *t_in) // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); WRITE_ONCE(t->trc_reader_special.b.need_qs, true); reset_ipi: // Allow future IPIs to be sent on CPU and for task. // Also order this IPI handler against any later manipulations of // the intended task. - smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ } @@ -950,6 +953,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) n_heavy_reader_ofl_updates++; in_qs = true; } else { + // The task is not running, so C-language access is safe. in_qs = likely(!t->trc_reader_nesting); } @@ -964,7 +968,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // state so that it will awaken the grace-period kthread upon exit // from that critical section. atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); WRITE_ONCE(t->trc_reader_special.b.need_qs, true); return true; } @@ -982,7 +986,7 @@ static void trc_wait_for_one_reader(struct task_struct *t, // The current task had better be in a quiescent state. if (t == current) { t->trc_reader_checked = true; - WARN_ON_ONCE(t->trc_reader_nesting); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); return; } @@ -994,6 +998,12 @@ static void trc_wait_for_one_reader(struct task_struct *t, } put_task_struct(t); + // If this task is not yet on the holdout list, then we are in + // an RCU read-side critical section. Otherwise, the invocation of + // rcu_add_holdout() that added it to the list did the necessary + // get_task_struct(). Either way, the task cannot be freed out + // from under this code. + // If currently running, send an IPI, either way, add to list. trc_add_holdout(t, bhp); if (task_curr(t) && @@ -1092,8 +1102,8 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0], ".i"[is_idle_task(t)], ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], - t->trc_reader_nesting, - " N"[!!t->trc_reader_special.b.need_qs], + READ_ONCE(t->trc_reader_nesting), + " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)], cpu); sched_show_task(t); } @@ -1187,7 +1197,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) static void exit_tasks_rcu_finish_trace(struct task_struct *t) { WRITE_ONCE(t->trc_reader_checked, true); - WARN_ON_ONCE(t->trc_reader_nesting); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); WRITE_ONCE(t->trc_reader_nesting, 0); if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) rcu_read_unlock_trace_special(t, 0); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51f24ecd94b2..bce848e50512 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -74,17 +74,10 @@ /* Data structures. */ -/* - * Steal a bit from the bottom of ->dynticks for idle entry/exit - * control. Initially this is for TLB flushing. - */ -#define RCU_DYNTICK_CTRL_MASK 0x1 -#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) - static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, - .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), + .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_RCU_NOCB_CPU .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, #endif @@ -259,6 +252,15 @@ void rcu_softirq_qs(void) } /* + * Increment the current CPU's rcu_data structure's ->dynticks field + * with ordering. Return the new value. + */ +static noinline noinstr unsigned long rcu_dynticks_inc(int incby) +{ + return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks)); +} + +/* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state, that is, * RCU is watching prior to the call to this function and is no longer @@ -266,7 +268,6 @@ void rcu_softirq_qs(void) */ static noinstr void rcu_dynticks_eqs_enter(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -275,13 +276,9 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = rcu_dynticks_inc(1); // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_CTR)); - /* Better not have special action (TLB flush) pending! */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_MASK)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1)); } /* @@ -291,7 +288,6 @@ static noinstr void rcu_dynticks_eqs_enter(void) */ static noinstr void rcu_dynticks_eqs_exit(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -299,15 +295,10 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = rcu_dynticks_inc(1); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !(seq & RCU_DYNTICK_CTRL_CTR)); - if (seq & RCU_DYNTICK_CTRL_MASK) { - arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); - smp_mb__after_atomic(); /* _exit after clearing mask. */ - } + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1)); } /* @@ -324,9 +315,9 @@ static void rcu_dynticks_eqs_online(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR) + if (atomic_read(&rdp->dynticks) & 0x1) return; - atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + rcu_dynticks_inc(1); } /* @@ -336,9 +327,7 @@ static void rcu_dynticks_eqs_online(void) */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1); } /* @@ -347,9 +336,8 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) */ static int rcu_dynticks_snap(struct rcu_data *rdp) { - int snap = atomic_add_return(0, &rdp->dynticks); - - return snap & ~RCU_DYNTICK_CTRL_MASK; + smp_mb(); // Fundamental RCU ordering guarantee. + return atomic_read_acquire(&rdp->dynticks); } /* @@ -358,7 +346,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp) */ static bool rcu_dynticks_in_eqs(int snap) { - return !(snap & RCU_DYNTICK_CTRL_CTR); + return !(snap & 0x1); } /* Return true if the specified CPU is currently idle from an RCU viewpoint. */ @@ -389,8 +377,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | - RCU_DYNTICK_CTRL_CTR); + snap = atomic_read(&rdp->dynticks) & ~0x1; smp_rmb(); // Order ->dynticks and *vp reads. if (READ_ONCE(*vp)) @@ -398,32 +385,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) smp_rmb(); // Order *vp read and ->dynticks re-read. // If still in the same extended quiescent state, we are good! - return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); -} - -/* - * Set the special (bottom) bit of the specified CPU so that it - * will take special action (such as flushing its TLB) on the - * next exit from an extended quiescent state. Returns true if - * the bit was successfully set, or false if the CPU was not in - * an extended quiescent state. - */ -bool rcu_eqs_special_set(int cpu) -{ - int old; - int new; - int new_old; - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - - new_old = atomic_read(&rdp->dynticks); - do { - old = new_old; - if (old & RCU_DYNTICK_CTRL_CTR) - return false; - new = old | RCU_DYNTICK_CTRL_MASK; - new_old = atomic_cmpxchg(&rdp->dynticks, old, new); - } while (new_old != old); - return true; + return snap == atomic_read(&rdp->dynticks); } /* @@ -439,13 +401,12 @@ bool rcu_eqs_special_set(int cpu) */ notrace void rcu_momentary_dyntick_idle(void) { - int special; + int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, - &this_cpu_ptr(&rcu_data)->dynticks); + seq = rcu_dynticks_inc(2); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); + WARN_ON_ONCE(!(seq & 0x1)); rcu_preempt_deferred_qs(current); } EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); @@ -1325,7 +1286,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) */ jtsq = READ_ONCE(jiffies_to_sched_qs); ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu); - rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); + rnhqp = per_cpu_ptr(&rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || time_after(jiffies, rcu_state.jiffies_resched) || @@ -1772,7 +1733,7 @@ static void rcu_strict_gp_boundary(void *unused) /* * Initialize a new grace period. Return false if no grace period required. */ -static bool rcu_gp_init(void) +static noinline_for_stack bool rcu_gp_init(void) { unsigned long firstseq; unsigned long flags; @@ -1966,7 +1927,7 @@ static void rcu_gp_fqs(bool first_time) /* * Loop doing repeated quiescent-state forcing until the grace period ends. */ -static void rcu_gp_fqs_loop(void) +static noinline_for_stack void rcu_gp_fqs_loop(void) { bool first_gp_fqs; int gf = 0; @@ -1993,8 +1954,8 @@ static void rcu_gp_fqs_loop(void) trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS); - ret = swait_event_idle_timeout_exclusive( - rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq, + rcu_gp_fqs_check_wake(&gf), j); rcu_gp_torture_wait(); WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ @@ -2471,9 +2432,6 @@ int rcutree_dead_cpu(unsigned int cpu) WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - /* Do any needed no-CB deferred wakeups from this CPU. */ - do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu)); - // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); return 0; @@ -4050,7 +4008,7 @@ void rcu_barrier(void) */ init_completion(&rcu_state.barrier_completion); atomic_set(&rcu_state.barrier_cpu_count, 2); - get_online_cpus(); + cpus_read_lock(); /* * Force each CPU with callbacks to register a new callback. @@ -4081,7 +4039,7 @@ void rcu_barrier(void) rcu_state.barrier_sequence); } } - put_online_cpus(); + cpus_read_unlock(); /* * Now that we have an rcu_barrier_callback() callback on each @@ -4784,4 +4742,5 @@ void __init rcu_init(void) #include "tree_stall.h" #include "tree_exp.h" +#include "tree_nocb.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h new file mode 100644 index 000000000000..8fdf44f8523f --- /dev/null +++ b/kernel/rcu/tree_nocb.h @@ -0,0 +1,1496 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions that provide either classic + * or preemptible semantics. + * + * Copyright Red Hat, 2009 + * Copyright IBM Corporation, 2009 + * Copyright SUSE, 2021 + * + * Author: Ingo Molnar <mingo@elte.hu> + * Paul E. McKenney <paulmck@linux.ibm.com> + * Frederic Weisbecker <frederic@kernel.org> + */ + +#ifdef CONFIG_RCU_NOCB_CPU +static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ +static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return lockdep_is_held(&rdp->nocb_lock); +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + /* Race on early boot between thread creation and assignment */ + if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) + return true; + + if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) + if (in_task()) + return true; + return false; +} + +/* + * Offload callback processing from the boot-time-specified set of CPUs + * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads + * created that pull the callbacks from the corresponding CPU, wait for + * a grace period to elapse, and invoke the callbacks. These kthreads + * are organized into GP kthreads, which manage incoming callbacks, wait for + * grace periods, and awaken CB kthreads, and the CB kthreads, which only + * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs + * do a wake_up() on their GP kthread when they insert a callback into any + * empty list, unless the rcu_nocb_poll boot parameter has been specified, + * in which case each kthread actively polls its CPU. (Which isn't so great + * for energy efficiency, but which does reduce RCU's overhead on that CPU.) + * + * This is intended to be used in conjunction with Frederic Weisbecker's + * adaptive-idle work, which would seriously reduce OS jitter on CPUs + * running CPU-bound user-mode computations. + * + * Offloading of callbacks can also be used as an energy-efficiency + * measure because CPUs with no RCU callbacks queued are more aggressive + * about entering dyntick-idle mode. + */ + + +/* + * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. + * If the list is invalid, a warning is emitted and all CPUs are offloaded. + */ +static int __init rcu_nocb_setup(char *str) +{ + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + if (cpulist_parse(str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } + return 1; +} +__setup("rcu_nocbs=", rcu_nocb_setup); + +static int __init parse_rcu_nocb_poll(char *arg) +{ + rcu_nocb_poll = true; + return 0; +} +early_param("rcu_nocb_poll", parse_rcu_nocb_poll); + +/* + * Don't bother bypassing ->cblist if the call_rcu() rate is low. + * After all, the main point of bypassing is to avoid lock contention + * on ->nocb_lock, which only can happen at high call_rcu() rates. + */ +static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; +module_param(nocb_nobypass_lim_per_jiffy, int, 0); + +/* + * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the + * lock isn't immediately available, increment ->nocb_lock_contended to + * flag the contention. + */ +static void rcu_nocb_bypass_lock(struct rcu_data *rdp) + __acquires(&rdp->nocb_bypass_lock) +{ + lockdep_assert_irqs_disabled(); + if (raw_spin_trylock(&rdp->nocb_bypass_lock)) + return; + atomic_inc(&rdp->nocb_lock_contended); + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + smp_mb__after_atomic(); /* atomic_inc() before lock. */ + raw_spin_lock(&rdp->nocb_bypass_lock); + smp_mb__before_atomic(); /* atomic_dec() after lock. */ + atomic_dec(&rdp->nocb_lock_contended); +} + +/* + * Spinwait until the specified rcu_data structure's ->nocb_lock is + * not contended. Please note that this is extremely special-purpose, + * relying on the fact that at most two kthreads and one CPU contend for + * this lock, and also that the two kthreads are guaranteed to have frequent + * grace-period-duration time intervals between successive acquisitions + * of the lock. This allows us to use an extremely simple throttling + * mechanism, and further to apply it only to the CPU doing floods of + * call_rcu() invocations. Don't try this at home! + */ +static void rcu_nocb_wait_contended(struct rcu_data *rdp) +{ + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) + cpu_relax(); +} + +/* + * Conditionally acquire the specified rcu_data structure's + * ->nocb_bypass_lock. + */ +static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + return raw_spin_trylock(&rdp->nocb_bypass_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_bypass_lock. + */ +static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) + __releases(&rdp->nocb_bypass_lock) +{ + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_bypass_lock); +} + +/* + * Acquire the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (!rcu_rdp_is_offloaded(rdp)) + return; + raw_spin_lock(&rdp->nocb_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ + if (rcu_rdp_is_offloaded(rdp)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_lock); + } +} + +/* + * Release the specified rcu_data structure's ->nocb_lock and restore + * interrupts, but only if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + if (rcu_rdp_is_offloaded(rdp)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + } else { + local_irq_restore(flags); + } +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (rcu_rdp_is_offloaded(rdp)) + lockdep_assert_held(&rdp->nocb_lock); +} + +/* + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended + * grace period. + */ +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) +{ + swake_up_all(sq); +} + +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ + init_swait_queue_head(&rnp->nocb_gp_wq[0]); + init_swait_queue_head(&rnp->nocb_gp_wq[1]); +} + +/* Is the specified CPU a no-CBs CPU? */ +bool rcu_is_nocb_cpu(int cpu) +{ + if (cpumask_available(rcu_nocb_mask)) + return cpumask_test_cpu(cpu, rcu_nocb_mask); + return false; +} + +static bool __wake_nocb_gp(struct rcu_data *rdp_gp, + struct rcu_data *rdp, + bool force, unsigned long flags) + __releases(rdp_gp->nocb_gp_lock) +{ + bool needwake = false; + + if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("AlreadyAwake")); + return false; + } + + if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&rdp_gp->nocb_timer); + } + + if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { + WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); + needwake = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + if (needwake) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); + wake_up_process(rdp_gp->nocb_gp_kthread); + } + + return needwake; +} + +/* + * Kick the GP kthread for this NOCB group. + */ +static bool wake_nocb_gp(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + return __wake_nocb_gp(rdp_gp, rdp, force, flags); +} + +/* + * Arrange to wake the GP kthread for this NOCB group at some future + * time when it is safe to do so. + */ +static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, + const char *reason) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + + /* + * Bypass wakeup overrides previous deferments. In case + * of callback storm, no need to wake up too early. + */ + if (waketype == RCU_NOCB_WAKE_BYPASS) { + mod_timer(&rdp_gp->nocb_timer, jiffies + 2); + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } else { + if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE) + mod_timer(&rdp_gp->nocb_timer, jiffies + 1); + if (rdp_gp->nocb_defer_wakeup < waketype) + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } + + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + struct rcu_cblist rcl; + + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); + rcu_lockdep_assert_cblist_protected(rdp); + lockdep_assert_held(&rdp->nocb_bypass_lock); + if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { + raw_spin_unlock(&rdp->nocb_bypass_lock); + return false; + } + /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ + if (rhp) + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); + WRITE_ONCE(rdp->nocb_bypass_first, j); + rcu_nocb_bypass_unlock(rdp); + return true; +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + if (!rcu_rdp_is_offloaded(rdp)) + return true; + rcu_lockdep_assert_cblist_protected(rdp); + rcu_nocb_bypass_lock(rdp); + return rcu_nocb_do_flush_bypass(rdp, rhp, j); +} + +/* + * If the ->nocb_bypass_lock is immediately available, flush the + * ->nocb_bypass queue into ->cblist. + */ +static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) +{ + rcu_lockdep_assert_cblist_protected(rdp); + if (!rcu_rdp_is_offloaded(rdp) || + !rcu_nocb_bypass_trylock(rdp)) + return; + WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); +} + +/* + * See whether it is appropriate to use the ->nocb_bypass list in order + * to control contention on ->nocb_lock. A limited number of direct + * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass + * is non-empty, further callbacks must be placed into ->nocb_bypass, + * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch + * back to direct use of ->cblist. However, ->nocb_bypass should not be + * used if ->cblist is empty, because otherwise callbacks can be stranded + * on ->nocb_bypass because we cannot count on the current CPU ever again + * invoking call_rcu(). The general rule is that if ->nocb_bypass is + * non-empty, the corresponding no-CBs grace-period kthread must not be + * in an indefinite sleep state. + * + * Finally, it is not permitted to use the bypass during early boot, + * as doing so would confuse the auto-initialization code. Besides + * which, there is no point in worrying about lock contention while + * there is only one CPU in operation. + */ +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags) +{ + unsigned long c; + unsigned long cur_gp_seq; + unsigned long j = jiffies; + long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + + lockdep_assert_irqs_disabled(); + + // Pure softirq/rcuc based processing: no bypassing, no + // locking. + if (!rcu_rdp_is_offloaded(rdp)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // In the process of (de-)offloading: no bypassing, but + // locking. + if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { + rcu_nocb_lock(rdp); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; /* Not offloaded, no bypassing. */ + } + + // Don't use ->nocb_bypass during early boot. + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { + rcu_nocb_lock(rdp); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // If we have advanced to a new jiffy, reset counts to allow + // moving back from ->nocb_bypass to ->cblist. + if (j == rdp->nocb_nobypass_last) { + c = rdp->nocb_nobypass_count + 1; + } else { + WRITE_ONCE(rdp->nocb_nobypass_last, j); + c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; + if (ULONG_CMP_LT(rdp->nocb_nobypass_count, + nocb_nobypass_lim_per_jiffy)) + c = 0; + else if (c > nocb_nobypass_lim_per_jiffy) + c = nocb_nobypass_lim_per_jiffy; + } + WRITE_ONCE(rdp->nocb_nobypass_count, c); + + // If there hasn't yet been all that many ->cblist enqueues + // this jiffy, tell the caller to enqueue onto ->cblist. But flush + // ->nocb_bypass first. + if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { + rcu_nocb_lock(rdp); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + + // If ->nocb_bypass has been used too long or is too full, + // flush ->nocb_bypass to ->cblist. + if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || + ncbs >= qhimark) { + rcu_nocb_lock(rdp); + if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + return true; // Callback already enqueued. + } + + // We need to use the bypass. + rcu_nocb_wait_contended(rdp); + rcu_nocb_bypass_lock(rdp); + ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); + if (!ncbs) { + WRITE_ONCE(rdp->nocb_bypass_first, j); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); + } + rcu_nocb_bypass_unlock(rdp); + smp_mb(); /* Order enqueue before wake. */ + if (ncbs) { + local_irq_restore(flags); + } else { + // No-CBs GP kthread might be indefinitely asleep, if so, wake. + rcu_nocb_lock(rdp); // Rare during call_rcu() flood. + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQwake")); + __call_rcu_nocb_wake(rdp, true, flags); + } else { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQnoWake")); + rcu_nocb_unlock_irqrestore(rdp, flags); + } + } + return true; // Callback already enqueued. +} + +/* + * Awaken the no-CBs grace-period kthread if needed, either due to it + * legitimately being asleep or due to overload conditions. + * + * If warranted, also wake up the kthread servicing this CPUs queues. + */ +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, + unsigned long flags) + __releases(rdp->nocb_lock) +{ + unsigned long cur_gp_seq; + unsigned long j; + long len; + struct task_struct *t; + + // If we are being polled or there is no kthread, just leave. + t = READ_ONCE(rdp->nocb_gp_kthread); + if (rcu_nocb_poll || !t) { + rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("WakeNotPoll")); + return; + } + // Need to actually to a wakeup. + len = rcu_segcblist_n_cbs(&rdp->cblist); + if (was_alldone) { + rdp->qlen_last_fqs_check = len; + if (!irqs_disabled_flags(flags)) { + /* ... if queue was empty ... */ + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp(rdp, false); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("WakeEmpty")); + } else { + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); + } + } else if (len > rdp->qlen_last_fqs_check + qhimark) { + /* ... or if many callbacks queued. */ + rdp->qlen_last_fqs_check = len; + j = jiffies; + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; + } + smp_mb(); /* Enqueue before timer_pending(). */ + if ((rdp->nocb_cb_sleep || + !rcu_segcblist_ready_cbs(&rdp->cblist)) && + !timer_pending(&rdp->nocb_timer)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, + TPS("WakeOvfIsDeferred")); + } else { + rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + } + } else { + rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + } + return; +} + +/* + * Check if we ignore this rdp. + * + * We check that without holding the nocb lock but + * we make sure not to miss a freshly offloaded rdp + * with the current ordering: + * + * rdp_offload_toggle() nocb_gp_enabled_cb() + * ------------------------- ---------------------------- + * WRITE flags LOCK nocb_gp_lock + * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep + * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock + * UNLOCK nocb_gp_lock READ flags + */ +static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; + + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, + bool *needwake_state) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + } + return false; + } + + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + return true; +} + + +/* + * No-CBs GP kthreads come here to wait for additional callbacks to show up + * or for grace periods to end. + */ +static void nocb_gp_wait(struct rcu_data *my_rdp) +{ + bool bypass = false; + long bypass_ncbs; + int __maybe_unused cpu = my_rdp->cpu; + unsigned long cur_gp_seq; + unsigned long flags; + bool gotcbs = false; + unsigned long j = jiffies; + bool needwait_gp = false; // This prevents actual uninitialized use. + bool needwake; + bool needwake_gp; + struct rcu_data *rdp; + struct rcu_node *rnp; + unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. + bool wasempty = false; + + /* + * Each pass through the following loop checks for CBs and for the + * nearest grace period (if any) to wait for next. The CB kthreads + * and the global grace-period kthread are awakened if needed. + */ + WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { + bool needwake_state = false; + + if (!nocb_gp_enabled_cb(rdp)) + continue; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); + rcu_nocb_lock_irqsave(rdp, flags); + if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + continue; + } + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + if (bypass_ncbs && + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || + bypass_ncbs > 2 * qhimark)) { + // Bypass full or old, so flush it. + (void)rcu_nocb_try_flush_bypass(rdp, j); + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + continue; /* No callbacks here, try next. */ + } + if (bypass_ncbs) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("Bypass")); + bypass = true; + } + rnp = rdp->mynode; + + // Advance callbacks if helpful and low contention. + needwake_gp = false; + if (!rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL) || + (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake_gp = rcu_advance_cbs(rnp, rdp); + wasempty = rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL); + raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ + } + // Need to wait on some grace period? + WARN_ON_ONCE(wasempty && + !rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL)); + if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { + if (!needwait_gp || + ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) + wait_gp_seq = cur_gp_seq; + needwait_gp = true; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("NeedWaitGP")); + } + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { + needwake = rdp->nocb_cb_sleep; + WRITE_ONCE(rdp->nocb_cb_sleep, false); + smp_mb(); /* CB invocation -after- GP end. */ + } else { + needwake = false; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake) { + swake_up_one(&rdp->nocb_cb_wq); + gotcbs = true; + } + if (needwake_gp) + rcu_gp_kthread_wake(); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + } + + my_rdp->nocb_gp_bypass = bypass; + my_rdp->nocb_gp_gp = needwait_gp; + my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; + + if (bypass && !rcu_nocb_poll) { + // At least one child with non-empty ->nocb_bypass, so set + // timer in order to avoid stranding its callbacks. + wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, + TPS("WakeBypassIsDeferred")); + } + if (rcu_nocb_poll) { + /* Polling, so trace if first poll in the series. */ + if (gotcbs) + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); + schedule_timeout_idle(1); + } else if (!needwait_gp) { + /* Wait for callbacks to appear. */ + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); + swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); + } else { + rnp = my_rdp->mynode; + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); + swait_event_interruptible_exclusive( + rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], + rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); + } + if (!rcu_nocb_poll) { + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + } + WRITE_ONCE(my_rdp->nocb_gp_sleep, true); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } + my_rdp->nocb_gp_seq = -1; + WARN_ON(signal_pending(current)); +} + +/* + * No-CBs grace-period-wait kthread. There is one of these per group + * of CPUs, but only once at least one CPU in that group has come online + * at least once since boot. This kthread checks for newly posted + * callbacks from any of the CPUs it is responsible for, waits for a + * grace period, then awakens all of the rcu_nocb_cb_kthread() instances + * that then have callback-invocation work to do. + */ +static int rcu_nocb_gp_kthread(void *arg) +{ + struct rcu_data *rdp = arg; + + for (;;) { + WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); + nocb_gp_wait(rdp); + cond_resched_tasks_rcu_qs(); + } + return 0; +} + +static inline bool nocb_cb_can_run(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) +{ + return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); +} + +/* + * Invoke any ready callbacks from the corresponding no-CBs CPU, + * then, if there are no more, wait for more to appear. + */ +static void nocb_cb_wait(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long cur_gp_seq; + unsigned long flags; + bool needwake_state = false; + bool needwake_gp = false; + bool can_sleep = true; + struct rcu_node *rnp = rdp->mynode; + + local_irq_save(flags); + rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + /* + * Disable BH to provide the expected environment. Also, when + * transitioning to/from NOCB mode, a self-requeuing callback might + * be invoked from softirq. A short grace period could cause both + * instances of this callback would execute concurrently. + */ + local_bh_disable(); + rcu_do_batch(rdp); + local_bh_enable(); + lockdep_assert_irqs_enabled(); + rcu_nocb_lock_irqsave(rdp, flags); + if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && + raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ + needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + } + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } + if (rcu_segcblist_ready_cbs(cblist)) + can_sleep = false; + } else { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We won't touch the callbacks and keep sleeping until we ever + * get re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } + + WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); + + if (rdp->nocb_cb_sleep) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); + + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_gp) + rcu_gp_kthread_wake(); + + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + + do { + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + + // VVV Ensure CB invocation follows _sleep test. + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + } while (!nocb_cb_can_run(rdp)); +} + +/* + * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke + * nocb_cb_wait() to do the dirty work. + */ +static int rcu_nocb_cb_kthread(void *arg) +{ + struct rcu_data *rdp = arg; + + // Each pass through this loop does one callback batch, and, + // if there are no more ready callbacks, waits for them. + for (;;) { + nocb_cb_wait(rdp); + cond_resched_tasks_rcu_qs(); + } + return 0; +} + +/* Is a deferred wakeup of rcu_nocb_kthread() required? */ +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) +{ + return READ_ONCE(rdp->nocb_defer_wakeup) >= level; +} + +/* Do a deferred wakeup of rcu_nocb_kthread(). */ +static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, + struct rcu_data *rdp, int level, + unsigned long flags) + __releases(rdp_gp->nocb_gp_lock) +{ + int ndw; + int ret; + + if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + return false; + } + + ndw = rdp_gp->nocb_defer_wakeup; + ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); + + return ret; +} + +/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ +static void do_nocb_deferred_wakeup_timer(struct timer_list *t) +{ + unsigned long flags; + struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); + + WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); + + raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); + smp_mb__after_spinlock(); /* Timer expire before wakeup. */ + do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); +} + +/* + * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. + * This means we do an inexact common-case check. Note that if + * we miss, ->nocb_timer will eventually clean things up. + */ +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)) + return false; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags); +} + +void rcu_nocb_flush_deferred_wakeup(void) +{ + do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); +} +EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); + +static int rdp_offload_toggle(struct rcu_data *rdp, + bool offload, unsigned long flags) + __releases(rdp->nocb_lock) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + bool wake_gp = false; + + rcu_segcblist_offload(cblist, offload); + + if (rdp->nocb_cb_sleep) + rdp->nocb_cb_sleep = false; + rcu_nocb_unlock_irqrestore(rdp, flags); + + /* + * Ignore former value of nocb_cb_sleep and force wake up as it could + * have been spuriously set to false already. + */ + swake_up_one(&rdp->nocb_cb_wq); + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + if (rdp_gp->nocb_gp_sleep) { + rdp_gp->nocb_gp_sleep = false; + wake_gp = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); + + return 0; +} + +static long rcu_nocb_rdp_deoffload(void *arg) +{ + struct rcu_data *rdp = arg; + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + + pr_info("De-offloading %d\n", rdp->cpu); + + rcu_nocb_lock_irqsave(rdp, flags); + /* + * Flush once and for all now. This suffices because we are + * running on the target CPU holding ->nocb_lock (thus having + * interrupts disabled), and because rdp_offload_toggle() + * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. + * Thus future calls to rcu_segcblist_completely_offloaded() will + * return false, which means that future calls to rcu_nocb_try_bypass() + * will refuse to put anything into the bypass. + */ + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); + ret = rdp_offload_toggle(rdp, false, flags); + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | + SEGCBLIST_KTHREAD_GP)); + /* + * Lock one last time to acquire latest callback updates from kthreads + * so we can later handle callbacks locally without locking. + */ + rcu_nocb_lock_irqsave(rdp, flags); + /* + * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb + * lock is released but how about being paranoid for once? + */ + rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); + /* + * With SEGCBLIST_SOFTIRQ_ONLY, we can't use + * rcu_nocb_unlock_irqrestore() anymore. + */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + /* Sanity check */ + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + + + return ret; +} + +int rcu_nocb_cpu_deoffload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (rcu_rdp_is_offloaded(rdp)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + if (!ret) + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); + ret = -EINVAL; + } + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); + +static long rcu_nocb_rdp_offload(void *arg) +{ + struct rcu_data *rdp = arg; + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + /* + * For now we only support re-offload, ie: the rdp must have been + * offloaded on boot first. + */ + if (!rdp->nocb_gp_rdp) + return -EINVAL; + + pr_info("Offloading %d\n", rdp->cpu); + /* + * Can't use rcu_nocb_lock_irqsave() while we are in + * SEGCBLIST_SOFTIRQ_ONLY mode. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + + /* + * We didn't take the nocb lock while working on the + * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. + * Every modifications that have been done previously on + * rdp->cblist must be visible remotely by the nocb kthreads + * upon wake up after reading the cblist flags. + * + * The layout against nocb_lock enforces that ordering: + * + * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() + * ------------------------- ---------------------------- + * WRITE callbacks rcu_nocb_lock() + * rcu_nocb_lock() READ flags + * WRITE flags READ callbacks + * rcu_nocb_unlock() rcu_nocb_unlock() + */ + ret = rdp_offload_toggle(rdp, true, flags); + swait_event_exclusive(rdp->nocb_state_wq, + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + + return ret; +} + +int rcu_nocb_cpu_offload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (!rcu_rdp_is_offloaded(rdp)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + if (!ret) + cpumask_set_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Can't CB-offload an offline CPU\n"); + ret = -EINVAL; + } + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); + +void __init rcu_init_nohz(void) +{ + int cpu; + bool need_rcu_nocb_mask = false; + struct rcu_data *rdp; + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) + need_rcu_nocb_mask = true; +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { + if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { + pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); + return; + } + } + if (!cpumask_available(rcu_nocb_mask)) + return; + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running) + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { + pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); + cpumask_and(rcu_nocb_mask, cpu_possible_mask, + rcu_nocb_mask); + } + if (cpumask_empty(rcu_nocb_mask)) + pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); + else + pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", + cpumask_pr_args(rcu_nocb_mask)); + if (rcu_nocb_poll) + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); + + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rcu_segcblist_empty(&rdp->cblist)) + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_offload(&rdp->cblist, true); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); + } + rcu_organize_nocb_kthreads(); +} + +/* Initialize per-rcu_data variables for no-CBs CPUs. */ +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ + init_swait_queue_head(&rdp->nocb_cb_wq); + init_swait_queue_head(&rdp->nocb_gp_wq); + init_swait_queue_head(&rdp->nocb_state_wq); + raw_spin_lock_init(&rdp->nocb_lock); + raw_spin_lock_init(&rdp->nocb_bypass_lock); + raw_spin_lock_init(&rdp->nocb_gp_lock); + timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); + rcu_cblist_init(&rdp->nocb_bypass); +} + +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread + * for this CPU's group has not yet been created, spawn it as well. + */ +static void rcu_spawn_one_nocb_kthread(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp_gp; + struct task_struct *t; + + /* + * If this isn't a no-CBs CPU or if it already has an rcuo kthread, + * then nothing to do. + */ + if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) + return; + + /* If we didn't spawn the GP kthread first, reorganize! */ + rdp_gp = rdp->nocb_gp_rdp; + if (!rdp_gp->nocb_gp_kthread) { + t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, + "rcuog/%d", rdp_gp->cpu); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) + return; + WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); + } + + /* Spawn the kthread for this CPU. */ + t = kthread_run(rcu_nocb_cb_kthread, rdp, + "rcuo%c/%d", rcu_state.abbr, cpu); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) + return; + WRITE_ONCE(rdp->nocb_cb_kthread, t); + WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); +} + +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo kthread, spawn it. + */ +static void rcu_spawn_cpu_nocb_kthread(int cpu) +{ + if (rcu_scheduler_fully_active) + rcu_spawn_one_nocb_kthread(cpu); +} + +/* + * Once the scheduler is running, spawn rcuo kthreads for all online + * no-CBs CPUs. This assumes that the early_initcall()s happen before + * non-boot CPUs come online -- if this changes, we will need to add + * some mutual exclusion. + */ +static void __init rcu_spawn_nocb_kthreads(void) +{ + int cpu; + + for_each_online_cpu(cpu) + rcu_spawn_cpu_nocb_kthread(cpu); +} + +/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ +static int rcu_nocb_gp_stride = -1; +module_param(rcu_nocb_gp_stride, int, 0444); + +/* + * Initialize GP-CB relationships for all no-CBs CPU. + */ +static void __init rcu_organize_nocb_kthreads(void) +{ + int cpu; + bool firsttime = true; + bool gotnocbs = false; + bool gotnocbscbs = true; + int ls = rcu_nocb_gp_stride; + int nl = 0; /* Next GP kthread. */ + struct rcu_data *rdp; + struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ + struct rcu_data *rdp_prev = NULL; + + if (!cpumask_available(rcu_nocb_mask)) + return; + if (ls == -1) { + ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); + rcu_nocb_gp_stride = ls; + } + + /* + * Each pass through this loop sets up one rcu_data structure. + * Should the corresponding CPU come online in the future, then + * we will spawn the needed set of rcu_nocb_kthread() kthreads. + */ + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->cpu >= nl) { + /* New GP kthread, set up for CBs & next GP. */ + gotnocbs = true; + nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; + rdp->nocb_gp_rdp = rdp; + rdp_gp = rdp; + if (dump_tree) { + if (!firsttime) + pr_cont("%s\n", gotnocbscbs + ? "" : " (self only)"); + gotnocbscbs = false; + firsttime = false; + pr_alert("%s: No-CB GP kthread CPU %d:", + __func__, cpu); + } + } else { + /* Another CB kthread, link to previous GP kthread. */ + gotnocbscbs = true; + rdp->nocb_gp_rdp = rdp_gp; + rdp_prev->nocb_next_cb_rdp = rdp; + if (dump_tree) + pr_cont(" %d", cpu); + } + rdp_prev = rdp; + } + if (gotnocbs && dump_tree) + pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); +} + +/* + * Bind the current task to the offloaded CPUs. If there are no offloaded + * CPUs, leave the task unbound. Splat if the bind attempt fails. + */ +void rcu_bind_current_to_nocb(void) +{ + if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) + WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); +} +EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); + +// The ->on_cpu field is available only in CONFIG_SMP=y, so... +#ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : ""; +} +#else // #ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return ""; +} +#endif // #else #ifdef CONFIG_SMP + +/* + * Dump out nocb grace-period kthread state for the specified rcu_data + * structure. + */ +static void show_rcu_nocb_gp_state(struct rcu_data *rdp) +{ + struct rcu_node *rnp = rdp->mynode; + + pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", + rdp->cpu, + "kK"[!!rdp->nocb_gp_kthread], + "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], + "dD"[!!rdp->nocb_defer_wakeup], + "tT"[timer_pending(&rdp->nocb_timer)], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[swait_active(&rdp->nocb_gp_wq)], + ".W"[swait_active(&rnp->nocb_gp_wq[0])], + ".W"[swait_active(&rnp->nocb_gp_wq[1])], + ".B"[!!rdp->nocb_gp_bypass], + ".G"[!!rdp->nocb_gp_gp], + (long)rdp->nocb_gp_seq, + rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), + rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); +} + +/* Dump out nocb kthread state for the specified rcu_data structure. */ +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ + char bufw[20]; + char bufr[20]; + struct rcu_segcblist *rsclp = &rdp->cblist; + bool waslocked; + bool wassleep; + + if (rdp->nocb_gp_rdp == rdp) + show_rcu_nocb_gp_state(rdp); + + sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + rdp->cpu, rdp->nocb_gp_rdp->cpu, + rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, + "kK"[!!rdp->nocb_cb_kthread], + "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], + "cC"[!!atomic_read(&rdp->nocb_lock_contended)], + "lL"[raw_spin_is_locked(&rdp->nocb_lock)], + "sS"[!!rdp->nocb_cb_sleep], + ".W"[swait_active(&rdp->nocb_cb_wq)], + jiffies - rdp->nocb_bypass_first, + jiffies - rdp->nocb_nobypass_last, + rdp->nocb_nobypass_count, + ".D"[rcu_segcblist_ready_cbs(rsclp)], + ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, + ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, + ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + rcu_segcblist_n_cbs(&rdp->cblist), + rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); + + /* It is OK for GP kthreads to have GP state. */ + if (rdp->nocb_gp_rdp == rdp) + return; + + waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); + wassleep = swait_active(&rdp->nocb_gp_wq); + if (!rdp->nocb_gp_sleep && !waslocked && !wassleep) + return; /* Nothing untoward. */ + + pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n", + "lL"[waslocked], + "dD"[!!rdp->nocb_defer_wakeup], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[wassleep]); +} + +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return 0; +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + return false; +} + +/* No ->nocb_lock to acquire. */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + local_irq_restore(flags); +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); +} + +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) +{ +} + +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return NULL; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ +} + +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + return true; +} + +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags) +{ + return false; +} + +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags) +{ + WARN_ON_ONCE(1); /* Should be dead code! */ +} + +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ +} + +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) +{ + return false; +} + +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + return false; +} + +static void rcu_spawn_cpu_nocb_kthread(int cpu) +{ +} + +static void __init rcu_spawn_nocb_kthreads(void) +{ +} + +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index de1dc3bb7f70..d070059163d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -13,39 +13,6 @@ #include "../locking/rtmutex_common.h" -#ifdef CONFIG_RCU_NOCB_CPU -static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ -static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return lockdep_is_held(&rdp->nocb_lock); -} - -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) -{ - /* Race on early boot between thread creation and assignment */ - if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) - return true; - - if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) - if (in_task()) - return true; - return false; -} - -#else -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return 0; -} - -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) -{ - return false; -} - -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) { /* @@ -346,7 +313,7 @@ void rcu_note_context_switch(bool preempt) trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); - WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0); + WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!"); if (rcu_preempt_depth() > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -405,17 +372,20 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) static void rcu_preempt_read_enter(void) { - current->rcu_read_lock_nesting++; + WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1); } static int rcu_preempt_read_exit(void) { - return --current->rcu_read_lock_nesting; + int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1; + + WRITE_ONCE(current->rcu_read_lock_nesting, ret); + return ret; } static void rcu_preempt_depth_set(int val) { - current->rcu_read_lock_nesting = val; + WRITE_ONCE(current->rcu_read_lock_nesting, val); } /* @@ -559,7 +529,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) WRITE_ONCE(rnp->exp_tasks, np); if (IS_ENABLED(CONFIG_RCU_BOOST)) { /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ - drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; + drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx.rtmutex) == t; if (&t->rcu_node_entry == rnp->boost_tasks) WRITE_ONCE(rnp->boost_tasks, np); } @@ -586,7 +556,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) /* Unboost if we were boosted. */ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) - rt_mutex_futex_unlock(&rnp->boost_mtx); + rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex); /* * If this was the last task on the expedited lists, @@ -1083,7 +1053,7 @@ static int rcu_boost(struct rcu_node *rnp) * section. */ t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); + rt_mutex_init_proxy_locked(&rnp->boost_mtx.rtmutex, t); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); @@ -1479,1460 +1449,6 @@ static void rcu_cleanup_after_idle(void) #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_NOCB_CPU - -/* - * Offload callback processing from the boot-time-specified set of CPUs - * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads - * created that pull the callbacks from the corresponding CPU, wait for - * a grace period to elapse, and invoke the callbacks. These kthreads - * are organized into GP kthreads, which manage incoming callbacks, wait for - * grace periods, and awaken CB kthreads, and the CB kthreads, which only - * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs - * do a wake_up() on their GP kthread when they insert a callback into any - * empty list, unless the rcu_nocb_poll boot parameter has been specified, - * in which case each kthread actively polls its CPU. (Which isn't so great - * for energy efficiency, but which does reduce RCU's overhead on that CPU.) - * - * This is intended to be used in conjunction with Frederic Weisbecker's - * adaptive-idle work, which would seriously reduce OS jitter on CPUs - * running CPU-bound user-mode computations. - * - * Offloading of callbacks can also be used as an energy-efficiency - * measure because CPUs with no RCU callbacks queued are more aggressive - * about entering dyntick-idle mode. - */ - - -/* - * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. - * If the list is invalid, a warning is emitted and all CPUs are offloaded. - */ -static int __init rcu_nocb_setup(char *str) -{ - alloc_bootmem_cpumask_var(&rcu_nocb_mask); - if (cpulist_parse(str, rcu_nocb_mask)) { - pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); - cpumask_setall(rcu_nocb_mask); - } - return 1; -} -__setup("rcu_nocbs=", rcu_nocb_setup); - -static int __init parse_rcu_nocb_poll(char *arg) -{ - rcu_nocb_poll = true; - return 0; -} -early_param("rcu_nocb_poll", parse_rcu_nocb_poll); - -/* - * Don't bother bypassing ->cblist if the call_rcu() rate is low. - * After all, the main point of bypassing is to avoid lock contention - * on ->nocb_lock, which only can happen at high call_rcu() rates. - */ -static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; -module_param(nocb_nobypass_lim_per_jiffy, int, 0); - -/* - * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the - * lock isn't immediately available, increment ->nocb_lock_contended to - * flag the contention. - */ -static void rcu_nocb_bypass_lock(struct rcu_data *rdp) - __acquires(&rdp->nocb_bypass_lock) -{ - lockdep_assert_irqs_disabled(); - if (raw_spin_trylock(&rdp->nocb_bypass_lock)) - return; - atomic_inc(&rdp->nocb_lock_contended); - WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - smp_mb__after_atomic(); /* atomic_inc() before lock. */ - raw_spin_lock(&rdp->nocb_bypass_lock); - smp_mb__before_atomic(); /* atomic_dec() after lock. */ - atomic_dec(&rdp->nocb_lock_contended); -} - -/* - * Spinwait until the specified rcu_data structure's ->nocb_lock is - * not contended. Please note that this is extremely special-purpose, - * relying on the fact that at most two kthreads and one CPU contend for - * this lock, and also that the two kthreads are guaranteed to have frequent - * grace-period-duration time intervals between successive acquisitions - * of the lock. This allows us to use an extremely simple throttling - * mechanism, and further to apply it only to the CPU doing floods of - * call_rcu() invocations. Don't try this at home! - */ -static void rcu_nocb_wait_contended(struct rcu_data *rdp) -{ - WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) - cpu_relax(); -} - -/* - * Conditionally acquire the specified rcu_data structure's - * ->nocb_bypass_lock. - */ -static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - return raw_spin_trylock(&rdp->nocb_bypass_lock); -} - -/* - * Release the specified rcu_data structure's ->nocb_bypass_lock. - */ -static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) - __releases(&rdp->nocb_bypass_lock) -{ - lockdep_assert_irqs_disabled(); - raw_spin_unlock(&rdp->nocb_bypass_lock); -} - -/* - * Acquire the specified rcu_data structure's ->nocb_lock, but only - * if it corresponds to a no-CBs CPU. - */ -static void rcu_nocb_lock(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (!rcu_rdp_is_offloaded(rdp)) - return; - raw_spin_lock(&rdp->nocb_lock); -} - -/* - * Release the specified rcu_data structure's ->nocb_lock, but only - * if it corresponds to a no-CBs CPU. - */ -static void rcu_nocb_unlock(struct rcu_data *rdp) -{ - if (rcu_rdp_is_offloaded(rdp)) { - lockdep_assert_irqs_disabled(); - raw_spin_unlock(&rdp->nocb_lock); - } -} - -/* - * Release the specified rcu_data structure's ->nocb_lock and restore - * interrupts, but only if it corresponds to a no-CBs CPU. - */ -static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, - unsigned long flags) -{ - if (rcu_rdp_is_offloaded(rdp)) { - lockdep_assert_irqs_disabled(); - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - } else { - local_irq_restore(flags); - } -} - -/* Lockdep check that ->cblist may be safely accessed. */ -static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (rcu_rdp_is_offloaded(rdp)) - lockdep_assert_held(&rdp->nocb_lock); -} - -/* - * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended - * grace period. - */ -static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) -{ - swake_up_all(sq); -} - -static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) -{ - return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; -} - -static void rcu_init_one_nocb(struct rcu_node *rnp) -{ - init_swait_queue_head(&rnp->nocb_gp_wq[0]); - init_swait_queue_head(&rnp->nocb_gp_wq[1]); -} - -/* Is the specified CPU a no-CBs CPU? */ -bool rcu_is_nocb_cpu(int cpu) -{ - if (cpumask_available(rcu_nocb_mask)) - return cpumask_test_cpu(cpu, rcu_nocb_mask); - return false; -} - -static bool __wake_nocb_gp(struct rcu_data *rdp_gp, - struct rcu_data *rdp, - bool force, unsigned long flags) - __releases(rdp_gp->nocb_gp_lock) -{ - bool needwake = false; - - if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("AlreadyAwake")); - return false; - } - - if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { - WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&rdp_gp->nocb_timer); - } - - if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { - WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); - needwake = true; - } - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - if (needwake) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - wake_up_process(rdp_gp->nocb_gp_kthread); - } - - return needwake; -} - -/* - * Kick the GP kthread for this NOCB group. - */ -static bool wake_nocb_gp(struct rcu_data *rdp, bool force) -{ - unsigned long flags; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - return __wake_nocb_gp(rdp_gp, rdp, force, flags); -} - -/* - * Arrange to wake the GP kthread for this NOCB group at some future - * time when it is safe to do so. - */ -static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, - const char *reason) -{ - unsigned long flags; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - - /* - * Bypass wakeup overrides previous deferments. In case - * of callback storm, no need to wake up too early. - */ - if (waketype == RCU_NOCB_WAKE_BYPASS) { - mod_timer(&rdp_gp->nocb_timer, jiffies + 2); - WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); - } else { - if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE) - mod_timer(&rdp_gp->nocb_timer, jiffies + 1); - if (rdp_gp->nocb_defer_wakeup < waketype) - WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); - } - - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); -} - -/* - * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. - * However, if there is a callback to be enqueued and if ->nocb_bypass - * proves to be initially empty, just return false because the no-CB GP - * kthread may need to be awakened in this case. - * - * Note that this function always returns true if rhp is NULL. - */ -static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) -{ - struct rcu_cblist rcl; - - WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); - rcu_lockdep_assert_cblist_protected(rdp); - lockdep_assert_held(&rdp->nocb_bypass_lock); - if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { - raw_spin_unlock(&rdp->nocb_bypass_lock); - return false; - } - /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ - if (rhp) - rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ - rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); - rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); - WRITE_ONCE(rdp->nocb_bypass_first, j); - rcu_nocb_bypass_unlock(rdp); - return true; -} - -/* - * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. - * However, if there is a callback to be enqueued and if ->nocb_bypass - * proves to be initially empty, just return false because the no-CB GP - * kthread may need to be awakened in this case. - * - * Note that this function always returns true if rhp is NULL. - */ -static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) -{ - if (!rcu_rdp_is_offloaded(rdp)) - return true; - rcu_lockdep_assert_cblist_protected(rdp); - rcu_nocb_bypass_lock(rdp); - return rcu_nocb_do_flush_bypass(rdp, rhp, j); -} - -/* - * If the ->nocb_bypass_lock is immediately available, flush the - * ->nocb_bypass queue into ->cblist. - */ -static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) -{ - rcu_lockdep_assert_cblist_protected(rdp); - if (!rcu_rdp_is_offloaded(rdp) || - !rcu_nocb_bypass_trylock(rdp)) - return; - WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); -} - -/* - * See whether it is appropriate to use the ->nocb_bypass list in order - * to control contention on ->nocb_lock. A limited number of direct - * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass - * is non-empty, further callbacks must be placed into ->nocb_bypass, - * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch - * back to direct use of ->cblist. However, ->nocb_bypass should not be - * used if ->cblist is empty, because otherwise callbacks can be stranded - * on ->nocb_bypass because we cannot count on the current CPU ever again - * invoking call_rcu(). The general rule is that if ->nocb_bypass is - * non-empty, the corresponding no-CBs grace-period kthread must not be - * in an indefinite sleep state. - * - * Finally, it is not permitted to use the bypass during early boot, - * as doing so would confuse the auto-initialization code. Besides - * which, there is no point in worrying about lock contention while - * there is only one CPU in operation. - */ -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags) -{ - unsigned long c; - unsigned long cur_gp_seq; - unsigned long j = jiffies; - long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - - lockdep_assert_irqs_disabled(); - - // Pure softirq/rcuc based processing: no bypassing, no - // locking. - if (!rcu_rdp_is_offloaded(rdp)) { - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; - } - - // In the process of (de-)offloading: no bypassing, but - // locking. - if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; /* Not offloaded, no bypassing. */ - } - - // Don't use ->nocb_bypass during early boot. - if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { - rcu_nocb_lock(rdp); - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; - } - - // If we have advanced to a new jiffy, reset counts to allow - // moving back from ->nocb_bypass to ->cblist. - if (j == rdp->nocb_nobypass_last) { - c = rdp->nocb_nobypass_count + 1; - } else { - WRITE_ONCE(rdp->nocb_nobypass_last, j); - c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; - if (ULONG_CMP_LT(rdp->nocb_nobypass_count, - nocb_nobypass_lim_per_jiffy)) - c = 0; - else if (c > nocb_nobypass_lim_per_jiffy) - c = nocb_nobypass_lim_per_jiffy; - } - WRITE_ONCE(rdp->nocb_nobypass_count, c); - - // If there hasn't yet been all that many ->cblist enqueues - // this jiffy, tell the caller to enqueue onto ->cblist. But flush - // ->nocb_bypass first. - if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - if (*was_alldone) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstQ")); - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - return false; // Caller must enqueue the callback. - } - - // If ->nocb_bypass has been used too long or is too full, - // flush ->nocb_bypass to ->cblist. - if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || - ncbs >= qhimark) { - rcu_nocb_lock(rdp); - if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - if (*was_alldone) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstQ")); - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - return false; // Caller must enqueue the callback. - } - if (j != rdp->nocb_gp_adv_time && - rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && - rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { - rcu_advance_cbs_nowake(rdp->mynode, rdp); - rdp->nocb_gp_adv_time = j; - } - rcu_nocb_unlock_irqrestore(rdp, flags); - return true; // Callback already enqueued. - } - - // We need to use the bypass. - rcu_nocb_wait_contended(rdp); - rcu_nocb_bypass_lock(rdp); - ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ - rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); - if (!ncbs) { - WRITE_ONCE(rdp->nocb_bypass_first, j); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); - } - rcu_nocb_bypass_unlock(rdp); - smp_mb(); /* Order enqueue before wake. */ - if (ncbs) { - local_irq_restore(flags); - } else { - // No-CBs GP kthread might be indefinitely asleep, if so, wake. - rcu_nocb_lock(rdp); // Rare during call_rcu() flood. - if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstBQwake")); - __call_rcu_nocb_wake(rdp, true, flags); - } else { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstBQnoWake")); - rcu_nocb_unlock_irqrestore(rdp, flags); - } - } - return true; // Callback already enqueued. -} - -/* - * Awaken the no-CBs grace-period kthread if needed, either due to it - * legitimately being asleep or due to overload conditions. - * - * If warranted, also wake up the kthread servicing this CPUs queues. - */ -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, - unsigned long flags) - __releases(rdp->nocb_lock) -{ - unsigned long cur_gp_seq; - unsigned long j; - long len; - struct task_struct *t; - - // If we are being polled or there is no kthread, just leave. - t = READ_ONCE(rdp->nocb_gp_kthread); - if (rcu_nocb_poll || !t) { - rcu_nocb_unlock_irqrestore(rdp, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeNotPoll")); - return; - } - // Need to actually to a wakeup. - len = rcu_segcblist_n_cbs(&rdp->cblist); - if (was_alldone) { - rdp->qlen_last_fqs_check = len; - if (!irqs_disabled_flags(flags)) { - /* ... if queue was empty ... */ - rcu_nocb_unlock_irqrestore(rdp, flags); - wake_nocb_gp(rdp, false); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeEmpty")); - } else { - rcu_nocb_unlock_irqrestore(rdp, flags); - wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, - TPS("WakeEmptyIsDeferred")); - } - } else if (len > rdp->qlen_last_fqs_check + qhimark) { - /* ... or if many callbacks queued. */ - rdp->qlen_last_fqs_check = len; - j = jiffies; - if (j != rdp->nocb_gp_adv_time && - rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && - rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { - rcu_advance_cbs_nowake(rdp->mynode, rdp); - rdp->nocb_gp_adv_time = j; - } - smp_mb(); /* Enqueue before timer_pending(). */ - if ((rdp->nocb_cb_sleep || - !rcu_segcblist_ready_cbs(&rdp->cblist)) && - !timer_pending(&rdp->nocb_timer)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, - TPS("WakeOvfIsDeferred")); - } else { - rcu_nocb_unlock_irqrestore(rdp, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); - } - } else { - rcu_nocb_unlock_irqrestore(rdp, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); - } - return; -} - -/* - * Check if we ignore this rdp. - * - * We check that without holding the nocb lock but - * we make sure not to miss a freshly offloaded rdp - * with the current ordering: - * - * rdp_offload_toggle() nocb_gp_enabled_cb() - * ------------------------- ---------------------------- - * WRITE flags LOCK nocb_gp_lock - * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep - * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock - * UNLOCK nocb_gp_lock READ flags - */ -static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - -static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, - bool *needwake_state) -{ - struct rcu_segcblist *cblist = &rdp->cblist; - - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - } - return false; - } - - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We will ignore this rdp until it ever gets re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - return true; -} - - -/* - * No-CBs GP kthreads come here to wait for additional callbacks to show up - * or for grace periods to end. - */ -static void nocb_gp_wait(struct rcu_data *my_rdp) -{ - bool bypass = false; - long bypass_ncbs; - int __maybe_unused cpu = my_rdp->cpu; - unsigned long cur_gp_seq; - unsigned long flags; - bool gotcbs = false; - unsigned long j = jiffies; - bool needwait_gp = false; // This prevents actual uninitialized use. - bool needwake; - bool needwake_gp; - struct rcu_data *rdp; - struct rcu_node *rnp; - unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. - bool wasempty = false; - - /* - * Each pass through the following loop checks for CBs and for the - * nearest grace period (if any) to wait for next. The CB kthreads - * and the global grace-period kthread are awakened if needed. - */ - WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { - bool needwake_state = false; - - if (!nocb_gp_enabled_cb(rdp)) - continue; - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); - rcu_nocb_lock_irqsave(rdp, flags); - if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - continue; - } - bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - if (bypass_ncbs && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || - bypass_ncbs > 2 * qhimark)) { - // Bypass full or old, so flush it. - (void)rcu_nocb_try_flush_bypass(rdp, j); - bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - continue; /* No callbacks here, try next. */ - } - if (bypass_ncbs) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("Bypass")); - bypass = true; - } - rnp = rdp->mynode; - - // Advance callbacks if helpful and low contention. - needwake_gp = false; - if (!rcu_segcblist_restempty(&rdp->cblist, - RCU_NEXT_READY_TAIL) || - (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && - rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { - raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ - needwake_gp = rcu_advance_cbs(rnp, rdp); - wasempty = rcu_segcblist_restempty(&rdp->cblist, - RCU_NEXT_READY_TAIL); - raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ - } - // Need to wait on some grace period? - WARN_ON_ONCE(wasempty && - !rcu_segcblist_restempty(&rdp->cblist, - RCU_NEXT_READY_TAIL)); - if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { - if (!needwait_gp || - ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) - wait_gp_seq = cur_gp_seq; - needwait_gp = true; - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("NeedWaitGP")); - } - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - needwake = rdp->nocb_cb_sleep; - WRITE_ONCE(rdp->nocb_cb_sleep, false); - smp_mb(); /* CB invocation -after- GP end. */ - } else { - needwake = false; - } - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake) { - swake_up_one(&rdp->nocb_cb_wq); - gotcbs = true; - } - if (needwake_gp) - rcu_gp_kthread_wake(); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - } - - my_rdp->nocb_gp_bypass = bypass; - my_rdp->nocb_gp_gp = needwait_gp; - my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; - - if (bypass && !rcu_nocb_poll) { - // At least one child with non-empty ->nocb_bypass, so set - // timer in order to avoid stranding its callbacks. - wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, - TPS("WakeBypassIsDeferred")); - } - if (rcu_nocb_poll) { - /* Polling, so trace if first poll in the series. */ - if (gotcbs) - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_idle(1); - } else if (!needwait_gp) { - /* Wait for callbacks to appear. */ - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); - swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, - !READ_ONCE(my_rdp->nocb_gp_sleep)); - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); - } else { - rnp = my_rdp->mynode; - trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); - swait_event_interruptible_exclusive( - rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], - rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || - !READ_ONCE(my_rdp->nocb_gp_sleep)); - trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); - } - if (!rcu_nocb_poll) { - raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); - if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { - WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); - } - WRITE_ONCE(my_rdp->nocb_gp_sleep, true); - raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); - } - my_rdp->nocb_gp_seq = -1; - WARN_ON(signal_pending(current)); -} - -/* - * No-CBs grace-period-wait kthread. There is one of these per group - * of CPUs, but only once at least one CPU in that group has come online - * at least once since boot. This kthread checks for newly posted - * callbacks from any of the CPUs it is responsible for, waits for a - * grace period, then awakens all of the rcu_nocb_cb_kthread() instances - * that then have callback-invocation work to do. - */ -static int rcu_nocb_gp_kthread(void *arg) -{ - struct rcu_data *rdp = arg; - - for (;;) { - WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); - nocb_gp_wait(rdp); - cond_resched_tasks_rcu_qs(); - } - return 0; -} - -static inline bool nocb_cb_can_run(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - -static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) -{ - return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); -} - -/* - * Invoke any ready callbacks from the corresponding no-CBs CPU, - * then, if there are no more, wait for more to appear. - */ -static void nocb_cb_wait(struct rcu_data *rdp) -{ - struct rcu_segcblist *cblist = &rdp->cblist; - unsigned long cur_gp_seq; - unsigned long flags; - bool needwake_state = false; - bool needwake_gp = false; - bool can_sleep = true; - struct rcu_node *rnp = rdp->mynode; - - local_irq_save(flags); - rcu_momentary_dyntick_idle(); - local_irq_restore(flags); - /* - * Disable BH to provide the expected environment. Also, when - * transitioning to/from NOCB mode, a self-requeuing callback might - * be invoked from softirq. A short grace period could cause both - * instances of this callback would execute concurrently. - */ - local_bh_disable(); - rcu_do_batch(rdp); - local_bh_enable(); - lockdep_assert_irqs_enabled(); - rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && - rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && - raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ - needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - } - - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; - } - if (rcu_segcblist_ready_cbs(cblist)) - can_sleep = false; - } else { - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We won't touch the callbacks and keep sleeping until we ever - * get re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; - } - - WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); - - if (rdp->nocb_cb_sleep) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); - - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_gp) - rcu_gp_kthread_wake(); - - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - - do { - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - nocb_cb_wait_cond(rdp)); - - // VVV Ensure CB invocation follows _sleep test. - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - } - } while (!nocb_cb_can_run(rdp)); -} - -/* - * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke - * nocb_cb_wait() to do the dirty work. - */ -static int rcu_nocb_cb_kthread(void *arg) -{ - struct rcu_data *rdp = arg; - - // Each pass through this loop does one callback batch, and, - // if there are no more ready callbacks, waits for them. - for (;;) { - nocb_cb_wait(rdp); - cond_resched_tasks_rcu_qs(); - } - return 0; -} - -/* Is a deferred wakeup of rcu_nocb_kthread() required? */ -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) -{ - return READ_ONCE(rdp->nocb_defer_wakeup) >= level; -} - -/* Do a deferred wakeup of rcu_nocb_kthread(). */ -static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, - struct rcu_data *rdp, int level, - unsigned long flags) - __releases(rdp_gp->nocb_gp_lock) -{ - int ndw; - int ret; - - if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - return false; - } - - ndw = rdp_gp->nocb_defer_wakeup; - ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); - - return ret; -} - -/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ -static void do_nocb_deferred_wakeup_timer(struct timer_list *t) -{ - unsigned long flags; - struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); - - WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); - - raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); - smp_mb__after_spinlock(); /* Timer expire before wakeup. */ - do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); -} - -/* - * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. - * This means we do an inexact common-case check. Note that if - * we miss, ->nocb_timer will eventually clean things up. - */ -static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - - if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)) - return false; - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags); -} - -void rcu_nocb_flush_deferred_wakeup(void) -{ - do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); -} -EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); - -static int rdp_offload_toggle(struct rcu_data *rdp, - bool offload, unsigned long flags) - __releases(rdp->nocb_lock) -{ - struct rcu_segcblist *cblist = &rdp->cblist; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - bool wake_gp = false; - - rcu_segcblist_offload(cblist, offload); - - if (rdp->nocb_cb_sleep) - rdp->nocb_cb_sleep = false; - rcu_nocb_unlock_irqrestore(rdp, flags); - - /* - * Ignore former value of nocb_cb_sleep and force wake up as it could - * have been spuriously set to false already. - */ - swake_up_one(&rdp->nocb_cb_wq); - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - if (rdp_gp->nocb_gp_sleep) { - rdp_gp->nocb_gp_sleep = false; - wake_gp = true; - } - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - - if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); - - return 0; -} - -static long rcu_nocb_rdp_deoffload(void *arg) -{ - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; - unsigned long flags; - int ret; - - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); - - pr_info("De-offloading %d\n", rdp->cpu); - - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Flush once and for all now. This suffices because we are - * running on the target CPU holding ->nocb_lock (thus having - * interrupts disabled), and because rdp_offload_toggle() - * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. - * Thus future calls to rcu_segcblist_completely_offloaded() will - * return false, which means that future calls to rcu_nocb_try_bypass() - * will refuse to put anything into the bypass. - */ - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); - ret = rdp_offload_toggle(rdp, false, flags); - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | - SEGCBLIST_KTHREAD_GP)); - /* - * Lock one last time to acquire latest callback updates from kthreads - * so we can later handle callbacks locally without locking. - */ - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb - * lock is released but how about being paranoid for once? - */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); - /* - * With SEGCBLIST_SOFTIRQ_ONLY, we can't use - * rcu_nocb_unlock_irqrestore() anymore. - */ - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - - /* Sanity check */ - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - - - return ret; -} - -int rcu_nocb_cpu_deoffload(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int ret = 0; - - mutex_lock(&rcu_state.barrier_mutex); - cpus_read_lock(); - if (rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); - if (!ret) - cpumask_clear_cpu(cpu, rcu_nocb_mask); - } else { - pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); - ret = -EINVAL; - } - } - cpus_read_unlock(); - mutex_unlock(&rcu_state.barrier_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); - -static long rcu_nocb_rdp_offload(void *arg) -{ - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; - unsigned long flags; - int ret; - - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); - /* - * For now we only support re-offload, ie: the rdp must have been - * offloaded on boot first. - */ - if (!rdp->nocb_gp_rdp) - return -EINVAL; - - pr_info("Offloading %d\n", rdp->cpu); - /* - * Can't use rcu_nocb_lock_irqsave() while we are in - * SEGCBLIST_SOFTIRQ_ONLY mode. - */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - - /* - * We didn't take the nocb lock while working on the - * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. - * Every modifications that have been done previously on - * rdp->cblist must be visible remotely by the nocb kthreads - * upon wake up after reading the cblist flags. - * - * The layout against nocb_lock enforces that ordering: - * - * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() - * ------------------------- ---------------------------- - * WRITE callbacks rcu_nocb_lock() - * rcu_nocb_lock() READ flags - * WRITE flags READ callbacks - * rcu_nocb_unlock() rcu_nocb_unlock() - */ - ret = rdp_offload_toggle(rdp, true, flags); - swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - - return ret; -} - -int rcu_nocb_cpu_offload(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int ret = 0; - - mutex_lock(&rcu_state.barrier_mutex); - cpus_read_lock(); - if (!rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); - if (!ret) - cpumask_set_cpu(cpu, rcu_nocb_mask); - } else { - pr_info("NOCB: Can't CB-offload an offline CPU\n"); - ret = -EINVAL; - } - } - cpus_read_unlock(); - mutex_unlock(&rcu_state.barrier_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); - -void __init rcu_init_nohz(void) -{ - int cpu; - bool need_rcu_nocb_mask = false; - struct rcu_data *rdp; - -#if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) - need_rcu_nocb_mask = true; -#endif /* #if defined(CONFIG_NO_HZ_FULL) */ - - if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { - if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { - pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); - return; - } - } - if (!cpumask_available(rcu_nocb_mask)) - return; - -#if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running) - cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); -#endif /* #if defined(CONFIG_NO_HZ_FULL) */ - - if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); - cpumask_and(rcu_nocb_mask, cpu_possible_mask, - rcu_nocb_mask); - } - if (cpumask_empty(rcu_nocb_mask)) - pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); - else - pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", - cpumask_pr_args(rcu_nocb_mask)); - if (rcu_nocb_poll) - pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - - for_each_cpu(cpu, rcu_nocb_mask) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (rcu_segcblist_empty(&rdp->cblist)) - rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); - } - rcu_organize_nocb_kthreads(); -} - -/* Initialize per-rcu_data variables for no-CBs CPUs. */ -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -{ - init_swait_queue_head(&rdp->nocb_cb_wq); - init_swait_queue_head(&rdp->nocb_gp_wq); - init_swait_queue_head(&rdp->nocb_state_wq); - raw_spin_lock_init(&rdp->nocb_lock); - raw_spin_lock_init(&rdp->nocb_bypass_lock); - raw_spin_lock_init(&rdp->nocb_gp_lock); - timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); - rcu_cblist_init(&rdp->nocb_bypass); -} - -/* - * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread - * for this CPU's group has not yet been created, spawn it as well. - */ -static void rcu_spawn_one_nocb_kthread(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_data *rdp_gp; - struct task_struct *t; - - /* - * If this isn't a no-CBs CPU or if it already has an rcuo kthread, - * then nothing to do. - */ - if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) - return; - - /* If we didn't spawn the GP kthread first, reorganize! */ - rdp_gp = rdp->nocb_gp_rdp; - if (!rdp_gp->nocb_gp_kthread) { - t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, - "rcuog/%d", rdp_gp->cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) - return; - WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); - } - - /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_cb_kthread, rdp, - "rcuo%c/%d", rcu_state.abbr, cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - return; - WRITE_ONCE(rdp->nocb_cb_kthread, t); - WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); -} - -/* - * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread, spawn it. - */ -static void rcu_spawn_cpu_nocb_kthread(int cpu) -{ - if (rcu_scheduler_fully_active) - rcu_spawn_one_nocb_kthread(cpu); -} - -/* - * Once the scheduler is running, spawn rcuo kthreads for all online - * no-CBs CPUs. This assumes that the early_initcall()s happen before - * non-boot CPUs come online -- if this changes, we will need to add - * some mutual exclusion. - */ -static void __init rcu_spawn_nocb_kthreads(void) -{ - int cpu; - - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); -} - -/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ -static int rcu_nocb_gp_stride = -1; -module_param(rcu_nocb_gp_stride, int, 0444); - -/* - * Initialize GP-CB relationships for all no-CBs CPU. - */ -static void __init rcu_organize_nocb_kthreads(void) -{ - int cpu; - bool firsttime = true; - bool gotnocbs = false; - bool gotnocbscbs = true; - int ls = rcu_nocb_gp_stride; - int nl = 0; /* Next GP kthread. */ - struct rcu_data *rdp; - struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ - struct rcu_data *rdp_prev = NULL; - - if (!cpumask_available(rcu_nocb_mask)) - return; - if (ls == -1) { - ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); - rcu_nocb_gp_stride = ls; - } - - /* - * Each pass through this loop sets up one rcu_data structure. - * Should the corresponding CPU come online in the future, then - * we will spawn the needed set of rcu_nocb_kthread() kthreads. - */ - for_each_cpu(cpu, rcu_nocb_mask) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->cpu >= nl) { - /* New GP kthread, set up for CBs & next GP. */ - gotnocbs = true; - nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; - rdp->nocb_gp_rdp = rdp; - rdp_gp = rdp; - if (dump_tree) { - if (!firsttime) - pr_cont("%s\n", gotnocbscbs - ? "" : " (self only)"); - gotnocbscbs = false; - firsttime = false; - pr_alert("%s: No-CB GP kthread CPU %d:", - __func__, cpu); - } - } else { - /* Another CB kthread, link to previous GP kthread. */ - gotnocbscbs = true; - rdp->nocb_gp_rdp = rdp_gp; - rdp_prev->nocb_next_cb_rdp = rdp; - if (dump_tree) - pr_cont(" %d", cpu); - } - rdp_prev = rdp; - } - if (gotnocbs && dump_tree) - pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); -} - -/* - * Bind the current task to the offloaded CPUs. If there are no offloaded - * CPUs, leave the task unbound. Splat if the bind attempt fails. - */ -void rcu_bind_current_to_nocb(void) -{ - if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) - WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); -} -EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); - -// The ->on_cpu field is available only in CONFIG_SMP=y, so... -#ifdef CONFIG_SMP -static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) -{ - return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : ""; -} -#else // #ifdef CONFIG_SMP -static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) -{ - return ""; -} -#endif // #else #ifdef CONFIG_SMP - -/* - * Dump out nocb grace-period kthread state for the specified rcu_data - * structure. - */ -static void show_rcu_nocb_gp_state(struct rcu_data *rdp) -{ - struct rcu_node *rnp = rdp->mynode; - - pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", - rdp->cpu, - "kK"[!!rdp->nocb_gp_kthread], - "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], - "dD"[!!rdp->nocb_defer_wakeup], - "tT"[timer_pending(&rdp->nocb_timer)], - "sS"[!!rdp->nocb_gp_sleep], - ".W"[swait_active(&rdp->nocb_gp_wq)], - ".W"[swait_active(&rnp->nocb_gp_wq[0])], - ".W"[swait_active(&rnp->nocb_gp_wq[1])], - ".B"[!!rdp->nocb_gp_bypass], - ".G"[!!rdp->nocb_gp_gp], - (long)rdp->nocb_gp_seq, - rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), - rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, - show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); -} - -/* Dump out nocb kthread state for the specified rcu_data structure. */ -static void show_rcu_nocb_state(struct rcu_data *rdp) -{ - char bufw[20]; - char bufr[20]; - struct rcu_segcblist *rsclp = &rdp->cblist; - bool waslocked; - bool wassleep; - - if (rdp->nocb_gp_rdp == rdp) - show_rcu_nocb_gp_state(rdp); - - sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); - sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", - rdp->cpu, rdp->nocb_gp_rdp->cpu, - rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, - "kK"[!!rdp->nocb_cb_kthread], - "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], - "cC"[!!atomic_read(&rdp->nocb_lock_contended)], - "lL"[raw_spin_is_locked(&rdp->nocb_lock)], - "sS"[!!rdp->nocb_cb_sleep], - ".W"[swait_active(&rdp->nocb_cb_wq)], - jiffies - rdp->nocb_bypass_first, - jiffies - rdp->nocb_nobypass_last, - rdp->nocb_nobypass_count, - ".D"[rcu_segcblist_ready_cbs(rsclp)], - ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], - rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, - ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], - rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, - ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], - ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], - rcu_segcblist_n_cbs(&rdp->cblist), - rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, - show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); - - /* It is OK for GP kthreads to have GP state. */ - if (rdp->nocb_gp_rdp == rdp) - return; - - waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); - wassleep = swait_active(&rdp->nocb_gp_wq); - if (!rdp->nocb_gp_sleep && !waslocked && !wassleep) - return; /* Nothing untoward. */ - - pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n", - "lL"[waslocked], - "dD"[!!rdp->nocb_defer_wakeup], - "sS"[!!rdp->nocb_gp_sleep], - ".W"[wassleep]); -} - -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ - -/* No ->nocb_lock to acquire. */ -static void rcu_nocb_lock(struct rcu_data *rdp) -{ -} - -/* No ->nocb_lock to release. */ -static void rcu_nocb_unlock(struct rcu_data *rdp) -{ -} - -/* No ->nocb_lock to release. */ -static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, - unsigned long flags) -{ - local_irq_restore(flags); -} - -/* Lockdep check that ->cblist may be safely accessed. */ -static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); -} - -static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) -{ -} - -static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) -{ - return NULL; -} - -static void rcu_init_one_nocb(struct rcu_node *rnp) -{ -} - -static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) -{ - return true; -} - -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags) -{ - return false; -} - -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, - unsigned long flags) -{ - WARN_ON_ONCE(1); /* Should be dead code! */ -} - -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -{ -} - -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) -{ - return false; -} - -static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) -{ - return false; -} - -static void rcu_spawn_cpu_nocb_kthread(int cpu) -{ -} - -static void __init rcu_spawn_nocb_kthreads(void) -{ -} - -static void show_rcu_nocb_state(struct rcu_data *rdp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? @@ -2982,17 +1498,17 @@ static void noinstr rcu_dynticks_task_exit(void) /* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ static void rcu_dynticks_task_trace_enter(void) { -#ifdef CONFIG_TASKS_RCU_TRACE +#ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) current->trc_reader_special.b.need_mb = true; -#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } /* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ static void rcu_dynticks_task_trace_exit(void) { -#ifdef CONFIG_TASKS_RCU_TRACE +#ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) current->trc_reader_special.b.need_mb = false; -#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 6c76988cc019..677ee3d8671b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -7,6 +7,8 @@ * Author: Paul E. McKenney <paulmck@linux.ibm.com> */ +#include <linux/kvm_para.h> + ////////////////////////////////////////////////////////////////////////////// // // Controlling CPU stall warnings, including delay calculation. @@ -117,17 +119,14 @@ static void panic_on_rcu_stall(void) } /** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. + * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period * * The caller must disable hard irqs. */ void rcu_cpu_stall_reset(void) { - WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); } ////////////////////////////////////////////////////////////////////////////// @@ -267,8 +266,10 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) struct task_struct *ts[8]; lockdep_assert_irqs_disabled(); - if (!rcu_preempt_blocked_readers_cgp(rnp)) + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; + } pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", rnp->level, rnp->grplo, rnp->grphi); t = list_entry(rnp->gp_tasks->prev, @@ -280,8 +281,8 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) break; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - for (i--; i; i--) { - t = ts[i]; + while (i) { + t = ts[--i]; if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) pr_cont(" P%d", t->pid); else @@ -350,7 +351,7 @@ static void rcu_dump_cpu_stacks(void) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", rdp->last_accelerate & 0xffff, jiffies & 0xffff, @@ -464,9 +465,10 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), - data_race(rcu_state.gp_flags), - gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->__state : ~0, cpu); + data_race(READ_ONCE(rcu_state.gp_flags)), + gp_state_getname(rcu_state.gp_state), + data_race(READ_ONCE(rcu_state.gp_state)), + gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu); if (gpk) { pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); @@ -509,7 +511,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void) (long)rcu_seq_current(&rcu_state.gp_seq), data_race(rcu_state.gp_flags), gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS, - gpk->__state); + data_race(READ_ONCE(gpk->__state))); pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n", cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu)); } @@ -568,11 +570,11 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = data_race(rcu_state.gp_activity); + gpa = data_race(READ_ONCE(rcu_state.gp_activity)); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rcu_state.name, j - gpa, j, gpa, - data_race(jiffies_till_next_fqs), - rcu_get_root()->qsmask); + data_race(READ_ONCE(jiffies_till_next_fqs)), + data_race(READ_ONCE(rcu_get_root()->qsmask))); } } /* Rewrite if needed in case of slow consoles. */ @@ -646,6 +648,7 @@ static void print_cpu_stall(unsigned long gps) static void check_cpu_stall(struct rcu_data *rdp) { + bool didstall = false; unsigned long gs1; unsigned long gs2; unsigned long gps; @@ -691,24 +694,46 @@ static void check_cpu_stall(struct rcu_data *rdp) ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + jn = jiffies + ULONG_MAX / 2; if (rcu_gp_in_progress() && (READ_ONCE(rnp->qsmask) & rdp->grpmask) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like an RCU stall. Check to see if the host + * stopped the vm. + */ + if (kvm_check_and_clear_guest_paused()) + return; + /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); + didstall = true; } else if (rcu_gp_in_progress() && ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like an RCU stall. Check to see if the host + * stopped the vm. + */ + if (kvm_check_and_clear_guest_paused()) + return; + /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); + didstall = true; + } + if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) { + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rcu_state.jiffies_stall, jn); } } @@ -742,7 +767,7 @@ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) rcu_for_each_leaf_node(rnp) { if (!cpup) { - if (READ_ONCE(rnp->qsmask)) { + if (data_race(READ_ONCE(rnp->qsmask))) { return false; } else { if (READ_ONCE(rnp->gp_tasks)) @@ -791,32 +816,34 @@ void show_rcu_gp_kthreads(void) struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; - ja = j - data_race(rcu_state.gp_activity); - jr = j - data_race(rcu_state.gp_req_activity); - js = j - data_race(rcu_state.gp_start); - jw = j - data_race(rcu_state.gp_wake_time); + ja = j - data_race(READ_ONCE(rcu_state.gp_activity)); + jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity)); + js = j - data_race(READ_ONCE(rcu_state.gp_start)); + jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time)); pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, t ? t->__state : 0x1ffff, t ? t->rt_priority : 0xffU, - js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), - (long)data_race(rcu_state.gp_seq), - (long)data_race(rcu_get_root()->gp_seq_needed), - data_race(rcu_state.gp_max), - data_race(rcu_state.gp_flags)); + data_race(READ_ONCE(rcu_state.gp_state)), + t ? data_race(READ_ONCE(t->__state)) : 0x1ffff, t ? t->rt_priority : 0xffU, + js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)), + (long)data_race(READ_ONCE(rcu_state.gp_seq)), + (long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)), + data_race(READ_ONCE(rcu_state.gp_max)), + data_race(READ_ONCE(rcu_state.gp_flags))); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) && - !data_race(rnp->qsmask) && !data_race(rnp->boost_tasks) && - !data_race(rnp->exp_tasks) && !data_race(rnp->gp_tasks)) + !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) && + !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n", rnp->grplo, rnp->grphi, - (long)data_race(rnp->gp_seq), (long)data_race(rnp->gp_seq_needed), - data_race(rnp->qsmask), - ".b"[!!data_race(rnp->boost_kthread_task)], - ".B"[!!data_race(rnp->boost_tasks)], - ".E"[!!data_race(rnp->exp_tasks)], - ".G"[!!data_race(rnp->gp_tasks)], - data_race(rnp->n_boosts)); + (long)data_race(READ_ONCE(rnp->gp_seq)), + (long)data_race(READ_ONCE(rnp->gp_seq_needed)), + data_race(READ_ONCE(rnp->qsmask)), + ".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))], + ".B"[!!data_race(READ_ONCE(rnp->boost_tasks))], + ".E"[!!data_race(READ_ONCE(rnp->exp_tasks))], + ".G"[!!data_race(READ_ONCE(rnp->gp_tasks))], + data_race(READ_ONCE(rnp->n_boosts))); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -826,12 +853,12 @@ void show_rcu_gp_kthreads(void) READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)data_race(rdp->gp_seq_needed)); + cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed))); } } for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - cbs += data_race(rdp->n_cbs_invoked); + cbs += data_race(READ_ONCE(rdp->n_cbs_invoked)); if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } @@ -913,11 +940,11 @@ void rcu_fwd_progress_check(unsigned long j) if (rcu_gp_in_progress()) { pr_info("%s: GP age %lu jiffies\n", - __func__, jiffies - rcu_state.gp_start); + __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start))); show_rcu_gp_kthreads(); } else { pr_info("%s: Last GP end %lu jiffies ago\n", - __func__, jiffies - rcu_state.gp_end); + __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end))); preempt_disable(); rdp = this_cpu_ptr(&rcu_data); rcu_check_gp_start_stall(rdp->mynode, rdp, j); diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 29e8fc5d91a7..64a08288b1a6 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -64,6 +64,7 @@ torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations."); torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); +torture_param(int, weight_single_rpc, -1, "Testing weight for single-CPU RPC operations."); torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations."); torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations."); @@ -86,6 +87,8 @@ struct scf_statistics { long long n_resched; long long n_single; long long n_single_ofl; + long long n_single_rpc; + long long n_single_rpc_ofl; long long n_single_wait; long long n_single_wait_ofl; long long n_many; @@ -101,14 +104,17 @@ static DEFINE_PER_CPU(long long, scf_invoked_count); // Data for random primitive selection #define SCF_PRIM_RESCHED 0 #define SCF_PRIM_SINGLE 1 -#define SCF_PRIM_MANY 2 -#define SCF_PRIM_ALL 3 -#define SCF_NPRIMS 7 // Need wait and no-wait versions of each, - // except for SCF_PRIM_RESCHED. +#define SCF_PRIM_SINGLE_RPC 2 +#define SCF_PRIM_MANY 3 +#define SCF_PRIM_ALL 4 +#define SCF_NPRIMS 8 // Need wait and no-wait versions of each, + // except for SCF_PRIM_RESCHED and + // SCF_PRIM_SINGLE_RPC. static char *scf_prim_name[] = { "resched_cpu", "smp_call_function_single", + "smp_call_function_single_rpc", "smp_call_function_many", "smp_call_function", }; @@ -128,6 +134,8 @@ struct scf_check { bool scfc_out; int scfc_cpu; // -1 for not _single(). bool scfc_wait; + bool scfc_rpc; + struct completion scfc_completion; }; // Use to wait for all threads to start. @@ -158,6 +166,7 @@ static void scf_torture_stats_print(void) scfs.n_resched += scf_stats_p[i].n_resched; scfs.n_single += scf_stats_p[i].n_single; scfs.n_single_ofl += scf_stats_p[i].n_single_ofl; + scfs.n_single_rpc += scf_stats_p[i].n_single_rpc; scfs.n_single_wait += scf_stats_p[i].n_single_wait; scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl; scfs.n_many += scf_stats_p[i].n_many; @@ -168,9 +177,10 @@ static void scf_torture_stats_print(void) if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) bangstr = "!!! "; - pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ", SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched, scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, + scfs.n_single_rpc, scfs.n_single_rpc_ofl, scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); torture_onoff_stats(); pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs), @@ -282,10 +292,13 @@ static void scf_handler(void *scfc_in) out: if (unlikely(!scfcp)) return; - if (scfcp->scfc_wait) + if (scfcp->scfc_wait) { WRITE_ONCE(scfcp->scfc_out, true); - else + if (scfcp->scfc_rpc) + complete(&scfcp->scfc_completion); + } else { kfree(scfcp); + } } // As above, but check for correct CPU. @@ -319,6 +332,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp->scfc_cpu = -1; scfcp->scfc_wait = scfsp->scfs_wait; scfcp->scfc_out = false; + scfcp->scfc_rpc = false; } } switch (scfsp->scfs_prim) { @@ -350,6 +364,34 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp = NULL; } break; + case SCF_PRIM_SINGLE_RPC: + if (!scfcp) + break; + cpu = torture_random(trsp) % nr_cpu_ids; + scfp->n_single_rpc++; + scfcp->scfc_cpu = cpu; + scfcp->scfc_wait = true; + init_completion(&scfcp->scfc_completion); + scfcp->scfc_rpc = true; + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, 0); + if (!ret) { + if (use_cpus_read_lock) + cpus_read_unlock(); + else + preempt_enable(); + wait_for_completion(&scfcp->scfc_completion); + if (use_cpus_read_lock) + cpus_read_lock(); + else + preempt_disable(); + } else { + scfp->n_single_rpc_ofl++; + kfree(scfcp); + scfcp = NULL; + } + break; case SCF_PRIM_MANY: if (scfsp->scfs_wait) scfp->n_many_wait++; @@ -379,10 +421,12 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra } if (scfcp && scfsp->scfs_wait) { if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) && - !scfcp->scfc_out)) + !scfcp->scfc_out)) { + pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim); atomic_inc(&n_mb_out_errs); // Leak rather than trash! - else + } else { kfree(scfcp); + } barrier(); // Prevent race-reduction compiler optimizations. } if (use_cpus_read_lock) @@ -453,8 +497,8 @@ static void scftorture_print_module_parms(const char *tag) { pr_alert(SCFTORT_FLAG - "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, - verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_rpc=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_rpc, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); } static void scf_cleanup_handler(void *unused) @@ -469,7 +513,7 @@ static void scf_torture_cleanup(void) return; WRITE_ONCE(scfdone, true); - if (nthreads) + if (nthreads && scf_stats_p) for (i = 0; i < nthreads; i++) torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); else @@ -497,6 +541,7 @@ static int __init scf_torture_init(void) int firsterr = 0; unsigned long weight_resched1 = weight_resched; unsigned long weight_single1 = weight_single; + unsigned long weight_single_rpc1 = weight_single_rpc; unsigned long weight_single_wait1 = weight_single_wait; unsigned long weight_many1 = weight_many; unsigned long weight_many_wait1 = weight_many_wait; @@ -508,11 +553,13 @@ static int __init scf_torture_init(void) scftorture_print_module_parms("Start of test"); - if (weight_resched == -1 && weight_single == -1 && weight_single_wait == -1 && + if (weight_resched == -1 && + weight_single == -1 && weight_single_rpc == -1 && weight_single_wait == -1 && weight_many == -1 && weight_many_wait == -1 && weight_all == -1 && weight_all_wait == -1) { weight_resched1 = 2 * nr_cpu_ids; weight_single1 = 2 * nr_cpu_ids; + weight_single_rpc1 = 2 * nr_cpu_ids; weight_single_wait1 = 2 * nr_cpu_ids; weight_many1 = 2; weight_many_wait1 = 2; @@ -523,6 +570,8 @@ static int __init scf_torture_init(void) weight_resched1 = 0; if (weight_single == -1) weight_single1 = 0; + if (weight_single_rpc == -1) + weight_single_rpc1 = 0; if (weight_single_wait == -1) weight_single_wait1 = 0; if (weight_many == -1) @@ -534,7 +583,7 @@ static int __init scf_torture_init(void) if (weight_all_wait == -1) weight_all_wait1 = 0; } - if (weight_single1 == 0 && weight_single_wait1 == 0 && + if (weight_single1 == 0 && weight_single_rpc1 == 0 && weight_single_wait1 == 0 && weight_many1 == 0 && weight_many_wait1 == 0 && weight_all1 == 0 && weight_all_wait1 == 0) { VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); @@ -546,6 +595,7 @@ static int __init scf_torture_init(void) else if (weight_resched1) VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored"); scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); + scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true); scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); scf_sel_add(weight_many1, SCF_PRIM_MANY, false); scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20ffcc044134..c4462c454ab9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -237,9 +237,30 @@ static DEFINE_MUTEX(sched_core_mutex); static atomic_t sched_core_count; static struct cpumask sched_core_mask; +static void sched_core_lock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t, i = 0; + + local_irq_save(*flags); + for_each_cpu(t, smt_mask) + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); +} + +static void sched_core_unlock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t; + + for_each_cpu(t, smt_mask) + raw_spin_unlock(&cpu_rq(t)->__lock); + local_irq_restore(*flags); +} + static void __sched_core_flip(bool enabled) { - int cpu, t, i; + unsigned long flags; + int cpu, t; cpus_read_lock(); @@ -250,19 +271,12 @@ static void __sched_core_flip(bool enabled) for_each_cpu(cpu, &sched_core_mask) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); - i = 0; - local_irq_disable(); - for_each_cpu(t, smt_mask) { - /* supports up to SMT8 */ - raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); - } + sched_core_lock(cpu, &flags); for_each_cpu(t, smt_mask) cpu_rq(t)->core_enabled = enabled; - for_each_cpu(t, smt_mask) - raw_spin_unlock(&cpu_rq(t)->__lock); - local_irq_enable(); + sched_core_unlock(cpu, &flags); cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); } @@ -993,6 +1007,7 @@ int get_nohz_timer_target(void) { int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; + const struct cpumask *hk_mask; if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { if (!idle_cpu(cpu)) @@ -1000,10 +1015,11 @@ int get_nohz_timer_target(void) default_cpu = cpu; } + hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); + rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu_and(i, sched_domain_span(sd), - housekeeping_cpumask(HK_FLAG_TIMER)) { + for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { if (cpu == i) continue; @@ -1619,6 +1635,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) +{ + if (!p->uclamp[clamp_id].active) + return; + + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + + /* + * Make sure to clear the idle flag if we've transiently reached 0 + * active tasks on rq. + */ + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} + static inline void uclamp_update_active(struct task_struct *p) { @@ -1642,12 +1675,8 @@ uclamp_update_active(struct task_struct *p) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - for_each_clamp_id(clamp_id) { - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - } - } + for_each_clamp_id(clamp_id) + uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); } @@ -2161,7 +2190,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) /* Non kernel threads are not allowed during either online or offline. */ if (!(p->flags & PF_KTHREAD)) - return cpu_active(cpu); + return cpu_active(cpu) && task_cpu_possible(cpu, p); /* KTHREAD_IS_PER_CPU is always allowed. */ if (kthread_is_per_cpu(p)) @@ -2468,6 +2497,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) __do_set_cpus_allowed(p, new_mask, 0); } +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, + int node) +{ + if (!src->user_cpus_ptr) + return 0; + + dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); + if (!dst->user_cpus_ptr) + return -ENOMEM; + + cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + return 0; +} + +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = NULL; + + swap(p->user_cpus_ptr, user_mask); + + return user_mask; +} + +void release_user_cpus_ptr(struct task_struct *p) +{ + kfree(clear_user_cpus_ptr(p)); +} + /* * This function is wildly self concurrent; here be dragons. * @@ -2685,28 +2742,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * Called with both p->pi_lock and rq->lock held; drops both before returning. */ -static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags) +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags, + struct rq *rq, + struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) { + const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; + bool kthread = p->flags & PF_KTHREAD; + struct cpumask *user_mask = NULL; unsigned int dest_cpu; - struct rq_flags rf; - struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { + if (kthread || is_migration_disabled(p)) { /* * Kernel threads are allowed on online && !active CPUs, * however, during cpu-hot-unplug, even these might get pushed @@ -2720,6 +2775,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, cpu_valid_mask = cpu_online_mask; } + if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { + ret = -EINVAL; + goto out; + } + /* * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. @@ -2754,20 +2814,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, __do_set_cpus_allowed(p, new_mask, flags); - return affine_move_task(rq, p, &rf, dest_cpu, flags); + if (flags & SCA_USER) + user_mask = clear_user_cpus_ptr(p); + + ret = affine_move_task(rq, p, rf, dest_cpu, flags); + + kfree(user_mask); + + return ret; out: - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, rf); return ret; } +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, u32 flags) +{ + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); +} + int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { return __set_cpus_allowed_ptr(p, new_mask, 0); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +/* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask + * and pointing @p->user_cpus_ptr to a copy of the old mask. + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ +static int restrict_cpus_allowed_ptr(struct task_struct *p, + struct cpumask *new_mask, + const struct cpumask *subset_mask) +{ + struct cpumask *user_mask = NULL; + struct rq_flags rf; + struct rq *rq; + int err; + + if (!p->user_cpus_ptr) { + user_mask = kmalloc(cpumask_size(), GFP_KERNEL); + if (!user_mask) + return -ENOMEM; + } + + rq = task_rq_lock(p, &rf); + + /* + * Forcefully restricting the affinity of a deadline task is + * likely to cause problems, so fail and noisily override the + * mask entirely. + */ + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + err = -EPERM; + goto err_unlock; + } + + if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { + err = -EINVAL; + goto err_unlock; + } + + /* + * We're about to butcher the task affinity, so keep track of what + * the user asked for in case we're able to restore it later on. + */ + if (user_mask) { + cpumask_copy(user_mask, p->cpus_ptr); + p->user_cpus_ptr = user_mask; + } + + return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + +err_unlock: + task_rq_unlock(rq, p, &rf); + kfree(user_mask); + return err; +} + +/* + * Restrict the CPU affinity of task @p so that it is a subset of + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ +void force_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + cpumask_var_t new_mask; + const struct cpumask *override_mask = task_cpu_possible_mask(p); + + alloc_cpumask_var(&new_mask, GFP_KERNEL); + + /* + * __migrate_task() can fail silently in the face of concurrent + * offlining of the chosen destination CPU, so take the hotplug + * lock to ensure that the migration succeeds. + */ + cpus_read_lock(); + if (!cpumask_available(new_mask)) + goto out_set_mask; + + if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) + goto out_free_mask; + + /* + * We failed to find a valid subset of the affinity mask for the + * task, so override it based on its cpuset hierarchy. + */ + cpuset_cpus_allowed(p, new_mask); + override_mask = new_mask; + +out_set_mask: + if (printk_ratelimit()) { + printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", + task_pid_nr(p), p->comm, + cpumask_pr_args(override_mask)); + } + + WARN_ON(set_cpus_allowed_ptr(p, override_mask)); +out_free_mask: + cpus_read_unlock(); + free_cpumask_var(new_mask); +} + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); + +/* + * Restore the affinity of a task @p which was previously restricted by a + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) + * @p->user_cpus_ptr. + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = p->user_cpus_ptr; + unsigned long flags; + + /* + * Try to restore the old affinity mask. If this fails, then + * we free the mask explicitly to avoid it being inherited across + * a subsequent fork(). + */ + if (!user_mask || !__sched_setaffinity(p, user_mask)) + return; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + user_mask = clear_user_cpus_ptr(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + kfree(user_mask); +} + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -3112,9 +3330,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) + if (is_cpu_allowed(p, dest_cpu)) return dest_cpu; } } @@ -3131,8 +3347,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - if (IS_ENABLED(CONFIG_CPUSETS)) { - cpuset_cpus_allowed_fallback(p); + if (cpuset_cpus_allowed_fallback(p)) { state = possible; break; } @@ -3144,10 +3359,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * * More yuck to audit. */ - do_set_cpus_allowed(p, cpu_possible_mask); + do_set_cpus_allowed(p, task_cpu_possible_mask(p)); state = fail; break; - case fail: BUG(); break; @@ -3562,6 +3776,55 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) } /* + * Invoked from try_to_wake_up() to check whether the task can be woken up. + * + * The caller holds p::pi_lock if p != current or has preemption + * disabled when p == current. + * + * The rules of PREEMPT_RT saved_state: + * + * The related locking code always holds p::pi_lock when updating + * p::saved_state, which means the code is fully serialized in both cases. + * + * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other + * bits set. This allows to distinguish all wakeup scenarios. + */ +static __always_inline +bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) +{ + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { + WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && + state != TASK_RTLOCK_WAIT); + } + + if (READ_ONCE(p->__state) & state) { + *success = 1; + return true; + } + +#ifdef CONFIG_PREEMPT_RT + /* + * Saved state preserves the task state across blocking on + * an RT lock. If the state matches, set p::saved_state to + * TASK_RUNNING, but do not wake the task because it waits + * for a lock wakeup. Also indicate success because from + * the regular waker's point of view this has succeeded. + * + * After acquiring the lock the task will restore p::__state + * from p::saved_state which ensures that the regular + * wakeup is not lost. The restore will also set + * p::saved_state to TASK_RUNNING so any further tests will + * not result in false positives vs. @success + */ + if (p->saved_state & state) { + p->saved_state = TASK_RUNNING; + *success = 1; + } +#endif + return false; +} + +/* * Notes on Program-Order guarantees on SMP systems. * * MIGRATION @@ -3700,10 +3963,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - if (!(READ_ONCE(p->__state) & state)) + if (!ttwu_state_match(p, state, &success)) goto out; - success = 1; trace_sched_waking(p); WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); @@ -3718,14 +3980,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); - if (!(READ_ONCE(p->__state) & state)) + if (!ttwu_state_match(p, state, &success)) goto unlock; trace_sched_waking(p); - /* We're going to change ->state: */ - success = 1; - /* * Ensure we load p->on_rq _after_ p->state, otherwise it would * be possible to, falsely, observe p->on_rq == 0 and get stuck @@ -5660,11 +5919,9 @@ static bool try_steal_cookie(int this, int that) if (p->core_occupation > dst->idle->core_occupation) goto next; - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src, p, 0); set_task_cpu(p, this); activate_task(dst, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; resched_curr(dst); @@ -5736,35 +5993,109 @@ void queue_core_balance(struct rq *rq) queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); } -static inline void sched_core_cpu_starting(unsigned int cpu) +static void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); - struct rq *rq, *core_rq = NULL; - int i; + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t; - core_rq = cpu_rq(cpu)->core; + sched_core_lock(cpu, &flags); - if (!core_rq) { - for_each_cpu(i, smt_mask) { - rq = cpu_rq(i); - if (rq->core && rq->core == rq) - core_rq = rq; + WARN_ON_ONCE(rq->core != rq); + + /* if we're the first, we'll be our own leader */ + if (cpumask_weight(smt_mask) == 1) + goto unlock; + + /* find the leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + rq = cpu_rq(t); + if (rq->core == rq) { + core_rq = rq; + break; } + } - if (!core_rq) - core_rq = cpu_rq(cpu); + if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ + goto unlock; - for_each_cpu(i, smt_mask) { - rq = cpu_rq(i); + /* install and validate core_rq */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); - WARN_ON_ONCE(rq->core && rq->core != core_rq); + if (t == cpu) rq->core = core_rq; - } + + WARN_ON_ONCE(rq->core != core_rq); + } + +unlock: + sched_core_unlock(cpu, &flags); +} + +static void sched_core_cpu_deactivate(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t; + + sched_core_lock(cpu, &flags); + + /* if we're the last man standing, nothing to do */ + if (cpumask_weight(smt_mask) == 1) { + WARN_ON_ONCE(rq->core != rq); + goto unlock; + } + + /* if we're not the leader, nothing to do */ + if (rq->core != rq) + goto unlock; + + /* find a new leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + core_rq = cpu_rq(t); + break; } + + if (WARN_ON_ONCE(!core_rq)) /* impossible */ + goto unlock; + + /* copy the shared state to the new leader */ + core_rq->core_task_seq = rq->core_task_seq; + core_rq->core_pick_seq = rq->core_pick_seq; + core_rq->core_cookie = rq->core_cookie; + core_rq->core_forceidle = rq->core_forceidle; + core_rq->core_forceidle_seq = rq->core_forceidle_seq; + + /* install new leader */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); + rq->core = core_rq; + } + +unlock: + sched_core_unlock(cpu, &flags); } + +static inline void sched_core_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->core != rq) + rq->core = rq; +} + #else /* !CONFIG_SCHED_CORE */ static inline void sched_core_cpu_starting(unsigned int cpu) {} +static inline void sched_core_cpu_deactivate(unsigned int cpu) {} +static inline void sched_core_cpu_dying(unsigned int cpu) {} static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) @@ -5775,6 +6106,24 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #endif /* CONFIG_SCHED_CORE */ /* + * Constants for the sched_mode argument of __schedule(). + * + * The mode argument allows RT enabled kernels to differentiate a + * preemption from blocking on an 'sleeping' spin/rwlock. Note that + * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to + * optimize the AND operation out and just check for zero. + */ +#define SM_NONE 0x0 +#define SM_PREEMPT 0x1 +#define SM_RTLOCK_WAIT 0x2 + +#ifndef CONFIG_PREEMPT_RT +# define SM_MASK_PREEMPT (~0U) +#else +# define SM_MASK_PREEMPT SM_PREEMPT +#endif + +/* * __schedule() is the main scheduler function. * * The main means of driving the scheduler and thus entering this function are: @@ -5813,7 +6162,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(bool preempt) +static void __sched notrace __schedule(unsigned int sched_mode) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -5826,13 +6175,13 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev, preempt); + schedule_debug(prev, !!sched_mode); if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(preempt); + rcu_note_context_switch(!!sched_mode); /* * Make sure that signal_pending_state()->signal_pending() below @@ -5866,7 +6215,7 @@ static void __sched notrace __schedule(bool preempt) * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ prev_state = READ_ONCE(prev->__state); - if (!preempt && prev_state) { + if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { if (signal_pending_state(prev_state, prev)) { WRITE_ONCE(prev->__state, TASK_RUNNING); } else { @@ -5932,7 +6281,7 @@ static void __sched notrace __schedule(bool preempt) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(preempt, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -5953,7 +6302,7 @@ void __noreturn do_task_dead(void) /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; - __schedule(false); + __schedule(SM_NONE); BUG(); /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ @@ -6014,7 +6363,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(false); + __schedule(SM_NONE); sched_preempt_enable_no_resched(); } while (need_resched()); sched_update_worker(tsk); @@ -6042,7 +6391,7 @@ void __sched schedule_idle(void) */ WARN_ON_ONCE(current->__state); do { - __schedule(false); + __schedule(SM_NONE); } while (need_resched()); } @@ -6077,6 +6426,18 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +#ifdef CONFIG_PREEMPT_RT +void __sched notrace schedule_rtlock(void) +{ + do { + preempt_disable(); + __schedule(SM_RTLOCK_WAIT); + sched_preempt_enable_no_resched(); + } while (need_resched()); +} +NOKPROBE_SYMBOL(schedule_rtlock); +#endif + static void __sched notrace preempt_schedule_common(void) { do { @@ -6095,7 +6456,7 @@ static void __sched notrace preempt_schedule_common(void) */ preempt_disable_notrace(); preempt_latency_start(1); - __schedule(true); + __schedule(SM_PREEMPT); preempt_latency_stop(1); preempt_enable_no_resched_notrace(); @@ -6174,7 +6535,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(true); + __schedule(SM_PREEMPT); exception_exit(prev_ctx); preempt_latency_stop(1); @@ -6323,7 +6684,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) do { preempt_disable(); local_irq_enable(); - __schedule(true); + __schedule(SM_PREEMPT); local_irq_disable(); sched_preempt_enable_no_resched(); } while (need_resched()); @@ -7300,6 +7661,16 @@ err_size: return -E2BIG; } +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ + if (task_has_dl_policy(p)) + __getparam_dl(p, attr); + else if (task_has_rt_policy(p)) + attr->sched_priority = p->rt_priority; + else + attr->sched_nice = task_nice(p); +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@ -7361,6 +7732,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, rcu_read_unlock(); if (likely(p)) { + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); retval = sched_setattr(p, &attr); put_task_struct(p); } @@ -7509,12 +7882,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - if (task_has_dl_policy(p)) - __getparam_dl(p, &kattr); - else if (task_has_rt_policy(p)) - kattr.sched_priority = p->rt_priority; - else - kattr.sched_nice = task_nice(p); + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK /* @@ -7535,9 +7904,76 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { + int ret = 0; + + /* + * If the task isn't a deadline task or admission control is + * disabled then we don't care about affinity changes. + */ + if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) + return 0; + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, mask)) + ret = -EBUSY; + rcu_read_unlock(); + return ret; +} +#endif + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) +{ + int retval; cpumask_var_t cpus_allowed, new_mask; + + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) + return -ENOMEM; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, mask, cpus_allowed); + + retval = dl_task_check_affinity(p, new_mask); + if (retval) + goto out_free_new_mask; +again: + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); + if (retval) + goto out_free_new_mask; + + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset update. + * Just reset the cpumask to the cpuset's cpus_allowed. + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } + +out_free_new_mask: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ struct task_struct *p; int retval; @@ -7557,68 +7993,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) retval = -EINVAL; goto out_put_task; } - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - retval = -EPERM; + if (!check_same_owner(p)) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); - goto out_free_new_mask; + retval = -EPERM; + goto out_put_task; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) - goto out_free_new_mask; - - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); - - /* - * Since bandwidth control happens on root_domain basis, - * if admission test is enabled, we only admit -deadline - * tasks allowed to run on all the CPUs in the task's - * root_domain. - */ -#ifdef CONFIG_SMP - if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { - rcu_read_lock(); - if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { - retval = -EBUSY; - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); - } -#endif -again: - retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); + goto out_put_task; - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } - } -out_free_new_mask: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); + retval = __sched_setaffinity(p, in_mask); out_put_task: put_task_struct(p); return retval; @@ -7761,6 +8151,17 @@ int __sched __cond_resched(void) preempt_schedule_common(); return 1; } + /* + * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * whether the current CPU is in an RCU read-side critical section, + * so the tick can report quiescent states even for CPUs looping + * in kernel context. In contrast, in non-preemptible kernels, + * RCU readers leave no in-memory hints, which means that CPU-bound + * processes executing in kernel context might never report an + * RCU quiescent state. Therefore, the following code causes + * cond_resched() to report a quiescent state, but only when RCU + * is in urgent need of one. + */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); #endif @@ -8707,6 +9108,8 @@ int sched_cpu_deactivate(unsigned int cpu) */ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_dec_cpuslocked(&sched_smt_present); + + sched_core_cpu_deactivate(cpu); #endif if (!sched_smp_initialized) @@ -8811,6 +9214,7 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); hrtick_clear(rq); + sched_core_cpu_dying(cpu); return 0; } #endif @@ -9022,7 +9426,7 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); #ifdef CONFIG_SCHED_CORE - rq->core = NULL; + rq->core = rq; rq->core_pick = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; @@ -9804,7 +10208,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@ -9848,7 +10252,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -10099,6 +10503,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->idle; +} + +static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 idle) +{ + return sched_group_set_idle(css_tg(css), idle); +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -10106,6 +10524,11 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, + { + .name = "idle", + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -10313,6 +10736,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, + { + .name = "idle", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 57124614363d..e7af18857371 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -537,9 +537,17 @@ static struct attribute *sugov_attrs[] = { }; ATTRIBUTE_GROUPS(sugov); +static void sugov_tunables_free(struct kobject *kobj) +{ + struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + + kfree(to_sugov_tunables(attr_set)); +} + static struct kobj_type sugov_tunables_ktype = { .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, + .release = &sugov_tunables_free, }; /********************** cpufreq governor interface *********************/ @@ -639,12 +647,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_free(struct sugov_tunables *tunables) +static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; - - kfree(tunables); } static int sugov_init(struct cpufreq_policy *policy) @@ -707,7 +713,7 @@ out: fail: kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); stop_kthread: sugov_kthread_stop(sg_policy); @@ -734,7 +740,7 @@ static void sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; if (!count) - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); mutex_unlock(&global_tunables_lock); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aaacd6cfd42f..e94314633b39 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1733,6 +1733,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused */ raw_spin_rq_lock(rq); if (p->dl.dl_non_contending) { + update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* @@ -2741,7 +2742,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; + dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } @@ -2754,7 +2755,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + attr->sched_flags &= ~SCHED_DL_FLAGS; + attr->sched_flags |= dl_se->flags; } /* @@ -2851,7 +2853,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) if (dl_se->dl_runtime != attr->sched_runtime || dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) + dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS)) return true; return false; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0c5ec2776ddf..49716228efb4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -388,6 +388,13 @@ void update_sched_domain_debugfs(void) { int cpu, i; + /* + * This can unfortunately be invoked before sched_debug_init() creates + * the debug directory. Don't touch sd_sysctl_cpus until then. + */ + if (!debugfs_sched) + return; + if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; @@ -600,6 +607,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", + cfs_rq->idle_h_nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44c452072a1b..ff69f245b939 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -431,6 +431,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } +static int tg_is_idle(struct task_group *tg) +{ + return tg->idle > 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return cfs_rq->idle > 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + if (entity_is_task(se)) + return task_has_idle_policy(task_of(se)); + return cfs_rq_is_idle(group_cfs_rq(se)); +} + #else /* !CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ @@ -468,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } +static inline int tg_is_idle(struct task_group *tg) +{ + return 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + return 0; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline @@ -1486,7 +1518,7 @@ static inline bool is_core_idle(int cpu) if (cpu == sibling) continue; - if (!idle_cpu(cpu)) + if (!idle_cpu(sibling)) return false; } #endif @@ -4841,6 +4873,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; @@ -4860,6 +4895,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) update_load_avg(qcfs_rq, se, 0); se_update_runnable(se); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; } @@ -4904,39 +4942,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + if (se->on_rq) break; - cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; } for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(qcfs_rq, se, UPDATE_TG); se_update_runnable(se); - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; /* * One parent has been throttled and cfs_rq removed from the * list. Add it back to not break the leaf list. */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (throttled_hierarchy(qcfs_rq)) + list_add_leaf_cfs_rq(qcfs_rq); } /* At this point se is NULL and we are at root level*/ @@ -4949,9 +4993,9 @@ unthrottle_throttle: * assertion below. */ for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (list_add_leaf_cfs_rq(cfs_rq)) + if (list_add_leaf_cfs_rq(qcfs_rq)) break; } @@ -5574,6 +5618,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@ -5591,6 +5638,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@ -5668,6 +5718,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@ -5697,6 +5750,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@ -6249,7 +6305,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); } - for_each_cpu_wrap(cpu, cpus, target) { + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsigned int)i < nr_cpumask_bits) @@ -6376,6 +6432,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && @@ -6902,9 +6959,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - - if (want_affine) - current->recent_used_cpu = cpu; } rcu_read_unlock(); @@ -7041,24 +7095,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->last = se; } } static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->next = se; } } @@ -7079,6 +7131,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; + int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) return; @@ -7123,8 +7176,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; find_matching_se(&se, &pse); - update_curr(cfs_rq_of(se)); BUG_ON(!pse); + + cse_is_idle = se_is_idle(se); + pse_is_idle = se_is_idle(pse); + + /* + * Preempt an idle group in favor of a non-idle group (and don't preempt + * in the inverse case). + */ + if (cse_is_idle && !pse_is_idle) + goto preempt; + if (cse_is_idle != pse_is_idle) + return; + + update_curr(cfs_rq_of(se)); if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is @@ -10217,9 +10283,11 @@ static inline int on_null_domain(struct rq *rq) static inline int find_new_ilb(void) { int ilb; + const struct cpumask *hk_mask; + + hk_mask = housekeeping_cpumask(HK_FLAG_MISC); - for_each_cpu_and(ilb, nohz.idle_cpus_mask, - housekeeping_cpumask(HK_FLAG_MISC)) { + for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { if (ilb == smp_processor_id()) continue; @@ -11416,10 +11484,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, static DEFINE_MUTEX(shares_mutex); -int sched_group_set_shares(struct task_group *tg, unsigned long shares) +static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + lockdep_assert_held(&shares_mutex); + /* * We can't change the weight of the root cgroup. */ @@ -11428,9 +11498,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - mutex_lock(&shares_mutex); if (tg->shares == shares) - goto done; + return 0; tg->shares = shares; for_each_possible_cpu(i) { @@ -11448,10 +11517,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) rq_unlock_irqrestore(rq, &rf); } -done: + return 0; +} + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int ret; + + mutex_lock(&shares_mutex); + if (tg_is_idle(tg)) + ret = -EINVAL; + else + ret = __sched_group_set_shares(tg, shares); + mutex_unlock(&shares_mutex); + + return ret; +} + +int sched_group_set_idle(struct task_group *tg, long idle) +{ + int i; + + if (tg == &root_task_group) + return -EINVAL; + + if (idle < 0 || idle > 1) + return -EINVAL; + + mutex_lock(&shares_mutex); + + if (tg->idle == idle) { + mutex_unlock(&shares_mutex); + return 0; + } + + tg->idle = idle; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se = tg->se[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + bool was_idle = cfs_rq_is_idle(grp_cfs_rq); + long idle_task_delta; + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + + grp_cfs_rq->idle = idle; + if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) + goto next_cpu; + + idle_task_delta = grp_cfs_rq->h_nr_running - + grp_cfs_rq->idle_h_nr_running; + if (!cfs_rq_is_idle(grp_cfs_rq)) + idle_task_delta *= -1; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!se->on_rq) + break; + + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* Already accounted at parent level and above. */ + if (cfs_rq_is_idle(cfs_rq)) + break; + } + +next_cpu: + rq_unlock_irqrestore(rq, &rf); + } + + /* Idle groups have minimum weight. */ + if (tg_is_idle(tg)) + __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); + else + __sched_group_set_shares(tg, NICE_0_LOAD); + mutex_unlock(&shares_mutex); return 0; } + #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14a41a243f7b..3d3e5793e117 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -227,6 +227,8 @@ static inline void update_avg(u64 *avg, u64 sample) */ #define SCHED_FLAG_SUGOV 0x10000000 +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL @@ -394,6 +396,9 @@ struct task_group { struct cfs_rq **cfs_rq; unsigned long shares; + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; + #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put @@ -503,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern int sched_group_set_idle(struct task_group *tg, long idle); + #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); @@ -599,6 +606,9 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + /* Locally cached copy of our task_group's idle value */ + int idle; + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; @@ -1093,7 +1103,7 @@ struct rq { unsigned int core_sched_seq; struct rb_root core_tree; - /* shared state */ + /* shared state -- careful with sched_core_cpu_deactivate() */ unsigned int core_task_seq; unsigned int core_pick_seq; unsigned long core_cookie; @@ -2234,6 +2244,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 #define SCA_MIGRATE_ENABLE 0x04 +#define SCA_USER 0x08 #ifdef CONFIG_SMP @@ -2255,6 +2266,9 @@ static inline struct task_struct *get_push_task(struct rq *rq) if (p->nr_cpus_allowed == 1) return NULL; + if (p->migration_disabled) + return NULL; + rq->push_busy = true; return get_task_struct(p); } @@ -2385,6 +2399,21 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_DEBUG +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern int sysctl_resched_latency_warn_ms; +extern int sysctl_resched_latency_warn_once; + +extern unsigned int sysctl_sched_tunable_scaling; + +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_size; +#endif + #ifdef CONFIG_SCHED_HRTICK /* diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b77ad49dc14f..4e8698e62f07 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1482,6 +1482,8 @@ int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; + +static unsigned long __read_mostly *sched_numa_onlined_nodes; #endif /* @@ -1833,6 +1835,16 @@ void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; for_each_node(k) { + /* + * Distance information can be unreliable for + * offline nodes, defer building the node + * masks to its bringup. + * This relies on all unique distance values + * still being visible at init time. + */ + if (!node_online(j)) + continue; + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); @@ -1886,6 +1898,53 @@ void sched_init_numa(void) sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); + + sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL); + if (!sched_numa_onlined_nodes) + return; + + bitmap_zero(sched_numa_onlined_nodes, nr_node_ids); + for_each_online_node(i) + bitmap_set(sched_numa_onlined_nodes, i, 1); +} + +static void __sched_domains_numa_masks_set(unsigned int node) +{ + int i, j; + + /* + * NUMA masks are not built for offline nodes in sched_init_numa(). + * Thus, when a CPU of a never-onlined-before node gets plugged in, + * adding that new CPU to the right NUMA masks is not sufficient: the + * masks of that CPU's node must also be updated. + */ + if (test_bit(node, sched_numa_onlined_nodes)) + return; + + bitmap_set(sched_numa_onlined_nodes, node, 1); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (!node_online(j) || node == j) + continue; + + if (node_distance(j, node) > sched_domains_numa_distance[i]) + continue; + + /* Add remote nodes in our masks */ + cpumask_or(sched_domains_numa_masks[i][node], + sched_domains_numa_masks[i][node], + sched_domains_numa_masks[0][j]); + } + } + + /* + * A new node has been brought up, potentially changing the topology + * classification. + * + * Note that this is racy vs any use of sched_numa_topology_type :/ + */ + init_numa_topology_type(); } void sched_domains_numa_masks_set(unsigned int cpu) @@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu) int node = cpu_to_node(cpu); int i, j; + __sched_domains_numa_masks_set(node); + for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { + if (!node_online(j)) + continue; + + /* Set ourselves in the remote node's masks */ if (node_distance(j, node) <= sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } diff --git a/kernel/signal.c b/kernel/signal.c index a3229add4455..52b6abec0ff8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1413,6 +1413,21 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, return sighand; } +#ifdef CONFIG_LOCKDEP +void lockdep_assert_task_sighand_held(struct task_struct *task) +{ + struct sighand_struct *sighand; + + rcu_read_lock(); + sighand = rcu_dereference(task->sighand); + if (sighand) + lockdep_assert_held(&sighand->siglock); + else + WARN_ON_ONCE(1); + rcu_read_unlock(); +} +#endif + /* * send signal info to all the members of a group */ diff --git a/kernel/smp.c b/kernel/smp.c index 52bf159ec400..f43ede0ab183 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -764,7 +764,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, EXPORT_SYMBOL(smp_call_function_single); /** - * smp_call_function_single_async(): Run an asynchronous function on a + * smp_call_function_single_async() - Run an asynchronous function on a * specific CPU. * @cpu: The CPU to run on. * @csd: Pre-allocated and setup data structure @@ -783,6 +783,8 @@ EXPORT_SYMBOL(smp_call_function_single); * * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. + * + * Return: %0 on success or negative errno value on error */ int smp_call_function_single_async(int cpu, struct __call_single_data *csd) { @@ -974,7 +976,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @flags: Bitmask that controls the operation. If %SCF_WAIT is set, wait + * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait * (atomically) until function has completed on other CPUs. If * %SCF_RUN_LOCAL is set, the function will also be run locally * if the local CPU is set in the @cpumask. @@ -1180,7 +1182,13 @@ void wake_up_all_idle_cpus(void) EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); /** - * smp_call_on_cpu - Call a function on a specific cpu + * struct smp_call_on_cpu_struct - Call a function on a specific CPU + * @work: &work_struct + * @done: &completion to signal + * @func: function to call + * @data: function's data argument + * @ret: return value from @func + * @cpu: target CPU (%-1 for any CPU) * * Used to call a function on a specific cpu and wait for it to return. * Optionally make sure the call is done on a specified physical cpu via vcpu diff --git a/kernel/smpboot.c b/kernel/smpboot.c index cf6acab78538..f6bc0bc8a2aa 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -291,7 +291,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) unsigned int cpu; int ret = 0; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); @@ -304,7 +304,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) list_add(&plug_thread->list, &hotplug_threads); out: mutex_unlock(&smpboot_threads_lock); - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); @@ -317,12 +317,12 @@ EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); */ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) { - get_online_cpus(); + cpus_read_lock(); mutex_lock(&smpboot_threads_lock); list_del(&plug_thread->list); smpboot_destroy_threads(plug_thread); mutex_unlock(&smpboot_threads_lock); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); diff --git a/kernel/softirq.c b/kernel/softirq.c index f3a012179f47..322b65d45676 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -422,7 +422,7 @@ static inline void invoke_softirq(void) if (ksoftirqd_running(local_softirq_pending())) return; - if (!force_irqthreads || !__this_cpu_read(ksoftirqd)) { + if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if diff --git a/kernel/sys.c b/kernel/sys.c index ef1a78f5d71c..72c7639e3c98 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -480,7 +480,8 @@ static int set_user(struct cred *new) * failure to the execve() stage. */ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && - new_user != INIT_USER) + new_user != INIT_USER && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 272f4a272f8c..25e49b4d8049 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -536,6 +536,21 @@ static void proc_put_char(void **buf, size_t *size, char c) } } +static int do_proc_dobool_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *(bool *)valp = *lvalp; + } else { + int val = *(bool *)valp; + + *lvalp = (unsigned long)val; + *negp = false; + } + return 0; +} + static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) @@ -799,6 +814,26 @@ static int do_proc_douintvec(struct ctl_table *table, int write, } /** + * proc_dobool - read/write a bool + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dobool(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dobool_conv, NULL); +} + +/** * proc_dointvec - read a vector of integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file @@ -1630,6 +1665,12 @@ int proc_dostring(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dobool(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3425,6 +3466,7 @@ int __init sysctl_init(void) * No sense putting this after each symbol definition, twice, * exception granted :-) */ +EXPORT_SYMBOL(proc_dobool); EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 01df12395c0e..df922f49d171 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -19,6 +19,8 @@ #include <linux/prandom.h> #include <linux/cpu.h> +#include "tick-internal.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); @@ -34,9 +36,6 @@ static u64 wdtest_jiffies_read(struct clocksource *cs) return (u64)jiffies; } -/* Assume HZ > 100. */ -#define JIFFIES_SHIFT 8 - static struct clocksource clocksource_wdtest_jiffies = { .name = "wdtest-jiffies", .rating = 1, /* lowest valid rating*/ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b89c76e1c02c..b8a14d2fb5ba 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -306,12 +306,12 @@ void clocksource_verify_percpu(struct clocksource *cs) return; cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); - get_online_cpus(); + cpus_read_lock(); preempt_disable(); clocksource_verify_choose_cpus(); if (cpumask_weight(&cpus_chosen) == 0) { preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; } @@ -337,7 +337,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cs_nsec_min = cs_nsec; } preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", cpumask_pr_args(&cpus_ahead), testcpu, cs->name); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4a66725b1d4a..0ea8702eb516 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -652,21 +652,10 @@ static inline int hrtimer_hres_active(void) return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } -/* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held - */ -static void -hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *next_timer, + ktime_t expires_next) { - ktime_t expires_next; - - expires_next = hrtimer_update_next_event(cpu_base); - - if (skip_equal && expires_next == cpu_base->expires_next) - return; - cpu_base->expires_next = expires_next; /* @@ -689,7 +678,25 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; - tick_program_event(cpu_base->expires_next, 1); + tick_program_event(expires_next, 1); +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +{ + ktime_t expires_next; + + expires_next = hrtimer_update_next_event(cpu_base); + + if (skip_equal && expires_next == cpu_base->expires_next) + return; + + __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ @@ -720,23 +727,7 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ - struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - - if (!__hrtimer_hres_active(base)) - return; - - raw_spin_lock(&base->lock); - hrtimer_update_base(base); - hrtimer_force_reprogram(base, 0); - raw_spin_unlock(&base->lock); -} +static void retrigger_next_event(void *arg); /* * Switch to high resolution mode @@ -758,29 +749,54 @@ static void hrtimer_switch_to_hres(void) retrigger_next_event(NULL); } -static void clock_was_set_work(struct work_struct *work) -{ - clock_was_set(); -} +#else -static DECLARE_WORK(hrtimer_work, clock_was_set_work); +static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline void hrtimer_switch_to_hres(void) { } +#endif /* CONFIG_HIGH_RES_TIMERS */ /* - * Called from timekeeping and resume code to reprogram the hrtimer - * interrupt device on all cpus. + * Retrigger next event is called after clock was set with interrupts + * disabled through an SMP function call or directly from low level + * resume code. + * + * This is only invoked when: + * - CONFIG_HIGH_RES_TIMERS is enabled. + * - CONFIG_NOHZ_COMMON is enabled + * + * For the other cases this function is empty and because the call sites + * are optimized out it vanishes as well, i.e. no need for lots of + * #ifdeffery. */ -void clock_was_set_delayed(void) +static void retrigger_next_event(void *arg) { - schedule_work(&hrtimer_work); -} - -#else + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); -static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline void hrtimer_switch_to_hres(void) { } -static inline void retrigger_next_event(void *arg) { } + /* + * When high resolution mode or nohz is active, then the offsets of + * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the + * next tick will take care of that. + * + * If high resolution mode is active then the next expiring timer + * must be reevaluated and the clock event device reprogrammed if + * necessary. + * + * In the NOHZ case the update of the offset and the reevaluation + * of the next expiring timer is enough. The return from the SMP + * function call will take care of the reprogramming in case the + * CPU was in a NOHZ idle sleep. + */ + if (!__hrtimer_hres_active(base) && !tick_nohz_active) + return; -#endif /* CONFIG_HIGH_RES_TIMERS */ + raw_spin_lock(&base->lock); + hrtimer_update_base(base); + if (__hrtimer_hres_active(base)) + hrtimer_force_reprogram(base, 0); + else + hrtimer_update_next_event(base); + raw_spin_unlock(&base->lock); +} /* * When a timer is enqueued and expires earlier than the already enqueued @@ -835,75 +851,161 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (base->cpu_base != cpu_base) return; + if (expires >= cpu_base->expires_next) + return; + /* - * If the hrtimer interrupt is running, then it will - * reevaluate the clock bases and reprogram the clock event - * device. The callbacks are always executed in hard interrupt - * context so we don't need an extra check for a running - * callback. + * If the hrtimer interrupt is running, then it will reevaluate the + * clock bases and reprogram the clock event device. */ if (cpu_base->in_hrtirq) return; - if (expires >= cpu_base->expires_next) - return; - - /* Update the pointer to the next expiring timer */ cpu_base->next_timer = timer; - cpu_base->expires_next = expires; + + __hrtimer_reprogram(cpu_base, timer, expires); +} + +static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, + unsigned int active) +{ + struct hrtimer_clock_base *base; + unsigned int seq; + ktime_t expires; /* - * If hres is not active, hardware does not have to be - * programmed yet. + * Update the base offsets unconditionally so the following + * checks whether the SMP function call is required works. * - * If a hang was detected in the last timer interrupt then we - * do not schedule a timer which is earlier than the expiry - * which we enforced in the hang detection. We want the system - * to make progress. + * The update is safe even when the remote CPU is in the hrtimer + * interrupt or the hrtimer soft interrupt and expiring affected + * bases. Either it will see the update before handling a base or + * it will see it when it finishes the processing and reevaluates + * the next expiring timer. */ - if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) - return; + seq = cpu_base->clock_was_set_seq; + hrtimer_update_base(cpu_base); + + /* + * If the sequence did not change over the update then the + * remote CPU already handled it. + */ + if (seq == cpu_base->clock_was_set_seq) + return false; + + /* + * If the remote CPU is currently handling an hrtimer interrupt, it + * will reevaluate the first expiring timer of all clock bases + * before reprogramming. Nothing to do here. + */ + if (cpu_base->in_hrtirq) + return false; /* - * Program the timer hardware. We enforce the expiry for - * events which are already in the past. + * Walk the affected clock bases and check whether the first expiring + * timer in a clock base is moving ahead of the first expiring timer of + * @cpu_base. If so, the IPI must be invoked because per CPU clock + * event devices cannot be remotely reprogrammed. */ - tick_program_event(expires, 1); + active &= cpu_base->active_bases; + + for_each_active_base(base, cpu_base, active) { + struct timerqueue_node *next; + + next = timerqueue_getnext(&base->active); + expires = ktime_sub(next->expires, base->offset); + if (expires < cpu_base->expires_next) + return true; + + /* Extra check for softirq clock bases */ + if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) + continue; + if (cpu_base->softirq_activated) + continue; + if (expires < cpu_base->softirq_expires_next) + return true; + } + return false; } /* - * Clock realtime was set - * - * Change the offset of the realtime clock vs. the monotonic - * clock. + * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and + * CLOCK_BOOTTIME (for late sleep time injection). * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. + * This requires to update the offsets for these clocks + * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this + * also requires to eventually reprogram the per CPU clock event devices + * when the change moves an affected timer ahead of the first expiring + * timer on that CPU. Obviously remote per CPU clock event devices cannot + * be reprogrammed. The other reason why an IPI has to be sent is when the + * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets + * in the tick, which obviously might be stopped, so this has to bring out + * the remote CPU which might sleep in idle to get this sorted. */ -void clock_was_set(void) +void clock_was_set(unsigned int bases) { -#ifdef CONFIG_HIGH_RES_TIMERS - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); -#endif + struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); + cpumask_var_t mask; + int cpu; + + if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active) + goto out_timerfd; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + on_each_cpu(retrigger_next_event, NULL, 1); + goto out_timerfd; + } + + /* Avoid interrupting CPUs if possible */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + unsigned long flags; + + cpu_base = &per_cpu(hrtimer_bases, cpu); + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (update_needs_ipi(cpu_base, bases)) + cpumask_set_cpu(cpu, mask); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + } + + preempt_disable(); + smp_call_function_many(mask, retrigger_next_event, NULL, 1); + preempt_enable(); + cpus_read_unlock(); + free_cpumask_var(mask); + +out_timerfd: timerfd_clock_was_set(); } +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(CLOCK_SET_WALL); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + +/* + * Called from timekeeping code to reprogram the hrtimer interrupt device + * on all cpus and to notify timerfd. + */ +void clock_was_set_delayed(void) +{ + schedule_work(&hrtimer_work); +} + /* - * During resume we might have to reprogram the high resolution timer - * interrupt on all online CPUs. However, all other CPUs will be - * stopped with IRQs interrupts disabled so the clock_was_set() call - * must be deferred. + * Called during resume either directly from via timekeeping_resume() + * or in the case of s2idle from tick_unfreeze() to ensure that the + * hrtimers are up to date. */ -void hrtimers_resume(void) +void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); - /* And schedule a retrigger for all others */ - clock_was_set_delayed(); } /* @@ -1030,12 +1132,13 @@ static void __remove_hrtimer(struct hrtimer *timer, * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { - int reprogram; + bool reprogram; /* * Remove the timer and force reprogramming when high @@ -1048,8 +1151,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + /* + * If the timer is not restarted then reprogramming is + * required if the timer is local. If it is local and about + * to be restarted, avoid programming it twice (on removal + * and a moment later when it's requeued). + */ if (!restart) state = HRTIMER_STATE_INACTIVE; + else + reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; @@ -1103,9 +1214,31 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; + bool force_local, first; - /* Remove an active timer from the queue: */ - remove_hrtimer(timer, base, true); + /* + * If the timer is on the local cpu base and is the first expiring + * timer then this might end up reprogramming the hardware twice + * (on removal and on enqueue). To avoid that by prevent the + * reprogram on removal, keep the timer local to the current CPU + * and enforce reprogramming after it is queued no matter whether + * it is the new first expiring timer again or not. + */ + force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local &= base->cpu_base->next_timer == timer; + + /* + * Remove an active timer from the queue. In case it is not queued + * on the current CPU, make sure that remove_hrtimer() updates the + * remote data correctly. + * + * If it's on the current CPU and the first expiring timer, then + * skip reprogramming, keep the timer local and enforce + * reprogramming later if it was the first expiring timer. This + * avoids programming the underlying clock event twice (once at + * removal and once after enqueue). + */ + remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); @@ -1115,9 +1248,24 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + if (!force_local) { + new_base = switch_hrtimer_base(timer, base, + mode & HRTIMER_MODE_PINNED); + } else { + new_base = base; + } - return enqueue_hrtimer(timer, new_base, mode); + first = enqueue_hrtimer(timer, new_base, mode); + if (!force_local) + return first; + + /* + * Timer was forced to stay on the current CPU to avoid + * reprogramming on removal and enqueue. Force reprogram the + * hardware by evaluating the new first expiring timer. + */ + hrtimer_force_reprogram(new_base->cpu_base, 1); + return 0; } /** @@ -1183,7 +1331,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false); + ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 01935aafdb46..bc4db9e5ab70 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -10,28 +10,9 @@ #include <linux/init.h> #include "timekeeping.h" +#include "tick-internal.h" -/* Since jiffies uses a simple TICK_NSEC multiplier - * conversion, the .shift value could be zero. However - * this would make NTP adjustments impossible as they are - * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to - * shift both the nominator and denominator the same - * amount, and give ntp adjustments in units of 1/2^8 - * - * The value 8 is somewhat carefully chosen, as anything - * larger can result in overflows. TICK_NSEC grows as HZ - * shrinks, so values greater than 8 overflow 32bits when - * HZ=100. - */ -#if HZ < 34 -#define JIFFIES_SHIFT 6 -#elif HZ < 67 -#define JIFFIES_SHIFT 7 -#else -#define JIFFIES_SHIFT 8 -#endif - static u64 jiffies_read(struct clocksource *cs) { return (u64) jiffies; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 517be7fd175e..ee736861b18f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -291,6 +291,8 @@ static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; struct posix_cputimers *pct = &tsk->signal->posix_cputimers; + lockdep_assert_task_sighand_held(tsk); + /* Check if cputimer isn't running. This is accessed without locking. */ if (!READ_ONCE(pct->timers_active)) { struct task_cputime sum; @@ -405,6 +407,55 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return 0; } +static struct posix_cputimer_base *timer_base(struct k_itimer *timer, + struct task_struct *tsk) +{ + int clkidx = CPUCLOCK_WHICH(timer->it_clock); + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + return tsk->posix_cputimers.bases + clkidx; + else + return tsk->signal->posix_cputimers.bases + clkidx; +} + +/* + * Force recalculating the base earliest expiration on the next tick. + * This will also re-evaluate the need to keep around the process wide + * cputime counter and tick dependency and eventually shut these down + * if necessary. + */ +static void trigger_base_recalc_expires(struct k_itimer *timer, + struct task_struct *tsk) +{ + struct posix_cputimer_base *base = timer_base(timer, tsk); + + base->nextevt = 0; +} + +/* + * Dequeue the timer and reset the base if it was its earliest expiration. + * It makes sure the next tick recalculates the base next expiration so we + * don't keep the costly process wide cputime counter around for a random + * amount of time, along with the tick dependency. + * + * If another timer gets queued between this and the next tick, its + * expiration will update the base next event if necessary on the next + * tick. + */ +static void disarm_timer(struct k_itimer *timer, struct task_struct *p) +{ + struct cpu_timer *ctmr = &timer->it.cpu; + struct posix_cputimer_base *base; + + if (!cpu_timer_dequeue(ctmr)) + return; + + base = timer_base(timer, p); + if (cpu_timer_getexpires(ctmr) == base->nextevt) + trigger_base_recalc_expires(timer, p); +} + + /* * Clean up a CPU-clock timer that is about to be destroyed. * This is called from timer deletion with the timer already locked. @@ -439,7 +490,7 @@ static int posix_cpu_timer_del(struct k_itimer *timer) if (timer->it.cpu.firing) ret = TIMER_RETRY; else - cpu_timer_dequeue(ctmr); + disarm_timer(timer, p); unlock_task_sighand(p, &flags); } @@ -498,15 +549,9 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) */ static void arm_timer(struct k_itimer *timer, struct task_struct *p) { - int clkidx = CPUCLOCK_WHICH(timer->it_clock); + struct posix_cputimer_base *base = timer_base(timer, p); struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); - struct posix_cputimer_base *base; - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - base = p->posix_cputimers.bases + clkidx; - else - base = p->signal->posix_cputimers.bases + clkidx; if (!cpu_timer_enqueue(&base->tqhead, ctmr)) return; @@ -703,16 +748,29 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires != 0 && !(val < new_expires)) { + if (val >= new_expires) { + if (new_expires != 0) { + /* + * The designated time already passed, so we notify + * immediately, even if the thread never runs to + * accumulate more time on this clock. + */ + cpu_timer_fire(timer); + } + /* - * The designated time already passed, so we notify - * immediately, even if the thread never runs to - * accumulate more time on this clock. + * Make sure we don't keep around the process wide cputime + * counter or the tick dependency if they are not necessary. */ - cpu_timer_fire(timer); - } + sighand = lock_task_sighand(p, &flags); + if (!sighand) + goto out; + + if (!cpu_timer_queued(ctmr)) + trigger_base_recalc_expires(timer, p); - ret = 0; + unlock_task_sighand(p, &flags); + } out: rcu_read_unlock(); if (old) @@ -1346,8 +1404,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, } } - if (!*newval) - return; *newval += now; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd5697d7347b..3913222e7bcf 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -336,7 +336,7 @@ void posixtimer_rearm(struct kernel_siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { enum pid_type type; - int ret = -1; + int ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index d663249652ef..46789356f856 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -470,6 +470,13 @@ void tick_resume_local(void) else tick_resume_oneshot(); } + + /* + * Ensure that hrtimers are up to date and the clockevents device + * is reprogrammed correctly when high resolution timers are + * enabled. + */ + hrtimers_resume_local(); } /** diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 6a742a29e545..649f2b48e8f0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -165,3 +165,35 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); void timer_clear_idle(void); + +#define CLOCK_SET_WALL \ + (BIT(HRTIMER_BASE_REALTIME) | BIT(HRTIMER_BASE_REALTIME_SOFT) | \ + BIT(HRTIMER_BASE_TAI) | BIT(HRTIMER_BASE_TAI_SOFT)) + +#define CLOCK_SET_BOOT \ + (BIT(HRTIMER_BASE_BOOTTIME) | BIT(HRTIMER_BASE_BOOTTIME_SOFT)) + +void clock_was_set(unsigned int bases); +void clock_was_set_delayed(void); + +void hrtimers_resume_local(void); + +/* Since jiffies uses a simple TICK_NSEC multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. TICK_NSEC grows as HZ + * shrinks, so values greater than 8 overflow 32bits when + * HZ=100. + */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else +#define JIFFIES_SHIFT 8 +#endif diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8a364aa9881a..b348749a9fc6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1323,8 +1323,8 @@ out: write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); if (!ret) audit_tk_injoffset(ts_delta); @@ -1371,8 +1371,8 @@ error: /* even if we error out, we forwarded the time, so call update */ write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); return ret; } @@ -1746,8 +1746,8 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); } #endif @@ -1810,8 +1810,10 @@ void timekeeping_resume(void) touch_softlockup_watchdog(); + /* Resume the clockevent device(s) and hrtimers */ tick_resume(); - hrtimers_resume(); + /* Notify timerfd as resume is equivalent to clock_was_set() */ + timerfd_resume(); } int timekeeping_suspend(void) @@ -2125,7 +2127,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static void timekeeping_advance(enum timekeeping_adv_mode mode) +static bool timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; @@ -2196,9 +2198,8 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode) write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - if (clock_set) - /* Have to call _delayed version, since in irq context*/ - clock_was_set_delayed(); + + return !!clock_set; } /** @@ -2207,7 +2208,8 @@ out: */ void update_wall_time(void) { - timekeeping_advance(TK_ADV_TICK); + if (timekeeping_advance(TK_ADV_TICK)) + clock_was_set_delayed(); } /** @@ -2387,8 +2389,9 @@ int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; struct audit_ntp_data ad; - unsigned long flags; + bool clock_set = false; struct timespec64 ts; + unsigned long flags; s32 orig_tai, tai; int ret; @@ -2423,6 +2426,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + clock_set = true; } tk_update_leap_state(tk); @@ -2433,10 +2437,10 @@ int do_adjtimex(struct __kernel_timex *txc) /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - timekeeping_advance(TK_ADV_FREQ); + clock_set |= timekeeping_advance(TK_ADV_FREQ); - if (tai != orig_tai) - clock_was_set(); + if (clock_set) + clock_was_set(CLOCK_REALTIME); ntp_notify_cmos_timer(); diff --git a/kernel/torture.c b/kernel/torture.c index 0a315c387bed..bb8f411c974b 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -521,11 +521,11 @@ static void torture_shuffle_tasks(void) struct shuffle_task *stp; cpumask_setall(shuffle_tmp_mask); - get_online_cpus(); + cpus_read_lock(); /* No point in shuffling if there is only one online CPU (ex: UP) */ if (num_online_cpus() == 1) { - put_online_cpus(); + cpus_read_unlock(); return; } @@ -541,7 +541,7 @@ static void torture_shuffle_tasks(void) set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); mutex_unlock(&shuffle_task_mutex); - put_online_cpus(); + cpus_read_unlock(); } /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b180f61e6d3..7efbc8aaf7f6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3100,6 +3100,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { + bool init_nop = ftrace_need_init_nop(); struct ftrace_page *pg; struct dyn_ftrace *p; u64 start, stop; @@ -3138,8 +3139,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) * Do the initial record conversion from mcount jump * to the NOP instructions. */ - if (!__is_defined(CC_USING_NOP_MCOUNT) && - !ftrace_nop_initialize(mod, p)) + if (init_nop && !ftrace_nop_initialize(mod, p)) break; update_cnt++; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f148eacda55a..33a6b4a2443d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -205,9 +205,26 @@ struct pool_workqueue { int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ + + /* + * nr_active management and WORK_STRUCT_INACTIVE: + * + * When pwq->nr_active >= max_active, new work item is queued to + * pwq->inactive_works instead of pool->worklist and marked with + * WORK_STRUCT_INACTIVE. + * + * All work items marked with WORK_STRUCT_INACTIVE do not participate + * in pwq->nr_active and all work items in pwq->inactive_works are + * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE + * work items are in pwq->inactive_works. Some of them are ready to + * run in pool->worklist or worker->scheduled. Those work itmes are + * only struct wq_barrier which is used for flush_work() and should + * not participate in pwq->nr_active. For non-barrier work item, it + * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. + */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ - struct list_head delayed_works; /* L: delayed works */ + struct list_head inactive_works; /* L: inactive works */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ @@ -524,7 +541,7 @@ static inline void debug_work_deactivate(struct work_struct *work) { } #endif /** - * worker_pool_assign_id - allocate ID and assing it to @pool + * worker_pool_assign_id - allocate ID and assign it to @pool * @pool: the pool pointer of interest * * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned @@ -579,9 +596,9 @@ static unsigned int work_color_to_flags(int color) return color << WORK_STRUCT_COLOR_SHIFT; } -static int get_work_color(struct work_struct *work) +static int get_work_color(unsigned long work_data) { - return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & + return (work_data >> WORK_STRUCT_COLOR_SHIFT) & ((1 << WORK_STRUCT_COLOR_BITS) - 1); } @@ -1136,7 +1153,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) } } -static void pwq_activate_delayed_work(struct work_struct *work) +static void pwq_activate_inactive_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -1144,22 +1161,22 @@ static void pwq_activate_delayed_work(struct work_struct *work) if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); pwq->nr_active++; } -static void pwq_activate_first_delayed(struct pool_workqueue *pwq) +static void pwq_activate_first_inactive(struct pool_workqueue *pwq) { - struct work_struct *work = list_first_entry(&pwq->delayed_works, + struct work_struct *work = list_first_entry(&pwq->inactive_works, struct work_struct, entry); - pwq_activate_delayed_work(work); + pwq_activate_inactive_work(work); } /** * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight * @pwq: pwq of interest - * @color: color of work which left the queue + * @work_data: work_data of work which left the queue * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. @@ -1167,21 +1184,21 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) * CONTEXT: * raw_spin_lock_irq(pool->lock). */ -static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) +static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) { - /* uncolored work items don't participate in flushing or nr_active */ - if (color == WORK_NO_COLOR) - goto out_put; - - pwq->nr_in_flight[color]--; + int color = get_work_color(work_data); - pwq->nr_active--; - if (!list_empty(&pwq->delayed_works)) { - /* one down, submit a delayed one */ - if (pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); + if (!(work_data & WORK_STRUCT_INACTIVE)) { + pwq->nr_active--; + if (!list_empty(&pwq->inactive_works)) { + /* one down, submit an inactive one */ + if (pwq->nr_active < pwq->max_active) + pwq_activate_first_inactive(pwq); + } } + pwq->nr_in_flight[color]--; + /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) goto out_put; @@ -1281,17 +1298,21 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, debug_work_deactivate(work); /* - * A delayed work item cannot be grabbed directly because - * it might have linked NO_COLOR work items which, if left - * on the delayed_list, will confuse pwq->nr_active + * A cancelable inactive work item must be in the + * pwq->inactive_works since a queued barrier can't be + * canceled (see the comments in insert_wq_barrier()). + * + * An inactive work item cannot be grabbed directly because + * it might have linked barrier work items which, if left + * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. */ - if (*work_data_bits(work) & WORK_STRUCT_DELAYED) - pwq_activate_delayed_work(work); + if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) + pwq_activate_inactive_work(work); list_del_init(&work->entry); - pwq_dec_nr_in_flight(pwq, get_work_color(work)); + pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); @@ -1490,8 +1511,8 @@ retry: if (list_empty(worklist)) pwq->pool->watchdog_ts = jiffies; } else { - work_flags |= WORK_STRUCT_DELAYED; - worklist = &pwq->delayed_works; + work_flags |= WORK_STRUCT_INACTIVE; + worklist = &pwq->inactive_works; } debug_work_activate(work); @@ -1912,14 +1933,14 @@ static void worker_detach_from_pool(struct worker *worker) */ static struct worker *create_worker(struct worker_pool *pool) { - struct worker *worker = NULL; - int id = -1; + struct worker *worker; + int id; char id_buf[16]; /* ID is needed to determine kthread name */ - id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); + id = ida_alloc(&pool->worker_ida, GFP_KERNEL); if (id < 0) - goto fail; + return NULL; worker = alloc_worker(pool->node); if (!worker) @@ -1954,8 +1975,7 @@ static struct worker *create_worker(struct worker_pool *pool) return worker; fail: - if (id >= 0) - ida_simple_remove(&pool->worker_ida, id); + ida_free(&pool->worker_ida, id); kfree(worker); return NULL; } @@ -2173,7 +2193,7 @@ __acquires(&pool->lock) struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; - int work_color; + unsigned long work_data; struct worker *collision; #ifdef CONFIG_LOCKDEP /* @@ -2209,7 +2229,8 @@ __acquires(&pool->lock) worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; - work_color = get_work_color(work); + work_data = *work_data_bits(work); + worker->current_color = get_work_color(work_data); /* * Record wq name for cmdline and debug reporting, may get @@ -2315,7 +2336,8 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; - pwq_dec_nr_in_flight(pwq, work_color); + worker->current_color = INT_MAX; + pwq_dec_nr_in_flight(pwq, work_data); } /** @@ -2378,7 +2400,7 @@ woke_up: set_pf_worker(false); set_task_comm(worker->task, "kworker/dying"); - ida_simple_remove(&pool->worker_ida, worker->id); + ida_free(&pool->worker_ida, worker->id); worker_detach_from_pool(worker); kfree(worker); return 0; @@ -2531,7 +2553,7 @@ repeat: /* * The above execution of rescued work items could * have created more to rescue through - * pwq_activate_first_delayed() or chained + * pwq_activate_first_inactive() or chained * queueing. Let's put @pwq back on mayday list so * that such back-to-back work items, which may be * being used to relieve memory pressure, don't @@ -2658,8 +2680,9 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { + unsigned int work_flags = 0; + unsigned int work_color; struct list_head *head; - unsigned int linked = 0; /* * debugobject calls are safe here even with pool->lock locked @@ -2674,24 +2697,31 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, barr->task = current; + /* The barrier work item does not participate in pwq->nr_active. */ + work_flags |= WORK_STRUCT_INACTIVE; + /* * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. */ - if (worker) + if (worker) { head = worker->scheduled.next; - else { + work_color = worker->current_color; + } else { unsigned long *bits = work_data_bits(target); head = target->entry.next; /* there can already be other linked works, inherit and set */ - linked = *bits & WORK_STRUCT_LINKED; + work_flags |= *bits & WORK_STRUCT_LINKED; + work_color = get_work_color(*bits); __set_bit(WORK_STRUCT_LINKED_BIT, bits); } + pwq->nr_in_flight[work_color]++; + work_flags |= work_color_to_flags(work_color); + debug_work_activate(&barr->work); - insert_work(pwq, &barr->work, head, - work_color_to_flags(WORK_NO_COLOR) | linked); + insert_work(pwq, &barr->work, head, work_flags); } /** @@ -2957,7 +2987,7 @@ reflush: bool drained; raw_spin_lock_irq(&pwq->pool->lock); - drained = !pwq->nr_active && list_empty(&pwq->delayed_works); + drained = !pwq->nr_active && list_empty(&pwq->inactive_works); raw_spin_unlock_irq(&pwq->pool->lock); if (drained) @@ -3293,7 +3323,7 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); @@ -3305,7 +3335,7 @@ int schedule_on_each_cpu(work_func_t func) for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); + cpus_read_unlock(); free_percpu(works); return 0; } @@ -3713,7 +3743,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * @pwq: target pool_workqueue * * If @pwq isn't freezing, set @pwq->max_active to the associated - * workqueue's saved_max_active and activate delayed work items + * workqueue's saved_max_active and activate inactive work items * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ static void pwq_adjust_max_active(struct pool_workqueue *pwq) @@ -3742,9 +3772,9 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = wq->saved_max_active; - while (!list_empty(&pwq->delayed_works) && + while (!list_empty(&pwq->inactive_works) && pwq->nr_active < pwq->max_active) { - pwq_activate_first_delayed(pwq); + pwq_activate_first_inactive(pwq); kick = true; } @@ -3763,7 +3793,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); } -/* initialize newly alloced @pwq which is associated with @wq and @pool */ +/* initialize newly allocated @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { @@ -3775,7 +3805,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; - INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); @@ -4016,14 +4046,14 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) static void apply_wqattrs_lock(void) { /* CPUs should stay stable across pwq creations and installations */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&wq_pool_mutex); } static void apply_wqattrs_unlock(void) { mutex_unlock(&wq_pool_mutex); - put_online_cpus(); + cpus_read_unlock(); } static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, @@ -4068,7 +4098,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * * Performs GFP_KERNEL allocations. * - * Assumes caller has CPU hotplug read exclusion, i.e. get_online_cpus(). + * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock(). * * Return: 0 on success and -errno on failure. */ @@ -4196,7 +4226,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return 0; } - get_online_cpus(); + cpus_read_lock(); if (wq->flags & __WQ_ORDERED) { ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ @@ -4206,7 +4236,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) } else { ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -4362,7 +4392,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) return true; - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) return true; return false; @@ -4558,7 +4588,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) else pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - ret = !list_empty(&pwq->delayed_works); + ret = !list_empty(&pwq->inactive_works); preempt_enable(); rcu_read_unlock(); @@ -4754,11 +4784,11 @@ static void show_pwq(struct pool_workqueue *pwq) pr_cont("\n"); } - if (!list_empty(&pwq->delayed_works)) { + if (!list_empty(&pwq->inactive_works)) { bool comma = false; - pr_info(" delayed:"); - list_for_each_entry(work, &pwq->delayed_works, entry) { + pr_info(" inactive:"); + list_for_each_entry(work, &pwq->inactive_works, entry) { pr_cont_work(comma, work); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } @@ -4788,7 +4818,7 @@ void show_workqueue_state(void) bool idle = true; for_each_pwq(pwq, wq) { - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { idle = false; break; } @@ -4800,7 +4830,7 @@ void show_workqueue_state(void) for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) show_pwq(pwq); raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* @@ -5168,10 +5198,10 @@ long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { long ret = -ENODEV; - get_online_cpus(); + cpus_read_lock(); if (cpu_online(cpu)) ret = work_on_cpu(cpu, fn, arg); - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(work_on_cpu_safe); @@ -5183,7 +5213,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe); * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their delayed_works list instead of + * workqueues will queue new works to their inactive_works list instead of * pool->worklist. * * CONTEXT: @@ -5331,7 +5361,7 @@ static int workqueue_apply_unbound_cpumask(void) * the affinity of all unbound workqueues. This function check the @cpumask * and apply it to all unbound workqueues and updates all pwqs of them. * - * Retun: 0 - Success + * Return: 0 - Success * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ @@ -5443,7 +5473,7 @@ static ssize_t wq_pool_ids_show(struct device *dev, const char *delim = ""; int node, written = 0; - get_online_cpus(); + cpus_read_lock(); rcu_read_lock(); for_each_node(node) { written += scnprintf(buf + written, PAGE_SIZE - written, @@ -5453,7 +5483,7 @@ static ssize_t wq_pool_ids_show(struct device *dev, } written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); rcu_read_unlock(); - put_online_cpus(); + cpus_read_unlock(); return written; } @@ -5902,6 +5932,13 @@ static void __init wq_numa_init(void) return; } + for_each_possible_cpu(cpu) { + if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { + pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); + return; + } + } + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_unbound_numa_attrs_buf); @@ -5919,11 +5956,6 @@ static void __init wq_numa_init(void) for_each_possible_cpu(cpu) { node = cpu_to_node(cpu); - if (WARN_ON(node == NUMA_NO_NODE)) { - pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); - /* happens iff arch is bonkers, let's just proceed */ - return; - } cpumask_set_cpu(cpu, tbl[node]); } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 498de0e909a4..e00b1204a8e9 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -30,7 +30,8 @@ struct worker { struct work_struct *current_work; /* L: work being processed */ work_func_t current_func; /* L: current_work's fn */ - struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + unsigned int current_color; /* L: current_work's color */ struct list_head scheduled; /* L: scheduled works */ /* 64 bytes boundary on 64bit, 32 on 32bit */ |