From dfcb245e28481256a10a9133441baf2a93d26642 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Dec 2018 10:05:56 +0100 Subject: sched: Fix various typos in comments Go over the scheduler source code and fix common typos in comments - and a typo in an actual variable name. No change in functionality intended. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/isolation.h | 4 ++-- include/linux/sched/mm.h | 2 +- include/linux/sched/stat.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 4a6582c27dea..b0fb1446fe04 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -16,7 +16,7 @@ enum hk_flags { }; #ifdef CONFIG_CPU_ISOLATION -DECLARE_STATIC_KEY_FALSE(housekeeping_overriden); +DECLARE_STATIC_KEY_FALSE(housekeeping_overridden); extern int housekeeping_any_cpu(enum hk_flags flags); extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); @@ -43,7 +43,7 @@ static inline void housekeeping_init(void) { } static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) { #ifdef CONFIG_CPU_ISOLATION - if (static_branch_unlikely(&housekeeping_overriden)) + if (static_branch_unlikely(&housekeeping_overridden)) return housekeeping_test_cpu(cpu, flags); #endif return true; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index aebb370a0006..3bfa6a0cbba4 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -153,7 +153,7 @@ static inline gfp_t current_gfp_context(gfp_t flags) { /* * NOIO implies both NOIO and NOFS and it is a weaker context - * so always make sure it makes precendence + * so always make sure it makes precedence */ if (unlikely(current->flags & PF_MEMALLOC_NOIO)) flags &= ~(__GFP_IO | __GFP_FS); diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index f30954cc059d..568286411b43 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -8,7 +8,7 @@ * Various counters maintained by the scheduler and fork(), * exposed via /proc, sys.c or used by drivers via these APIs. * - * ( Note that all these values are aquired without locking, + * ( Note that all these values are acquired without locking, * so they can only be relied on in narrow circumstances. ) */ -- cgit v1.2.3 From 765d0af19f5f388a34bf4533378f8398b72ded46 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 29 Aug 2018 15:19:11 +0200 Subject: sched/topology: Remove the ::smt_gain field from 'struct sched_domain' ::smt_gain is used to compute the capacity of CPUs of a SMT core with the constraint 1 < ::smt_gain < 2 in order to be able to compute number of CPUs per core. The field has_free_capacity of struct numa_stat, which was the last user of this computation of number of CPUs per core, has been removed by: 2d4056fafa19 ("sched/numa: Remove numa_has_capacity()") We can now remove this constraint on core capacity and use the defautl value SCHED_CAPACITY_SCALE for SMT CPUs. With this remove, SCHED_CAPACITY_SCALE becomes the maximum compute capacity of CPUs on every systems. This should help to simplify some code and remove fields like rd->max_cpu_capacity Furthermore, arch_scale_cpu_capacity() is used with a NULL sd in several other places in the code when it wants the capacity of a CPUs to scale some metrics like in pelt, deadline or schedutil. In case on SMT, the value returned is not the capacity of SMT CPUs but default SCHED_CAPACITY_SCALE. So remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1535548752-4434-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 1 - kernel/sched/sched.h | 3 --- kernel/sched/topology.c | 2 -- 3 files changed, 6 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 6b9976180c1e..7fa0bc17cd8c 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -89,7 +89,6 @@ struct sched_domain { unsigned int newidle_idx; unsigned int wake_idx; unsigned int forkexec_idx; - unsigned int smt_gain; int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9bde60a11805..ceb896404869 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1864,9 +1864,6 @@ unsigned long arch_scale_freq_capacity(int cpu) static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) - return sd->smt_gain / sd->span_weight; - return SCHED_CAPACITY_SCALE; } #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8d7f15ba5916..7364e0b427b7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1133,7 +1133,6 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, - .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, .child = child, @@ -1164,7 +1163,6 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; - sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; -- cgit v1.2.3 From 5bd0988be12733a42a1a3d50e3e2ddfd79e57518 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:14 +0000 Subject: sched/topology: Relocate arch_scale_cpu_capacity() to the internal header By default, arch_scale_cpu_capacity() is only visible from within the kernel/sched folder. Relocate it to include/linux/sched/topology.h to make it visible to other clients needing to know about the capacity of CPUs, such as the Energy Model framework. This also shrinks the public header. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-2-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 16 ++++++++++++++++ kernel/sched/sched.h | 18 ------------------ 2 files changed, 16 insertions(+), 18 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7fa0bc17cd8c..c31d3a47a47c 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -201,6 +201,14 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl); # define SD_INIT_NAME(type) #endif +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + #else /* CONFIG_SMP */ struct sched_domain_attr; @@ -216,6 +224,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + #endif /* !CONFIG_SMP */ static inline int task_node(const struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ceb896404869..66067152a831 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1859,24 +1859,6 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif -#ifdef CONFIG_SMP -#ifndef arch_scale_cpu_capacity -static __always_inline -unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} -#endif -#else -#ifndef arch_scale_cpu_capacity -static __always_inline -unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} -#endif -#endif - #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT -- cgit v1.2.3 From 938e5e4b0d1502a93e787985cb95b136b40717b7 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:15 +0000 Subject: sched/cpufreq: Prepare schedutil for Energy Aware Scheduling Schedutil requests frequency by aggregating utilization signals from the scheduler (CFS, RT, DL, IRQ) and applying a 25% margin on top of them. Since Energy Aware Scheduling (EAS) needs to be able to predict the frequency requests, it needs to forecast the decisions made by the governor. In order to prepare the introduction of EAS, introduce schedutil_freq_util() to centralize the aforementioned signal aggregation and make it available to both schedutil and EAS. Since frequency selection and energy estimation still need to deal with RT and DL signals slightly differently, schedutil_freq_util() is called with a different 'type' parameter in those two contexts, and returns an aggregated utilization signal accordingly. While at it, introduce the map_util_freq() function which is designed to make schedutil's 25% margin usable easily for both sugov and EAS. As EAS will be able to predict schedutil's frequency requests more accurately than any other governor by design, it'd be sensible to make sure EAS cannot be used without schedutil. This will be done later, once EAS has actually been introduced. Suggested-by: Peter Zijlstra Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-3-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched/cpufreq.h | 6 +++++ kernel/sched/cpufreq_schedutil.c | 53 ++++++++++++++++++++++++++++------------ kernel/sched/sched.h | 30 +++++++++++++++++++++++ 3 files changed, 74 insertions(+), 15 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 59667444669f..afa940cd50dc 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -20,6 +20,12 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, void (*func)(struct update_util_data *data, u64 time, unsigned int flags)); void cpufreq_remove_update_util_hook(int cpu); + +static inline unsigned long map_util_freq(unsigned long util, + unsigned long freq, unsigned long cap) +{ + return (freq + (freq >> 2)) * util / cap; +} #endif /* CONFIG_CPU_FREQ */ #endif /* _LINUX_SCHED_CPUFREQ_H */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3fffad3bc8a8..90128be27712 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -13,6 +13,7 @@ #include "sched.h" +#include #include struct sugov_tunables { @@ -167,7 +168,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; - freq = (freq + (freq >> 2)) * util / max; + freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; @@ -197,15 +198,13 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ -static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type) { - struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util, irq, max; + unsigned long dl_util, util, irq; + struct rq *rq = cpu_rq(cpu); - sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); - sg_cpu->bw_dl = cpu_bw_dl(rq); - - if (rt_rq_is_runnable(&rq->rt)) + if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) return max; /* @@ -223,21 +222,30 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. */ - util = cpu_util_cfs(rq); + util = util_cfs; util += cpu_util_rt(rq); + dl_util = cpu_util_dl(rq); + /* - * We do not make cpu_util_dl() a permanent part of this sum because we - * want to use cpu_bw_dl() later on, but we need to check if the - * CFS+RT+DL sum is saturated (ie. no idle time) such that we select - * f_max when there is no idle time. + * For frequency selection we do not make cpu_util_dl() a permanent part + * of this sum because we want to use cpu_bw_dl() later on, but we need + * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such + * that we select f_max when there is no idle time. * * NOTE: numerical errors or stop class might cause us to not quite hit * saturation when we should -- something for later. */ - if ((util + cpu_util_dl(rq)) >= max) + if (util + dl_util >= max) return max; + /* + * OTOH, for energy computation we need the estimated running time, so + * include util_dl and ignore dl_bw. + */ + if (type == ENERGY_UTIL) + util += dl_util; + /* * There is still idle time; further improve the number by using the * irq metric. Because IRQ/steal time is hidden from the task clock we @@ -260,7 +268,22 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * bw_dl as requested freq. However, cpufreq is not yet ready for such * an interface. So, we only do the latter for now. */ - return min(max, util + sg_cpu->bw_dl); + if (type == FREQUENCY_UTIL) + util += cpu_bw_dl(rq); + + return min(max, util); +} + +static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +{ + struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util = cpu_util_cfs(rq); + unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + + sg_cpu->max = max; + sg_cpu->bw_dl = cpu_bw_dl(rq); + + return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); } /** diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 66067152a831..2eafa228aebf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2191,6 +2191,31 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +/** + * enum schedutil_type - CPU utilization type + * @FREQUENCY_UTIL: Utilization used to select frequency + * @ENERGY_UTIL: Utilization used during energy calculation + * + * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time + * need to be aggregated differently depending on the usage made of them. This + * enum is used within schedutil_freq_util() to differentiate the types of + * utilization expected by the callers, and adjust the aggregation accordingly. + */ +enum schedutil_type { + FREQUENCY_UTIL, + ENERGY_UTIL, +}; + +unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type); + +static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +{ + unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + + return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); +} + static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -2217,6 +2242,11 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } +#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +{ + return cfs; +} #endif #ifdef CONFIG_HAVE_SCHED_AVG_IRQ -- cgit v1.2.3 From fb5bf31722d0805a3f394f7d59f2e8cd07acccb7 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 3 Jan 2019 15:28:03 -0800 Subject: fork: fix some -Wmissing-prototypes warnings We get a warning when building kernel with W=1: kernel/fork.c:167:13: warning: no previous prototype for `arch_release_thread_stack' [-Wmissing-prototypes] kernel/fork.c:779:13: warning: no previous prototype for `fork_init' [-Wmissing-prototypes] Add the missing declaration in head file to fix this. Also, remove arch_release_thread_stack() completely because no arch seems to implement it since bb9d81264 (arch: remove tile port). Link: http://lkml.kernel.org/r/1542170087-23645-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Yi Wang Acked-by: Michal Hocko Acked-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/task.h | 2 ++ init/main.c | 1 - kernel/fork.c | 5 ----- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 108ede99e533..44c6f15800ff 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -39,6 +39,8 @@ void __noreturn do_task_dead(void); extern void proc_caches_init(void); +extern void fork_init(void); + extern void release_task(struct task_struct * p); #ifdef CONFIG_HAVE_COPY_THREAD_TLS diff --git a/init/main.c b/init/main.c index 6a74ba0892d2..e2e80ca3165a 100644 --- a/init/main.c +++ b/init/main.c @@ -105,7 +105,6 @@ static int kernel_init(void *); extern void init_IRQ(void); -extern void fork_init(void); extern void radix_tree_init(void); /* diff --git a/kernel/fork.c b/kernel/fork.c index d439c48ecf18..a60459947f18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -164,10 +164,6 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_stack(unsigned long *stack) -{ -} - #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* @@ -422,7 +418,6 @@ static void release_task_stack(struct task_struct *tsk) return; /* Better to leak the stack than to free prematurely */ account_kernel_stack(tsk, -1); - arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); tsk->stack = NULL; #ifdef CONFIG_VMAP_STACK -- cgit v1.2.3 From e6018c0f5c996e61639adce6a0697391a2861916 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Dec 2018 10:14:53 +0100 Subject: sched/wake_q: Document wake_q_add() The only guarantee provided by wake_q_add() is that a wakeup will happen after it, it does _NOT_ guarantee the wakeup will be delayed until the matching wake_up_q(). If wake_q_add() fails the cmpxchg() a concurrent wakeup is pending and that can happen at any time after the cmpxchg(). This means we should not rely on the wakeup happening at wake_q_up(), but should be ready for wake_q_add() to issue the wakeup. The delay; if provided (most likely); should only result in more efficient behaviour. Reported-by: Yongji Xie Signed-off-by: Peter Zijlstra (Intel) Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Signed-off-by: Ingo Molnar --- include/linux/sched/wake_q.h | 6 +++++- kernel/sched/core.c | 12 ++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 10b19a192b2d..545f37138057 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -24,9 +24,13 @@ * called near the end of a function. Otherwise, the list can be * re-initialized for later re-use by wake_q_init(). * - * Note that this can cause spurious wakeups. schedule() callers + * NOTE that this can cause spurious wakeups. schedule() callers * must ensure the call is done inside a loop, confirming that the * wakeup condition has in fact occurred. + * + * NOTE that there is no guarantee the wakeup will happen any later than the + * wake_q_add() location. Therefore task must be ready to be woken at the + * location of the wake_q_add(). */ #include diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a674c7db2f29..cc814933f7d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -396,6 +396,18 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif +/** + * wake_q_add() - queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + */ void wake_q_add(struct wake_q_head *head, struct task_struct *task) { struct wake_q_node *node = &task->wake_q; -- cgit v1.2.3 From 8d5d0cfb63cbcb4005e19a332b31d687b1d01e58 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:23 +0000 Subject: sched/topology: Introduce a sysctl for Energy Aware Scheduling In its current state, Energy Aware Scheduling (EAS) starts automatically on asymmetric platforms having an Energy Model (EM). However, there are users who want to have an EM (for thermal management for example), but don't want EAS with it. In order to let users disable EAS explicitly, introduce a new sysctl called 'sched_energy_aware'. It is enabled by default so that EAS can start automatically on platforms where it makes sense. Flipping it to 0 rebuilds the scheduling domains and disables EAS. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-11-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- Documentation/sysctl/kernel.txt | 12 ++++++++++++ include/linux/sched/sysctl.h | 7 +++++++ kernel/sched/topology.c | 29 +++++++++++++++++++++++++++++ kernel/sysctl.c | 11 +++++++++++ 4 files changed, 59 insertions(+) (limited to 'include/linux/sched') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index c0527d8a468a..379063e58326 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -79,6 +79,7 @@ show up in /proc/sys/kernel: - reboot-cmd [ SPARC only ] - rtsig-max - rtsig-nr +- sched_energy_aware - seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst - sem - sem_next_id [ sysv ipc ] @@ -890,6 +891,17 @@ rtsig-nr shows the number of RT signals currently queued. ============================================================== +sched_energy_aware: + +Enables/disables Energy Aware Scheduling (EAS). EAS starts +automatically on platforms where it can run (that is, +platforms with asymmetric CPU topologies and having an Energy +Model available). If your platform happens to meet the +requirements for EAS but you do not want to use it, change +this value to 0. + +============================================================== + sched_schedstats: Enables/disables scheduler statistics. Enabling this feature diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index a9c32daeb9d8..99ce6d728df7 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -83,4 +83,11 @@ extern int sysctl_schedstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +extern unsigned int sysctl_sched_energy_aware; +extern int sched_energy_aware_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3f35ba1d8fde..50c3fc316c54 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -203,9 +203,35 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) DEFINE_STATIC_KEY_FALSE(sched_energy_present); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +#ifdef CONFIG_PROC_SYSCTL +int sched_energy_aware_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, state; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + state = static_branch_unlikely(&sched_energy_present); + if (state != sysctl_sched_energy_aware) { + mutex_lock(&sched_energy_mutex); + sched_energy_update = 1; + rebuild_sched_domains(); + sched_energy_update = 0; + mutex_unlock(&sched_energy_mutex); + } + } + + return ret; +} +#endif + static void free_pd(struct perf_domain *pd) { struct perf_domain *tmp; @@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map) struct cpufreq_policy *policy; struct cpufreq_governor *gov; + if (!sysctl_sched_energy_aware) + goto free; + /* EAS is enabled for asymmetric CPU capacity topologies. */ if (!per_cpu(sd_asym_cpucapacity, cpu)) { if (sched_debug()) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ba4d9e85feb8..987ae08147bf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -467,6 +467,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", -- cgit v1.2.3 From 9bcdeb51bd7d2ae9fe65ea4d60643d2aeef5bfe3 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 1 Feb 2019 14:20:31 -0800 Subject: oom, oom_reaper: do not enqueue same task twice Arkadiusz reported that enabling memcg's group oom killing causes strange memcg statistics where there is no task in a memcg despite the number of tasks in that memcg is not 0. It turned out that there is a bug in wake_oom_reaper() which allows enqueuing same task twice which makes impossible to decrease the number of tasks in that memcg due to a refcount leak. This bug existed since the OOM reaper became invokable from task_will_free_mem(current) path in out_of_memory() in Linux 4.7, T1@P1 |T2@P1 |T3@P1 |OOM reaper ----------+----------+----------+------------ # Processing an OOM victim in a different memcg domain. try_charge() mem_cgroup_out_of_memory() mutex_lock(&oom_lock) try_charge() mem_cgroup_out_of_memory() mutex_lock(&oom_lock) try_charge() mem_cgroup_out_of_memory() mutex_lock(&oom_lock) out_of_memory() oom_kill_process(P1) do_send_sig_info(SIGKILL, @P1) mark_oom_victim(T1@P1) wake_oom_reaper(T1@P1) # T1@P1 is enqueued. mutex_unlock(&oom_lock) out_of_memory() mark_oom_victim(T2@P1) wake_oom_reaper(T2@P1) # T2@P1 is enqueued. mutex_unlock(&oom_lock) out_of_memory() mark_oom_victim(T1@P1) wake_oom_reaper(T1@P1) # T1@P1 is enqueued again due to oom_reaper_list == T2@P1 && T1@P1->oom_reaper_list == NULL. mutex_unlock(&oom_lock) # Completed processing an OOM victim in a different memcg domain. spin_lock(&oom_reaper_lock) # T1P1 is dequeued. spin_unlock(&oom_reaper_lock) but memcg's group oom killing made it easier to trigger this bug by calling wake_oom_reaper() on the same task from one out_of_memory() request. Fix this bug using an approach used by commit 855b018325737f76 ("oom, oom_reaper: disable oom_reaper for oom_kill_allocating_task"). As a side effect of this patch, this patch also avoids enqueuing multiple threads sharing memory via task_will_free_mem(current) path. Link: http://lkml.kernel.org/r/e865a044-2c10-9858-f4ef-254bc71d6cc2@i-love.sakura.ne.jp Link: http://lkml.kernel.org/r/5ee34fc6-1485-34f8-8790-903ddabaa809@i-love.sakura.ne.jp Fixes: af8e15cc85a25315 ("oom, oom_reaper: do not enqueue task if it is on the oom_reaper_list head") Signed-off-by: Tetsuo Handa Reported-by: Arkadiusz Miskiewicz Tested-by: Arkadiusz Miskiewicz Acked-by: Michal Hocko Acked-by: Roman Gushchin Cc: Tejun Heo Cc: Aleksa Sarai Cc: Jay Kamat Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/coredump.h | 1 + mm/oom_kill.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index ec912d01126f..ecdc6542070f 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ +#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f0e8cd9edb1a..059e617a1847 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -647,8 +647,8 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - /* tsk is already queued? */ - if (tsk == oom_reaper_list || tsk->oom_reaper_list) + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) return; get_task_struct(tsk); -- cgit v1.2.3 From d036bda7d0e7269c2982eb979acfef855f5d7977 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 18 Jan 2019 14:27:26 +0200 Subject: sched/core: Convert sighand_struct.count to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable sighand_struct.count is used as pure reference counter. Convert it to refcount_t and fix up the operations. ** Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the sighand_struct.count it might make a difference in following places: - __cleanup_sighand: decrement in refcount_dec_and_test() only provides RELEASE ordering and control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook Signed-off-by: Elena Reshetova Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Reviewed-by: Andrea Parri Reviewed-by: Oleg Nesterov Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: viro@zeniv.linux.org.uk Link: https://lkml.kernel.org/r/1547814450-18902-2-git-send-email-elena.reshetova@intel.com Signed-off-by: Ingo Molnar --- fs/exec.c | 4 ++-- fs/proc/task_nommu.c | 2 +- include/linux/sched/signal.h | 3 ++- kernel/fork.c | 8 ++++---- 4 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux/sched') diff --git a/fs/exec.c b/fs/exec.c index fb72d36f7823..966cd98a2ce2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1189,7 +1189,7 @@ no_thread_group: flush_itimer_signals(); #endif - if (atomic_read(&oldsighand->count) != 1) { + if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND @@ -1199,7 +1199,7 @@ no_thread_group: if (!newsighand) return -ENOMEM; - atomic_set(&newsighand->count, 1); + refcount_set(&newsighand->count, 1); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 0b63d68dedb2..f912872fbf91 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -64,7 +64,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) else bytes += kobjsize(current->files); - if (current->sighand && atomic_read(¤t->sighand->count) > 1) + if (current->sighand && refcount_read(¤t->sighand->count) > 1) sbytes += kobjsize(current->sighand); else bytes += kobjsize(current->sighand); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 13789d10a50e..37eeb1a28eba 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -8,13 +8,14 @@ #include #include #include +#include /* * Types defining task->signal and task->sighand and APIs using them: */ struct sighand_struct { - atomic_t count; + refcount_t count; struct k_sigaction action[_NSIG]; spinlock_t siglock; wait_queue_head_t signalfd_wqh; diff --git a/kernel/fork.c b/kernel/fork.c index b69248e6f0e0..370856d4c0b3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1463,7 +1463,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) struct sighand_struct *sig; if (clone_flags & CLONE_SIGHAND) { - atomic_inc(¤t->sighand->count); + refcount_inc(¤t->sighand->count); return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); @@ -1471,7 +1471,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) if (!sig) return -ENOMEM; - atomic_set(&sig->count, 1); + refcount_set(&sig->count, 1); spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(¤t->sighand->siglock); @@ -1480,7 +1480,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) { + if (refcount_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it @@ -2439,7 +2439,7 @@ static int check_unshare_flags(unsigned long unshare_flags) return -EINVAL; } if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { - if (atomic_read(¤t->sighand->count) > 1) + if (refcount_read(¤t->sighand->count) > 1) return -EINVAL; } if (unshare_flags & CLONE_VM) { -- cgit v1.2.3 From 60d4de3ff7f775509deba94b3db3c1abe55bf7a5 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 18 Jan 2019 14:27:27 +0200 Subject: sched/core: Convert signal_struct.sigcnt to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable signal_struct.sigcnt is used as pure reference counter. Convert it to refcount_t and fix up the operations. ** Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the signal_struct.sigcnt it might make a difference in following places: - put_signal_struct(): decrement in refcount_dec_and_test() only provides RELEASE ordering and control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook Signed-off-by: Elena Reshetova Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Reviewed-by: Andrea Parri Reviewed-by: Oleg Nesterov Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: viro@zeniv.linux.org.uk Link: https://lkml.kernel.org/r/1547814450-18902-3-git-send-email-elena.reshetova@intel.com Signed-off-by: Ingo Molnar --- include/linux/sched/signal.h | 2 +- init/init_task.c | 2 +- kernel/fork.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 37eeb1a28eba..ae5655197698 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -83,7 +83,7 @@ struct multiprocess_signals { * the locking of signal_struct. */ struct signal_struct { - atomic_t sigcnt; + refcount_t sigcnt; atomic_t live; int nr_threads; struct list_head thread_head; diff --git a/init/init_task.c b/init/init_task.c index 5aebe3be4d7c..9aa3ebc74970 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -44,7 +44,7 @@ static struct signal_struct init_signals = { }; static struct sighand_struct init_sighand = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .action = { { { .sa_handler = SIG_DFL, } }, }, .siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock), .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), diff --git a/kernel/fork.c b/kernel/fork.c index 370856d4c0b3..935a42d5f8ff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -710,7 +710,7 @@ static inline void free_signal_struct(struct signal_struct *sig) static inline void put_signal_struct(struct signal_struct *sig) { - if (atomic_dec_and_test(&sig->sigcnt)) + if (refcount_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); } @@ -1527,7 +1527,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); - atomic_set(&sig->sigcnt, 1); + refcount_set(&sig->sigcnt, 1); /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); @@ -2082,7 +2082,7 @@ static __latent_entropy struct task_struct *copy_process( } else { current->signal->nr_threads++; atomic_inc(¤t->signal->live); - atomic_inc(¤t->signal->sigcnt); + refcount_inc(¤t->signal->sigcnt); task_join_group_stop(p); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); -- cgit v1.2.3 From ec1d281923cf81cc660343d0cb8ffc837ffb991d Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 18 Jan 2019 14:27:29 +0200 Subject: sched/core: Convert task_struct.usage to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable task_struct.usage is used as pure reference counter. Convert it to refcount_t and fix up the operations. ** Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the task_struct.usage it might make a difference in following places: - put_task_struct(): decrement in refcount_dec_and_test() only provides RELEASE ordering and control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook Signed-off-by: Elena Reshetova Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Reviewed-by: Andrea Parri Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: viro@zeniv.linux.org.uk Link: https://lkml.kernel.org/r/1547814450-18902-5-git-send-email-elena.reshetova@intel.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- include/linux/sched/task.h | 4 ++-- init/init_task.c | 2 +- kernel/fork.c | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index e2bba022827d..9d14d6864ca6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -607,7 +608,7 @@ struct task_struct { randomized_struct_fields_start void *stack; - atomic_t usage; + refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 44c6f15800ff..2e97a2227045 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -88,13 +88,13 @@ extern void sched_exec(void); #define sched_exec() {} #endif -#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#define get_task_struct(tsk) do { refcount_inc(&(tsk)->usage); } while(0) extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) { - if (atomic_dec_and_test(&t->usage)) + if (refcount_dec_and_test(&t->usage)) __put_task_struct(t); } diff --git a/init/init_task.c b/init/init_task.c index 9aa3ebc74970..aca34c89529f 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -65,7 +65,7 @@ struct task_struct init_task #endif .state = 0, .stack = init_stack, - .usage = ATOMIC_INIT(2), + .usage = REFCOUNT_INIT(2), .flags = PF_KTHREAD, .prio = MAX_PRIO - 20, .static_prio = MAX_PRIO - 20, diff --git a/kernel/fork.c b/kernel/fork.c index 935a42d5f8ff..3f7e192e29f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -717,7 +717,7 @@ static inline void put_signal_struct(struct signal_struct *sig) void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); - WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); cgroup_free(tsk); @@ -896,7 +896,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) * One for us, one for whoever does the "release_task()" (usually * parent) */ - atomic_set(&tsk->usage, 2); + refcount_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif -- cgit v1.2.3 From f0b89d3958d73cd0785ec381f0ddf8efb6f183d8 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 18 Jan 2019 14:27:30 +0200 Subject: sched/core: Convert task_struct.stack_refcount to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable task_struct.stack_refcount is used as pure reference counter. Convert it to refcount_t and fix up the operations. ** Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the task_struct.stack_refcount it might make a difference in following places: - try_get_task_stack(): increment in refcount_inc_not_zero() only guarantees control dependency on success vs. fully ordered atomic counterpart - put_task_stack(): decrement in refcount_dec_and_test() only provides RELEASE ordering and control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook Signed-off-by: Elena Reshetova Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Reviewed-by: Andrea Parri Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: viro@zeniv.linux.org.uk Link: https://lkml.kernel.org/r/1547814450-18902-6-git-send-email-elena.reshetova@intel.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 1 + include/linux/sched.h | 2 +- include/linux/sched/task_stack.h | 2 +- init/init_task.c | 2 +- kernel/fork.c | 6 +++--- 5 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index a7083a45a26c..6049baa5b8bc 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/sched.h b/include/linux/sched.h index 9d14d6864ca6..628bf13cb5a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1194,7 +1194,7 @@ struct task_struct { #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ - atomic_t stack_refcount; + refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index 6a841929073f..2413427e439c 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -61,7 +61,7 @@ static inline unsigned long *end_of_stack(struct task_struct *p) #ifdef CONFIG_THREAD_INFO_IN_TASK static inline void *try_get_task_stack(struct task_struct *tsk) { - return atomic_inc_not_zero(&tsk->stack_refcount) ? + return refcount_inc_not_zero(&tsk->stack_refcount) ? task_stack_page(tsk) : NULL; } diff --git a/init/init_task.c b/init/init_task.c index aca34c89529f..46dbf546264d 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -61,7 +61,7 @@ struct task_struct init_task = { #ifdef CONFIG_THREAD_INFO_IN_TASK .thread_info = INIT_THREAD_INFO(init_task), - .stack_refcount = ATOMIC_INIT(1), + .stack_refcount = REFCOUNT_INIT(1), #endif .state = 0, .stack = init_stack, diff --git a/kernel/fork.c b/kernel/fork.c index 3f7e192e29f2..77059b211608 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -429,7 +429,7 @@ static void release_task_stack(struct task_struct *tsk) #ifdef CONFIG_THREAD_INFO_IN_TASK void put_task_stack(struct task_struct *tsk) { - if (atomic_dec_and_test(&tsk->stack_refcount)) + if (refcount_dec_and_test(&tsk->stack_refcount)) release_task_stack(tsk); } #endif @@ -447,7 +447,7 @@ void free_task(struct task_struct *tsk) * If the task had a separate stack allocation, it should be gone * by now. */ - WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); + WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); @@ -867,7 +867,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->stack_vm_area = stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK - atomic_set(&tsk->stack_refcount, 1); + refcount_set(&tsk->stack_refcount, 1); #endif if (err) -- cgit v1.2.3 From 07879c6a3740fbbf3c8891a0ab484c20a12794d8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 18 Dec 2018 11:53:52 -0800 Subject: sched/wake_q: Reduce reference counting for special users Some users, specifically futexes and rwsems, required fixes that allowed the callers to be safe when wakeups occur before they are expected by wake_up_q(). Such scenarios also play games and rely on reference counting, and until now were pivoting on wake_q doing it. With the wake_q_add() call being moved down, this can no longer be the case. As such we end up with a a double task refcounting overhead; and these callers care enough about this (being rather core-ish). This patch introduces a wake_q_add_safe() call that serves for callers that have already done refcounting and therefore the task is 'safe' from wake_q point of view (int that it requires reference throughout the entire queue/>wakeup cycle). In the one case it has internal reference counting, in the other case it consumes the reference counting. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: Xie Yongji Cc: Yongji Xie Cc: andrea.parri@amarulasolutions.com Cc: lilin24@baidu.com Cc: liuqi16@baidu.com Cc: nixun@baidu.com Cc: yuanlinsi01@baidu.com Cc: zhangyu31@baidu.com Link: https://lkml.kernel.org/r/20181218195352.7orq3upiwfdbrdne@linux-r8p5 Signed-off-by: Ingo Molnar --- include/linux/sched/wake_q.h | 4 +-- kernel/futex.c | 3 +-- kernel/locking/rwsem-xadd.c | 4 +-- kernel/sched/core.c | 60 ++++++++++++++++++++++++++++++++------------ 4 files changed, 48 insertions(+), 23 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 545f37138057..ad826d2a4557 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -51,8 +51,8 @@ static inline void wake_q_init(struct wake_q_head *head) head->lastp = &head->first; } -extern void wake_q_add(struct wake_q_head *head, - struct task_struct *task); +extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); +extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); extern void wake_up_q(struct wake_q_head *head); #endif /* _LINUX_SCHED_WAKE_Q_H */ diff --git a/kernel/futex.c b/kernel/futex.c index 69e619baf709..2abe1a0b3062 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1463,8 +1463,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) * Queue the task for later wakeup for after we've released * the hb->lock. wake_q_add() grabs reference to p. */ - wake_q_add(wake_q, p); - put_task_struct(p); + wake_q_add_safe(wake_q, p); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 50d9af615dc4..fbe96341beee 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -211,9 +211,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * Ensure issuing the wakeup (either by us or someone else) * after setting the reader waiter to nil. */ - wake_q_add(wake_q, tsk); - /* wake_q_add() already take the task ref */ - put_task_struct(tsk); + wake_q_add_safe(wake_q, tsk); } adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c8b4dba3d2d..64ceaa5158c5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -396,19 +396,7 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif -/** - * wake_q_add() - queue a wakeup for 'later' waking. - * @head: the wake_q_head to add @task to - * @task: the task to queue for 'later' wakeup - * - * Queue a task for later wakeup, most likely by the wake_up_q() call in the - * same context, _HOWEVER_ this is not guaranteed, the wakeup can come - * instantly. - * - * This function must be used as-if it were wake_up_process(); IOW the task - * must be ready to be woken at this location. - */ -void wake_q_add(struct wake_q_head *head, struct task_struct *task) +static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) { struct wake_q_node *node = &task->wake_q; @@ -422,15 +410,55 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) */ smp_mb__before_atomic(); if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) - return; - - get_task_struct(task); + return false; /* * The head is context local, there can be no concurrency. */ *head->lastp = node; head->lastp = &node->next; + return true; +} + +/** + * wake_q_add() - queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + */ +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ + if (__wake_q_add(head, task)) + get_task_struct(task); +} + +/** + * wake_q_add_safe() - safely queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + * + * This function is essentially a task-safe equivalent to wake_q_add(). Callers + * that already hold reference to @task can call the 'safe' version and trust + * wake_q to do the right thing depending whether or not the @task is already + * queued for wakeup. + */ +void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) +{ + if (!__wake_q_add(head, task)) + put_task_struct(task); } void wake_up_q(struct wake_q_head *head) -- cgit v1.2.3 From 99687cdbb3f6c8e32bcc7f37496e811f30460e48 Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Fri, 18 Jan 2019 15:49:36 +0100 Subject: sched/topology: Fix percpu data types in struct sd_data & struct s_data The percpu members of struct sd_data and s_data are declared as: struct ... ** __percpu member; So their type is: __percpu pointer to pointer to struct ... But looking at how they're used, their type should be: pointer to __percpu pointer to struct ... and they should thus be declared as: struct ... * __percpu *member; So fix the placement of '__percpu' in the definition of these structures. This addresses a bunch of Sparse's warnings like: warning: incorrect type in initializer (different address spaces) expected void const [noderef] *__vpp_verify got struct sched_domain ** Signed-off-by: Luc Van Oostenryck Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190118144936.79158-1-luc.vanoostenryck@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 8 ++++---- kernel/sched/topology.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index c31d3a47a47c..57c7ed3fe465 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -176,10 +176,10 @@ typedef int (*sched_domain_flags_f)(void); #define SDTL_OVERLAP 0x01 struct sd_data { - struct sched_domain **__percpu sd; - struct sched_domain_shared **__percpu sds; - struct sched_group **__percpu sg; - struct sched_group_capacity **__percpu sgc; + struct sched_domain *__percpu *sd; + struct sched_domain_shared *__percpu *sds; + struct sched_group *__percpu *sg; + struct sched_group_capacity *__percpu *sgc; }; struct sched_domain_topology_level { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4ae9403420ed..93ff526e77b0 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -705,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } struct s_data { - struct sched_domain ** __percpu sd; + struct sched_domain * __percpu *sd; struct root_domain *rd; }; -- cgit v1.2.3 From 2b188cc1bb857a9d4701ae59aa7768b5124e262e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Jan 2019 10:46:33 -0700 Subject: Add io_uring IO interface The submission queue (SQ) and completion queue (CQ) rings are shared between the application and the kernel. This eliminates the need to copy data back and forth to submit and complete IO. IO submissions use the io_uring_sqe data structure, and completions are generated in the form of io_uring_cqe data structures. The SQ ring is an index into the io_uring_sqe array, which makes it possible to submit a batch of IOs without them being contiguous in the ring. The CQ ring is always contiguous, as completion events are inherently unordered, and hence any io_uring_cqe entry can point back to an arbitrary submission. Two new system calls are added for this: io_uring_setup(entries, params) Sets up an io_uring instance for doing async IO. On success, returns a file descriptor that the application can mmap to gain access to the SQ ring, CQ ring, and io_uring_sqes. io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize) Initiates IO against the rings mapped to this fd, or waits for them to complete, or both. The behavior is controlled by the parameters passed in. If 'to_submit' is non-zero, then we'll try and submit new IO. If IORING_ENTER_GETEVENTS is set, the kernel will wait for 'min_complete' events, if they aren't already available. It's valid to set IORING_ENTER_GETEVENTS and 'min_complete' == 0 at the same time, this allows the kernel to return already completed events without waiting for them. This is useful only for polling, as for IRQ driven IO, the application can just check the CQ ring without entering the kernel. With this setup, it's possible to do async IO with a single system call. Future developments will enable polled IO with this interface, and polled submission as well. The latter will enable an application to do IO without doing ANY system calls at all. For IRQ driven IO, an application only needs to enter the kernel for completions if it wants to wait for them to occur. Each io_uring is backed by a workqueue, to support buffered async IO as well. We will only punt to an async context if the command would need to wait for IO on the device side. Any data that can be accessed directly in the page cache is done inline. This avoids the slowness issue of usual threadpools, since cached data is accessed as quickly as a sync interface. Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- arch/x86/entry/syscalls/syscall_32.tbl | 2 + arch/x86/entry/syscalls/syscall_64.tbl | 2 + fs/Makefile | 1 + fs/io_uring.c | 1255 ++++++++++++++++++++++++++++++++ include/linux/fs.h | 9 + include/linux/sched/user.h | 2 +- include/linux/syscalls.h | 6 + include/uapi/asm-generic/unistd.h | 6 +- include/uapi/linux/io_uring.h | 95 +++ init/Kconfig | 9 + kernel/sys_ni.c | 2 + net/unix/garbage.c | 3 + 12 files changed, 1390 insertions(+), 2 deletions(-) create mode 100644 fs/io_uring.c create mode 100644 include/uapi/linux/io_uring.h (limited to 'include/linux/sched') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 3cf7b533b3d1..481c126259e9 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -398,3 +398,5 @@ 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents 386 i386 rseq sys_rseq __ia32_sys_rseq +425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup +426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f0b1709a5ffb..6a32a430c8e0 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -343,6 +343,8 @@ 332 common statx __x64_sys_statx 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq +425 common io_uring_setup __x64_sys_io_uring_setup +426 common io_uring_enter __x64_sys_io_uring_enter # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/Makefile b/fs/Makefile index 293733f61594..8e15d6fc4340 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_IO_URING) += io_uring.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FILE_LOCKING) += locks.o diff --git a/fs/io_uring.c b/fs/io_uring.c new file mode 100644 index 000000000000..f68052290426 --- /dev/null +++ b/fs/io_uring.c @@ -0,0 +1,1255 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared application/kernel submission and completion ring pairs, for + * supporting fast/efficient IO. + * + * A note on the read/write ordering memory barriers that are matched between + * the application and kernel side. When the application reads the CQ ring + * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() + * the kernel uses after writing the tail. Failure to do so could cause a + * delay in when the application notices that completion events available. + * This isn't a fatal condition. Likewise, the application must use an + * appropriate smp_wmb() both before writing the SQ tail, and after writing + * the SQ tail. The first one orders the sqe writes with the tail write, and + * the latter is paired with the smp_rmb() the kernel will issue before + * reading the SQ tail on submission. + * + * Also see the examples in the liburing library: + * + * git://git.kernel.dk/liburing + * + * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens + * from data shared between the kernel and application. This is done both + * for ordering purposes, but also to ensure that once a value is loaded from + * data that the application could potentially modify, it remains stable. + * + * Copyright (C) 2018-2019 Jens Axboe + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +#define IORING_MAX_ENTRIES 4096 + +struct io_uring { + u32 head ____cacheline_aligned_in_smp; + u32 tail ____cacheline_aligned_in_smp; +}; + +struct io_sq_ring { + struct io_uring r; + u32 ring_mask; + u32 ring_entries; + u32 dropped; + u32 flags; + u32 array[]; +}; + +struct io_cq_ring { + struct io_uring r; + u32 ring_mask; + u32 ring_entries; + u32 overflow; + struct io_uring_cqe cqes[]; +}; + +struct io_ring_ctx { + struct { + struct percpu_ref refs; + } ____cacheline_aligned_in_smp; + + struct { + unsigned int flags; + bool compat; + bool account_mem; + + /* SQ ring */ + struct io_sq_ring *sq_ring; + unsigned cached_sq_head; + unsigned sq_entries; + unsigned sq_mask; + struct io_uring_sqe *sq_sqes; + } ____cacheline_aligned_in_smp; + + /* IO offload */ + struct workqueue_struct *sqo_wq; + struct mm_struct *sqo_mm; + + struct { + /* CQ ring */ + struct io_cq_ring *cq_ring; + unsigned cached_cq_tail; + unsigned cq_entries; + unsigned cq_mask; + struct wait_queue_head cq_wait; + struct fasync_struct *cq_fasync; + } ____cacheline_aligned_in_smp; + + struct user_struct *user; + + struct completion ctx_done; + + struct { + struct mutex uring_lock; + wait_queue_head_t wait; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t completion_lock; + } ____cacheline_aligned_in_smp; + +#if defined(CONFIG_UNIX) + struct socket *ring_sock; +#endif +}; + +struct sqe_submit { + const struct io_uring_sqe *sqe; + unsigned short index; + bool has_user; +}; + +struct io_kiocb { + struct kiocb rw; + + struct sqe_submit submit; + + struct io_ring_ctx *ctx; + struct list_head list; + unsigned int flags; +#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ + u64 user_data; + + struct work_struct work; +}; + +#define IO_PLUG_THRESHOLD 2 + +static struct kmem_cache *req_cachep; + +static const struct file_operations io_uring_fops; + +struct sock *io_uring_get_socket(struct file *file) +{ +#if defined(CONFIG_UNIX) + if (file->f_op == &io_uring_fops) { + struct io_ring_ctx *ctx = file->private_data; + + return ctx->ring_sock->sk; + } +#endif + return NULL; +} +EXPORT_SYMBOL(io_uring_get_socket); + +static void io_ring_ctx_ref_free(struct percpu_ref *ref) +{ + struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); + + complete(&ctx->ctx_done); +} + +static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) +{ + struct io_ring_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) { + kfree(ctx); + return NULL; + } + + ctx->flags = p->flags; + init_waitqueue_head(&ctx->cq_wait); + init_completion(&ctx->ctx_done); + mutex_init(&ctx->uring_lock); + init_waitqueue_head(&ctx->wait); + spin_lock_init(&ctx->completion_lock); + return ctx; +} + +static void io_commit_cqring(struct io_ring_ctx *ctx) +{ + struct io_cq_ring *ring = ctx->cq_ring; + + if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) { + /* order cqe stores with ring update */ + smp_store_release(&ring->r.tail, ctx->cached_cq_tail); + + /* + * Write sider barrier of tail update, app has read side. See + * comment at the top of this file. + */ + smp_wmb(); + + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); + } + } +} + +static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) +{ + struct io_cq_ring *ring = ctx->cq_ring; + unsigned tail; + + tail = ctx->cached_cq_tail; + /* See comment at the top of the file */ + smp_rmb(); + if (tail + 1 == READ_ONCE(ring->r.head)) + return NULL; + + ctx->cached_cq_tail++; + return &ring->cqes[tail & ctx->cq_mask]; +} + +static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res, unsigned ev_flags) +{ + struct io_uring_cqe *cqe; + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqring(ctx); + if (cqe) { + WRITE_ONCE(cqe->user_data, ki_user_data); + WRITE_ONCE(cqe->res, res); + WRITE_ONCE(cqe->flags, ev_flags); + } else { + unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); + + WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1); + } +} + +static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res, unsigned ev_flags) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + io_cqring_fill_event(ctx, ki_user_data, res, ev_flags); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} + +static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) +{ + percpu_ref_put_many(&ctx->refs, refs); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} + +static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + if (!percpu_ref_tryget(&ctx->refs)) + return NULL; + + req = kmem_cache_alloc(req_cachep, __GFP_NOWARN); + if (req) { + req->ctx = ctx; + req->flags = 0; + return req; + } + + io_ring_drop_ctx_refs(ctx, 1); + return NULL; +} + +static void io_free_req(struct io_kiocb *req) +{ + io_ring_drop_ctx_refs(req->ctx, 1); + kmem_cache_free(req_cachep, req); +} + +static void kiocb_end_write(struct kiocb *kiocb) +{ + if (kiocb->ki_flags & IOCB_WRITE) { + struct inode *inode = file_inode(kiocb->ki_filp); + + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (S_ISREG(inode->i_mode)) + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + file_end_write(kiocb->ki_filp); + } +} + +static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + + kiocb_end_write(kiocb); + + fput(kiocb->ki_filp); + io_cqring_add_event(req->ctx, req->user_data, res, 0); + io_free_req(req); +} + +/* + * If we tracked the file through the SCM inflight mechanism, we could support + * any file. For now, just ensure that anything potentially problematic is done + * inline. + */ +static bool io_file_supports_async(struct file *file) +{ + umode_t mode = file_inode(file)->i_mode; + + if (S_ISBLK(mode) || S_ISCHR(mode)) + return true; + if (S_ISREG(mode) && file->f_op != &io_uring_fops) + return true; + + return false; +} + +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ + struct kiocb *kiocb = &req->rw; + unsigned ioprio; + int fd, ret; + + /* For -EAGAIN retry, everything is already prepped */ + if (kiocb->ki_filp) + return 0; + + fd = READ_ONCE(sqe->fd); + kiocb->ki_filp = fget(fd); + if (unlikely(!kiocb->ki_filp)) + return -EBADF; + if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) + force_nonblock = false; + kiocb->ki_pos = READ_ONCE(sqe->off); + kiocb->ki_flags = iocb_flags(kiocb->ki_filp); + kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); + + ioprio = READ_ONCE(sqe->ioprio); + if (ioprio) { + ret = ioprio_check_cap(ioprio); + if (ret) + goto out_fput; + + kiocb->ki_ioprio = ioprio; + } else + kiocb->ki_ioprio = get_current_ioprio(); + + ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); + if (unlikely(ret)) + goto out_fput; + if (force_nonblock) { + kiocb->ki_flags |= IOCB_NOWAIT; + req->flags |= REQ_F_FORCE_NONBLOCK; + } + if (kiocb->ki_flags & IOCB_HIPRI) { + ret = -EINVAL; + goto out_fput; + } + + kiocb->ki_complete = io_complete_rw; + return 0; +out_fput: + fput(kiocb->ki_filp); + return ret; +} + +static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +{ + switch (ret) { + case -EIOCBQUEUED: + break; + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* + * We can't just restart the syscall, since previously + * submitted sqes may already be in progress. Just fail this + * IO with EINTR. + */ + ret = -EINTR; + /* fall through */ + default: + kiocb->ki_complete(kiocb, ret, 0); + } +} + +static int io_import_iovec(struct io_ring_ctx *ctx, int rw, + const struct sqe_submit *s, struct iovec **iovec, + struct iov_iter *iter) +{ + const struct io_uring_sqe *sqe = s->sqe; + void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + size_t sqe_len = READ_ONCE(sqe->len); + + if (!s->has_user) + return -EFAULT; + +#ifdef CONFIG_COMPAT + if (ctx->compat) + return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, + iovec, iter); +#endif + + return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); +} + +static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct kiocb *kiocb = &req->rw; + struct iov_iter iter; + struct file *file; + ssize_t ret; + + ret = io_prep_rw(req, s->sqe, force_nonblock); + if (ret) + return ret; + file = kiocb->ki_filp; + + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_READ))) + goto out_fput; + ret = -EINVAL; + if (unlikely(!file->f_op->read_iter)) + goto out_fput; + + ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); + if (ret) + goto out_fput; + + ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter)); + if (!ret) { + ssize_t ret2; + + /* Catch -EAGAIN return for forced non-blocking submission */ + ret2 = call_read_iter(file, kiocb, &iter); + if (!force_nonblock || ret2 != -EAGAIN) + io_rw_done(kiocb, ret2); + else + ret = -EAGAIN; + } + kfree(iovec); +out_fput: + /* Hold on to the file for -EAGAIN */ + if (unlikely(ret && ret != -EAGAIN)) + fput(file); + return ret; +} + +static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct kiocb *kiocb = &req->rw; + struct iov_iter iter; + struct file *file; + ssize_t ret; + + ret = io_prep_rw(req, s->sqe, force_nonblock); + if (ret) + return ret; + /* Hold on to the file for -EAGAIN */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) + return -EAGAIN; + + ret = -EBADF; + file = kiocb->ki_filp; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + goto out_fput; + ret = -EINVAL; + if (unlikely(!file->f_op->write_iter)) + goto out_fput; + + ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); + if (ret) + goto out_fput; + + ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, + iov_iter_count(&iter)); + if (!ret) { + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (S_ISREG(file_inode(file)->i_mode)) { + __sb_start_write(file_inode(file)->i_sb, + SB_FREEZE_WRITE, true); + __sb_writers_release(file_inode(file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; + io_rw_done(kiocb, call_write_iter(file, kiocb, &iter)); + } + kfree(iovec); +out_fput: + if (unlikely(ret)) + fput(file); + return ret; +} + +/* + * IORING_OP_NOP just posts a completion event, nothing else. + */ +static int io_nop(struct io_kiocb *req, u64 user_data) +{ + struct io_ring_ctx *ctx = req->ctx; + long err = 0; + + /* + * Twilight zone - it's possible that someone issued an opcode that + * has a file attached, then got -EAGAIN on submission, and changed + * the sqe before we retried it from async context. Avoid dropping + * a file reference for this malicious case, and flag the error. + */ + if (req->rw.ki_filp) { + err = -EBADF; + fput(req->rw.ki_filp); + } + io_cqring_add_event(ctx, user_data, err, 0); + io_free_req(req); + return 0; +} + +static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct sqe_submit *s, bool force_nonblock) +{ + ssize_t ret; + int opcode; + + if (unlikely(s->index >= ctx->sq_entries)) + return -EINVAL; + req->user_data = READ_ONCE(s->sqe->user_data); + + opcode = READ_ONCE(s->sqe->opcode); + switch (opcode) { + case IORING_OP_NOP: + ret = io_nop(req, req->user_data); + break; + case IORING_OP_READV: + ret = io_read(req, s, force_nonblock); + break; + case IORING_OP_WRITEV: + ret = io_write(req, s, force_nonblock); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static void io_sq_wq_submit_work(struct work_struct *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct sqe_submit *s = &req->submit; + const struct io_uring_sqe *sqe = s->sqe; + struct io_ring_ctx *ctx = req->ctx; + mm_segment_t old_fs = get_fs(); + int ret; + + /* Ensure we clear previously set forced non-block flag */ + req->flags &= ~REQ_F_FORCE_NONBLOCK; + req->rw.ki_flags &= ~IOCB_NOWAIT; + + if (!mmget_not_zero(ctx->sqo_mm)) { + ret = -EFAULT; + goto err; + } + + use_mm(ctx->sqo_mm); + set_fs(USER_DS); + s->has_user = true; + + ret = __io_submit_sqe(ctx, req, s, false); + + set_fs(old_fs); + unuse_mm(ctx->sqo_mm); + mmput(ctx->sqo_mm); +err: + if (ret) { + io_cqring_add_event(ctx, sqe->user_data, ret, 0); + io_free_req(req); + } + + /* async context always use a copy of the sqe */ + kfree(sqe); +} + +static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s) +{ + struct io_kiocb *req; + ssize_t ret; + + /* enforce forwards compatibility on users */ + if (unlikely(s->sqe->flags)) + return -EINVAL; + + req = io_get_req(ctx); + if (unlikely(!req)) + return -EAGAIN; + + req->rw.ki_filp = NULL; + + ret = __io_submit_sqe(ctx, req, s, true); + if (ret == -EAGAIN) { + struct io_uring_sqe *sqe_copy; + + sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + if (sqe_copy) { + memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy)); + s->sqe = sqe_copy; + + memcpy(&req->submit, s, sizeof(*s)); + INIT_WORK(&req->work, io_sq_wq_submit_work); + queue_work(ctx->sqo_wq, &req->work); + ret = 0; + } + } + if (ret) + io_free_req(req); + + return ret; +} + +static void io_commit_sqring(struct io_ring_ctx *ctx) +{ + struct io_sq_ring *ring = ctx->sq_ring; + + if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) { + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&ring->r.head, ctx->cached_sq_head); + + /* + * write side barrier of head update, app has read side. See + * comment at the top of this file + */ + smp_wmb(); + } +} + +/* + * Undo last io_get_sqring() + */ +static void io_drop_sqring(struct io_ring_ctx *ctx) +{ + ctx->cached_sq_head--; +} + +/* + * Fetch an sqe, if one is available. Note that s->sqe will point to memory + * that is mapped by userspace. This means that care needs to be taken to + * ensure that reads are stable, as we cannot rely on userspace always + * being a good citizen. If members of the sqe are validated and then later + * used, it's important that those reads are done through READ_ONCE() to + * prevent a re-load down the line. + */ +static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) +{ + struct io_sq_ring *ring = ctx->sq_ring; + unsigned head; + + /* + * The cached sq head (or cq tail) serves two purposes: + * + * 1) allows us to batch the cost of updating the user visible + * head updates. + * 2) allows the kernel side to track the head on its own, even + * though the application is the one updating it. + */ + head = ctx->cached_sq_head; + /* See comment at the top of this file */ + smp_rmb(); + if (head == READ_ONCE(ring->r.tail)) + return false; + + head = READ_ONCE(ring->array[head & ctx->sq_mask]); + if (head < ctx->sq_entries) { + s->index = head; + s->sqe = &ctx->sq_sqes[head]; + ctx->cached_sq_head++; + return true; + } + + /* drop invalid entries */ + ctx->cached_sq_head++; + ring->dropped++; + /* See comment at the top of this file */ + smp_wmb(); + return false; +} + +static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) +{ + int i, ret = 0, submit = 0; + struct blk_plug plug; + + if (to_submit > IO_PLUG_THRESHOLD) + blk_start_plug(&plug); + + for (i = 0; i < to_submit; i++) { + struct sqe_submit s; + + if (!io_get_sqring(ctx, &s)) + break; + + s.has_user = true; + ret = io_submit_sqe(ctx, &s); + if (ret) { + io_drop_sqring(ctx); + break; + } + + submit++; + } + io_commit_sqring(ctx); + + if (to_submit > IO_PLUG_THRESHOLD) + blk_finish_plug(&plug); + + return submit ? submit : ret; +} + +static unsigned io_cqring_events(struct io_cq_ring *ring) +{ + return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); +} + +/* + * Wait until events become available, if we don't already have some. The + * application must reap them itself, as they reside on the shared cq ring. + */ +static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, + const sigset_t __user *sig, size_t sigsz) +{ + struct io_cq_ring *ring = ctx->cq_ring; + sigset_t ksigmask, sigsaved; + DEFINE_WAIT(wait); + int ret; + + /* See comment at the top of this file */ + smp_rmb(); + if (io_cqring_events(ring) >= min_events) + return 0; + + if (sig) { + ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz); + if (ret) + return ret; + } + + do { + prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE); + + ret = 0; + /* See comment at the top of this file */ + smp_rmb(); + if (io_cqring_events(ring) >= min_events) + break; + + schedule(); + + ret = -EINTR; + if (signal_pending(current)) + break; + } while (1); + + finish_wait(&ctx->wait, &wait); + + if (sig) + restore_user_sigmask(sig, &sigsaved); + + return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; +} + +static int io_sq_offload_start(struct io_ring_ctx *ctx) +{ + int ret; + + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + + /* Do QD, or 2 * CPUS, whatever is smallest */ + ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, + min(ctx->sq_entries - 1, 2 * num_online_cpus())); + if (!ctx->sqo_wq) { + ret = -ENOMEM; + goto err; + } + + return 0; +err: + mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + return ret; +} + +static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) +{ + atomic_long_sub(nr_pages, &user->locked_vm); +} + +static int io_account_mem(struct user_struct *user, unsigned long nr_pages) +{ + unsigned long page_limit, cur_pages, new_pages; + + /* Don't allow more pages than we can safely lock */ + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + do { + cur_pages = atomic_long_read(&user->locked_vm); + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, + new_pages) != cur_pages); + + return 0; +} + +static void io_mem_free(void *ptr) +{ + struct page *page = virt_to_head_page(ptr); + + if (put_page_testzero(page)) + free_compound_page(page); +} + +static void *io_mem_alloc(size_t size) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | + __GFP_NORETRY; + + return (void *) __get_free_pages(gfp_flags, get_order(size)); +} + +static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) +{ + struct io_sq_ring *sq_ring; + struct io_cq_ring *cq_ring; + size_t bytes; + + bytes = struct_size(sq_ring, array, sq_entries); + bytes += array_size(sizeof(struct io_uring_sqe), sq_entries); + bytes += struct_size(cq_ring, cqes, cq_entries); + + return (bytes + PAGE_SIZE - 1) / PAGE_SIZE; +} + +static void io_ring_ctx_free(struct io_ring_ctx *ctx) +{ + if (ctx->sqo_wq) + destroy_workqueue(ctx->sqo_wq); + if (ctx->sqo_mm) + mmdrop(ctx->sqo_mm); +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) + sock_release(ctx->ring_sock); +#endif + + io_mem_free(ctx->sq_ring); + io_mem_free(ctx->sq_sqes); + io_mem_free(ctx->cq_ring); + + percpu_ref_exit(&ctx->refs); + if (ctx->account_mem) + io_unaccount_mem(ctx->user, + ring_pages(ctx->sq_entries, ctx->cq_entries)); + free_uid(ctx->user); + kfree(ctx); +} + +static __poll_t io_uring_poll(struct file *file, poll_table *wait) +{ + struct io_ring_ctx *ctx = file->private_data; + __poll_t mask = 0; + + poll_wait(file, &ctx->cq_wait, wait); + /* See comment at the top of this file */ + smp_rmb(); + if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) + mask |= EPOLLOUT | EPOLLWRNORM; + if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static int io_uring_fasync(int fd, struct file *file, int on) +{ + struct io_ring_ctx *ctx = file->private_data; + + return fasync_helper(fd, file, on, &ctx->cq_fasync); +} + +static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) +{ + mutex_lock(&ctx->uring_lock); + percpu_ref_kill(&ctx->refs); + mutex_unlock(&ctx->uring_lock); + + wait_for_completion(&ctx->ctx_done); + io_ring_ctx_free(ctx); +} + +static int io_uring_release(struct inode *inode, struct file *file) +{ + struct io_ring_ctx *ctx = file->private_data; + + file->private_data = NULL; + io_ring_ctx_wait_and_kill(ctx); + return 0; +} + +static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT; + unsigned long sz = vma->vm_end - vma->vm_start; + struct io_ring_ctx *ctx = file->private_data; + unsigned long pfn; + struct page *page; + void *ptr; + + switch (offset) { + case IORING_OFF_SQ_RING: + ptr = ctx->sq_ring; + break; + case IORING_OFF_SQES: + ptr = ctx->sq_sqes; + break; + case IORING_OFF_CQ_RING: + ptr = ctx->cq_ring; + break; + default: + return -EINVAL; + } + + page = virt_to_head_page(ptr); + if (sz > (PAGE_SIZE << compound_order(page))) + return -EINVAL; + + pfn = virt_to_phys(ptr) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); +} + +SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, + u32, min_complete, u32, flags, const sigset_t __user *, sig, + size_t, sigsz) +{ + struct io_ring_ctx *ctx; + long ret = -EBADF; + int submitted = 0; + struct fd f; + + if (flags & ~IORING_ENTER_GETEVENTS) + return -EINVAL; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = -EOPNOTSUPP; + if (f.file->f_op != &io_uring_fops) + goto out_fput; + + ret = -ENXIO; + ctx = f.file->private_data; + if (!percpu_ref_tryget(&ctx->refs)) + goto out_fput; + + ret = 0; + if (to_submit) { + to_submit = min(to_submit, ctx->sq_entries); + + mutex_lock(&ctx->uring_lock); + submitted = io_ring_submit(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + + if (submitted < 0) + goto out_ctx; + } + if (flags & IORING_ENTER_GETEVENTS) { + min_complete = min(min_complete, ctx->cq_entries); + + /* + * The application could have included the 'to_submit' count + * in how many events it wanted to wait for. If we failed to + * submit the desired count, we may need to adjust the number + * of events to poll/wait for. + */ + if (submitted < to_submit) + min_complete = min_t(unsigned, submitted, min_complete); + + ret = io_cqring_wait(ctx, min_complete, sig, sigsz); + } + +out_ctx: + io_ring_drop_ctx_refs(ctx, 1); +out_fput: + fdput(f); + return submitted ? submitted : ret; +} + +static const struct file_operations io_uring_fops = { + .release = io_uring_release, + .mmap = io_uring_mmap, + .poll = io_uring_poll, + .fasync = io_uring_fasync, +}; + +static int io_allocate_scq_urings(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + struct io_sq_ring *sq_ring; + struct io_cq_ring *cq_ring; + size_t size; + + sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries)); + if (!sq_ring) + return -ENOMEM; + + ctx->sq_ring = sq_ring; + sq_ring->ring_mask = p->sq_entries - 1; + sq_ring->ring_entries = p->sq_entries; + ctx->sq_mask = sq_ring->ring_mask; + ctx->sq_entries = sq_ring->ring_entries; + + size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + if (size == SIZE_MAX) + return -EOVERFLOW; + + ctx->sq_sqes = io_mem_alloc(size); + if (!ctx->sq_sqes) { + io_mem_free(ctx->sq_ring); + return -ENOMEM; + } + + cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); + if (!cq_ring) { + io_mem_free(ctx->sq_ring); + io_mem_free(ctx->sq_sqes); + return -ENOMEM; + } + + ctx->cq_ring = cq_ring; + cq_ring->ring_mask = p->cq_entries - 1; + cq_ring->ring_entries = p->cq_entries; + ctx->cq_mask = cq_ring->ring_mask; + ctx->cq_entries = cq_ring->ring_entries; + return 0; +} + +/* + * Allocate an anonymous fd, this is what constitutes the application + * visible backing of an io_uring instance. The application mmaps this + * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, + * we have to tie this fd to a socket for file garbage collection purposes. + */ +static int io_uring_get_fd(struct io_ring_ctx *ctx) +{ + struct file *file; + int ret; + +#if defined(CONFIG_UNIX) + ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, + &ctx->ring_sock); + if (ret) + return ret; +#endif + + ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (ret < 0) + goto err; + + file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, + O_RDWR | O_CLOEXEC); + if (IS_ERR(file)) { + put_unused_fd(ret); + ret = PTR_ERR(file); + goto err; + } + +#if defined(CONFIG_UNIX) + ctx->ring_sock->file = file; +#endif + fd_install(ret, file); + return ret; +err: +#if defined(CONFIG_UNIX) + sock_release(ctx->ring_sock); + ctx->ring_sock = NULL; +#endif + return ret; +} + +static int io_uring_create(unsigned entries, struct io_uring_params *p) +{ + struct user_struct *user = NULL; + struct io_ring_ctx *ctx; + bool account_mem; + int ret; + + if (!entries || entries > IORING_MAX_ENTRIES) + return -EINVAL; + + /* + * Use twice as many entries for the CQ ring. It's possible for the + * application to drive a higher depth than the size of the SQ ring, + * since the sqes are only used at submission time. This allows for + * some flexibility in overcommitting a bit. + */ + p->sq_entries = roundup_pow_of_two(entries); + p->cq_entries = 2 * p->sq_entries; + + user = get_uid(current_user()); + account_mem = !capable(CAP_IPC_LOCK); + + if (account_mem) { + ret = io_account_mem(user, + ring_pages(p->sq_entries, p->cq_entries)); + if (ret) { + free_uid(user); + return ret; + } + } + + ctx = io_ring_ctx_alloc(p); + if (!ctx) { + if (account_mem) + io_unaccount_mem(user, ring_pages(p->sq_entries, + p->cq_entries)); + free_uid(user); + return -ENOMEM; + } + ctx->compat = in_compat_syscall(); + ctx->account_mem = account_mem; + ctx->user = user; + + ret = io_allocate_scq_urings(ctx, p); + if (ret) + goto err; + + ret = io_sq_offload_start(ctx); + if (ret) + goto err; + + ret = io_uring_get_fd(ctx); + if (ret < 0) + goto err; + + memset(&p->sq_off, 0, sizeof(p->sq_off)); + p->sq_off.head = offsetof(struct io_sq_ring, r.head); + p->sq_off.tail = offsetof(struct io_sq_ring, r.tail); + p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask); + p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries); + p->sq_off.flags = offsetof(struct io_sq_ring, flags); + p->sq_off.dropped = offsetof(struct io_sq_ring, dropped); + p->sq_off.array = offsetof(struct io_sq_ring, array); + + memset(&p->cq_off, 0, sizeof(p->cq_off)); + p->cq_off.head = offsetof(struct io_cq_ring, r.head); + p->cq_off.tail = offsetof(struct io_cq_ring, r.tail); + p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask); + p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries); + p->cq_off.overflow = offsetof(struct io_cq_ring, overflow); + p->cq_off.cqes = offsetof(struct io_cq_ring, cqes); + return ret; +err: + io_ring_ctx_wait_and_kill(ctx); + return ret; +} + +/* + * Sets up an aio uring context, and returns the fd. Applications asks for a + * ring size, we return the actual sq/cq ring sizes (among other things) in the + * params structure passed in. + */ +static long io_uring_setup(u32 entries, struct io_uring_params __user *params) +{ + struct io_uring_params p; + long ret; + int i; + + if (copy_from_user(&p, params, sizeof(p))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(p.resv); i++) { + if (p.resv[i]) + return -EINVAL; + } + + if (p.flags) + return -EINVAL; + + ret = io_uring_create(entries, &p); + if (ret < 0) + return ret; + + if (copy_to_user(params, &p, sizeof(p))) + return -EFAULT; + + return ret; +} + +SYSCALL_DEFINE2(io_uring_setup, u32, entries, + struct io_uring_params __user *, params) +{ + return io_uring_setup(entries, params); +} + +static int __init io_uring_init(void) +{ + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + return 0; +}; +__initcall(io_uring_init); diff --git a/include/linux/fs.h b/include/linux/fs.h index dedcc2e9265c..61aa210f0c2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3517,4 +3517,13 @@ extern void inode_nohighmem(struct inode *inode); extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +#if defined(CONFIG_IO_URING) +extern struct sock *io_uring_get_socket(struct file *file); +#else +static inline struct sock *io_uring_get_socket(struct file *file) +{ + return NULL; +} +#endif + #endif /* _LINUX_FS_H */ diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 39ad98c09c58..c7b5f86b91a1 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -40,7 +40,7 @@ struct user_struct { kuid_t uid; #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ - defined(CONFIG_NET) + defined(CONFIG_NET) || defined(CONFIG_IO_URING) atomic_long_t locked_vm; #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 257cccba3062..3072dbaa7869 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -69,6 +69,7 @@ struct file_handle; struct sigaltstack; struct rseq; union bpf_attr; +struct io_uring_params; #include #include @@ -309,6 +310,11 @@ asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id, struct io_event __user *events, struct old_timespec32 __user *timeout, const struct __aio_sigset *sig); +asmlinkage long sys_io_uring_setup(u32 entries, + struct io_uring_params __user *p); +asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, + u32 min_complete, u32 flags, + const sigset_t __user *sig, size_t sigsz); /* fs/xattr.c */ asmlinkage long sys_setxattr(const char __user *path, const char __user *name, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index d90127298f12..87871e7b7ea7 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -740,9 +740,13 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) +#define __NR_io_uring_setup 425 +__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) +#define __NR_io_uring_enter 426 +__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) #undef __NR_syscalls -#define __NR_syscalls 295 +#define __NR_syscalls 427 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h new file mode 100644 index 000000000000..ac692823d6f4 --- /dev/null +++ b/include/uapi/linux/io_uring.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Header file for the io_uring interface. + * + * Copyright (C) 2019 Jens Axboe + * Copyright (C) 2019 Christoph Hellwig + */ +#ifndef LINUX_IO_URING_H +#define LINUX_IO_URING_H + +#include +#include + +/* + * IO submission data structure (Submission Queue Entry) + */ +struct io_uring_sqe { + __u8 opcode; /* type of operation for this sqe */ + __u8 flags; /* as of now unused */ + __u16 ioprio; /* ioprio for the request */ + __s32 fd; /* file descriptor to do IO on */ + __u64 off; /* offset into file */ + __u64 addr; /* pointer to buffer or iovecs */ + __u32 len; /* buffer size or number of iovecs */ + union { + __kernel_rwf_t rw_flags; + __u32 __resv; + }; + __u64 user_data; /* data to be passed back at completion time */ + __u64 __pad2[3]; +}; + +#define IORING_OP_NOP 0 +#define IORING_OP_READV 1 +#define IORING_OP_WRITEV 2 + +/* + * IO completion data structure (Completion Queue Entry) + */ +struct io_uring_cqe { + __u64 user_data; /* sqe->data submission passed back */ + __s32 res; /* result code for this event */ + __u32 flags; +}; + +/* + * Magic offsets for the application to mmap the data it needs + */ +#define IORING_OFF_SQ_RING 0ULL +#define IORING_OFF_CQ_RING 0x8000000ULL +#define IORING_OFF_SQES 0x10000000ULL + +/* + * Filled with the offset for mmap(2) + */ +struct io_sqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 flags; + __u32 dropped; + __u32 array; + __u32 resv1; + __u64 resv2; +}; + +struct io_cqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 overflow; + __u32 cqes; + __u64 resv[2]; +}; + +/* + * io_uring_enter(2) flags + */ +#define IORING_ENTER_GETEVENTS (1U << 0) + +/* + * Passed in for io_uring_setup(2). Copied back with updated info on success + */ +struct io_uring_params { + __u32 sq_entries; + __u32 cq_entries; + __u32 flags; + __u32 resv[7]; + struct io_sqring_offsets sq_off; + struct io_cqring_offsets cq_off; +}; + +#endif diff --git a/init/Kconfig b/init/Kconfig index c9386a365eea..53b54214a36e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1414,6 +1414,15 @@ config AIO by some high performance threaded applications. Disabling this option saves about 7k. +config IO_URING + bool "Enable IO uring support" if EXPERT + select ANON_INODES + default y + help + This option enables support for the io_uring interface, enabling + applications to submit and complete IO through submission and + completion rings that are shared between the kernel and application. + config ADVISE_SYSCALLS bool "Enable madvise/fadvise syscalls" if EXPERT default y diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ab9d0e3c6d50..ee5e523564bb 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -46,6 +46,8 @@ COND_SYSCALL(io_getevents); COND_SYSCALL(io_pgetevents); COND_SYSCALL_COMPAT(io_getevents); COND_SYSCALL_COMPAT(io_pgetevents); +COND_SYSCALL(io_uring_setup); +COND_SYSCALL(io_uring_enter); /* fs/xattr.c */ diff --git a/net/unix/garbage.c b/net/unix/garbage.c index c36757e72844..f81854d74c7d 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -108,6 +108,9 @@ struct sock *unix_get_socket(struct file *filp) /* PF_UNIX ? */ if (s && sock->ops && sock->ops->family == PF_UNIX) u_sock = s; + } else { + /* Could be an io_uring instance */ + u_sock = io_uring_get_socket(filp); } return u_sock; } -- cgit v1.2.3 From d7fefcc8de9147cc37d0c00df12e7ea4f77999b5 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:47:40 -0800 Subject: mm/cma: add PF flag to force non cma alloc Patch series "mm/kvm/vfio/ppc64: Migrate compound pages out of CMA region", v8. ppc64 uses the CMA area for the allocation of guest page table (hash page table). We won't be able to start guest if we fail to allocate hash page table. We have observed hash table allocation failure because we failed to migrate pages out of CMA region because they were pinned. This happen when we are using VFIO. VFIO on ppc64 pins the entire guest RAM. If the guest RAM pages get allocated out of CMA region, we won't be able to migrate those pages. The pages are also pinned for the lifetime of the guest. Currently we support migration of non-compound pages. With THP and with the addition of hugetlb migration we can end up allocating compound pages from CMA region. This patch series add support for migrating compound pages. This patch (of 4): Add PF_MEMALLOC_NOCMA which make sure any allocation in that context is marked non-movable and hence cannot be satisfied by CMA region. This is useful with get_user_pages_longterm where we want to take a page pin by migrating pages from CMA region. Marking the section PF_MEMALLOC_NOCMA ensures that we avoid unnecessary page migration later. Link: http://lkml.kernel.org/r/20190114095438.32470-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Suggested-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: Michal Hocko Cc: Alexey Kardashevskiy Cc: David Gibson Cc: Michael Ellerman Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + include/linux/sched/mm.h | 48 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 41 insertions(+), 8 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index ebfb34fb9b30..36ec6e7e8291 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1407,6 +1407,7 @@ extern struct pid *cad_pid; #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ +#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3bfa6a0cbba4..0cd9f10423fb 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -148,17 +148,25 @@ static inline bool in_vfork(struct task_struct *tsk) * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS + * PF_MEMALLOC_NOCMA implies no allocation from CMA region. */ static inline gfp_t current_gfp_context(gfp_t flags) { - /* - * NOIO implies both NOIO and NOFS and it is a weaker context - * so always make sure it makes precedence - */ - if (unlikely(current->flags & PF_MEMALLOC_NOIO)) - flags &= ~(__GFP_IO | __GFP_FS); - else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) - flags &= ~__GFP_FS; + if (unlikely(current->flags & + (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA))) { + /* + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precedence + */ + if (current->flags & PF_MEMALLOC_NOIO) + flags &= ~(__GFP_IO | __GFP_FS); + else if (current->flags & PF_MEMALLOC_NOFS) + flags &= ~__GFP_FS; +#ifdef CONFIG_CMA + if (current->flags & PF_MEMALLOC_NOCMA) + flags &= ~__GFP_MOVABLE; +#endif + } return flags; } @@ -248,6 +256,30 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +#ifdef CONFIG_CMA +static inline unsigned int memalloc_nocma_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; + + current->flags |= PF_MEMALLOC_NOCMA; + return flags; +} + +static inline void memalloc_nocma_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; +} +#else +static inline unsigned int memalloc_nocma_save(void) +{ + return 0; +} + +static inline void memalloc_nocma_restore(unsigned int flags) +{ +} +#endif + #ifdef CONFIG_MEMCG /** * memalloc_use_memcg - Starts the remote memcg charging scope. -- cgit v1.2.3 From fcfc2aa0185f4a731d05a21e9f359968fdfd02e7 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Mar 2019 20:44:13 -0700 Subject: ptrace: take into account saved_sigmask in PTRACE{GET,SET}SIGMASK There are a few system calls (pselect, ppoll, etc) which replace a task sigmask while they are running in a kernel-space When a task calls one of these syscalls, the kernel saves a current sigmask in task->saved_sigmask and sets a syscall sigmask. On syscall-exit-stop, ptrace traps a task before restoring the saved_sigmask, so PTRACE_GETSIGMASK returns the syscall sigmask and PTRACE_SETSIGMASK does nothing, because its sigmask is replaced by saved_sigmask, when the task returns to user-space. This patch fixes this problem. PTRACE_GETSIGMASK returns saved_sigmask if it's set. PTRACE_SETSIGMASK drops the TIF_RESTORE_SIGMASK flag. Link: http://lkml.kernel.org/r/20181120060616.6043-1-avagin@gmail.com Fixes: 29000caecbe8 ("ptrace: add ability to get/set signal-blocked mask") Signed-off-by: Andrei Vagin Acked-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/signal.h | 18 ++++++++++++++++++ kernel/ptrace.c | 15 +++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index ae5655197698..e412c092c1e8 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -418,10 +418,20 @@ static inline void set_restore_sigmask(void) set_thread_flag(TIF_RESTORE_SIGMASK); WARN_ON(!test_thread_flag(TIF_SIGPENDING)); } + +static inline void clear_tsk_restore_sigmask(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK); +} + static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } +static inline bool test_tsk_restore_sigmask(struct task_struct *tsk) +{ + return test_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK); +} static inline bool test_restore_sigmask(void) { return test_thread_flag(TIF_RESTORE_SIGMASK); @@ -439,6 +449,10 @@ static inline void set_restore_sigmask(void) current->restore_sigmask = true; WARN_ON(!test_thread_flag(TIF_SIGPENDING)); } +static inline void clear_tsk_restore_sigmask(struct task_struct *tsk) +{ + tsk->restore_sigmask = false; +} static inline void clear_restore_sigmask(void) { current->restore_sigmask = false; @@ -447,6 +461,10 @@ static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } +static inline bool test_tsk_restore_sigmask(struct task_struct *tsk) +{ + return tsk->restore_sigmask; +} static inline bool test_and_clear_restore_sigmask(void) { if (!current->restore_sigmask) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 771e93f9c43f..6f357f4fc859 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * Access another process' address space via ptrace. @@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; - case PTRACE_GETSIGMASK: + case PTRACE_GETSIGMASK: { + sigset_t *mask; + if (addr != sizeof(sigset_t)) { ret = -EINVAL; break; } - if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) + if (test_tsk_restore_sigmask(child)) + mask = &child->saved_sigmask; + else + mask = &child->blocked; + + if (copy_to_user(datavp, mask, sizeof(sigset_t))) ret = -EFAULT; else ret = 0; break; + } case PTRACE_SETSIGMASK: { sigset_t new_set; @@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request, child->blocked = new_set; spin_unlock_irq(&child->sighand->siglock); + clear_tsk_restore_sigmask(child); + ret = 0; break; } -- cgit v1.2.3 From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 18 Apr 2019 17:50:52 -0700 Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping The core dumping code has always run without holding the mmap_sem for writing, despite that is the only way to ensure that the entire vma layout will not change from under it. Only using some signal serialization on the processes belonging to the mm is not nearly enough. This was pointed out earlier. For example in Hugh's post from Jul 2017: https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils "Not strictly relevant here, but a related note: I was very surprised to discover, only quite recently, how handle_mm_fault() may be called without down_read(mmap_sem) - when core dumping. That seems a misguided optimization to me, which would also be nice to correct" In particular because the growsdown and growsup can move the vm_start/vm_end the various loops the core dump does around the vma will not be consistent if page faults can happen concurrently. Pretty much all users calling mmget_not_zero()/get_task_mm() and then taking the mmap_sem had the potential to introduce unexpected side effects in the core dumping code. Adding mmap_sem for writing around the ->core_dump invocation is a viable long term fix, but it requires removing all copy user and page faults and to replace them with get_dump_page() for all binary formats which is not suitable as a short term fix. For the time being this solution manually covers the places that can confuse the core dump either by altering the vma layout or the vma flags while it runs. Once ->core_dump runs under mmap_sem for writing the function mmget_still_valid() can be dropped. Allowing mmap_sem protected sections to run in parallel with the coredump provides some minor parallelism advantage to the swapoff code (which seems to be safe enough by never mangling any vma field and can keep doing swapins in parallel to the core dumping) and to some other corner case. In order to facilitate the backporting I added "Fixes: 86039bd3b4e6" however the side effect of this same race condition in /proc/pid/mem should be reproducible since before 2.6.12-rc2 so I couldn't add any other "Fixes:" because there's no hash beyond the git genesis commit. Because find_extend_vma() is the only location outside of the process context that could modify the "mm" structures under mmap_sem for reading, by adding the mmget_still_valid() check to it, all other cases that take the mmap_sem for reading don't need the new check after mmget_not_zero()/get_task_mm(). The expand_stack() in page fault context also doesn't need the new check, because all tasks under core dumping are frozen. Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") Signed-off-by: Andrea Arcangeli Reported-by: Jann Horn Suggested-by: Oleg Nesterov Acked-by: Peter Xu Reviewed-by: Mike Rapoport Reviewed-by: Oleg Nesterov Reviewed-by: Jann Horn Acked-by: Jason Gunthorpe Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/core/uverbs_main.c | 3 +++ fs/proc/task_mmu.c | 18 ++++++++++++++++++ fs/userfaultfd.c | 9 +++++++++ include/linux/sched/mm.h | 21 +++++++++++++++++++++ mm/mmap.c | 7 ++++++- 5 files changed, 57 insertions(+), 1 deletion(-) (limited to 'include/linux/sched') diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 70b7d80431a9..f2e7ffe6fc54 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -993,6 +993,8 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) * will only be one mm, so no big deal. */ down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto skip_mm; mutex_lock(&ufile->umap_lock); list_for_each_entry_safe (priv, next_priv, &ufile->umaps, list) { @@ -1007,6 +1009,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); } mutex_unlock(&ufile->umap_lock); + skip_mm: up_write(&mm->mmap_sem); mmput(mm); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 92a91e7816d8..95ca1fe7283c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1143,6 +1143,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } + /* + * Avoid to modify vma->vm_flags + * without locked ops while the + * coredump reads the vm_flags. + */ + if (!mmget_still_valid(mm)) { + /* + * Silently return "count" + * like if get_task_mm() + * failed. FIXME: should this + * function have returned + * -ESRCH if get_task_mm() + * failed like if + * get_proc_task() fails? + */ + up_write(&mm->mmap_sem); + goto out_mm; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 89800fc7dc9d..f5de1e726356 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -629,6 +629,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ down_write(&mm->mmap_sem); + /* no task can run (and in turn coredump) yet */ + VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -883,6 +885,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_sem for writing. */ down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto skip_mm; prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -905,6 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } +skip_mm: up_write(&mm->mmap_sem); mmput(mm); wakeup: @@ -1333,6 +1338,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1520,6 +1527,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0cd9f10423fb..a3fda9f024c3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,27 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +/* + * This has to be called after a get_task_mm()/mmget_not_zero() + * followed by taking the mmap_sem for writing before modifying the + * vmas or anything the coredump pretends not to change from under it. + * + * NOTE: find_extend_vma() called from GUP context is the only place + * that can modify the "mm" (notably the vm_start/end) under mmap_sem + * for reading and outside the context of the process, so it is also + * the only case that holds the mmap_sem for reading that must call + * this function. Generally if the mmap_sem is hold for reading + * there's no need of this check after get_task_mm()/mmget_not_zero(). + * + * This function can be obsoleted and the check can be removed, after + * the coredump code will hold the mmap_sem for writing before + * invoking the ->core_dump methods. + */ +static inline bool mmget_still_valid(struct mm_struct *mm) +{ + return likely(!mm->core_state); +} + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/mm/mmap.c b/mm/mmap.c index 41eb48d9b527..bd7b9f293b39 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -2525,7 +2526,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + /* don't alter vm_end if the coredump is running */ + if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); @@ -2551,6 +2553,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; + /* don't alter vm_start if the coredump is running */ + if (!mmget_still_valid(mm)) + return NULL; start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; -- cgit v1.2.3