From a364298359e74a414857bbbf3b725564feb22d09 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Feb 2018 05:17:24 +0100 Subject: nohz: Convert tick_nohz_tick_stopped() to bool It makes this function more self-explanatory about what it does and how to use it. Reported-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1519186649-3242-3-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/tick.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tick.h b/include/linux/tick.h index 7cc35921218e..86576d9d2311 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -113,7 +113,7 @@ enum tick_dep_bits { #ifdef CONFIG_NO_HZ_COMMON extern bool tick_nohz_enabled; -extern int tick_nohz_tick_stopped(void); +extern bool tick_nohz_tick_stopped(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); -- cgit v1.2.3 From 22ab8bc02a5f6e8ffc418759894f7a6b0b632331 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Feb 2018 05:17:25 +0100 Subject: nohz: Allow to check if remote CPU tick is stopped This check is racy but provides a good heuristic to determine whether a CPU may need a remote tick or not. Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1519186649-3242-4-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/tick.h | 2 ++ kernel/time/tick-sched.c | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/tick.h b/include/linux/tick.h index 86576d9d2311..7f8c9a127f5a 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -114,6 +114,7 @@ enum tick_dep_bits { #ifdef CONFIG_NO_HZ_COMMON extern bool tick_nohz_enabled; extern bool tick_nohz_tick_stopped(void); +extern bool tick_nohz_tick_stopped_cpu(int cpu); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); @@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ #define tick_nohz_enabled (0) static inline int tick_nohz_tick_stopped(void) { return 0; } +static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0aba0412ede5..d479b21a848b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -486,6 +486,13 @@ bool tick_nohz_tick_stopped(void) return __this_cpu_read(tick_cpu_sched.tick_stopped); } +bool tick_nohz_tick_stopped_cpu(int cpu) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + return ts->tick_stopped; +} + /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * -- cgit v1.2.3 From 1bda3f8087fce9063da0b8aef87f17a3fe541aca Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Feb 2018 05:17:26 +0100 Subject: sched/isolation: Isolate workqueues when "nohz_full=" is set As we prepare for offloading the residual 1hz scheduler ticks to workqueue, let's affine those to housekeepers so that they don't interrupt the CPUs that don't want to be disturbed. Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1519186649-3242-5-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/isolation.h | 1 + kernel/sched/isolation.c | 3 ++- kernel/workqueue.c | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index d849431c8060..4a6582c27dea 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -12,6 +12,7 @@ enum hk_flags { HK_FLAG_SCHED = (1 << 3), HK_FLAG_TICK = (1 << 4), HK_FLAG_DOMAIN = (1 << 5), + HK_FLAG_WQ = (1 << 6), }; #ifdef CONFIG_CPU_ISOLATION diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b71b436f59f2..a2500c459617 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -3,6 +3,7 @@ * any CPU: unbound workqueues, timers, kthreads and any offloadable work. * * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker + * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ @@ -119,7 +120,7 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned int flags; - flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; return housekeeping_setup(str, flags); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 017044c26233..593dbe749174 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5565,12 +5565,13 @@ static void __init wq_numa_init(void) int __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; + int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit v1.2.3 From dcdedb24159be3487e3dbbe1faa79ae7d00c92ac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Feb 2018 05:17:28 +0100 Subject: sched/nohz: Remove the 1 Hz tick code Now that the 1Hz tick is offloaded to workqueues, we can safely remove the residual code that used to handle it locally. Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1519186649-3242-7-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/nohz.h | 4 ---- kernel/sched/core.c | 29 ----------------------------- kernel/sched/idle_task.c | 1 - kernel/sched/sched.h | 11 +---------- kernel/time/tick-sched.c | 6 ------ 5 files changed, 1 insertion(+), 50 deletions(-) (limited to 'include') diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 3d3a97d9399d..094217273ff9 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -37,8 +37,4 @@ extern void wake_up_nohz_cpu(int cpu); static inline void wake_up_nohz_cpu(int cpu) { } #endif -#ifdef CONFIG_NO_HZ_FULL -extern u64 scheduler_tick_max_deferment(void); -#endif - #endif /* _LINUX_SCHED_NOHZ_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5dfef458ab52..8fff4f16c510 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3096,35 +3096,9 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); #endif - rq_last_tick_reset(rq); } #ifdef CONFIG_NO_HZ_FULL -/** - * scheduler_tick_max_deferment - * - * Keep at least one tick per second when a single - * active task is running because the scheduler doesn't - * yet completely support full dynticks environment. - * - * This makes sure that uptime, CFS vruntime, load - * balancing, etc... continue to move forward, even - * with a very low granularity. - * - * Return: Maximum deferment in nanoseconds. - */ -u64 scheduler_tick_max_deferment(void) -{ - struct rq *rq = this_rq(); - unsigned long next, now = READ_ONCE(jiffies); - - next = rq->last_sched_tick + HZ; - - if (time_before_eq(next, now)) - return 0; - - return jiffies_to_nsecs(next - now); -} struct tick_work { int cpu; @@ -6116,9 +6090,6 @@ void __init sched_init(void) rq->last_load_update_tick = jiffies; rq->nohz_flags = 0; #endif -#ifdef CONFIG_NO_HZ_FULL - rq->last_sched_tick = 0; -#endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index e1b46e08c8e1..48b8a83f5185 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -48,7 +48,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { - rq_last_tick_reset(rq); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c1c7c788da1c..dc6c8b5a24ad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -727,9 +727,7 @@ struct rq { #endif /* CONFIG_SMP */ unsigned long nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_NO_HZ_FULL - unsigned long last_sched_tick; -#endif + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -1626,13 +1624,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } -static inline void rq_last_tick_reset(struct rq *rq) -{ -#ifdef CONFIG_NO_HZ_FULL - rq->last_sched_tick = jiffies; -#endif -} - extern void update_rq_clock(struct rq *rq); extern void activate_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d479b21a848b..f2fa2e940fe5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -748,12 +748,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, delta = KTIME_MAX; } -#ifdef CONFIG_NO_HZ_FULL - /* Limit the tick delta to the maximum scheduler deferment */ - if (!ts->inidle) - delta = min(delta, scheduler_tick_max_deferment()); -#endif - /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; -- cgit v1.2.3 From 325ea10c0809406ce23f038602abbc454f3f761d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Mar 2018 12:20:47 +0100 Subject: sched/headers: Simplify and clean up header usage in the scheduler Do the following cleanups and simplifications: - sched/sched.h already includes , so no need to include it in sched/core.c again. - order the headers alphabetically - add all headers to kernel/sched/sched.h - remove all unnecessary includes from the .c files that are already included in kernel/sched/sched.h. Finally, make all scheduler .c files use a single common header: #include "sched.h" ... which now contains a union of the relied upon headers. This makes the various .c files easier to read and easier to handle. Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/deadline.h | 6 --- kernel/sched/autogroup.c | 9 ++--- kernel/sched/autogroup.h | 4 -- kernel/sched/clock.c | 14 +------ kernel/sched/completion.c | 5 +-- kernel/sched/core.c | 40 +++++++------------- kernel/sched/cpuacct.c | 13 +------ kernel/sched/cpudeadline.c | 5 +-- kernel/sched/cpudeadline.h | 2 - kernel/sched/cpufreq.c | 1 - kernel/sched/cpufreq_schedutil.c | 8 +--- kernel/sched/cpupri.c | 6 +-- kernel/sched/cpupri.h | 1 - kernel/sched/cputime.c | 10 ++--- kernel/sched/deadline.c | 3 -- kernel/sched/debug.c | 11 +----- kernel/sched/fair.c | 16 +------- kernel/sched/idle.c | 15 +------- kernel/sched/idle_task.c | 5 +-- kernel/sched/isolation.c | 7 ---- kernel/sched/loadavg.c | 4 -- kernel/sched/membarrier.c | 9 +---- kernel/sched/rt.c | 4 -- kernel/sched/sched.h | 81 ++++++++++++++++++++++++++-------------- kernel/sched/stats.c | 13 +++---- kernel/sched/swait.c | 3 +- kernel/sched/topology.c | 4 -- kernel/sched/wait.c | 9 +---- kernel/sched/wait_bit.c | 5 +-- 29 files changed, 94 insertions(+), 219 deletions(-) (limited to 'include') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index a5bc8728ead7..0cb034331cbb 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -1,8 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SCHED_DEADLINE_H -#define _LINUX_SCHED_DEADLINE_H - -#include /* * SCHED_DEADLINE tasks has negative priorities, reflecting @@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b) { return (s64)(a - b) < 0; } - -#endif /* _LINUX_SCHED_DEADLINE_H */ diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index ff1b7b647b86..6be6c575b6cd 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,10 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include - +/* + * Auto-group scheduling implementation: + */ #include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 49e6ec9559cf..b96419974a1f 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,10 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_SCHED_AUTOGROUP -#include -#include -#include - struct autogroup { /* * Reference doesn't mean how many threads attach to this diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 7da6bec8a2ff..10c83e73837a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -52,19 +52,7 @@ * that is otherwise invisible (TSC gets stopped). * */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" /* * Scheduler clock - returns current time in nanosec units. diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 0926aef10dad..5d2d56b0817a 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -11,10 +11,7 @@ * typically be used for exclusion which gives rise to priority inversion. * Waiting for completion is a typically sync point, but not an exclusion point. */ - -#include -#include -#include +#include "sched.h" /** * complete: - signals a single thread waiting on this completion diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9427b59551c1..e1e334ba8ff9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5,37 +5,11 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" #include #include -#ifdef CONFIG_PARAVIRT -#include -#endif -#include "sched.h" #include "../workqueue_internal.h" #include "../smpboot.h" @@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq) raw_spin_unlock_irq(&rq->lock); } +/* + * NOP if the arch has not defined these: + */ + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif + +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 1abd325e733a..9fbb10383434 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,22 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sched.h" - /* * CPU accounting code for task groups. * * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index cb172b61d191..50316455ea66 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -10,10 +10,7 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include -#include -#include -#include "cpudeadline.h" +#include "sched.h" static inline int parent(int i) { diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index c26e7a0e5a66..0adeda93b5fb 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,6 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#include -#include #define IDX_INVALID -1 diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index dbc51442ecbc..5e54cbcae673 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -8,7 +8,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - #include "sched.h" DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 0dad8160e00f..feb5f89020f2 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -11,14 +11,10 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include - #include "sched.h" +#include + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index f43e14ccb67d..daaadf939ccb 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -26,11 +26,7 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include -#include -#include -#include -#include "cpupri.h" +#include "sched.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ static int convert_prio(int prio) diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 141a06c914c6..7dc20a3232e7 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,5 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#include #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index d3b450b57ade..0796f938c4f0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -1,10 +1,6 @@ -#include -#include -#include -#include -#include -#include -#include +/* + * Simple CPU accounting cgroup controller + */ #include "sched.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 58f8b7b37983..af491f537636 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,9 +17,6 @@ */ #include "sched.h" -#include -#include - struct dl_bandwidth def_dl_bandwidth; static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7c82a9b88510..644d9a464380 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1,7 +1,7 @@ /* * kernel/sched/debug.c * - * Print the CFS rbtree + * Print the CFS rbtree and other debugging details * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar * @@ -9,15 +9,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include -#include -#include -#include -#include -#include -#include -#include - #include "sched.h" static DEFINE_SPINLOCK(sched_debug_lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f877de96c9b..f5591071ae98 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,24 +20,10 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" #include -#include "sched.h" - /* * Targeted preemption latency for CPU-bound tasks: * diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 343d25f85477..2760e0357271 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -1,23 +1,10 @@ /* * Generic entry points for the idle threads */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#include "sched.h" #include -#include "sched.h" - /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index ec73680922f8..488222ac4651 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - /* * idle-task scheduling class. * - * (NOTE: these are not related to SCHED_IDLE tasks which are + * (NOTE: these are not related to SCHED_IDLE batch scheduling tasks which are * handled in sched/fair.c) */ +#include "sched.h" #ifdef CONFIG_SMP static int diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index aad5f48a07c6..e6802181900f 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -6,13 +6,6 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ -#include -#include -#include -#include -#include -#include - #include "sched.h" DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a398e7e28a8a..a171c1258109 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,10 +6,6 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ - -#include -#include - #include "sched.h" /* diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 2c6ae2413fa2..76e0eaf4654e 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -13,14 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - -#include -#include -#include -#include -#include - -#include "sched.h" /* for cpu_rq(). */ +#include "sched.h" /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e40498872111..a3d438fec46c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -3,12 +3,8 @@ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ - #include "sched.h" -#include -#include - int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bd1461ae06e4..23ba4dd76ac4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3,39 +3,71 @@ * Scheduler internal types and methods: */ #include + #include -#include -#include -#include -#include #include -#include -#include -#include -#include +#include #include -#include -#include +#include +#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include +#include +#include +#include +#include + +#include -#include -#include #include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include -#include -#include -#include +#include +#include +#include +#include +#include + +#include #ifdef CONFIG_PARAVIRT -#include +# include #endif #include "cpupri.h" @@ -1357,13 +1389,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) return p->on_rq == TASK_ON_RQ_MIGRATING; } -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_post_lock_switch -# define finish_arch_post_lock_switch() do { } while (0) -#endif - /* * wake flags */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 968c1fe3099a..ab112cbfd7c8 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -1,14 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include - +/* + * /proc/schedstat implementation + */ #include "sched.h" /* - * bump this up when changing the output format or the meaning of an existing + * Current schedstat API version. + * + * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ #define SCHEDSTAT_VERSION 15 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index b88ab4e0207f..b6fb2c3b3ff7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,8 +2,7 @@ /* * (simple wait queues ) implementation: */ -#include -#include +#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 219eee70e457..64cc564f5255 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2,10 +2,6 @@ /* * Scheduler topology setup/handling methods */ -#include -#include -#include - #include "sched.h" DEFINE_MUTEX(sched_domains_mutex); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 7b2a142ae629..928be527477e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -3,14 +3,7 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 5293c59163a6..4239c78f5cd3 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,10 +1,7 @@ /* * The implementation of the wait_bit*() and related waiting APIs: */ -#include -#include -#include -#include +#include "sched.h" #define WAIT_TABLE_BITS 8 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) -- cgit v1.2.3 From 6b2bb7265f0b62605e8caee3613449ed0db270b9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Mar 2018 11:40:33 +0100 Subject: sched/wait: Introduce wait_var_event() As a replacement for the wait_on_atomic_t() API provide the wait_var_event() API. The wait_var_event() API is based on the very same hashed-waitqueue idea, but doesn't care about the type (atomic_t) or the specific condition (atomic_read() == 0). IOW. it's much more widely applicable/flexible. It shares all the benefits/disadvantages of a hashed-waitqueue approach with the existing wait_on_atomic_t/wait_on_bit() APIs. The API is modeled after the existing wait_event() API, but instead of taking a wait_queue_head, it takes an address. This addresses is hashed to obtain a wait_queue_head from the bit_wait_table. Similar to the wait_event() API, it takes a condition expression as second argument and will wait until this expression becomes true. The following are (mostly) identical replacements: wait_on_atomic_t(&my_atomic, atomic_t_wait, TASK_UNINTERRUPTIBLE); wake_up_atomic_t(&my_atomic); wait_var_event(&my_atomic, !atomic_read(&my_atomic)); wake_up_var(&my_atomic); The only difference is that wake_up_var() is an unconditional wakeup and doesn't check the previously hard-coded (atomic_read() == 0) condition here. This is of little concequence, since most callers are already conditional on atomic_dec_and_test() and the ones that are not, are trivial to make so. Tested-by: Dan Williams Signed-off-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait_bit.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/wait_bit.c | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) (limited to 'include') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 61b39eaf7cad..3fcdb75d69cf 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -262,4 +262,74 @@ int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode return out_of_line_wait_on_atomic_t(val, action, mode); } +extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags); +extern void wake_up_var(void *var); +extern wait_queue_head_t *__var_waitqueue(void *p); + +#define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \ +({ \ + __label__ __out; \ + struct wait_queue_head *__wq_head = __var_waitqueue(var); \ + struct wait_bit_queue_entry __wbq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait_var_entry(&__wbq_entry, var, \ + exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ + for (;;) { \ + long __int = prepare_to_wait_event(__wq_head, \ + &__wbq_entry.wq_entry, \ + state); \ + if (condition) \ + break; \ + \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ + goto __out; \ + } \ + \ + cmd; \ + } \ + finish_wait(__wq_head, &__wbq_entry.wq_entry); \ +__out: __ret; \ +}) + +#define __wait_var_event(var, condition) \ + ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + schedule()) + +#define wait_var_event(var, condition) \ +do { \ + might_sleep(); \ + if (condition) \ + break; \ + __wait_var_event(var, condition); \ +} while (0) + +#define __wait_var_event_killable(var, condition) \ + ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ + schedule()) + +#define wait_var_event_killable(var, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_var_event_killable(var, condition); \ + __ret; \ +}) + +#define __wait_var_event_timeout(var, condition, timeout) \ + ___wait_var_event(var, ___wait_cond_timeout(condition), \ + TASK_UNINTERRUPTIBLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_var_event_timeout(var, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_var_event_timeout(var, condition, timeout); \ + __ret; \ +}) + #endif /* _LINUX_WAIT_BIT_H */ diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 4239c78f5cd3..ed84ab245a05 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -149,6 +149,54 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); +wait_queue_head_t *__var_waitqueue(void *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} +EXPORT_SYMBOL(__var_waitqueue); + +static int +var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, + int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wbq_entry = + container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + + if (wbq_entry->key.flags != key->flags || + wbq_entry->key.bit_nr != key->bit_nr) + return 0; + + return autoremove_wake_function(wq_entry, mode, sync, key); +} + +void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags) +{ + *wbq_entry = (struct wait_bit_queue_entry){ + .key = { + .flags = (var), + .bit_nr = -1, + }, + .wq_entry = { + .private = current, + .func = var_wake_function, + .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), + }, + }; +} +EXPORT_SYMBOL(init_wait_var_entry); + +void wake_up_var(void *var) +{ + __wake_up_bit(__var_waitqueue(var), var, -1); +} +EXPORT_SYMBOL(wake_up_var); + /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash -- cgit v1.2.3 From f44c77630d26ca2c2a60b20c47dd9ce07c4361b3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 7 Mar 2018 15:26:44 -0800 Subject: fs, dax: prepare for dax-specific address_space_operations In preparation for the dax implementation to start associating dax pages to inodes via page->mapping, we need to provide a 'struct address_space_operations' instance for dax. Define some generic VFS aops helpers for dax. These noop implementations are there in the dax case to prevent the VFS from falling back to operations with page-cache assumptions, dax_writeback_mapping_range() may not be referenced in the FS_DAX=n case. Cc: Jeff Moyer Cc: Ross Zwisler Suggested-by: Matthew Wilcox Suggested-by: Jan Kara Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Suggested-by: Dave Chinner Signed-off-by: Dan Williams --- fs/libfs.c | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/dax.h | 12 +++++++++--- include/linux/fs.h | 4 ++++ 3 files changed, 52 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/libfs.c b/fs/libfs.c index 7ff3cb904acd..0fb590d79f30 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1060,6 +1060,45 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); +int noop_set_page_dirty(struct page *page) +{ + /* + * Unlike __set_page_dirty_no_writeback that handles dirty page + * tracking in the page object, dax does all dirty tracking in + * the inode address_space in response to mkwrite faults. In the + * dax case we only need to worry about potentially dirty CPU + * caches, not dirty page cache pages to write back. + * + * This callback is defined to prevent fallback to + * __set_page_dirty_buffers() in set_page_dirty(). + */ + return 0; +} +EXPORT_SYMBOL_GPL(noop_set_page_dirty); + +void noop_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + /* + * There is no page cache to invalidate in the dax case, however + * we need this callback defined to prevent falling back to + * block_invalidatepage() in do_invalidatepage(). + */ +} +EXPORT_SYMBOL_GPL(noop_invalidatepage); + +ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + /* + * iomap based filesystems support direct I/O without need for + * this callback. However, it still needs to be set in + * inode->a_ops so that open/fcntl know that direct I/O is + * generally supported. + */ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(noop_direct_IO); + /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ void kfree_link(void *p) { diff --git a/include/linux/dax.h b/include/linux/dax.h index 0185ecdae135..ae27a7efe7ab 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -38,6 +38,7 @@ static inline void put_dax(struct dax_device *dax_dev) } #endif +struct writeback_control; int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #if IS_ENABLED(CONFIG_FS_DAX) int __bdev_dax_supported(struct super_block *sb, int blocksize); @@ -57,6 +58,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev) } struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); +int dax_writeback_mapping_range(struct address_space *mapping, + struct block_device *bdev, struct writeback_control *wbc); #else static inline int bdev_dax_supported(struct super_block *sb, int blocksize) { @@ -76,6 +79,12 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) { return NULL; } + +static inline int dax_writeback_mapping_range(struct address_space *mapping, + struct block_device *bdev, struct writeback_control *wbc) +{ + return -EOPNOTSUPP; +} #endif int dax_read_lock(void); @@ -121,7 +130,4 @@ static inline bool dax_mapping(struct address_space *mapping) return mapping->host && IS_DAX(mapping->host); } -struct writeback_control; -int dax_writeback_mapping_range(struct address_space *mapping, - struct block_device *bdev, struct writeback_control *wbc); #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 79c413985305..44f7f7080faa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3129,6 +3129,10 @@ extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern int noop_fsync(struct file *, loff_t, loff_t, int); +extern int noop_set_page_dirty(struct page *page); +extern void noop_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); +extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); extern int simple_write_begin(struct file *file, struct address_space *mapping, -- cgit v1.2.3 From 976431b02c2ef92ae3f8b6a7d699fc554025e118 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 29 Mar 2018 17:22:13 -0700 Subject: dax, dm: allow device-mapper to operate without dax support Change device-mapper's DAX dependency to require the presence of at least one DAX_DRIVER. This allows device-mapper to be built without bringing the DAX core along which is especially wasteful when there are no DAX drivers, like BLK_DEV_PMEM, configured. Cc: Alasdair Kergon Reported-by: Bart Van Assche Reported-by: kbuild test robot Reported-by: Arnd Bergmann Reviewed-by: Mike Snitzer Signed-off-by: Dan Williams --- drivers/md/Kconfig | 2 +- drivers/md/dm-linear.c | 6 +++ drivers/md/dm-log-writes.c | 95 ++++++++++++++++++++++++---------------------- drivers/md/dm-stripe.c | 6 +++ drivers/md/dm.c | 10 +++-- include/linux/dax.h | 30 ++++++++++++--- 6 files changed, 93 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2c8ac3688815..edff083f7c4e 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -201,7 +201,7 @@ config BLK_DEV_DM_BUILTIN config BLK_DEV_DM tristate "Device mapper support" select BLK_DEV_DM_BUILTIN - select DAX + depends on DAX || DAX=n ---help--- Device-mapper is a low level volume manager. It works by allowing people to specify mappings for ranges of logical sectors. Various diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index d5f8eff7c11d..89443e0ededa 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -154,6 +154,7 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } +#if IS_ENABLED(CONFIG_DAX_DRIVER) static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { @@ -184,6 +185,11 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } +#else +#define linear_dax_direct_access NULL +#define linear_dax_copy_from_iter NULL +#endif + static struct target_type linear_target = { .name = "linear", .version = {1, 4, 0}, diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 3362d866793b..7fcb4216973f 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -610,51 +610,6 @@ static int log_mark(struct log_writes_c *lc, char *data) return 0; } -static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, - struct iov_iter *i) -{ - struct pending_block *block; - - if (!bytes) - return 0; - - block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); - if (!block) { - DMERR("Error allocating dax pending block"); - return -ENOMEM; - } - - block->data = kzalloc(bytes, GFP_KERNEL); - if (!block->data) { - DMERR("Error allocating dax data space"); - kfree(block); - return -ENOMEM; - } - - /* write data provided via the iterator */ - if (!copy_from_iter(block->data, bytes, i)) { - DMERR("Error copying dax data"); - kfree(block->data); - kfree(block); - return -EIO; - } - - /* rewind the iterator so that the block driver can use it */ - iov_iter_revert(i, bytes); - - block->datalen = bytes; - block->sector = bio_to_dev_sectors(lc, sector); - block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; - - atomic_inc(&lc->pending_blocks); - spin_lock_irq(&lc->blocks_lock); - list_add_tail(&block->list, &lc->unflushed_blocks); - spin_unlock_irq(&lc->blocks_lock); - wake_up_process(lc->log_kthread); - - return 0; -} - static void log_writes_dtr(struct dm_target *ti) { struct log_writes_c *lc = ti->private; @@ -920,6 +875,52 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit limits->io_min = limits->physical_block_size; } +#if IS_ENABLED(CONFIG_DAX_DRIVER) +static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, + struct iov_iter *i) +{ + struct pending_block *block; + + if (!bytes) + return 0; + + block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); + if (!block) { + DMERR("Error allocating dax pending block"); + return -ENOMEM; + } + + block->data = kzalloc(bytes, GFP_KERNEL); + if (!block->data) { + DMERR("Error allocating dax data space"); + kfree(block); + return -ENOMEM; + } + + /* write data provided via the iterator */ + if (!copy_from_iter(block->data, bytes, i)) { + DMERR("Error copying dax data"); + kfree(block->data); + kfree(block); + return -EIO; + } + + /* rewind the iterator so that the block driver can use it */ + iov_iter_revert(i, bytes); + + block->datalen = bytes; + block->sector = bio_to_dev_sectors(lc, sector); + block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; + + atomic_inc(&lc->pending_blocks); + spin_lock_irq(&lc->blocks_lock); + list_add_tail(&block->list, &lc->unflushed_blocks); + spin_unlock_irq(&lc->blocks_lock); + wake_up_process(lc->log_kthread); + + return 0; +} + static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { @@ -956,6 +957,10 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti, dax_copy: return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); } +#else +#define log_writes_dax_direct_access NULL +#define log_writes_dax_copy_from_iter NULL +#endif static struct target_type log_writes_target = { .name = "log-writes", diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index b5e892149c54..ac2e8ee9d586 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -311,6 +311,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } +#if IS_ENABLED(CONFIG_DAX_DRIVER) static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { @@ -351,6 +352,11 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } +#else +#define stripe_dax_direct_access NULL +#define stripe_dax_copy_from_iter NULL +#endif + /* * Stripe status: * diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 68136806d365..ffc93aecc02a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1800,7 +1800,7 @@ static void cleanup_mapped_device(struct mapped_device *md) static struct mapped_device *alloc_dev(int minor) { int r, numa_node_id = dm_get_numa_node(); - struct dax_device *dax_dev; + struct dax_device *dax_dev = NULL; struct mapped_device *md; void *old_md; @@ -1866,9 +1866,11 @@ static struct mapped_device *alloc_dev(int minor) md->disk->private_data = md; sprintf(md->disk->disk_name, "dm-%d", minor); - dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); - if (!dax_dev) - goto bad; + if (IS_ENABLED(CONFIG_DAX_DRIVER)) { + dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); + if (!dax_dev) + goto bad; + } md->dax_dev = dax_dev; add_disk_no_queue_reg(md->disk); diff --git a/include/linux/dax.h b/include/linux/dax.h index ae27a7efe7ab..f9eb22ad341e 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -26,16 +26,39 @@ extern struct attribute_group dax_attribute_group; #if IS_ENABLED(CONFIG_DAX) struct dax_device *dax_get_by_host(const char *host); +struct dax_device *alloc_dax(void *private, const char *host, + const struct dax_operations *ops); void put_dax(struct dax_device *dax_dev); +void kill_dax(struct dax_device *dax_dev); +void dax_write_cache(struct dax_device *dax_dev, bool wc); +bool dax_write_cache_enabled(struct dax_device *dax_dev); #else static inline struct dax_device *dax_get_by_host(const char *host) { return NULL; } - +static inline struct dax_device *alloc_dax(void *private, const char *host, + const struct dax_operations *ops) +{ + /* + * Callers should check IS_ENABLED(CONFIG_DAX) to know if this + * NULL is an error or expected. + */ + return NULL; +} static inline void put_dax(struct dax_device *dax_dev) { } +static inline void kill_dax(struct dax_device *dax_dev) +{ +} +static inline void dax_write_cache(struct dax_device *dax_dev, bool wc) +{ +} +static inline bool dax_write_cache_enabled(struct dax_device *dax_dev) +{ + return false; +} #endif struct writeback_control; @@ -89,18 +112,13 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping, int dax_read_lock(void); void dax_read_unlock(int id); -struct dax_device *alloc_dax(void *private, const char *host, - const struct dax_operations *ops); bool dax_alive(struct dax_device *dax_dev); -void kill_dax(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); -void dax_write_cache(struct dax_device *dax_dev, bool wc); -bool dax_write_cache_enabled(struct dax_device *dax_dev); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); -- cgit v1.2.3