From 3851f1cdb2b8d507b10395fc110d4c37d6121285 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 4 Aug 2016 00:08:45 -0400 Subject: SUNRPC: Limit the reconnect backoff timer to the max RPC message timeout ...and ensure that we propagate it to new transports on the same client. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 5e3e1b63dbb3..a16070dd03ee 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -218,7 +218,8 @@ struct rpc_xprt { struct work_struct task_cleanup; struct timer_list timer; unsigned long last_used, - idle_timeout; + idle_timeout, + max_reconnect_timeout; /* * Send stuff -- cgit v1.2.3 From 8d480326c3d6921ff5f1cc988c993bd572248deb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 5 Aug 2016 19:03:31 -0400 Subject: NFSv4: Cap the transport reconnection timer at 1/2 lease period We don't want to miss a lease period renewal due to the TCP connection failing to reconnect in a timely fashion. To ensure this doesn't happen, cap the reconnection timer so that we retry the connection attempt at least every 1/2 lease period. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4renewd.c | 3 +++ include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/clnt.c | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 062029ab344a..82e77198d17e 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -151,6 +151,9 @@ void nfs4_set_lease_period(struct nfs_client *clp, clp->cl_lease_time = lease; clp->cl_last_renewal = lastrenewed; spin_unlock(&clp->cl_lock); + + /* Cap maximum reconnect timeout at 1/2 lease period */ + rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1); } /* diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index b6810c92b8bb..5c02b0691587 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -195,6 +195,8 @@ int rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *, struct rpc_xprt *, void *), void *data); +void rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, + unsigned long timeo); const char *rpc_proc_name(const struct rpc_task *task); #endif /* __KERNEL__ */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index faac5472d14d..7f79fb7dc6a0 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2676,6 +2676,27 @@ out_put_switch: } EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt); +static int +rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + void *data) +{ + unsigned long timeout = *((unsigned long *)data); + + if (timeout < xprt->max_reconnect_timeout) + xprt->max_reconnect_timeout = timeout; + return 0; +} + +void +rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) +{ + rpc_clnt_iterate_for_each_xprt(clnt, + rpc_xprt_cap_max_reconnect_timeout, + &timeo); +} +EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { -- cgit v1.2.3 From a0cba2179ea4c1820fce2ee046b6ed90ecc56196 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 9 Aug 2016 10:48:18 -0700 Subject: Revert "printk: create pr_ functions" This reverts commit 874f9c7da9a4acbc1b9e12ca722579fb50e4d142. Geert Uytterhoeven reports: "This change seems to have an (unintendent?) side-effect. Before, pr_*() calls without a trailing newline characters would be printed with a newline character appended, both on the console and in the output of the dmesg command. After this commit, no new line character is appended, and the output of the next pr_*() call of the same type may be appended, like in: - Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000 - Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM) + Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)" Joe Perches says: "No, that is not intentional. The newline handling code inside vprintk_emit is a bit involved and for now I suggest a revert until this has all the same behavior as earlier" Reported-by: Geert Uytterhoeven Requested-by: Joe Perches Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 48 +++++++++++++++--------------------------------- kernel/printk/internal.h | 16 ++++++---------- kernel/printk/nmi.c | 13 ++----------- kernel/printk/printk.c | 25 +++---------------------- 4 files changed, 26 insertions(+), 76 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 8dc155dab3ed..696a56be7d3e 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -266,39 +266,21 @@ extern asmlinkage void dump_stack(void) __cold; * and other debug macros are compiled out unless either DEBUG is defined * or CONFIG_DYNAMIC_DEBUG is set. */ - -#ifdef CONFIG_PRINTK - -asmlinkage __printf(1, 2) __cold void __pr_emerg(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_alert(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_crit(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_err(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_warn(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_notice(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_info(const char *fmt, ...); - -#define pr_emerg(fmt, ...) __pr_emerg(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_alert(fmt, ...) __pr_alert(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_crit(fmt, ...) __pr_crit(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_err(fmt, ...) __pr_err(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warn(fmt, ...) __pr_warn(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_notice(fmt, ...) __pr_notice(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_info(fmt, ...) __pr_info(pr_fmt(fmt), ##__VA_ARGS__) - -#else - -#define pr_emerg(fmt, ...) printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) -#define pr_alert(fmt, ...) printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_crit(fmt, ...) printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_err(fmt, ...) printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warn(fmt, ...) printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) -#define pr_notice(fmt, ...) printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) -#define pr_info(fmt, ...) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) - -#endif - -#define pr_warning pr_warn - +#define pr_emerg(fmt, ...) \ + printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +#define pr_alert(fmt, ...) \ + printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_crit(fmt, ...) \ + printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_err(fmt, ...) \ + printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warning(fmt, ...) \ + printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warn pr_warning +#define pr_notice(fmt, ...) \ + printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info(fmt, ...) \ + printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) /* * Like KERN_CONT, pr_cont() should only be used when continuing * a line with no newline ('\n') enclosed. Otherwise it defaults diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 5d4505f30083..7fd2838fa417 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -16,11 +16,9 @@ */ #include -typedef __printf(2, 0) int (*printk_func_t)(int level, const char *fmt, - va_list args); +typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); -__printf(2, 0) -int vprintk_default(int level, const char *fmt, va_list args); +int __printf(1, 0) vprintk_default(const char *fmt, va_list args); #ifdef CONFIG_PRINTK_NMI @@ -33,10 +31,9 @@ extern raw_spinlock_t logbuf_lock; * via per-CPU variable. */ DECLARE_PER_CPU(printk_func_t, printk_func); -__printf(2, 0) -static inline int vprintk_func(int level, const char *fmt, va_list args) +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { - return this_cpu_read(printk_func)(level, fmt, args); + return this_cpu_read(printk_func)(fmt, args); } extern atomic_t nmi_message_lost; @@ -47,10 +44,9 @@ static inline int get_nmi_message_lost(void) #else /* CONFIG_PRINTK_NMI */ -__printf(2, 0) -static inline int vprintk_func(int level, const char *fmt, va_list args) +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { - return vprintk_default(level, fmt, args); + return vprintk_default(fmt, args); } static inline int get_nmi_message_lost(void) diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index bc3eeb1ae6da..b69eb8a2876f 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); * one writer running. But the buffer might get flushed from another * CPU, so we need to be careful. */ -static int vprintk_nmi(int level, const char *fmt, va_list args) +static int vprintk_nmi(const char *fmt, va_list args) { struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); int add = 0; @@ -79,16 +79,7 @@ again: if (!len) smp_rmb(); - if (level != LOGLEVEL_DEFAULT) { - add = snprintf(s->buffer + len, sizeof(s->buffer) - len, - KERN_SOH "%c", '0' + level); - add += vsnprintf(s->buffer + len + add, - sizeof(s->buffer) - len - add, - fmt, args); - } else { - add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, - fmt, args); - } + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); /* * Do it once again if the buffer has been flushed in the meantime. diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a37fc8cf8e84..eea6dbc2d8cf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1930,26 +1930,7 @@ asmlinkage int printk_emit(int facility, int level, } EXPORT_SYMBOL(printk_emit); -#define define_pr_level(func, loglevel) \ -asmlinkage __visible void func(const char *fmt, ...) \ -{ \ - va_list args; \ - \ - va_start(args, fmt); \ - vprintk_default(loglevel, fmt, args); \ - va_end(args); \ -} \ -EXPORT_SYMBOL(func) - -define_pr_level(__pr_emerg, LOGLEVEL_EMERG); -define_pr_level(__pr_alert, LOGLEVEL_ALERT); -define_pr_level(__pr_crit, LOGLEVEL_CRIT); -define_pr_level(__pr_err, LOGLEVEL_ERR); -define_pr_level(__pr_warn, LOGLEVEL_WARNING); -define_pr_level(__pr_notice, LOGLEVEL_NOTICE); -define_pr_level(__pr_info, LOGLEVEL_INFO); - -int vprintk_default(int level, const char *fmt, va_list args) +int vprintk_default(const char *fmt, va_list args) { int r; @@ -1959,7 +1940,7 @@ int vprintk_default(int level, const char *fmt, va_list args) return r; } #endif - r = vprintk_emit(0, level, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); return r; } @@ -1992,7 +1973,7 @@ asmlinkage __visible int printk(const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_func(LOGLEVEL_DEFAULT, fmt, args); + r = vprintk_func(fmt, args); va_end(args); return r; -- cgit v1.2.3 From db4a835601b73cf8d6cd8986381d966b8e13d2d9 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Tue, 2 Aug 2016 00:48:12 -0700 Subject: perf/core: Set cgroup in CPU contexts for new cgroup events There's a perf stat bug easy to observer on a machine with only one cgroup: $ perf stat -e cycles -I 1000 -C 0 -G / # time counts unit events 1.000161699 cycles / 2.000355591 cycles / 3.000565154 cycles / 4.000951350 cycles / We'd expect some output there. The underlying problem is that there is an optimization in perf_cgroup_sched_{in,out}() that skips the switch of cgroup events if the old and new cgroups in a task switch are the same. This optimization interacts with the current code in two ways that cause a CPU context's cgroup (cpuctx->cgrp) to be NULL even if a cgroup event matches the current task. These are: 1. On creation of the first cgroup event in a CPU: In current code, cpuctx->cpu is only set in perf_cgroup_sched_in, but due to the aforesaid optimization, perf_cgroup_sched_in will run until the next cgroup switches in that CPU. This may happen late or never happen, depending on system's number of cgroups, CPU load, etc. 2. On deletion of the last cgroup event in a cpuctx: In list_del_event, cpuctx->cgrp is set NULL. Any new cgroup event will not be sched in because cpuctx->cgrp == NULL until a cgroup switch occurs and perf_cgroup_sched_in is executed (updating cpuctx->cgrp). This patch fixes both problems by setting cpuctx->cgrp in list_add_event, mirroring what list_del_event does when removing a cgroup event from CPU context, as introduced in: commit 68cacd29167b ("perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx()") With this patch, cpuctx->cgrp is always set/clear when installing/removing the first/last cgroup event in/from the CPU context. With cpuctx->cgrp correctly set, event_filter_match works as intended when events are sched in/out. After the fix, the output is as expected: $ perf stat -e cycles -I 1000 -a -G / # time counts unit events 1.004699159 627342882 cycles / 2.007397156 615272690 cycles / 3.010019057 616726074 cycles / Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Cc: Vince Weaver Link: http://lkml.kernel.org/r/1470124092-113192-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++++ kernel/events/core.c | 54 ++++++++++++++++++++++++++++++---------------- 2 files changed, 40 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8ed4326164cc..2b6b43cc0dd5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -743,7 +743,9 @@ struct perf_event_context { u64 parent_gen; u64 generation; int pin_count; +#ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ +#endif void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; }; @@ -769,7 +771,9 @@ struct perf_cpu_context { unsigned int hrtimer_active; struct pmu *unique_pmu; +#ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; +#endif }; struct perf_output_handle { diff --git a/kernel/events/core.c b/kernel/events/core.c index 87d02b8cb87e..1903b8f3a705 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -843,6 +843,32 @@ perf_cgroup_mark_enabled(struct perf_event *event, } } } + +/* + * Update cpuctx->cgrp so that it is set when first cgroup event is added and + * cleared when last cgroup event is removed. + */ +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ + struct perf_cpu_context *cpuctx; + + if (!is_cgroup_event(event)) + return; + + if (add && ctx->nr_cgroups++) + return; + else if (!add && --ctx->nr_cgroups) + return; + /* + * Because cgroup events are always per-cpu events, + * this will always be called from the right CPU. + */ + cpuctx = __get_cpu_context(ctx); + cpuctx->cgrp = add ? event->cgrp : NULL; +} + #else /* !CONFIG_CGROUP_PERF */ static inline bool @@ -920,6 +946,13 @@ perf_cgroup_mark_enabled(struct perf_event *event, struct perf_event_context *ctx) { } + +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ +} + #endif /* @@ -1392,6 +1425,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); @@ -1412,8 +1446,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } - if (is_cgroup_event(event)) - ctx->nr_cgroups++; + list_update_cgroup_event(event, ctx, true); list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; @@ -1581,8 +1614,6 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx; - WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); @@ -1594,20 +1625,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) { - ctx->nr_cgroups--; - /* - * Because cgroup events are always per-cpu events, this will - * always be called from the right CPU. - */ - cpuctx = __get_cpu_context(ctx); - /* - * If there are no more cgroup events then clear cgrp to avoid - * stale pointer in update_cgrp_time_from_cpuctx(). - */ - if (!ctx->nr_cgroups) - cpuctx->cgrp = NULL; - } + list_update_cgroup_event(event, ctx, false); ctx->nr_events--; if (event->attr.inherit_stat) -- cgit v1.2.3 From 1ea049b2de5d803374fdbf43add23c8d1c518e7b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 11 Aug 2016 10:15:56 +0200 Subject: bvec: avoid variable shadowing warning Due to the (indirect) nesting of min(..., min(...)), sparse will show a variable shadowing warning whenever bvec.h is included. Avoid that by assigning the inner min() to a temporary variable first. Signed-off-by: Johannes Berg Signed-off-by: Jens Axboe --- include/linux/bvec.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 701b64a3b7c5..89b65b82d98f 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -74,7 +74,8 @@ static inline void bvec_iter_advance(const struct bio_vec *bv, "Attempted to advance past end of bvec iter\n"); while (bytes) { - unsigned len = min(bytes, bvec_iter_len(bv, *iter)); + unsigned iter_len = bvec_iter_len(bv, *iter); + unsigned len = min(bytes, iter_len); bytes -= len; iter->bi_size -= len; -- cgit v1.2.3 From 023e9fddc3616b005c3753fc1bb6526388cd7a30 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 9 Aug 2016 19:13:00 +0200 Subject: KVM: PPC: Move xics_debugfs_init out of create MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As we are about to hold the kvm->lock during the create operation on KVM devices, we should move the call to xics_debugfs_init into its own function, since holding a mutex over extended amounts of time might not be a good idea. Introduce an init operation on the kvm_device_ops struct which cannot fail and call this, if configured, after the device has been created. Signed-off-by: Christoffer Dall Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/powerpc/kvm/book3s_xics.c | 10 ++++++++-- include/linux/kvm_host.h | 6 ++++++ virt/kvm/kvm_main.c | 3 +++ 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index a75ba38a2d81..f2def8e45fef 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -1341,8 +1341,6 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) return ret; } - xics_debugfs_init(xics); - #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE if (cpu_has_feature(CPU_FTR_ARCH_206)) { /* Enable real mode support */ @@ -1354,9 +1352,17 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) return 0; } +static void kvmppc_xics_init(struct kvm_device *dev) +{ + struct kvmppc_xics *xics = (struct kvmppc_xics *)dev->private; + + xics_debugfs_init(xics); +} + struct kvm_device_ops kvm_xics_ops = { .name = "kvm-xics", .create = kvmppc_xics_create, + .init = kvmppc_xics_init, .destroy = kvmppc_xics_free, .set_attr = xics_set_attr, .get_attr = xics_get_attr, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 01e908ac4a39..d3c9b82812c3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1115,6 +1115,12 @@ struct kvm_device_ops { const char *name; int (*create)(struct kvm_device *dev, u32 type); + /* + * init is called after create if create is successful and is called + * outside of holding kvm->lock. + */ + void (*init)(struct kvm_device *dev); + /* * Destroy is responsible for freeing dev. * diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cc081ccfcaa3..ae642452e91a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2838,6 +2838,9 @@ static int kvm_ioctl_create_device(struct kvm *kvm, return ret; } + if (ops->init) + ops->init(dev); + ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { ops->destroy(dev); -- cgit v1.2.3 From a28ebea2adc4a2bef5989a5a181ec238f59fbcad Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 9 Aug 2016 19:13:01 +0200 Subject: KVM: Protect device ops->create and list_add with kvm->lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM devices were manipulating list data structures without any form of synchronization, and some implementations of the create operations also suffered from a lack of synchronization. Now when we've split the xics create operation into create and init, we can hold the kvm->lock mutex while calling the create operation and when manipulating the devices list. The error path in the generic code gets slightly ugly because we have to take the mutex again and delete the device from the list, but holding the mutex during anon_inode_getfd or releasing/locking the mutex in the common non-error path seemed wrong. Signed-off-by: Christoffer Dall Reviewed-by: Paolo Bonzini Acked-by: Christian Borntraeger Signed-off-by: Radim Krčmář --- arch/arm/kvm/arm.c | 6 +++++- arch/powerpc/kvm/book3s_xics.c | 2 -- include/linux/kvm_host.h | 6 ++++++ virt/kvm/arm/vgic/vgic-init.c | 17 ++++------------- virt/kvm/kvm_main.c | 13 ++++++++++++- 5 files changed, 27 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index d94bb9093ead..75f130ef6504 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -1009,9 +1009,13 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_CREATE_IRQCHIP: { + int ret; if (!vgic_present) return -ENXIO; - return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + mutex_lock(&kvm->lock); + ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + mutex_unlock(&kvm->lock); + return ret; } case KVM_ARM_SET_DEVICE_ADDR: { struct kvm_arm_device_addr dev_addr; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index f2def8e45fef..05aa11399a78 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -1329,12 +1329,10 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) xics->kvm = kvm; /* Already there ? */ - mutex_lock(&kvm->lock); if (kvm->arch.xics) ret = -EEXIST; else kvm->arch.xics = xics; - mutex_unlock(&kvm->lock); if (ret) { kfree(xics); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d3c9b82812c3..9c28b4d4c90b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1113,6 +1113,12 @@ struct kvm_device { /* create, destroy, and name are mandatory */ struct kvm_device_ops { const char *name; + + /* + * create is called holding kvm->lock and any operations not suitable + * to do while holding the lock should be deferred to init (see + * below). + */ int (*create)(struct kvm_device *dev, u32 type); /* diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index fb4b0a79a950..83777c1cbae0 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -73,12 +73,8 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) int i, vcpu_lock_idx = -1, ret; struct kvm_vcpu *vcpu; - mutex_lock(&kvm->lock); - - if (irqchip_in_kernel(kvm)) { - ret = -EEXIST; - goto out; - } + if (irqchip_in_kernel(kvm)) + return -EEXIST; /* * This function is also called by the KVM_CREATE_IRQCHIP handler, @@ -87,10 +83,8 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) * the proper checks already. */ if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && - !kvm_vgic_global_state.can_emulate_gicv2) { - ret = -ENODEV; - goto out; - } + !kvm_vgic_global_state.can_emulate_gicv2) + return -ENODEV; /* * Any time a vcpu is run, vcpu_load is called which tries to grab the @@ -138,9 +132,6 @@ out_unlock: vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); mutex_unlock(&vcpu->mutex); } - -out: - mutex_unlock(&kvm->lock); return ret; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ae642452e91a..195078225aa5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -696,6 +696,11 @@ static void kvm_destroy_devices(struct kvm *kvm) { struct kvm_device *dev, *tmp; + /* + * We do not need to take the kvm->lock here, because nobody else + * has a reference to the struct kvm at this point and therefore + * cannot access the devices list anyhow. + */ list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { list_del(&dev->vm_node); dev->ops->destroy(dev); @@ -2832,11 +2837,15 @@ static int kvm_ioctl_create_device(struct kvm *kvm, dev->ops = ops; dev->kvm = kvm; + mutex_lock(&kvm->lock); ret = ops->create(dev, cd->type); if (ret < 0) { + mutex_unlock(&kvm->lock); kfree(dev); return ret; } + list_add(&dev->vm_node, &kvm->devices); + mutex_unlock(&kvm->lock); if (ops->init) ops->init(dev); @@ -2844,10 +2853,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm, ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { ops->destroy(dev); + mutex_lock(&kvm->lock); + list_del(&dev->vm_node); + mutex_unlock(&kvm->lock); return ret; } - list_add(&dev->vm_node, &kvm->devices); kvm_get_kvm(kvm); cd->fd = ret; return 0; -- cgit v1.2.3