From 3851f1cdb2b8d507b10395fc110d4c37d6121285 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 4 Aug 2016 00:08:45 -0400
Subject: SUNRPC: Limit the reconnect backoff timer to the max RPC message
 timeout

...and ensure that we propagate it to new transports on the same
client.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 5e3e1b63dbb3..a16070dd03ee 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -218,7 +218,8 @@ struct rpc_xprt {
 	struct work_struct	task_cleanup;
 	struct timer_list	timer;
 	unsigned long		last_used,
-				idle_timeout;
+				idle_timeout,
+				max_reconnect_timeout;
 
 	/*
 	 * Send stuff
-- 
cgit v1.2.3


From 8d480326c3d6921ff5f1cc988c993bd572248deb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 5 Aug 2016 19:03:31 -0400
Subject: NFSv4: Cap the transport reconnection timer at 1/2 lease period

We don't want to miss a lease period renewal due to the TCP connection
failing to reconnect in a timely fashion. To ensure this doesn't happen,
cap the reconnection timer so that we retry the connection attempt
at least every 1/2 lease period.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4renewd.c         |  3 +++
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           | 21 +++++++++++++++++++++
 3 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 062029ab344a..82e77198d17e 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -151,6 +151,9 @@ void nfs4_set_lease_period(struct nfs_client *clp,
 	clp->cl_lease_time = lease;
 	clp->cl_last_renewal = lastrenewed;
 	spin_unlock(&clp->cl_lock);
+
+	/* Cap maximum reconnect timeout at 1/2 lease period */
+	rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1);
 }
 
 /*
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index b6810c92b8bb..5c02b0691587 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -195,6 +195,8 @@ int		rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *,
 				struct rpc_xprt *,
 				void *),
 			void *data);
+void		rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
+			unsigned long timeo);
 
 const char *rpc_proc_name(const struct rpc_task *task);
 #endif /* __KERNEL__ */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index faac5472d14d..7f79fb7dc6a0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2676,6 +2676,27 @@ out_put_switch:
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
 
+static int
+rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
+		struct rpc_xprt *xprt,
+		void *data)
+{
+	unsigned long timeout = *((unsigned long *)data);
+
+	if (timeout < xprt->max_reconnect_timeout)
+		xprt->max_reconnect_timeout = timeout;
+	return 0;
+}
+
+void
+rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
+{
+	rpc_clnt_iterate_for_each_xprt(clnt,
+			rpc_xprt_cap_max_reconnect_timeout,
+			&timeo);
+}
+EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
+
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static void rpc_show_header(void)
 {
-- 
cgit v1.2.3


From a0cba2179ea4c1820fce2ee046b6ed90ecc56196 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 9 Aug 2016 10:48:18 -0700
Subject: Revert "printk: create pr_<level> functions"

This reverts commit 874f9c7da9a4acbc1b9e12ca722579fb50e4d142.

Geert Uytterhoeven reports:
 "This change seems to have an (unintendent?) side-effect.

  Before, pr_*() calls without a trailing newline characters would be
  printed with a newline character appended, both on the console and in
  the output of the dmesg command.

  After this commit, no new line character is appended, and the output
  of the next pr_*() call of the same type may be appended, like in:

    - Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000
    - Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)
    + Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)"

Joe Perches says:
 "No, that is not intentional.

  The newline handling code inside vprintk_emit is a bit involved and
  for now I suggest a revert until this has all the same behavior as
  earlier"

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Requested-by: Joe Perches <joe@perches.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h   | 48 +++++++++++++++---------------------------------
 kernel/printk/internal.h | 16 ++++++----------
 kernel/printk/nmi.c      | 13 ++-----------
 kernel/printk/printk.c   | 25 +++----------------------
 4 files changed, 26 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 8dc155dab3ed..696a56be7d3e 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -266,39 +266,21 @@ extern asmlinkage void dump_stack(void) __cold;
  * and other debug macros are compiled out unless either DEBUG is defined
  * or CONFIG_DYNAMIC_DEBUG is set.
  */
-
-#ifdef CONFIG_PRINTK
-
-asmlinkage __printf(1, 2) __cold void __pr_emerg(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_alert(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_crit(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_err(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_warn(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_notice(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_info(const char *fmt, ...);
-
-#define pr_emerg(fmt, ...)	__pr_emerg(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_alert(fmt, ...)	__pr_alert(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_crit(fmt, ...)	__pr_crit(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_err(fmt, ...)	__pr_err(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_warn(fmt, ...)	__pr_warn(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_notice(fmt, ...)	__pr_notice(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_info(fmt, ...)	__pr_info(pr_fmt(fmt), ##__VA_ARGS__)
-
-#else
-
-#define pr_emerg(fmt, ...)	printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_alert(fmt, ...)	printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_crit(fmt, ...)	printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_err(fmt, ...)	printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_warn(fmt, ...)	printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_notice(fmt, ...)	printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_info(fmt, ...)	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
-
-#endif
-
-#define pr_warning pr_warn
-
+#define pr_emerg(fmt, ...) \
+	printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_alert(fmt, ...) \
+	printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_crit(fmt, ...) \
+	printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_err(fmt, ...) \
+	printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warning(fmt, ...) \
+	printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warn pr_warning
+#define pr_notice(fmt, ...) \
+	printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info(fmt, ...) \
+	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 /*
  * Like KERN_CONT, pr_cont() should only be used when continuing
  * a line with no newline ('\n') enclosed. Otherwise it defaults
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 5d4505f30083..7fd2838fa417 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,11 +16,9 @@
  */
 #include <linux/percpu.h>
 
-typedef __printf(2, 0) int (*printk_func_t)(int level, const char *fmt,
-					    va_list args);
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
 
-__printf(2, 0)
-int vprintk_default(int level, const char *fmt, va_list args);
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK_NMI
 
@@ -33,10 +31,9 @@ extern raw_spinlock_t logbuf_lock;
  * via per-CPU variable.
  */
 DECLARE_PER_CPU(printk_func_t, printk_func);
-__printf(2, 0)
-static inline int vprintk_func(int level, const char *fmt, va_list args)
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 {
-	return this_cpu_read(printk_func)(level, fmt, args);
+	return this_cpu_read(printk_func)(fmt, args);
 }
 
 extern atomic_t nmi_message_lost;
@@ -47,10 +44,9 @@ static inline int get_nmi_message_lost(void)
 
 #else /* CONFIG_PRINTK_NMI */
 
-__printf(2, 0)
-static inline int vprintk_func(int level, const char *fmt, va_list args)
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 {
-	return vprintk_default(level, fmt, args);
+	return vprintk_default(fmt, args);
 }
 
 static inline int get_nmi_message_lost(void)
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index bc3eeb1ae6da..b69eb8a2876f 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
  * one writer running. But the buffer might get flushed from another
  * CPU, so we need to be careful.
  */
-static int vprintk_nmi(int level, const char *fmt, va_list args)
+static int vprintk_nmi(const char *fmt, va_list args)
 {
 	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
 	int add = 0;
@@ -79,16 +79,7 @@ again:
 	if (!len)
 		smp_rmb();
 
-	if (level != LOGLEVEL_DEFAULT) {
-		add = snprintf(s->buffer + len, sizeof(s->buffer) - len,
-				KERN_SOH "%c", '0' + level);
-		add += vsnprintf(s->buffer + len + add,
-				 sizeof(s->buffer) - len - add,
-				 fmt, args);
-	} else {
-		add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len,
-				fmt, args);
-	}
+	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
 
 	/*
 	 * Do it once again if the buffer has been flushed in the meantime.
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a37fc8cf8e84..eea6dbc2d8cf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1930,26 +1930,7 @@ asmlinkage int printk_emit(int facility, int level,
 }
 EXPORT_SYMBOL(printk_emit);
 
-#define define_pr_level(func, loglevel)				\
-asmlinkage __visible void func(const char *fmt, ...)		\
-{								\
-	va_list args;						\
-								\
-	va_start(args, fmt);					\
-	vprintk_default(loglevel, fmt, args);			\
-	va_end(args);						\
-}								\
-EXPORT_SYMBOL(func)
-
-define_pr_level(__pr_emerg, LOGLEVEL_EMERG);
-define_pr_level(__pr_alert, LOGLEVEL_ALERT);
-define_pr_level(__pr_crit, LOGLEVEL_CRIT);
-define_pr_level(__pr_err, LOGLEVEL_ERR);
-define_pr_level(__pr_warn, LOGLEVEL_WARNING);
-define_pr_level(__pr_notice, LOGLEVEL_NOTICE);
-define_pr_level(__pr_info, LOGLEVEL_INFO);
-
-int vprintk_default(int level, const char *fmt, va_list args)
+int vprintk_default(const char *fmt, va_list args)
 {
 	int r;
 
@@ -1959,7 +1940,7 @@ int vprintk_default(int level, const char *fmt, va_list args)
 		return r;
 	}
 #endif
-	r = vprintk_emit(0, level, NULL, 0, fmt, args);
+	r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
 
 	return r;
 }
@@ -1992,7 +1973,7 @@ asmlinkage __visible int printk(const char *fmt, ...)
 	int r;
 
 	va_start(args, fmt);
-	r = vprintk_func(LOGLEVEL_DEFAULT, fmt, args);
+	r = vprintk_func(fmt, args);
 	va_end(args);
 
 	return r;
-- 
cgit v1.2.3


From db4a835601b73cf8d6cd8986381d966b8e13d2d9 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Tue, 2 Aug 2016 00:48:12 -0700
Subject: perf/core: Set cgroup in CPU contexts for new cgroup events

There's a perf stat bug easy to observer on a machine with only one cgroup:

  $ perf stat -e cycles -I 1000 -C 0 -G /
  #          time             counts unit events
      1.000161699      <not counted>      cycles                    /
      2.000355591      <not counted>      cycles                    /
      3.000565154      <not counted>      cycles                    /
      4.000951350      <not counted>      cycles                    /

We'd expect some output there.

The underlying problem is that there is an optimization in
perf_cgroup_sched_{in,out}() that skips the switch of cgroup events
if the old and new cgroups in a task switch are the same.

This optimization interacts with the current code in two ways
that cause a CPU context's cgroup (cpuctx->cgrp) to be NULL even if a
cgroup event matches the current task. These are:

  1. On creation of the first cgroup event in a CPU: In current code,
  cpuctx->cpu is only set in perf_cgroup_sched_in, but due to the
  aforesaid optimization, perf_cgroup_sched_in will run until the next
  cgroup switches in that CPU. This may happen late or never happen,
  depending on system's number of cgroups, CPU load, etc.

  2. On deletion of the last cgroup event in a cpuctx: In list_del_event,
  cpuctx->cgrp is set NULL. Any new cgroup event will not be sched in
  because cpuctx->cgrp == NULL until a cgroup switch occurs and
  perf_cgroup_sched_in is executed (updating cpuctx->cgrp).

This patch fixes both problems by setting cpuctx->cgrp in list_add_event,
mirroring what list_del_event does when removing a cgroup event from CPU
context, as introduced in:

  commit 68cacd29167b ("perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx()")

With this patch, cpuctx->cgrp is always set/clear when installing/removing
the first/last cgroup event in/from the CPU context. With cpuctx->cgrp
correctly set, event_filter_match works as intended when events are
sched in/out.

After the fix, the output is as expected:

  $ perf stat -e cycles -I 1000 -a -G /
  #         time             counts unit events
     1.004699159          627342882      cycles                    /
     2.007397156          615272690      cycles                    /
     3.010019057          616726074      cycles                    /

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1470124092-113192-1-git-send-email-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  4 ++++
 kernel/events/core.c       | 54 ++++++++++++++++++++++++++++++----------------
 2 files changed, 40 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8ed4326164cc..2b6b43cc0dd5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -743,7 +743,9 @@ struct perf_event_context {
 	u64				parent_gen;
 	u64				generation;
 	int				pin_count;
+#ifdef CONFIG_CGROUP_PERF
 	int				nr_cgroups;	 /* cgroup evts */
+#endif
 	void				*task_ctx_data; /* pmu specific data */
 	struct rcu_head			rcu_head;
 };
@@ -769,7 +771,9 @@ struct perf_cpu_context {
 	unsigned int			hrtimer_active;
 
 	struct pmu			*unique_pmu;
+#ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
+#endif
 };
 
 struct perf_output_handle {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 87d02b8cb87e..1903b8f3a705 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -843,6 +843,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 		}
 	}
 }
+
+/*
+ * Update cpuctx->cgrp so that it is set when first cgroup event is added and
+ * cleared when last cgroup event is removed.
+ */
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+			 struct perf_event_context *ctx, bool add)
+{
+	struct perf_cpu_context *cpuctx;
+
+	if (!is_cgroup_event(event))
+		return;
+
+	if (add && ctx->nr_cgroups++)
+		return;
+	else if (!add && --ctx->nr_cgroups)
+		return;
+	/*
+	 * Because cgroup events are always per-cpu events,
+	 * this will always be called from the right CPU.
+	 */
+	cpuctx = __get_cpu_context(ctx);
+	cpuctx->cgrp = add ? event->cgrp : NULL;
+}
+
 #else /* !CONFIG_CGROUP_PERF */
 
 static inline bool
@@ -920,6 +946,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 			 struct perf_event_context *ctx)
 {
 }
+
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+			 struct perf_event_context *ctx, bool add)
+{
+}
+
 #endif
 
 /*
@@ -1392,6 +1425,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 {
+
 	lockdep_assert_held(&ctx->lock);
 
 	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1412,8 +1446,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		list_add_tail(&event->group_entry, list);
 	}
 
-	if (is_cgroup_event(event))
-		ctx->nr_cgroups++;
+	list_update_cgroup_event(event, ctx, true);
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	ctx->nr_events++;
@@ -1581,8 +1614,6 @@ static void perf_group_attach(struct perf_event *event)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-	struct perf_cpu_context *cpuctx;
-
 	WARN_ON_ONCE(event->ctx != ctx);
 	lockdep_assert_held(&ctx->lock);
 
@@ -1594,20 +1625,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
-	if (is_cgroup_event(event)) {
-		ctx->nr_cgroups--;
-		/*
-		 * Because cgroup events are always per-cpu events, this will
-		 * always be called from the right CPU.
-		 */
-		cpuctx = __get_cpu_context(ctx);
-		/*
-		 * If there are no more cgroup events then clear cgrp to avoid
-		 * stale pointer in update_cgrp_time_from_cpuctx().
-		 */
-		if (!ctx->nr_cgroups)
-			cpuctx->cgrp = NULL;
-	}
+	list_update_cgroup_event(event, ctx, false);
 
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
-- 
cgit v1.2.3


From 1ea049b2de5d803374fdbf43add23c8d1c518e7b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 11 Aug 2016 10:15:56 +0200
Subject: bvec: avoid variable shadowing warning

Due to the (indirect) nesting of min(..., min(...)), sparse will
show a variable shadowing warning whenever bvec.h is included.

Avoid that by assigning the inner min() to a temporary variable first.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bvec.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 701b64a3b7c5..89b65b82d98f 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -74,7 +74,8 @@ static inline void bvec_iter_advance(const struct bio_vec *bv,
 		  "Attempted to advance past end of bvec iter\n");
 
 	while (bytes) {
-		unsigned len = min(bytes, bvec_iter_len(bv, *iter));
+		unsigned iter_len = bvec_iter_len(bv, *iter);
+		unsigned len = min(bytes, iter_len);
 
 		bytes -= len;
 		iter->bi_size -= len;
-- 
cgit v1.2.3


From 023e9fddc3616b005c3753fc1bb6526388cd7a30 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Tue, 9 Aug 2016 19:13:00 +0200
Subject: KVM: PPC: Move xics_debugfs_init out of create
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As we are about to hold the kvm->lock during the create operation on KVM
devices, we should move the call to xics_debugfs_init into its own
function, since holding a mutex over extended amounts of time might not
be a good idea.

Introduce an init operation on the kvm_device_ops struct which cannot
fail and call this, if configured, after the device has been created.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/powerpc/kvm/book3s_xics.c | 10 ++++++++--
 include/linux/kvm_host.h       |  6 ++++++
 virt/kvm/kvm_main.c            |  3 +++
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index a75ba38a2d81..f2def8e45fef 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1341,8 +1341,6 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 		return ret;
 	}
 
-	xics_debugfs_init(xics);
-
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
 		/* Enable real mode support */
@@ -1354,9 +1352,17 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 	return 0;
 }
 
+static void kvmppc_xics_init(struct kvm_device *dev)
+{
+	struct kvmppc_xics *xics = (struct kvmppc_xics *)dev->private;
+
+	xics_debugfs_init(xics);
+}
+
 struct kvm_device_ops kvm_xics_ops = {
 	.name = "kvm-xics",
 	.create = kvmppc_xics_create,
+	.init = kvmppc_xics_init,
 	.destroy = kvmppc_xics_free,
 	.set_attr = xics_set_attr,
 	.get_attr = xics_get_attr,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 01e908ac4a39..d3c9b82812c3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1115,6 +1115,12 @@ struct kvm_device_ops {
 	const char *name;
 	int (*create)(struct kvm_device *dev, u32 type);
 
+	/*
+	 * init is called after create if create is successful and is called
+	 * outside of holding kvm->lock.
+	 */
+	void (*init)(struct kvm_device *dev);
+
 	/*
 	 * Destroy is responsible for freeing dev.
 	 *
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cc081ccfcaa3..ae642452e91a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2838,6 +2838,9 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 		return ret;
 	}
 
+	if (ops->init)
+		ops->init(dev);
+
 	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
 	if (ret < 0) {
 		ops->destroy(dev);
-- 
cgit v1.2.3


From a28ebea2adc4a2bef5989a5a181ec238f59fbcad Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Tue, 9 Aug 2016 19:13:01 +0200
Subject: KVM: Protect device ops->create and list_add with kvm->lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM devices were manipulating list data structures without any form of
synchronization, and some implementations of the create operations also
suffered from a lack of synchronization.

Now when we've split the xics create operation into create and init, we
can hold the kvm->lock mutex while calling the create operation and when
manipulating the devices list.

The error path in the generic code gets slightly ugly because we have to
take the mutex again and delete the device from the list, but holding
the mutex during anon_inode_getfd or releasing/locking the mutex in the
common non-error path seemed wrong.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/arm/kvm/arm.c             |  6 +++++-
 arch/powerpc/kvm/book3s_xics.c |  2 --
 include/linux/kvm_host.h       |  6 ++++++
 virt/kvm/arm/vgic/vgic-init.c  | 17 ++++-------------
 virt/kvm/kvm_main.c            | 13 ++++++++++++-
 5 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index d94bb9093ead..75f130ef6504 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -1009,9 +1009,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 	switch (ioctl) {
 	case KVM_CREATE_IRQCHIP: {
+		int ret;
 		if (!vgic_present)
 			return -ENXIO;
-		return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+		mutex_lock(&kvm->lock);
+		ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+		mutex_unlock(&kvm->lock);
+		return ret;
 	}
 	case KVM_ARM_SET_DEVICE_ADDR: {
 		struct kvm_arm_device_addr dev_addr;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index f2def8e45fef..05aa11399a78 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1329,12 +1329,10 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 	xics->kvm = kvm;
 
 	/* Already there ? */
-	mutex_lock(&kvm->lock);
 	if (kvm->arch.xics)
 		ret = -EEXIST;
 	else
 		kvm->arch.xics = xics;
-	mutex_unlock(&kvm->lock);
 
 	if (ret) {
 		kfree(xics);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d3c9b82812c3..9c28b4d4c90b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1113,6 +1113,12 @@ struct kvm_device {
 /* create, destroy, and name are mandatory */
 struct kvm_device_ops {
 	const char *name;
+
+	/*
+	 * create is called holding kvm->lock and any operations not suitable
+	 * to do while holding the lock should be deferred to init (see
+	 * below).
+	 */
 	int (*create)(struct kvm_device *dev, u32 type);
 
 	/*
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index fb4b0a79a950..83777c1cbae0 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -73,12 +73,8 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
 	int i, vcpu_lock_idx = -1, ret;
 	struct kvm_vcpu *vcpu;
 
-	mutex_lock(&kvm->lock);
-
-	if (irqchip_in_kernel(kvm)) {
-		ret = -EEXIST;
-		goto out;
-	}
+	if (irqchip_in_kernel(kvm))
+		return -EEXIST;
 
 	/*
 	 * This function is also called by the KVM_CREATE_IRQCHIP handler,
@@ -87,10 +83,8 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
 	 * the proper checks already.
 	 */
 	if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
-		!kvm_vgic_global_state.can_emulate_gicv2) {
-		ret = -ENODEV;
-		goto out;
-	}
+		!kvm_vgic_global_state.can_emulate_gicv2)
+		return -ENODEV;
 
 	/*
 	 * Any time a vcpu is run, vcpu_load is called which tries to grab the
@@ -138,9 +132,6 @@ out_unlock:
 		vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
 		mutex_unlock(&vcpu->mutex);
 	}
-
-out:
-	mutex_unlock(&kvm->lock);
 	return ret;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ae642452e91a..195078225aa5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -696,6 +696,11 @@ static void kvm_destroy_devices(struct kvm *kvm)
 {
 	struct kvm_device *dev, *tmp;
 
+	/*
+	 * We do not need to take the kvm->lock here, because nobody else
+	 * has a reference to the struct kvm at this point and therefore
+	 * cannot access the devices list anyhow.
+	 */
 	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
 		list_del(&dev->vm_node);
 		dev->ops->destroy(dev);
@@ -2832,11 +2837,15 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 	dev->ops = ops;
 	dev->kvm = kvm;
 
+	mutex_lock(&kvm->lock);
 	ret = ops->create(dev, cd->type);
 	if (ret < 0) {
+		mutex_unlock(&kvm->lock);
 		kfree(dev);
 		return ret;
 	}
+	list_add(&dev->vm_node, &kvm->devices);
+	mutex_unlock(&kvm->lock);
 
 	if (ops->init)
 		ops->init(dev);
@@ -2844,10 +2853,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
 	if (ret < 0) {
 		ops->destroy(dev);
+		mutex_lock(&kvm->lock);
+		list_del(&dev->vm_node);
+		mutex_unlock(&kvm->lock);
 		return ret;
 	}
 
-	list_add(&dev->vm_node, &kvm->devices);
 	kvm_get_kvm(kvm);
 	cd->fd = ret;
 	return 0;
-- 
cgit v1.2.3