From 77f88796cee819b9c4562b0b6b44691b3b7755b1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 16 Mar 2017 16:54:24 -0400
Subject: cgroup, kthread: close race window where new kthreads can be migrated
 to non-root cgroups

Creation of a kthread goes through a couple interlocked stages between
the kthread itself and its creator.  Once the new kthread starts
running, it initializes itself and wakes up the creator.  The creator
then can further configure the kthread and then let it start doing its
job by waking it up.

In this configuration-by-creator stage, the creator is the only one
that can wake it up but the kthread is visible to userland.  When
altering the kthread's attributes from userland is allowed, this is
fine; however, for cases where CPU affinity is critical,
kthread_bind() is used to first disable affinity changes from userland
and then set the affinity.  This also prevents the kthread from being
migrated into non-root cgroups as that can affect the CPU affinity and
many other things.

Unfortunately, the cgroup side of protection is racy.  While the
PF_NO_SETAFFINITY flag prevents further migrations, userland can win
the race before the creator sets the flag with kthread_bind() and put
the kthread in a non-root cgroup, which can lead to all sorts of
problems including incorrect CPU affinity and starvation.

This bug got triggered by userland which periodically tries to migrate
all processes in the root cpuset cgroup to a non-root one.  Per-cpu
workqueue workers got caught while being created and ended up with
incorrected CPU affinity breaking concurrency management and sometimes
stalling workqueue execution.

This patch adds task->no_cgroup_migration which disallows the task to
be migrated by userland.  kthreadd starts with the flag set making
every child kthread start in the root cgroup with migration
disallowed.  The flag is cleared after the kthread finishes
initialization by which time PF_NO_SETAFFINITY is set if the kthread
should stay in the root cgroup.

It'd be better to wait for the initialization instead of failing but I
couldn't think of a way of implementing that without adding either a
new PF flag, or sleeping and retrying from waiting side.  Even if
userland depends on changing cgroup membership of a kthread, it either
has to be synchronized with kthread_create() or periodically repeat,
so it's unlikely that this would break anything.

v2: Switch to a simpler implementation using a new task_struct bit
    field suggested by Oleg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reported-and-debugged-by: Chris Mason <clm@fb.com>
Cc: stable@vger.kernel.org # v4.3+ (we can't close the race on < v4.3)
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 21 +++++++++++++++++++++
 include/linux/sched.h  |  4 ++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f6b43fbb141c..af9c86e958bd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -570,6 +570,25 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 	pr_cont_kernfs_path(cgrp->kn);
 }
 
+static inline void cgroup_init_kthreadd(void)
+{
+	/*
+	 * kthreadd is inherited by all kthreads, keep it in the root so
+	 * that the new kthreads are guaranteed to stay in the root until
+	 * initialization is finished.
+	 */
+	current->no_cgroup_migration = 1;
+}
+
+static inline void cgroup_kthread_ready(void)
+{
+	/*
+	 * This kthread finished initialization.  The creator should have
+	 * set PF_NO_SETAFFINITY if this kthread should stay in the root.
+	 */
+	current->no_cgroup_migration = 0;
+}
+
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
@@ -590,6 +609,8 @@ static inline void cgroup_free(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
+static inline void cgroup_init_kthreadd(void) {}
+static inline void cgroup_kthread_ready(void) {}
 
 static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 					       struct cgroup *ancestor)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..4cf9a59a4d08 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -604,6 +604,10 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
 	unsigned			brk_randomized:1;
 #endif
+#ifdef CONFIG_CGROUPS
+	/* disallow userland-initiated cgroup migration */
+	unsigned			no_cgroup_migration:1;
+#endif
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
-- 
cgit v1.2.3


From 698eff6355f735d46d1b7113df8b422874cd7988 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Mar 2017 12:48:18 +0100
Subject: sched/clock, x86/perf: Fix "perf test tsc"

People reported that commit:

  5680d8094ffa ("sched/clock: Provide better clock continuity")

broke "perf test tsc".

That commit added another offset to the reported clock value; so
take that into account when computing the provided offset values.

Reported-by: Adrian Hunter <adrian.hunter@intel.com>
Reported-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 5680d8094ffa ("sched/clock: Provide better clock continuity")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/core.c       |  9 ++++++---
 arch/x86/include/asm/timer.h |  2 ++
 arch/x86/kernel/tsc.c        |  4 ++--
 include/linux/sched/clock.h  | 13 +++++++------
 kernel/sched/clock.c         | 22 +++++++++++-----------
 5 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 2aa1ad194db2..580b60f5ac83 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2256,6 +2256,7 @@ void arch_perf_update_userpage(struct perf_event *event,
 			       struct perf_event_mmap_page *userpg, u64 now)
 {
 	struct cyc2ns_data *data;
+	u64 offset;
 
 	userpg->cap_user_time = 0;
 	userpg->cap_user_time_zero = 0;
@@ -2263,11 +2264,13 @@ void arch_perf_update_userpage(struct perf_event *event,
 		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
 	userpg->pmc_width = x86_pmu.cntval_bits;
 
-	if (!sched_clock_stable())
+	if (!using_native_sched_clock() || !sched_clock_stable())
 		return;
 
 	data = cyc2ns_read_begin();
 
+	offset = data->cyc2ns_offset + __sched_clock_offset;
+
 	/*
 	 * Internal timekeeping for enabled/running/stopped times
 	 * is always in the local_clock domain.
@@ -2275,7 +2278,7 @@ void arch_perf_update_userpage(struct perf_event *event,
 	userpg->cap_user_time = 1;
 	userpg->time_mult = data->cyc2ns_mul;
 	userpg->time_shift = data->cyc2ns_shift;
-	userpg->time_offset = data->cyc2ns_offset - now;
+	userpg->time_offset = offset - now;
 
 	/*
 	 * cap_user_time_zero doesn't make sense when we're using a different
@@ -2283,7 +2286,7 @@ void arch_perf_update_userpage(struct perf_event *event,
 	 */
 	if (!event->attr.use_clockid) {
 		userpg->cap_user_time_zero = 1;
-		userpg->time_zero = data->cyc2ns_offset;
+		userpg->time_zero = offset;
 	}
 
 	cyc2ns_read_end(data);
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index a04eabd43d06..27e9f9d769b8 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -12,6 +12,8 @@ extern int recalibrate_cpu_khz(void);
 
 extern int no_timer_check;
 
+extern bool using_native_sched_clock(void);
+
 /*
  * We use the full linear equation: f(x) = a + b*x, in order to allow
  * a continuous function in the face of dynamic freq changes.
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c73a7f9e881a..714dfba6a1e7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -328,7 +328,7 @@ unsigned long long sched_clock(void)
 	return paravirt_sched_clock();
 }
 
-static inline bool using_native_sched_clock(void)
+bool using_native_sched_clock(void)
 {
 	return pv_time_ops.sched_clock == native_sched_clock;
 }
@@ -336,7 +336,7 @@ static inline bool using_native_sched_clock(void)
 unsigned long long
 sched_clock(void) __attribute__((alias("native_sched_clock")));
 
-static inline bool using_native_sched_clock(void) { return true; }
+bool using_native_sched_clock(void) { return true; }
 #endif
 
 int check_tsc_unstable(void)
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 4a68c6791207..34fe92ce1ebd 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -54,15 +54,16 @@ static inline u64 local_clock(void)
 }
 #else
 extern void sched_clock_init_late(void);
-/*
- * Architectures can set this to 1 if they have specified
- * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
- * but then during bootup it turns out that sched_clock()
- * is reliable after all:
- */
 extern int sched_clock_stable(void);
 extern void clear_sched_clock_stable(void);
 
+/*
+ * When sched_clock_stable(), __sched_clock_offset provides the offset
+ * between local_clock() and sched_clock().
+ */
+extern u64 __sched_clock_offset;
+
+
 extern void sched_clock_tick(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index fec0f58c8dee..24a3e01bf8cb 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -96,10 +96,10 @@ static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
 static int __sched_clock_stable_early = 1;
 
 /*
- * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
+ * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
  */
-static __read_mostly u64 raw_offset;
-static __read_mostly u64 gtod_offset;
+__read_mostly u64 __sched_clock_offset;
+static __read_mostly u64 __gtod_offset;
 
 struct sched_clock_data {
 	u64			tick_raw;
@@ -131,11 +131,11 @@ static void __set_sched_clock_stable(void)
 	/*
 	 * Attempt to make the (initial) unstable->stable transition continuous.
 	 */
-	raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw);
+	__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
 
 	printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
-			scd->tick_gtod, gtod_offset,
-			scd->tick_raw,  raw_offset);
+			scd->tick_gtod, __gtod_offset,
+			scd->tick_raw,  __sched_clock_offset);
 
 	static_branch_enable(&__sched_clock_stable);
 	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
@@ -161,11 +161,11 @@ static void __clear_sched_clock_stable(void)
 	 *
 	 * Still do what we can.
 	 */
-	gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod);
+	__gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);
 
 	printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
-			scd->tick_gtod, gtod_offset,
-			scd->tick_raw,  raw_offset);
+			scd->tick_gtod, __gtod_offset,
+			scd->tick_raw,  __sched_clock_offset);
 
 	tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
 
@@ -238,7 +238,7 @@ again:
 	 *		      scd->tick_gtod + TICK_NSEC);
 	 */
 
-	clock = scd->tick_gtod + gtod_offset + delta;
+	clock = scd->tick_gtod + __gtod_offset + delta;
 	min_clock = wrap_max(scd->tick_gtod, old_clock);
 	max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
 
@@ -324,7 +324,7 @@ u64 sched_clock_cpu(int cpu)
 	u64 clock;
 
 	if (sched_clock_stable())
-		return sched_clock() + raw_offset;
+		return sched_clock() + __sched_clock_offset;
 
 	if (unlikely(!sched_clock_running))
 		return 0ull;
-- 
cgit v1.2.3


From 90db10434b163e46da413d34db8d0e77404cc645 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 23 Mar 2017 18:24:19 +0100
Subject: KVM: kvm_io_bus_unregister_dev() should never fail

No caller currently checks the return value of
kvm_io_bus_unregister_dev(). This is evil, as all callers silently go on
freeing their device. A stale reference will remain in the io_bus,
getting at least used again, when the iobus gets teared down on
kvm_destroy_vm() - leading to use after free errors.

There is nothing the callers could do, except retrying over and over
again.

So let's simply remove the bus altogether, print an error and make
sure no one can access this broken bus again (returning -ENOMEM on any
attempt to access it).

Fixes: e93f8a0f821e ("KVM: convert io_bus to SRCU")
Cc: stable@vger.kernel.org # 3.4+
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  4 ++--
 virt/kvm/eventfd.c       |  3 ++-
 virt/kvm/kvm_main.c      | 42 +++++++++++++++++++++++++-----------------
 3 files changed, 29 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c14ad9809da..d0250744507a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -162,8 +162,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev);
-int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			      struct kvm_io_device *dev);
+void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			       struct kvm_io_device *dev);
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 					 gpa_t addr);
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index a29786dd9522..4d28a9ddbee0 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -870,7 +870,8 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
 			continue;
 
 		kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
-		kvm->buses[bus_idx]->ioeventfd_count--;
+		if (kvm->buses[bus_idx])
+			kvm->buses[bus_idx]->ioeventfd_count--;
 		ioeventfd_release(p);
 		ret = 0;
 		break;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7445566fadc1..ef1aa7f1ed7a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -728,7 +728,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	spin_unlock(&kvm_lock);
 	kvm_free_irq_routing(kvm);
 	for (i = 0; i < KVM_NR_BUSES; i++) {
-		kvm_io_bus_destroy(kvm->buses[i]);
+		if (kvm->buses[i])
+			kvm_io_bus_destroy(kvm->buses[i]);
 		kvm->buses[i] = NULL;
 	}
 	kvm_coalesced_mmio_free(kvm);
@@ -3476,6 +3477,8 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 	};
 
 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	if (!bus)
+		return -ENOMEM;
 	r = __kvm_io_bus_write(vcpu, bus, &range, val);
 	return r < 0 ? r : 0;
 }
@@ -3493,6 +3496,8 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
 	};
 
 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	if (!bus)
+		return -ENOMEM;
 
 	/* First try the device referenced by cookie. */
 	if ((cookie >= 0) && (cookie < bus->dev_count) &&
@@ -3543,6 +3548,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 	};
 
 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	if (!bus)
+		return -ENOMEM;
 	r = __kvm_io_bus_read(vcpu, bus, &range, val);
 	return r < 0 ? r : 0;
 }
@@ -3555,6 +3562,9 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	struct kvm_io_bus *new_bus, *bus;
 
 	bus = kvm->buses[bus_idx];
+	if (!bus)
+		return -ENOMEM;
+
 	/* exclude ioeventfd which is limited by maximum fd */
 	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
 		return -ENOSPC;
@@ -3574,45 +3584,41 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 }
 
 /* Caller must hold slots_lock. */
-int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			      struct kvm_io_device *dev)
+void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			       struct kvm_io_device *dev)
 {
-	int i, r;
+	int i;
 	struct kvm_io_bus *new_bus, *bus;
 
 	bus = kvm->buses[bus_idx];
-
-	/*
-	 * It's possible the bus being released before hand. If so,
-	 * we're done here.
-	 */
 	if (!bus)
-		return 0;
+		return;
 
-	r = -ENOENT;
 	for (i = 0; i < bus->dev_count; i++)
 		if (bus->range[i].dev == dev) {
-			r = 0;
 			break;
 		}
 
-	if (r)
-		return r;
+	if (i == bus->dev_count)
+		return;
 
 	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
-	if (!new_bus)
-		return -ENOMEM;
+	if (!new_bus)  {
+		pr_err("kvm: failed to shrink bus, removing it completely\n");
+		goto broken;
+	}
 
 	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
 	new_bus->dev_count--;
 	memcpy(new_bus->range + i, bus->range + i + 1,
 	       (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
 
+broken:
 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
 	synchronize_srcu_expedited(&kvm->srcu);
 	kfree(bus);
-	return r;
+	return;
 }
 
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -3625,6 +3631,8 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+	if (!bus)
+		goto out_unlock;
 
 	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
 	if (dev_idx < 0)
-- 
cgit v1.2.3


From 07de36b378a58f1d1426829acf0ab7cf86f651f3 Mon Sep 17 00:00:00 2001
From: Alexander Kochetkov <al.kochet@gmail.com>
Date: Wed, 22 Mar 2017 17:32:49 +0300
Subject: clockevents: Fix syntax error in clkevt-of macro

The patch fix syntax errors introduced by commit 0c8893c9095d
("clockevents: Add a clkevt-of mechanism like clksrc-of").

Fixes: 0c8893c9095d ("clockevents: Add a clkevt-of mechanism like clksrc-of")
Signed-off-by: Alexander Kochetkov <al.kochet@gmail.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/clkevt-probe.c | 2 +-
 include/linux/clockchips.h         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clocksource/clkevt-probe.c b/drivers/clocksource/clkevt-probe.c
index 8c30fec86094..eb89b502acbd 100644
--- a/drivers/clocksource/clkevt-probe.c
+++ b/drivers/clocksource/clkevt-probe.c
@@ -17,7 +17,7 @@
 
 #include <linux/init.h>
 #include <linux/of.h>
-#include <linux/clockchip.h>
+#include <linux/clockchips.h>
 
 extern struct of_device_id __clkevt_of_table[];
 
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 5d3053c34fb3..6d7edc3082f9 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -229,7 +229,7 @@ static inline void tick_setup_hrtimer_broadcast(void) { }
 
 #ifdef CONFIG_CLKEVT_PROBE
 extern int clockevent_probe(void);
-#els
+#else
 static inline int clockevent_probe(void) { return 0; }
 #endif
 
-- 
cgit v1.2.3


From 597b7305dd8bafdb3aef4957d97128bc90af8e9f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 31 Mar 2017 15:11:47 -0700
Subject: mm: move mm_percpu_wq initialization earlier

Yang Li has reported that drain_all_pages triggers a WARN_ON which means
that this function is called earlier than the mm_percpu_wq is
initialized on arm64 with CMA configured:

  WARNING: CPU: 2 PID: 1 at mm/page_alloc.c:2423 drain_all_pages+0x244/0x25c
  Modules linked in:
  CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.11.0-rc1-next-20170310-00027-g64dfbc5 #127
  Hardware name: Freescale Layerscape 2088A RDB Board (DT)
  task: ffffffc07c4a6d00 task.stack: ffffffc07c4a8000
  PC is at drain_all_pages+0x244/0x25c
  LR is at start_isolate_page_range+0x14c/0x1f0
  [...]
   drain_all_pages+0x244/0x25c
   start_isolate_page_range+0x14c/0x1f0
   alloc_contig_range+0xec/0x354
   cma_alloc+0x100/0x1fc
   dma_alloc_from_contiguous+0x3c/0x44
   atomic_pool_init+0x7c/0x208
   arm64_dma_init+0x44/0x4c
   do_one_initcall+0x38/0x128
   kernel_init_freeable+0x1a0/0x240
   kernel_init+0x10/0xfc
   ret_from_fork+0x10/0x20

Fix this by moving the whole setup_vmstat which is an initcall right now
to init_mm_internals which will be called right after the WQ subsystem
is initialized.

Link: http://lkml.kernel.org/r/20170315164021.28532-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Yang Li <pku.leo@gmail.com>
Tested-by: Yang Li <pku.leo@gmail.com>
Tested-by: Xiaolong Ye <xiaolong.ye@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 ++
 init/main.c        | 2 ++
 mm/vmstat.c        | 4 +---
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5f01c88f0800..00a8fa7e366a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -32,6 +32,8 @@ struct user_struct;
 struct writeback_control;
 struct bdi_writeback;
 
+void init_mm_internals(void);
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES	/* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
 
diff --git a/init/main.c b/init/main.c
index f9c9d9948203..b0c11cbf5ddf 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1022,6 +1022,8 @@ static noinline void __init kernel_init_freeable(void)
 
 	workqueue_init();
 
+	init_mm_internals();
+
 	do_pre_smp_initcalls();
 	lockup_detector_init();
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b1947f0cbee2..89f95396ec46 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1764,7 +1764,7 @@ static int vmstat_cpu_dead(unsigned int cpu)
 
 #endif
 
-static int __init setup_vmstat(void)
+void __init init_mm_internals(void)
 {
 #ifdef CONFIG_SMP
 	int ret;
@@ -1792,9 +1792,7 @@ static int __init setup_vmstat(void)
 	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
 	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #endif
-	return 0;
 }
-module_init(setup_vmstat)
 
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
 
-- 
cgit v1.2.3


From 553af430e7c981e6e8fa5007c5b7b5773acc63dd Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 31 Mar 2017 15:11:50 -0700
Subject: mm: rmap: fix huge file mmap accounting in the memcg stats

Huge pages are accounted as single units in the memcg's "file_mapped"
counter.  Account the correct number of base pages, like we do in the
corresponding node counter.

Link: http://lkml.kernel.org/r/20170322005111.3156-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <stable@vger.kernel.org>	[4.8+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 6 ++++++
 mm/rmap.c                  | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5af377303880..bb7250c45cb8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -740,6 +740,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
 	return false;
 }
 
+static inline void mem_cgroup_update_page_stat(struct page *page,
+					       enum mem_cgroup_stat_index idx,
+					       int nr)
+{
+}
+
 static inline void mem_cgroup_inc_page_stat(struct page *page,
 					    enum mem_cgroup_stat_index idx)
 {
diff --git a/mm/rmap.c b/mm/rmap.c
index 49ed681ccc7b..f6838015810f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1159,7 +1159,7 @@ void page_add_file_rmap(struct page *page, bool compound)
 			goto out;
 	}
 	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
-	mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+	mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr);
 out:
 	unlock_page_memcg(page);
 }
@@ -1199,7 +1199,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
 	 * pte lock(a spinlock) is held, which implies preemption disabled.
 	 */
 	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
-	mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+	mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr);
 
 	if (unlikely(PageMlocked(page)))
 		clear_page_mlock(page);
-- 
cgit v1.2.3


From b0845ce58379d11dcad4cdb6824a6410de260216 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 31 Mar 2017 15:12:04 -0700
Subject: kasan: report only the first error by default

Disable kasan after the first report.  There are several reasons for
this:

 - Single bug quite often has multiple invalid memory accesses causing
   storm in the dmesg.

 - Write OOB access might corrupt metadata so the next report will print
   bogus alloc/free stacktraces.

 - Reports after the first easily could be not bugs by itself but just
   side effects of the first one.

Given that multiple reports usually only do harm, it makes sense to
disable kasan after the first one.  If user wants to see all the
reports, the boot-time parameter kasan_multi_shot must be used.

[aryabinin@virtuozzo.com: wrote changelog and doc, added missing include]
Link: http://lkml.kernel.org/r/20170323154416.30257-1-aryabinin@virtuozzo.com
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  6 +++++
 include/linux/kasan.h                           |  3 +++
 lib/test_kasan.c                                | 10 +++++++
 mm/kasan/kasan.h                                |  5 ----
 mm/kasan/report.c                               | 36 +++++++++++++++++++++++++
 5 files changed, 55 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2ba45caabada..facc20a3f962 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1725,6 +1725,12 @@
 			kernel and module base offset ASLR (Address Space
 			Layout Randomization).
 
+	kasan_multi_shot
+			[KNL] Enforce KASAN (Kernel Address Sanitizer) to print
+			report on every invalid memory access. Without this
+			parameter KASAN will print report only for the first
+			invalid access.
+
 	keepinitrd	[HW,ARM]
 
 	kernelcore=	[KNL,X86,IA-64,PPC]
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 5734480c9590..a5c7046f26b4 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -76,6 +76,9 @@ size_t ksize(const void *);
 static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
 size_t kasan_metadata_size(struct kmem_cache *cache);
 
+bool kasan_save_enable_multi_shot(void);
+void kasan_restore_multi_shot(bool enabled);
+
 #else /* CONFIG_KASAN */
 
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 0b1d3140fbb8..a25c9763fce1 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -20,6 +20,7 @@
 #include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
+#include <linux/kasan.h>
 
 /*
  * Note: test functions are marked noinline so that their names appear in
@@ -474,6 +475,12 @@ static noinline void __init use_after_scope_test(void)
 
 static int __init kmalloc_tests_init(void)
 {
+	/*
+	 * Temporarily enable multi-shot mode. Otherwise, we'd only get a
+	 * report for the first case.
+	 */
+	bool multishot = kasan_save_enable_multi_shot();
+
 	kmalloc_oob_right();
 	kmalloc_oob_left();
 	kmalloc_node_oob_right();
@@ -499,6 +506,9 @@ static int __init kmalloc_tests_init(void)
 	ksize_unpoisons_memory();
 	copy_user_test();
 	use_after_scope_test();
+
+	kasan_restore_multi_shot(multishot);
+
 	return -EAGAIN;
 }
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1c260e6b3b3c..dd2dea8eb077 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -96,11 +96,6 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 		<< KASAN_SHADOW_SCALE_SHIFT);
 }
 
-static inline bool kasan_report_enabled(void)
-{
-	return !current->kasan_depth;
-}
-
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 void kasan_report_double_free(struct kmem_cache *cache, void *object,
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index f479365530b6..ab42a0803f16 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -13,7 +13,9 @@
  *
  */
 
+#include <linux/bitops.h>
 #include <linux/ftrace.h>
+#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/printk.h>
@@ -293,6 +295,40 @@ static void kasan_report_error(struct kasan_access_info *info)
 	kasan_end_report(&flags);
 }
 
+static unsigned long kasan_flags;
+
+#define KASAN_BIT_REPORTED	0
+#define KASAN_BIT_MULTI_SHOT	1
+
+bool kasan_save_enable_multi_shot(void)
+{
+	return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
+
+void kasan_restore_multi_shot(bool enabled)
+{
+	if (!enabled)
+		clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
+
+static int __init kasan_set_multi_shot(char *str)
+{
+	set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+	return 1;
+}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
+
+static inline bool kasan_report_enabled(void)
+{
+	if (current->kasan_depth)
+		return false;
+	if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+		return true;
+	return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
+
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip)
 {
-- 
cgit v1.2.3


From bf17aa36c0f199f5b254262e77eaefda7da0f50b Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@purestorage.com>
Date: Wed, 1 Mar 2017 18:22:01 -0800
Subject: nvme: Correct NVMF enum values to match NVMe-oF rev 1.0

The enum values for QPTYPE, PRTYPE and CMS are off by 1 from the
values defined in figure 42 of the NVM Express over Fabrics 1.0:

    http://www.nvmexpress.org/wp-content/uploads/NVMe_over_Fabrics_1_0_Gold_20160605-1.pdf

Fix our enums to match the final spec.

Signed-off-by: Roland Dreier <roland@purestorage.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 include/linux/nvme.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c43d435d4225..9061780b141f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -64,26 +64,26 @@ enum {
  * RDMA_QPTYPE field
  */
 enum {
-	NVMF_RDMA_QPTYPE_CONNECTED	= 0, /* Reliable Connected */
-	NVMF_RDMA_QPTYPE_DATAGRAM	= 1, /* Reliable Datagram */
+	NVMF_RDMA_QPTYPE_CONNECTED	= 1, /* Reliable Connected */
+	NVMF_RDMA_QPTYPE_DATAGRAM	= 2, /* Reliable Datagram */
 };
 
 /* RDMA QP Service Type codes for Discovery Log Page entry TSAS
  * RDMA_QPTYPE field
  */
 enum {
-	NVMF_RDMA_PRTYPE_NOT_SPECIFIED	= 0, /* No Provider Specified */
-	NVMF_RDMA_PRTYPE_IB		= 1, /* InfiniBand */
-	NVMF_RDMA_PRTYPE_ROCE		= 2, /* InfiniBand RoCE */
-	NVMF_RDMA_PRTYPE_ROCEV2		= 3, /* InfiniBand RoCEV2 */
-	NVMF_RDMA_PRTYPE_IWARP		= 4, /* IWARP */
+	NVMF_RDMA_PRTYPE_NOT_SPECIFIED	= 1, /* No Provider Specified */
+	NVMF_RDMA_PRTYPE_IB		= 2, /* InfiniBand */
+	NVMF_RDMA_PRTYPE_ROCE		= 3, /* InfiniBand RoCE */
+	NVMF_RDMA_PRTYPE_ROCEV2		= 4, /* InfiniBand RoCEV2 */
+	NVMF_RDMA_PRTYPE_IWARP		= 5, /* IWARP */
 };
 
 /* RDMA Connection Management Service Type codes for Discovery Log Page
  * entry TSAS RDMA_CMS field
  */
 enum {
-	NVMF_RDMA_CMS_RDMA_CM	= 0, /* Sockets based enpoint addressing */
+	NVMF_RDMA_CMS_RDMA_CM	= 1, /* Sockets based endpoint addressing */
 };
 
 #define NVMF_AQ_DEPTH		32
-- 
cgit v1.2.3


From 27c0e3748e41ca79171ffa3e97415a20af6facd0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 17 Feb 2017 18:42:24 -0500
Subject: [iov_iter] new privimitive: iov_iter_revert()

opposite to iov_iter_advance(); the caller is responsible for never
using it to move back past the initial position.

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  6 ++++-
 lib/iov_iter.c      | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 804e34c6f981..f2d36a3d3005 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -39,7 +39,10 @@ struct iov_iter {
 	};
 	union {
 		unsigned long nr_segs;
-		int idx;
+		struct {
+			int idx;
+			int start_idx;
+		};
 	};
 };
 
@@ -81,6 +84,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to);
 size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
+void iov_iter_revert(struct iov_iter *i, size_t bytes);
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e68604ae3ced..60abc44385b7 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -786,6 +786,68 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 }
 EXPORT_SYMBOL(iov_iter_advance);
 
+void iov_iter_revert(struct iov_iter *i, size_t unroll)
+{
+	if (!unroll)
+		return;
+	i->count += unroll;
+	if (unlikely(i->type & ITER_PIPE)) {
+		struct pipe_inode_info *pipe = i->pipe;
+		int idx = i->idx;
+		size_t off = i->iov_offset;
+		while (1) {
+			size_t n = off - pipe->bufs[idx].offset;
+			if (unroll < n) {
+				off -= (n - unroll);
+				break;
+			}
+			unroll -= n;
+			if (!unroll && idx == i->start_idx) {
+				off = 0;
+				break;
+			}
+			if (!idx--)
+				idx = pipe->buffers - 1;
+			off = pipe->bufs[idx].offset + pipe->bufs[idx].len;
+		}
+		i->iov_offset = off;
+		i->idx = idx;
+		pipe_truncate(i);
+		return;
+	}
+	if (unroll <= i->iov_offset) {
+		i->iov_offset -= unroll;
+		return;
+	}
+	unroll -= i->iov_offset;
+	if (i->type & ITER_BVEC) {
+		const struct bio_vec *bvec = i->bvec;
+		while (1) {
+			size_t n = (--bvec)->bv_len;
+			i->nr_segs++;
+			if (unroll <= n) {
+				i->bvec = bvec;
+				i->iov_offset = n - unroll;
+				return;
+			}
+			unroll -= n;
+		}
+	} else { /* same logics for iovec and kvec */
+		const struct iovec *iov = i->iov;
+		while (1) {
+			size_t n = (--iov)->iov_len;
+			i->nr_segs++;
+			if (unroll <= n) {
+				i->iov = iov;
+				i->iov_offset = n - unroll;
+				return;
+			}
+			unroll -= n;
+		}
+	}
+}
+EXPORT_SYMBOL(iov_iter_revert);
+
 /*
  * Return the count of just the current iov_iter segment.
  */
@@ -839,6 +901,7 @@ void iov_iter_pipe(struct iov_iter *i, int direction,
 	i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 	i->iov_offset = 0;
 	i->count = count;
+	i->start_idx = i->idx;
 }
 EXPORT_SYMBOL(iov_iter_pipe);
 
-- 
cgit v1.2.3


From 3209f68b3ca4667069923a325c88b21131bfdf9f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 31 Mar 2017 18:32:17 +0100
Subject: statx: Include a mask for stx_attributes in struct statx

Include a mask in struct stat to indicate which bits of stx_attributes the
filesystem actually supports.

This would also be useful if we add another system call that allows you to
do a 'bulk attribute set' and pass in a statx struct with the masks
appropriately set to say what you want to set.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/inode.c            |  6 ++++++
 fs/stat.c                  |  1 +
 include/linux/stat.h       |  1 +
 include/uapi/linux/stat.h  |  4 ++--
 samples/statx/test-statx.c | 12 ++++++++----
 5 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5d02b922afa3..b9ffa9f4191f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5413,6 +5413,12 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
 	if (flags & EXT4_NODUMP_FL)
 		stat->attributes |= STATX_ATTR_NODUMP;
 
+	stat->attributes_mask |= (STATX_ATTR_APPEND |
+				  STATX_ATTR_COMPRESSED |
+				  STATX_ATTR_ENCRYPTED |
+				  STATX_ATTR_IMMUTABLE |
+				  STATX_ATTR_NODUMP);
+
 	generic_fillattr(inode, stat);
 	return 0;
 }
diff --git a/fs/stat.c b/fs/stat.c
index 0c7e6cdc435c..c6c963b2546b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -527,6 +527,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
 	tmp.stx_ino = stat->ino;
 	tmp.stx_size = stat->size;
 	tmp.stx_blocks = stat->blocks;
+	tmp.stx_attributes_mask = stat->attributes_mask;
 	tmp.stx_atime.tv_sec = stat->atime.tv_sec;
 	tmp.stx_atime.tv_nsec = stat->atime.tv_nsec;
 	tmp.stx_btime.tv_sec = stat->btime.tv_sec;
diff --git a/include/linux/stat.h b/include/linux/stat.h
index c76e524fb34b..64b6b3aece21 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -26,6 +26,7 @@ struct kstat {
 	unsigned int	nlink;
 	uint32_t	blksize;	/* Preferred I/O size */
 	u64		attributes;
+	u64		attributes_mask;
 #define KSTAT_ATTR_FS_IOC_FLAGS				\
 	(STATX_ATTR_COMPRESSED |			\
 	 STATX_ATTR_IMMUTABLE |				\
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 0869b9eaa8ce..d538897b8e08 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -114,7 +114,7 @@ struct statx {
 	__u64	stx_ino;	/* Inode number */
 	__u64	stx_size;	/* File size */
 	__u64	stx_blocks;	/* Number of 512-byte blocks allocated */
-	__u64	__spare1[1];
+	__u64	stx_attributes_mask; /* Mask to show what's supported in stx_attributes */
 	/* 0x40 */
 	struct statx_timestamp	stx_atime;	/* Last access time */
 	struct statx_timestamp	stx_btime;	/* File creation time */
@@ -155,7 +155,7 @@ struct statx {
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
 
 /*
- * Attributes to be found in stx_attributes
+ * Attributes to be found in stx_attributes and masked in stx_attributes_mask.
  *
  * These give information about the features or the state of a file that might
  * be of use to ordinary userspace programs such as GUIs or ls rather than
diff --git a/samples/statx/test-statx.c b/samples/statx/test-statx.c
index 8571d766331d..d4d77b09412c 100644
--- a/samples/statx/test-statx.c
+++ b/samples/statx/test-statx.c
@@ -141,8 +141,8 @@ static void dump_statx(struct statx *stx)
 	if (stx->stx_mask & STATX_BTIME)
 		print_time(" Birth: ", &stx->stx_btime);
 
-	if (stx->stx_attributes) {
-		unsigned char bits;
+	if (stx->stx_attributes_mask) {
+		unsigned char bits, mbits;
 		int loop, byte;
 
 		static char attr_representation[64 + 1] =
@@ -160,14 +160,18 @@ static void dump_statx(struct statx *stx)
 		printf("Attributes: %016llx (", stx->stx_attributes);
 		for (byte = 64 - 8; byte >= 0; byte -= 8) {
 			bits = stx->stx_attributes >> byte;
+			mbits = stx->stx_attributes_mask >> byte;
 			for (loop = 7; loop >= 0; loop--) {
 				int bit = byte + loop;
 
-				if (bits & 0x80)
+				if (!(mbits & 0x80))
+					putchar('.');	/* Not supported */
+				else if (bits & 0x80)
 					putchar(attr_representation[63 - bit]);
 				else
-					putchar('-');
+					putchar('-');	/* Not set */
 				bits <<= 1;
+				mbits <<= 1;
 			}
 			if (byte)
 				putchar(' ');
-- 
cgit v1.2.3


From 6d56111c92d247bb64301029fe88365aa4caf16e Mon Sep 17 00:00:00 2001
From: Christoffer Dall <cdall@linaro.org>
Date: Tue, 21 Mar 2017 22:05:22 +0100
Subject: KVM: arm/arm64: vgic: Fix GICC_PMR uaccess on GICv3 and clarify ABI

As an oversight, for GICv2, we accidentally export the GICC_PMR register
in the format of the GICH_VMCR.VMPriMask field in the lower 5 bits of a
word, meaning that userspace must always use the lower 5 bits to
communicate with the KVM device and must shift the value left by 3
places to obtain the actual priority mask level.

Since GICv3 supports the full 8 bits of priority masking in the ICH_VMCR,
we have to fix the value we export when emulating a GICv2 on top of a
hardware GICv3 and exporting the emulated GICv2 state to userspace.

Take the chance to clarify this aspect of the ABI.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 Documentation/virtual/kvm/devices/arm-vgic.txt |  6 ++++++
 include/linux/irqchip/arm-gic.h                |  3 +++
 virt/kvm/arm/vgic/vgic-mmio-v2.c               | 20 ++++++++++++++++++--
 virt/kvm/arm/vgic/vgic-v2.c                    |  8 ++++----
 virt/kvm/arm/vgic/vgic.h                       |  9 ++++++++-
 5 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 76e61c883347..b2f60ca8b60c 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -83,6 +83,12 @@ Groups:
 
     Bits for undefined preemption levels are RAZ/WI.
 
+    For historical reasons and to provide ABI compatibility with userspace we
+    export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask
+    field in the lower 5 bits of a word, meaning that userspace must always
+    use the lower 5 bits to communicate with the KVM device and must shift the
+    value left by 3 places to obtain the actual priority mask level.
+
   Limitations:
     - Priorities are not implemented, and registers are RAZ/WI
     - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index eafc965b3eb8..dc30f3d057eb 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -96,6 +96,9 @@
 #define GICH_MISR_EOI			(1 << 0)
 #define GICH_MISR_U			(1 << 1)
 
+#define GICV_PMR_PRIORITY_SHIFT		3
+#define GICV_PMR_PRIORITY_MASK		(0x1f << GICV_PMR_PRIORITY_SHIFT)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/irqdomain.h>
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index a3ad7ff95c9b..0a4283ed9aa7 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -229,7 +229,15 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
 		val = vmcr.ctlr;
 		break;
 	case GIC_CPU_PRIMASK:
-		val = vmcr.pmr;
+		/*
+		 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+		 * the PMR field as GICH_VMCR.VMPriMask rather than
+		 * GICC_PMR.Priority, so we expose the upper five bits of
+		 * priority mask to userspace using the lower bits in the
+		 * unsigned long.
+		 */
+		val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >>
+			GICV_PMR_PRIORITY_SHIFT;
 		break;
 	case GIC_CPU_BINPOINT:
 		val = vmcr.bpr;
@@ -262,7 +270,15 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
 		vmcr.ctlr = val;
 		break;
 	case GIC_CPU_PRIMASK:
-		vmcr.pmr = val;
+		/*
+		 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+		 * the PMR field as GICH_VMCR.VMPriMask rather than
+		 * GICC_PMR.Priority, so we expose the upper five bits of
+		 * priority mask to userspace using the lower bits in the
+		 * unsigned long.
+		 */
+		vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) &
+			GICV_PMR_PRIORITY_MASK;
 		break;
 	case GIC_CPU_BINPOINT:
 		vmcr.bpr = val;
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 94cf4b9b6471..b637d9c7afe3 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -206,8 +206,8 @@ void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 		GICH_VMCR_ALIAS_BINPOINT_MASK;
 	vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
 		GICH_VMCR_BINPOINT_MASK;
-	vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) &
-		GICH_VMCR_PRIMASK_MASK;
+	vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) <<
+		 GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
 
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
 }
@@ -222,8 +222,8 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 			GICH_VMCR_ALIAS_BINPOINT_SHIFT;
 	vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
 			GICH_VMCR_BINPOINT_SHIFT;
-	vmcrp->pmr  = (vmcr & GICH_VMCR_PRIMASK_MASK) >>
-			GICH_VMCR_PRIMASK_SHIFT;
+	vmcrp->pmr  = ((vmcr & GICH_VMCR_PRIMASK_MASK) >>
+			GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
 }
 
 void vgic_v2_enable(struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 91566f5aac9b..6cf557e9f718 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -81,11 +81,18 @@ static inline bool irq_is_pending(struct vgic_irq *irq)
 		return irq->pending_latch || irq->line_level;
 }
 
+/*
+ * This struct provides an intermediate representation of the fields contained
+ * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
+ * state to userspace can generate either GICv2 or GICv3 CPU interface
+ * registers regardless of the hardware backed GIC used.
+ */
 struct vgic_vmcr {
 	u32	ctlr;
 	u32	abpr;
 	u32	bpr;
-	u32	pmr;
+	u32	pmr;  /* Priority mask field in the GICC_PMR and
+		       * ICC_PMR_EL1 priority field format */
 	/* Below member variable are valid only for GICv3 */
 	u32	grpen0;
 	u32	grpen1;
-- 
cgit v1.2.3


From 62e24c5775ecb387a3eb33701378ccfa6dbc98ee Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Fri, 5 Feb 2016 13:41:39 +0100
Subject: reset: add exported __reset_control_get, return NULL if optional

Rename the internal __reset_control_get/put functions to
__reset_control_get/put_internal and add an exported
__reset_control_get equivalent to __of_reset_control_get
that takes a struct device parameter.
This avoids the confusing call to __of_reset_control_get in
the non-DT case and fixes the devm_reset_control_get_optional
function to return NULL if RESET_CONTROLLER is enabled but
dev->of_node == NULL.

Fixes: bb475230b8e5 ("reset: make optional functions really optional")
Reported-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Ramiro Oliveira <Ramiro.Oliveira@synopsys.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c  | 22 ++++++++++++++++------
 include/linux/reset.h | 22 ++++++++++++++--------
 2 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index f1e5e65388bb..cd739d2fa160 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -275,7 +275,7 @@ int reset_control_status(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_status);
 
-static struct reset_control *__reset_control_get(
+static struct reset_control *__reset_control_get_internal(
 				struct reset_controller_dev *rcdev,
 				unsigned int index, bool shared)
 {
@@ -308,7 +308,7 @@ static struct reset_control *__reset_control_get(
 	return rstc;
 }
 
-static void __reset_control_put(struct reset_control *rstc)
+static void __reset_control_put_internal(struct reset_control *rstc)
 {
 	lockdep_assert_held(&reset_list_mutex);
 
@@ -377,7 +377,7 @@ struct reset_control *__of_reset_control_get(struct device_node *node,
 	}
 
 	/* reset_list_mutex also protects the rcdev's reset_control list */
-	rstc = __reset_control_get(rcdev, rstc_id, shared);
+	rstc = __reset_control_get_internal(rcdev, rstc_id, shared);
 
 	mutex_unlock(&reset_list_mutex);
 
@@ -385,6 +385,17 @@ struct reset_control *__of_reset_control_get(struct device_node *node,
 }
 EXPORT_SYMBOL_GPL(__of_reset_control_get);
 
+struct reset_control *__reset_control_get(struct device *dev, const char *id,
+					  int index, bool shared, bool optional)
+{
+	if (dev->of_node)
+		return __of_reset_control_get(dev->of_node, id, index, shared,
+					      optional);
+
+	return optional ? NULL : ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(__reset_control_get);
+
 /**
  * reset_control_put - free the reset controller
  * @rstc: reset controller
@@ -396,7 +407,7 @@ void reset_control_put(struct reset_control *rstc)
 		return;
 
 	mutex_lock(&reset_list_mutex);
-	__reset_control_put(rstc);
+	__reset_control_put_internal(rstc);
 	mutex_unlock(&reset_list_mutex);
 }
 EXPORT_SYMBOL_GPL(reset_control_put);
@@ -417,8 +428,7 @@ struct reset_control *__devm_reset_control_get(struct device *dev,
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
-	rstc = __of_reset_control_get(dev ? dev->of_node : NULL,
-				      id, index, shared, optional);
+	rstc = __reset_control_get(dev, id, index, shared, optional);
 	if (!IS_ERR(rstc)) {
 		*ptr = rstc;
 		devres_add(dev, ptr);
diff --git a/include/linux/reset.h b/include/linux/reset.h
index 96fb139bdd08..13d8681210d5 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -15,6 +15,9 @@ int reset_control_status(struct reset_control *rstc);
 struct reset_control *__of_reset_control_get(struct device_node *node,
 				     const char *id, int index, bool shared,
 				     bool optional);
+struct reset_control *__reset_control_get(struct device *dev, const char *id,
+					  int index, bool shared,
+					  bool optional);
 void reset_control_put(struct reset_control *rstc);
 struct reset_control *__devm_reset_control_get(struct device *dev,
 				     const char *id, int index, bool shared,
@@ -72,6 +75,13 @@ static inline struct reset_control *__of_reset_control_get(
 	return optional ? NULL : ERR_PTR(-ENOTSUPP);
 }
 
+static inline struct reset_control *__reset_control_get(
+					struct device *dev, const char *id,
+					int index, bool shared, bool optional)
+{
+	return optional ? NULL : ERR_PTR(-ENOTSUPP);
+}
+
 static inline struct reset_control *__devm_reset_control_get(
 					struct device *dev, const char *id,
 					int index, bool shared, bool optional)
@@ -102,8 +112,7 @@ __must_check reset_control_get_exclusive(struct device *dev, const char *id)
 #ifndef CONFIG_RESET_CONTROLLER
 	WARN_ON(1);
 #endif
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, false,
-									false);
+	return __reset_control_get(dev, id, 0, false, false);
 }
 
 /**
@@ -131,22 +140,19 @@ __must_check reset_control_get_exclusive(struct device *dev, const char *id)
 static inline struct reset_control *reset_control_get_shared(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true,
-									false);
+	return __reset_control_get(dev, id, 0, true, false);
 }
 
 static inline struct reset_control *reset_control_get_optional_exclusive(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, false,
-									true);
+	return __reset_control_get(dev, id, 0, false, true);
 }
 
 static inline struct reset_control *reset_control_get_optional_shared(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true,
-									true);
+	return __reset_control_get(dev, id, 0, true, true);
 }
 
 /**
-- 
cgit v1.2.3


From b2376407f98920c9b0c411948675f58a9640be35 Mon Sep 17 00:00:00 2001
From: Vic Yang <victoryang@google.com>
Date: Fri, 24 Mar 2017 18:44:01 +0100
Subject: mfd: cros-ec: Fix host command buffer size

For SPI, we can get up to 32 additional bytes for response preamble.
The current overhead (2 bytes) may cause problems when we try to receive
a big response. Update it to 32 bytes.

Without this fix we could see a kernel BUG when we receive a big response
from the Chrome EC when is connected via SPI.

Signed-off-by: Vic Yang <victoryang@google.com>
Tested-by: Enric Balletbo i Serra <enric.balletbo.collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 7a01c94496f1..3eef9fb9968a 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -35,10 +35,11 @@
  * Max bus-specific overhead incurred by request/responses.
  * I2C requires 1 additional byte for requests.
  * I2C requires 2 additional bytes for responses.
+ * SPI requires up to 32 additional bytes for responses.
  * */
 #define EC_PROTO_VERSION_UNKNOWN	0
 #define EC_MAX_REQUEST_OVERHEAD		1
-#define EC_MAX_RESPONSE_OVERHEAD	2
+#define EC_MAX_RESPONSE_OVERHEAD	32
 
 /*
  * Command interface between EC and AP, for LPC, I2C and SPI interfaces.
-- 
cgit v1.2.3


From 6118714275f0a313ecc296a87ed1af32d9691bed Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 30 Mar 2017 09:16:39 -0700
Subject: pinctrl: core: Fix pinctrl_register_and_init() with pinctrl_enable()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recent pinctrl changes to allow dynamic allocation of pins exposed one
more issue with the pinctrl pins claimed early by the controller itself.
This caused a regression for IMX6 pinctrl hogs.

Before enabling the pin controller driver we need to wait until it has
been properly initialized, then claim the hogs, and only then enable it.

To fix the regression, split the code into pinctrl_claim_hogs() and
pinctrl_enable(). And then let's require that pinctrl_enable() is always
called by the pin controller driver when ready after calling
pinctrl_register_and_init().

Depends-on: 950b0d91dc10 ("pinctrl: core: Fix regression caused by delayed
work for hogs")
Fixes: df61b366af26 ("pinctrl: core: Use delayed work for hogs")
Fixes: e566fc11ea76 ("pinctrl: imx: use generic pinctrl helpers for
managing groups")
Cc: Haojian Zhuang <haojian.zhuang@linaro.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Nishanth Menon <nm@ti.com>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Stefan Agner <stefan@agner.ch>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Tested-by: Gary Bisson <gary.bisson@boundarydevices.com>
Tested-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/pinctrl.txt               |  8 ++-
 drivers/pinctrl/core.c                  | 97 +++++++++++++++++++++------------
 drivers/pinctrl/freescale/pinctrl-imx.c |  2 +-
 drivers/pinctrl/pinctrl-single.c        |  2 +-
 drivers/pinctrl/sh-pfc/pinctrl.c        | 11 +++-
 drivers/pinctrl/ti/pinctrl-ti-iodelay.c |  2 +
 include/linux/pinctrl/pinctrl.h         |  3 +-
 7 files changed, 83 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/pinctrl.txt b/Documentation/pinctrl.txt
index 54bd5faa8782..f2af35f6d6b2 100644
--- a/Documentation/pinctrl.txt
+++ b/Documentation/pinctrl.txt
@@ -77,9 +77,15 @@ static struct pinctrl_desc foo_desc = {
 
 int __init foo_probe(void)
 {
+	int error;
+
 	struct pinctrl_dev *pctl;
 
-	return pinctrl_register_and_init(&foo_desc, <PARENT>, NULL, &pctl);
+	error = pinctrl_register_and_init(&foo_desc, <PARENT>, NULL, &pctl);
+	if (error)
+		return error;
+
+	return pinctrl_enable(pctl);
 }
 
 To enable the pinctrl subsystem and the subgroups for PINMUX and PINCONF and
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index d69046537b75..32822b0d9cd0 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -2010,29 +2010,57 @@ out_err:
 	return ERR_PTR(ret);
 }
 
-static int pinctrl_create_and_start(struct pinctrl_dev *pctldev)
+static int pinctrl_claim_hogs(struct pinctrl_dev *pctldev)
 {
 	pctldev->p = create_pinctrl(pctldev->dev, pctldev);
-	if (!IS_ERR(pctldev->p)) {
-		kref_get(&pctldev->p->users);
-		pctldev->hog_default =
-			pinctrl_lookup_state(pctldev->p, PINCTRL_STATE_DEFAULT);
-		if (IS_ERR(pctldev->hog_default)) {
-			dev_dbg(pctldev->dev,
-				"failed to lookup the default state\n");
-		} else {
-			if (pinctrl_select_state(pctldev->p,
-						pctldev->hog_default))
-				dev_err(pctldev->dev,
-					"failed to select default state\n");
-		}
+	if (PTR_ERR(pctldev->p) == -ENODEV) {
+		dev_dbg(pctldev->dev, "no hogs found\n");
 
-		pctldev->hog_sleep =
-			pinctrl_lookup_state(pctldev->p,
-						    PINCTRL_STATE_SLEEP);
-		if (IS_ERR(pctldev->hog_sleep))
-			dev_dbg(pctldev->dev,
-				"failed to lookup the sleep state\n");
+		return 0;
+	}
+
+	if (IS_ERR(pctldev->p)) {
+		dev_err(pctldev->dev, "error claiming hogs: %li\n",
+			PTR_ERR(pctldev->p));
+
+		return PTR_ERR(pctldev->p);
+	}
+
+	kref_get(&pctldev->p->users);
+	pctldev->hog_default =
+		pinctrl_lookup_state(pctldev->p, PINCTRL_STATE_DEFAULT);
+	if (IS_ERR(pctldev->hog_default)) {
+		dev_dbg(pctldev->dev,
+			"failed to lookup the default state\n");
+	} else {
+		if (pinctrl_select_state(pctldev->p,
+					 pctldev->hog_default))
+			dev_err(pctldev->dev,
+				"failed to select default state\n");
+	}
+
+	pctldev->hog_sleep =
+		pinctrl_lookup_state(pctldev->p,
+				     PINCTRL_STATE_SLEEP);
+	if (IS_ERR(pctldev->hog_sleep))
+		dev_dbg(pctldev->dev,
+			"failed to lookup the sleep state\n");
+
+	return 0;
+}
+
+int pinctrl_enable(struct pinctrl_dev *pctldev)
+{
+	int error;
+
+	error = pinctrl_claim_hogs(pctldev);
+	if (error) {
+		dev_err(pctldev->dev, "could not claim hogs: %i\n",
+			error);
+		mutex_destroy(&pctldev->mutex);
+		kfree(pctldev);
+
+		return error;
 	}
 
 	mutex_lock(&pinctrldev_list_mutex);
@@ -2043,6 +2071,7 @@ static int pinctrl_create_and_start(struct pinctrl_dev *pctldev)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pinctrl_enable);
 
 /**
  * pinctrl_register() - register a pin controller device
@@ -2065,25 +2094,30 @@ struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
 	if (IS_ERR(pctldev))
 		return pctldev;
 
-	error = pinctrl_create_and_start(pctldev);
-	if (error) {
-		mutex_destroy(&pctldev->mutex);
-		kfree(pctldev);
-
+	error = pinctrl_enable(pctldev);
+	if (error)
 		return ERR_PTR(error);
-	}
 
 	return pctldev;
 
 }
 EXPORT_SYMBOL_GPL(pinctrl_register);
 
+/**
+ * pinctrl_register_and_init() - register and init pin controller device
+ * @pctldesc: descriptor for this pin controller
+ * @dev: parent device for this pin controller
+ * @driver_data: private pin controller data for this pin controller
+ * @pctldev: pin controller device
+ *
+ * Note that pinctrl_enable() still needs to be manually called after
+ * this once the driver is ready.
+ */
 int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
 			      struct device *dev, void *driver_data,
 			      struct pinctrl_dev **pctldev)
 {
 	struct pinctrl_dev *p;
-	int error;
 
 	p = pinctrl_init_controller(pctldesc, dev, driver_data);
 	if (IS_ERR(p))
@@ -2097,15 +2131,6 @@ int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
 	 */
 	*pctldev = p;
 
-	error = pinctrl_create_and_start(p);
-	if (error) {
-		mutex_destroy(&p->mutex);
-		kfree(p);
-		*pctldev = NULL;
-
-		return error;
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pinctrl_register_and_init);
diff --git a/drivers/pinctrl/freescale/pinctrl-imx.c b/drivers/pinctrl/freescale/pinctrl-imx.c
index a7ace9e1ad81..74bd90dfd7b1 100644
--- a/drivers/pinctrl/freescale/pinctrl-imx.c
+++ b/drivers/pinctrl/freescale/pinctrl-imx.c
@@ -790,7 +790,7 @@ int imx_pinctrl_probe(struct platform_device *pdev,
 
 	dev_info(&pdev->dev, "initialized IMX pinctrl driver\n");
 
-	return 0;
+	return pinctrl_enable(ipctl->pctl);
 
 free:
 	imx_free_resources(ipctl);
diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c
index 8b2d45e85bae..9c267dcda094 100644
--- a/drivers/pinctrl/pinctrl-single.c
+++ b/drivers/pinctrl/pinctrl-single.c
@@ -1781,7 +1781,7 @@ static int pcs_probe(struct platform_device *pdev)
 	dev_info(pcs->dev, "%i pins at pa %p size %u\n",
 		 pcs->desc.npins, pcs->base, pcs->size);
 
-	return 0;
+	return pinctrl_enable(pcs->pctl);
 
 free:
 	pcs_free_resources(pcs);
diff --git a/drivers/pinctrl/sh-pfc/pinctrl.c b/drivers/pinctrl/sh-pfc/pinctrl.c
index 08150a321be6..a70157f0acf4 100644
--- a/drivers/pinctrl/sh-pfc/pinctrl.c
+++ b/drivers/pinctrl/sh-pfc/pinctrl.c
@@ -816,6 +816,13 @@ int sh_pfc_register_pinctrl(struct sh_pfc *pfc)
 	pmx->pctl_desc.pins = pmx->pins;
 	pmx->pctl_desc.npins = pfc->info->nr_pins;
 
-	return devm_pinctrl_register_and_init(pfc->dev, &pmx->pctl_desc, pmx,
-					      &pmx->pctl);
+	ret = devm_pinctrl_register_and_init(pfc->dev, &pmx->pctl_desc, pmx,
+					     &pmx->pctl);
+	if (ret) {
+		dev_err(pfc->dev, "could not register: %i\n", ret);
+
+		return ret;
+	}
+
+	return pinctrl_enable(pmx->pctl);
 }
diff --git a/drivers/pinctrl/ti/pinctrl-ti-iodelay.c b/drivers/pinctrl/ti/pinctrl-ti-iodelay.c
index 717e3404900c..362c50918c13 100644
--- a/drivers/pinctrl/ti/pinctrl-ti-iodelay.c
+++ b/drivers/pinctrl/ti/pinctrl-ti-iodelay.c
@@ -893,6 +893,8 @@ static int ti_iodelay_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, iod);
 
+	return pinctrl_enable(iod->pctl);
+
 exit_out:
 	of_node_put(np);
 	return ret;
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 8ce2d87a238b..5e45385c5bdc 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -145,8 +145,9 @@ struct pinctrl_desc {
 extern int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
 				     struct device *dev, void *driver_data,
 				     struct pinctrl_dev **pctldev);
+extern int pinctrl_enable(struct pinctrl_dev *pctldev);
 
-/* Please use pinctrl_register_and_init() instead */
+/* Please use pinctrl_register_and_init() and pinctrl_enable() instead */
 extern struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
 				struct device *dev, void *driver_data);
 
-- 
cgit v1.2.3


From 404123c2db798027e852480ed9c4accef9f1d9e6 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 29 Mar 2017 19:06:20 +0300
Subject: virtio: allow drivers to validate features

Some drivers can't support all features in all configurations.  At the
moment we blindly set FEATURES_OK and later FAILED.  Support this better
by adding a callback drivers can use to do some early checks.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio.c | 6 ++++++
 include/linux/virtio.h  | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 400d70b69379..48230a5e12f2 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -232,6 +232,12 @@ static int virtio_dev_probe(struct device *_d)
 		if (device_features & (1ULL << i))
 			__virtio_set_bit(dev, i);
 
+	if (drv->validate) {
+		err = drv->validate(dev);
+		if (err)
+			goto err;
+	}
+
 	err = virtio_finalize_features(dev);
 	if (err)
 		goto err;
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 04b0d3f95043..7edfbdb55a99 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -167,6 +167,7 @@ struct virtio_driver {
 	unsigned int feature_table_size;
 	const unsigned int *feature_table_legacy;
 	unsigned int feature_table_size_legacy;
+	int (*validate)(struct virtio_device *dev);
 	int (*probe)(struct virtio_device *dev);
 	void (*scan)(struct virtio_device *dev);
 	void (*remove)(struct virtio_device *dev);
-- 
cgit v1.2.3


From 54d5329d425650fafaf90660a139c771d2d49cae Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 7 Apr 2017 08:52:27 -0600
Subject: blk-mq-sched: fix crash in switch error path

In elevator_switch(), if blk_mq_init_sched() fails, we attempt to fall
back to the original scheduler. However, at this point, we've already
torn down the original scheduler's tags, so this causes a crash. Doing
the fallback like the legacy elevator path is much harder for mq, so fix
it by just falling back to none, instead.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sched.c     | 13 +++++--
 block/blk-mq-sched.h     |  2 +-
 block/blk-mq.c           |  2 --
 block/blk-sysfs.c        |  2 +-
 block/elevator.c         | 94 +++++++++++++++++++++++++++---------------------
 include/linux/elevator.h |  2 +-
 6 files changed, 67 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 0bb13bb51daa..e8c2ed654ef0 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -451,7 +451,7 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
 	return ret;
 }
 
-void blk_mq_sched_teardown(struct request_queue *q)
+static void blk_mq_sched_tags_teardown(struct request_queue *q)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
 	struct blk_mq_hw_ctx *hctx;
@@ -513,10 +513,19 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	return 0;
 
 err:
-	blk_mq_sched_teardown(q);
+	blk_mq_sched_tags_teardown(q);
+	q->elevator = NULL;
 	return ret;
 }
 
+void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
+{
+	if (e->type->ops.mq.exit_sched)
+		e->type->ops.mq.exit_sched(e);
+	blk_mq_sched_tags_teardown(q);
+	q->elevator = NULL;
+}
+
 int blk_mq_sched_init(struct request_queue *q)
 {
 	int ret;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 19db25e0c95a..e704956e0862 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -33,7 +33,7 @@ void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
 			struct request *(*get_rq)(struct blk_mq_hw_ctx *));
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
-void blk_mq_sched_teardown(struct request_queue *q);
+void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
 
 int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 			   unsigned int hctx_idx);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 72e744cd638c..cfb7c97b14ec 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2240,8 +2240,6 @@ void blk_mq_release(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int i;
 
-	blk_mq_sched_teardown(q);
-
 	/* hctx kobj stays in hctx */
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c44b321335f3..37f0b3ad635e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -816,7 +816,7 @@ static void blk_release_queue(struct kobject *kobj)
 
 	if (q->elevator) {
 		ioc_clear_queue(q);
-		elevator_exit(q->elevator);
+		elevator_exit(q, q->elevator);
 	}
 
 	blk_exit_rl(&q->root_rl);
diff --git a/block/elevator.c b/block/elevator.c
index f236ef1d2be9..dbeecf7be719 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -252,11 +252,11 @@ int elevator_init(struct request_queue *q, char *name)
 }
 EXPORT_SYMBOL(elevator_init);
 
-void elevator_exit(struct elevator_queue *e)
+void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
 	if (e->uses_mq && e->type->ops.mq.exit_sched)
-		e->type->ops.mq.exit_sched(e);
+		blk_mq_exit_sched(q, e);
 	else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
 		e->type->ops.sq.elevator_exit_fn(e);
 	mutex_unlock(&e->sysfs_lock);
@@ -941,6 +941,45 @@ void elv_unregister(struct elevator_type *e)
 }
 EXPORT_SYMBOL_GPL(elv_unregister);
 
+static int elevator_switch_mq(struct request_queue *q,
+			      struct elevator_type *new_e)
+{
+	int ret;
+
+	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
+
+	if (q->elevator) {
+		if (q->elevator->registered)
+			elv_unregister_queue(q);
+		ioc_clear_queue(q);
+		elevator_exit(q, q->elevator);
+	}
+
+	ret = blk_mq_init_sched(q, new_e);
+	if (ret)
+		goto out;
+
+	if (new_e) {
+		ret = elv_register_queue(q);
+		if (ret) {
+			elevator_exit(q, q->elevator);
+			goto out;
+		}
+	}
+
+	if (new_e)
+		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+	else
+		blk_add_trace_msg(q, "elv switch: none");
+
+out:
+	blk_mq_unfreeze_queue(q);
+	blk_mq_start_stopped_hw_queues(q, true);
+	return ret;
+
+}
+
 /*
  * switch to new_e io scheduler. be careful not to introduce deadlocks -
  * we don't free the old io scheduler, before we have allocated what we
@@ -953,10 +992,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	bool old_registered = false;
 	int err;
 
-	if (q->mq_ops) {
-		blk_mq_freeze_queue(q);
-		blk_mq_quiesce_queue(q);
-	}
+	if (q->mq_ops)
+		return elevator_switch_mq(q, new_e);
 
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data.
@@ -968,11 +1005,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	if (old) {
 		old_registered = old->registered;
 
-		if (old->uses_mq)
-			blk_mq_sched_teardown(q);
-
-		if (!q->mq_ops)
-			blk_queue_bypass_start(q);
+		blk_queue_bypass_start(q);
 
 		/* unregister and clear all auxiliary data of the old elevator */
 		if (old_registered)
@@ -982,53 +1015,32 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	}
 
 	/* allocate, init and register new elevator */
-	if (q->mq_ops)
-		err = blk_mq_init_sched(q, new_e);
-	else
-		err = new_e->ops.sq.elevator_init_fn(q, new_e);
+	err = new_e->ops.sq.elevator_init_fn(q, new_e);
 	if (err)
 		goto fail_init;
 
-	if (new_e) {
-		err = elv_register_queue(q);
-		if (err)
-			goto fail_register;
-	}
+	err = elv_register_queue(q);
+	if (err)
+		goto fail_register;
 
 	/* done, kill the old one and finish */
 	if (old) {
-		elevator_exit(old);
-		if (!q->mq_ops)
-			blk_queue_bypass_end(q);
+		elevator_exit(q, old);
+		blk_queue_bypass_end(q);
 	}
 
-	if (q->mq_ops) {
-		blk_mq_unfreeze_queue(q);
-		blk_mq_start_stopped_hw_queues(q, true);
-	}
-
-	if (new_e)
-		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
-	else
-		blk_add_trace_msg(q, "elv switch: none");
+	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
 
 	return 0;
 
 fail_register:
-	if (q->mq_ops)
-		blk_mq_sched_teardown(q);
-	elevator_exit(q->elevator);
+	elevator_exit(q, q->elevator);
 fail_init:
 	/* switch failed, restore and re-register old elevator */
 	if (old) {
 		q->elevator = old;
 		elv_register_queue(q);
-		if (!q->mq_ops)
-			blk_queue_bypass_end(q);
-	}
-	if (q->mq_ops) {
-		blk_mq_unfreeze_queue(q);
-		blk_mq_start_stopped_hw_queues(q, true);
+		blk_queue_bypass_end(q);
 	}
 
 	return err;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index aebecc4ed088..22d39e8d4de1 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -211,7 +211,7 @@ extern ssize_t elv_iosched_show(struct request_queue *, char *);
 extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
 
 extern int elevator_init(struct request_queue *, char *);
-extern void elevator_exit(struct elevator_queue *);
+extern void elevator_exit(struct request_queue *, struct elevator_queue *);
 extern int elevator_change(struct request_queue *, const char *);
 extern bool elv_bio_merge_ok(struct request *, struct bio *);
 extern struct elevator_queue *elevator_alloc(struct request_queue *,
-- 
cgit v1.2.3


From 7587a5ae7eef0439f7be31f1b5959af062bbc5ec Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 7 Apr 2017 11:16:52 -0700
Subject: blk-mq: Introduce blk_mq_delay_run_hw_queue()

Introduce a function that runs a hardware queue unconditionally
after a delay. Note: there is already a function that stops and
restarts a hardware queue after a delay, namely blk_mq_delay_queue().

This function will be used in the next patch in this series.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Long Li <longli@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 32 ++++++++++++++++++++++++++++++--
 include/linux/blk-mq.h |  2 ++
 2 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f1be5c3eb600..ad45ae743186 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1135,7 +1135,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 	return hctx->next_cpu;
 }
 
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
+					unsigned long msecs)
 {
 	if (unlikely(blk_mq_hctx_stopped(hctx) ||
 		     !blk_mq_hw_queue_mapped(hctx)))
@@ -1152,7 +1153,24 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 		put_cpu();
 	}
 
-	kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
+	if (msecs == 0)
+		kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx),
+					 &hctx->run_work);
+	else
+		kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+						 &hctx->delayed_run_work,
+						 msecs_to_jiffies(msecs));
+}
+
+void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
+{
+	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
+}
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
+
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+	__blk_mq_delay_run_hw_queue(hctx, async, 0);
 }
 
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
@@ -1255,6 +1273,15 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 	__blk_mq_run_hw_queue(hctx);
 }
 
+static void blk_mq_delayed_run_work_fn(struct work_struct *work)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_run_work.work);
+
+	__blk_mq_run_hw_queue(hctx);
+}
+
 static void blk_mq_delay_work_fn(struct work_struct *work)
 {
 	struct blk_mq_hw_ctx *hctx;
@@ -1962,6 +1989,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 		node = hctx->numa_node = set->numa_node;
 
 	INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
+	INIT_DELAYED_WORK(&hctx->delayed_run_work, blk_mq_delayed_run_work_fn);
 	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b296a9006117..9382c5da7a2e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,6 +51,7 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
+	struct delayed_work	delayed_run_work;
 	struct delayed_work	delay_work;
 
 	struct hlist_node	cpuhp_dead;
@@ -238,6 +239,7 @@ void blk_mq_stop_hw_queues(struct request_queue *q);
 void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
+void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
-- 
cgit v1.2.3


From 6d8c6c0f97ad8a3517c42b179c1dc8e77397d0e2 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 7 Apr 2017 12:40:09 -0600
Subject: blk-mq: Restart a single queue if tag sets are shared

To improve scalability, if hardware queues are shared, restart
a single hardware queue in round-robin fashion. Rename
blk_mq_sched_restart_queues() to reflect the new semantics.
Remove blk_mq_sched_mark_restart_queue() because this function
has no callers. Remove flag QUEUE_FLAG_RESTART because this
patch removes the code that uses this flag.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sched.c   | 63 ++++++++++++++++++++++++++++++++++++++++++--------
 block/blk-mq-sched.h   | 16 +------------
 block/blk-mq.c         |  2 +-
 include/linux/blkdev.h |  1 -
 4 files changed, 55 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index e8c2ed654ef0..c974a1bbf4cb 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -318,25 +318,68 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 	return true;
 }
 
-static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
 	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
 		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-		if (blk_mq_hctx_has_pending(hctx))
+		if (blk_mq_hctx_has_pending(hctx)) {
 			blk_mq_run_hw_queue(hctx, true);
+			return true;
+		}
 	}
+	return false;
 }
 
-void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx)
-{
-	struct request_queue *q = hctx->queue;
-	unsigned int i;
+/**
+ * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
+ * @pos:    loop cursor.
+ * @skip:   the list element that will not be examined. Iteration starts at
+ *          @skip->next.
+ * @head:   head of the list to examine. This list must have at least one
+ *          element, namely @skip.
+ * @member: name of the list_head structure within typeof(*pos).
+ */
+#define list_for_each_entry_rcu_rr(pos, skip, head, member)		\
+	for ((pos) = (skip);						\
+	     (pos = (pos)->member.next != (head) ? list_entry_rcu(	\
+			(pos)->member.next, typeof(*pos), member) :	\
+	      list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
+	     (pos) != (skip); )
 
-	if (test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
-		if (test_and_clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
-			queue_for_each_hw_ctx(q, hctx, i)
-				blk_mq_sched_restart_hctx(hctx);
+/*
+ * Called after a driver tag has been freed to check whether a hctx needs to
+ * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
+ * queues in a round-robin fashion if the tag set of @hctx is shared with other
+ * hardware queues.
+ */
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
+{
+	struct blk_mq_tags *const tags = hctx->tags;
+	struct blk_mq_tag_set *const set = hctx->queue->tag_set;
+	struct request_queue *const queue = hctx->queue, *q;
+	struct blk_mq_hw_ctx *hctx2;
+	unsigned int i, j;
+
+	if (set->flags & BLK_MQ_F_TAG_SHARED) {
+		rcu_read_lock();
+		list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
+					   tag_set_list) {
+			queue_for_each_hw_ctx(q, hctx2, i)
+				if (hctx2->tags == tags &&
+				    blk_mq_sched_restart_hctx(hctx2))
+					goto done;
+		}
+		j = hctx->queue_num + 1;
+		for (i = 0; i < queue->nr_hw_queues; i++, j++) {
+			if (j == queue->nr_hw_queues)
+				j = 0;
+			hctx2 = queue->queue_hw_ctx[j];
+			if (hctx2->tags == tags &&
+			    blk_mq_sched_restart_hctx(hctx2))
+				break;
 		}
+done:
+		rcu_read_unlock();
 	} else {
 		blk_mq_sched_restart_hctx(hctx);
 	}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index e704956e0862..3a9e6e40558b 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -19,7 +19,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 				struct request **merged_request);
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
-void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx);
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 				 bool run_queue, bool async, bool can_block);
@@ -136,20 +136,6 @@ static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 }
 
-/*
- * Mark a hardware queue and the request queue it belongs to as needing a
- * restart.
- */
-static inline void blk_mq_sched_mark_restart_queue(struct blk_mq_hw_ctx *hctx)
-{
-	struct request_queue *q = hctx->queue;
-
-	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-	if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
-		set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
-}
-
 static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
 {
 	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ad45ae743186..572966f49596 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -348,7 +348,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
 		blk_mq_sched_completed_request(hctx, rq);
-	blk_mq_sched_restart_queues(hctx);
+	blk_mq_sched_restart(hctx);
 	blk_queue_exit(q);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5a7da607ca04..7548f332121a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -610,7 +610,6 @@ struct request_queue {
 #define QUEUE_FLAG_FLUSH_NQ    25	/* flush not queueuable */
 #define QUEUE_FLAG_DAX         26	/* device supports DAX */
 #define QUEUE_FLAG_STATS       27	/* track rq completion times */
-#define QUEUE_FLAG_RESTART     28	/* queue needs restart at completion */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
cgit v1.2.3


From c0c379e2931b05facef538e53bf3b21f283d9a0b Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 13 Apr 2017 14:56:23 -0700
Subject: mm: drop unused pmdp_huge_get_and_clear_notify()

Dave noticed that after fixing MADV_DONTNEED vs numa balancing race the
last pmdp_huge_get_and_clear_notify() user is gone.

Let's drop the helper.

Link: http://lkml.kernel.org/r/20170306112047.24809-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 51891fb0d3ce..c91b3bcd158f 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -394,18 +394,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	___pud;								\
 })
 
-#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd)		\
-({									\
-	unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;		\
-	pmd_t ___pmd;							\
-									\
-	___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd);		\
-	mmu_notifier_invalidate_range(__mm, ___haddr,			\
-				      ___haddr + HPAGE_PMD_SIZE);	\
-									\
-	___pmd;								\
-})
-
 /*
  * set_pte_at_notify() sets the pte _after_ running the notifier.
  * This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -489,7 +477,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define	ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
 #define pudp_huge_clear_flush_notify pudp_huge_clear_flush
-#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
-- 
cgit v1.2.3


From 5a8d75a1b8c99bdc926ba69b7b7dbe4fae81a5af Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 14 Apr 2017 13:58:29 -0600
Subject: block: fix bio_will_gap() for first bvec with offset

Commit 729204ef49ec("block: relax check on sg gap") allows us to merge
bios, if both are physically contiguous.  This change can merge a huge
number of small bios, through mkfs for example, mkfs.ntfs running time
can be decreased to ~1/10.

But if one rq starts with a non-aligned buffer (the 1st bvec's bv_offset
is non-zero) and if we allow the merge, it is quite difficult to respect
sg gap limit, especially the max segment size, or we risk having an
unaligned virtual boundary.  This patch tries to avoid the issue by
disallowing a merge, if the req starts with an unaligned buffer.

Also add comments to explain why the merged segment can't end in
unaligned virt boundary.

Fixes: 729204ef49ec ("block: relax check on sg gap")
Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>

Rewrote parts of the commit message and comments.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7548f332121a..01a696b0a4d3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1672,12 +1672,36 @@ static inline bool bios_segs_mergeable(struct request_queue *q,
 	return true;
 }
 
-static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
-			 struct bio *next)
+static inline bool bio_will_gap(struct request_queue *q,
+				struct request *prev_rq,
+				struct bio *prev,
+				struct bio *next)
 {
 	if (bio_has_data(prev) && queue_virt_boundary(q)) {
 		struct bio_vec pb, nb;
 
+		/*
+		 * don't merge if the 1st bio starts with non-zero
+		 * offset, otherwise it is quite difficult to respect
+		 * sg gap limit. We work hard to merge a huge number of small
+		 * single bios in case of mkfs.
+		 */
+		if (prev_rq)
+			bio_get_first_bvec(prev_rq->bio, &pb);
+		else
+			bio_get_first_bvec(prev, &pb);
+		if (pb.bv_offset)
+			return true;
+
+		/*
+		 * We don't need to worry about the situation that the
+		 * merged segment ends in unaligned virt boundary:
+		 *
+		 * - if 'pb' ends aligned, the merged segment ends aligned
+		 * - if 'pb' ends unaligned, the next bio must include
+		 *   one single bvec of 'nb', otherwise the 'nb' can't
+		 *   merge with 'pb'
+		 */
 		bio_get_last_bvec(prev, &pb);
 		bio_get_first_bvec(next, &nb);
 
@@ -1690,12 +1714,12 @@ static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
 
 static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
 {
-	return bio_will_gap(req->q, req->biotail, bio);
+	return bio_will_gap(req->q, req, req->biotail, bio);
 }
 
 static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 {
-	return bio_will_gap(req->q, bio, req->bio);
+	return bio_will_gap(req->q, NULL, bio, req->bio);
 }
 
 int kblockd_schedule_work(struct work_struct *work);
-- 
cgit v1.2.3