22 files changed, 277 insertions, 159 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d4988410b410..53abf008ecb3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2cf0f79f1fc9..2c9eae6ad970 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,7 +46,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
-#include <linux/percpu-rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
@@ -104,8 +103,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(release_agent_path_lock);
 
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
-
 #define cgroup_assert_mutex_or_rcu_locked()				\
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
 			   !lockdep_is_held(&cgroup_mutex),		\
@@ -874,6 +871,48 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
+void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+{
+	down_read(&tsk->signal->group_rwsem);
+}
+
+void cgroup_threadgroup_change_end(struct task_struct *tsk)
+{
+	up_read(&tsk->signal->group_rwsem);
+}
+
+/**
+ * threadgroup_lock - lock threadgroup
+ * @tsk: member task of the threadgroup to lock
+ *
+ * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
+ * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
+ * change ->group_leader/pid.  This is useful for cases where the threadgroup
+ * needs to stay stable across blockable operations.
+ *
+ * fork and exit explicitly call threadgroup_change_{begin|end}() for
+ * synchronization.  While held, no new task will be added to threadgroup
+ * and no existing live task will have its PF_EXITING set.
+ *
+ * de_thread() does threadgroup_change_{begin|end}() when a non-leader
+ * sub-thread becomes a new leader.
+ */
+static void threadgroup_lock(struct task_struct *tsk)
+{
+	down_write(&tsk->signal->group_rwsem);
+}
+
+/**
+ * threadgroup_unlock - unlock threadgroup
+ * @tsk: member task of the threadgroup to unlock
+ *
+ * Reverse threadgroup_lock().
+ */
+static inline void threadgroup_unlock(struct task_struct *tsk)
+{
+	up_write(&tsk->signal->group_rwsem);
+}
+
 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -2074,9 +2113,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	lockdep_assert_held(&css_set_rwsem);
 
 	/*
-	 * We are synchronized through cgroup_threadgroup_rwsem against
-	 * PF_EXITING setting such that we can't race against cgroup_exit()
-	 * changing the css_set to init_css_set and dropping the old one.
+	 * We are synchronized through threadgroup_lock() against PF_EXITING
+	 * setting such that we can't race against cgroup_exit() changing the
+	 * css_set to init_css_set and dropping the old one.
 	 */
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	old_cset = task_css_set(tsk);
@@ -2133,11 +2172,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * @src_cset and add it to @preloaded_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
- * This function may be called without holding cgroup_threadgroup_rwsem
- * even if the target is a process.  Threads may be created and destroyed
- * but as long as cgroup_mutex is not dropped, no new css_set can be put
- * into play and the preloaded css_sets are guaranteed to cover all
- * migrations.
+ * This function may be called without holding threadgroup_lock even if the
+ * target is a process.  Threads may be created and destroyed but as long
+ * as cgroup_mutex is not dropped, no new css_set can be put into play and
+ * the preloaded css_sets are guaranteed to cover all migrations.
  */
 static void cgroup_migrate_add_src(struct css_set *src_cset,
 				   struct cgroup *dst_cgrp,
@@ -2240,7 +2278,7 @@ err:
  * @threadgroup: whether @leader points to the whole process or a single task
  *
  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- * process, the caller must be holding cgroup_threadgroup_rwsem.  The
+ * process, the caller must be holding threadgroup_lock of @leader.  The
  * caller is also responsible for invoking cgroup_migrate_add_src() and
  * cgroup_migrate_prepare_dst() on the targets before invoking this
  * function and following up with cgroup_migrate_finish().
@@ -2368,7 +2406,7 @@ out_release_tset:
  * @leader: the task or the leader of the threadgroup to be attached
  * @threadgroup: attach the whole threadgroup?
  *
- * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
  */
 static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			      struct task_struct *leader, bool threadgroup)
@@ -2460,13 +2498,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	if (!cgrp)
 		return -ENODEV;
 
-	percpu_down_write(&cgroup_threadgroup_rwsem);
+retry_find_task:
 	rcu_read_lock();
 	if (pid) {
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
+			rcu_read_unlock();
 			ret = -ESRCH;
-			goto out_unlock_rcu;
+			goto out_unlock_cgroup;
 		}
 	} else {
 		tsk = current;
@@ -2482,23 +2521,37 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	 */
 	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
-		goto out_unlock_rcu;
+		rcu_read_unlock();
+		goto out_unlock_cgroup;
 	}
 
 	get_task_struct(tsk);
 	rcu_read_unlock();
 
+	threadgroup_lock(tsk);
+	if (threadgroup) {
+		if (!thread_group_leader(tsk)) {
+			/*
+			 * a race with de_thread from another thread's exec()
+			 * may strip us of our leadership, if this happens,
+			 * there is no choice but to throw this task away and
+			 * try again; this is
+			 * "double-double-toil-and-trouble-check locking".
+			 */
+			threadgroup_unlock(tsk);
+			put_task_struct(tsk);
+			goto retry_find_task;
+		}
+	}
+
 	ret = cgroup_procs_write_permission(tsk, cgrp, of);
 	if (!ret)
 		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 
-	put_task_struct(tsk);
-	goto out_unlock_threadgroup;
+	threadgroup_unlock(tsk);
 
-out_unlock_rcu:
-	rcu_read_unlock();
-out_unlock_threadgroup:
-	percpu_up_write(&cgroup_threadgroup_rwsem);
+	put_task_struct(tsk);
+out_unlock_cgroup:
 	cgroup_kn_unlock(of->kn);
 	return ret ?: nbytes;
 }
@@ -2643,8 +2696,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-
 	/* look up all csses currently attached to @cgrp's subtree */
 	down_read(&css_set_rwsem);
 	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
@@ -2700,8 +2751,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 				goto out_finish;
 			last_task = task;
 
+			threadgroup_lock(task);
+			/* raced against de_thread() from another thread? */
+			if (!thread_group_leader(task)) {
+				threadgroup_unlock(task);
+				put_task_struct(task);
+				continue;
+			}
+
 			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
 
+			threadgroup_unlock(task);
 			put_task_struct(task);
 
 			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -2711,7 +2771,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 
 out_finish:
 	cgroup_migrate_finish(&preloaded_csets);
-	percpu_up_write(&cgroup_threadgroup_rwsem);
 	return ret;
 }
 
@@ -5024,7 +5083,6 @@ int __init cgroup_init(void)
 	unsigned long key;
 	int ssid, err;
 
-	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 9656a3c36503..009cc9a17d95 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
  * low power state that may have caused some blocks in the same power domain
  * to reset.
  *
- * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * Must be called after cpu_cluster_pm_enter has been called for the power
  * domain, and before cpu_pm_exit has been called on any cpu in the power
  * domain. Notified drivers can include VFP co-processor, interrupt controller
  * and its PM extensions, local CPU timers context save/restore which
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d5f0f118a63..2845623fb582 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1149,6 +1149,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	tty_audit_fork(sig);
 	sched_autogroup_fork(sig);
 
+#ifdef CONFIG_CGROUPS
+	init_rwsem(&sig->group_rwsem);
+#endif
+
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6e40a9539763..e28169dd1c36 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -83,7 +83,7 @@ int irq_set_handler_data(unsigned int irq, void *data)
 
 	if (!desc)
 		return -EINVAL;
-	desc->irq_data.handler_data = data;
+	desc->irq_common_data.handler_data = data;
 	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
@@ -105,7 +105,7 @@ int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
 
 	if (!desc)
 		return -EINVAL;
-	desc->irq_data.msi_desc = entry;
+	desc->irq_common_data.msi_desc = entry;
 	if (entry && !irq_offset)
 		entry->irq = irq_base;
 	irq_put_desc_unlock(desc, flags);
@@ -372,7 +372,6 @@ static bool irq_may_run(struct irq_desc *desc)
 
 /**
  *	handle_simple_irq - Simple and software-decoded IRQs.
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  *	Simple interrupts are either sent from a demultiplexing interrupt
@@ -382,8 +381,7 @@ static bool irq_may_run(struct irq_desc *desc)
  *	Note: The caller is expected to handle the ack, clear, mask and
  *	unmask issues if necessary.
  */
-void
-handle_simple_irq(unsigned int irq, struct irq_desc *desc)
+void handle_simple_irq(struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
@@ -425,7 +423,6 @@ static void cond_unmask_irq(struct irq_desc *desc)
 
 /**
  *	handle_level_irq - Level type irq handler
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  *	Level type interrupts are active as long as the hardware line has
@@ -433,8 +430,7 @@ static void cond_unmask_irq(struct irq_desc *desc)
  *	it after the associated handler has acknowledged the device, so the
  *	interrupt line is back to inactive.
  */
-void
-handle_level_irq(unsigned int irq, struct irq_desc *desc)
+void handle_level_irq(struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 	mask_ack_irq(desc);
@@ -496,7 +492,6 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
 
 /**
  *	handle_fasteoi_irq - irq handler for transparent controllers
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  *	Only a single callback will be issued to the chip: an ->eoi()
@@ -504,8 +499,7 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
  *	for modern forms of interrupt handlers, which handle the flow
  *	details in hardware, transparently.
  */
-void
-handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
+void handle_fasteoi_irq(struct irq_desc *desc)
 {
 	struct irq_chip *chip = desc->irq_data.chip;
 
@@ -546,7 +540,6 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
 
 /**
  *	handle_edge_irq - edge type IRQ handler
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  *	Interrupt occures on the falling and/or rising edge of a hardware
@@ -560,8 +553,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
  *	the handler was running. If all pending interrupts are handled, the
  *	loop is left.
  */
-void
-handle_edge_irq(unsigned int irq, struct irq_desc *desc)
+void handle_edge_irq(struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
@@ -618,13 +610,12 @@ EXPORT_SYMBOL(handle_edge_irq);
 #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
 /**
  *	handle_edge_eoi_irq - edge eoi type IRQ handler
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  * Similar as the above handle_edge_irq, but using eoi and w/o the
  * mask/unmask logic.
  */
-void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
+void handle_edge_eoi_irq(struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 
@@ -665,13 +656,11 @@ out_eoi:
 
 /**
  *	handle_percpu_irq - Per CPU local irq handler
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  *	Per CPU interrupts on SMP machines without locking requirements
  */
-void
-handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
+void handle_percpu_irq(struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 
@@ -688,7 +677,6 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 
 /**
  * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
- * @irq:	the interrupt number
  * @desc:	the interrupt description structure for this irq
  *
  * Per CPU interrupts on SMP machines without locking requirements. Same as
@@ -698,11 +686,12 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
  * contain the real device id for the cpu on which this handler is
  * called
  */
-void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
+void handle_percpu_devid_irq(struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 	struct irqaction *action = desc->action;
 	void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
+	unsigned int irq = irq_desc_get_irq(desc);
 	irqreturn_t res;
 
 	kstat_incr_irqs_this_cpu(desc);
@@ -796,7 +785,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
 		return;
 
 	__irq_do_set_handler(desc, handle, 1, NULL);
-	desc->irq_data.handler_data = data;
+	desc->irq_common_data.handler_data = data;
 
 	irq_put_desc_busunlock(desc, flags);
 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index b6eeea8a80c5..de41a68fc038 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -27,8 +27,10 @@
  *
  * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
  */
-void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
+void handle_bad_irq(struct irq_desc *desc)
 {
+	unsigned int irq = irq_desc_get_irq(desc);
+
 	print_irq_desc(irq, desc);
 	kstat_incr_irqs_this_cpu(desc);
 	ack_bad_irq(irq);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index eee4b385cffb..5ef0c2dbe930 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -194,7 +194,7 @@ static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
 
 static inline int irq_desc_get_node(struct irq_desc *desc)
 {
-	return irq_data_get_node(&desc->irq_data);
+	return irq_common_data_get_node(&desc->irq_common_data);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0a2a4b697bcb..239e2ae2c947 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -38,12 +38,13 @@ static void __init init_irq_default_affinity(void)
 #ifdef CONFIG_SMP
 static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
 {
-	if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
+	if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity,
+				     gfp, node))
 		return -ENOMEM;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
-		free_cpumask_var(desc->irq_data.affinity);
+		free_cpumask_var(desc->irq_common_data.affinity);
 		return -ENOMEM;
 	}
 #endif
@@ -52,11 +53,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
 
 static void desc_smp_init(struct irq_desc *desc, int node)
 {
-	desc->irq_data.node = node;
-	cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
+	cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_clear(desc->pending_mask);
 #endif
+#ifdef CONFIG_NUMA
+	desc->irq_common_data.node = node;
+#endif
 }
 
 #else
@@ -70,12 +73,13 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
 {
 	int cpu;
 
+	desc->irq_common_data.handler_data = NULL;
+	desc->irq_common_data.msi_desc = NULL;
+
 	desc->irq_data.common = &desc->irq_common_data;
 	desc->irq_data.irq = irq;
 	desc->irq_data.chip = &no_irq_chip;
 	desc->irq_data.chip_data = NULL;
-	desc->irq_data.handler_data = NULL;
-	desc->irq_data.msi_desc = NULL;
 	irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
 	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
 	desc->handle_irq = handle_bad_irq;
@@ -121,7 +125,7 @@ static void free_masks(struct irq_desc *desc)
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	free_cpumask_var(desc->pending_mask);
 #endif
-	free_cpumask_var(desc->irq_data.affinity);
+	free_cpumask_var(desc->irq_common_data.affinity);
 }
 #else
 static inline void free_masks(struct irq_desc *desc) { }
@@ -343,7 +347,7 @@ int generic_handle_irq(unsigned int irq)
 
 	if (!desc)
 		return -EINVAL;
-	generic_handle_irq_desc(irq, desc);
+	generic_handle_irq_desc(desc);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(generic_handle_irq);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 79baaf8a7813..dc9d27c0c158 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -844,7 +844,6 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
 		child->parent_data = irq_data;
 		irq_data->irq = child->irq;
 		irq_data->common = child->common;
-		irq_data->node = child->node;
 		irq_data->domain = domain;
 	}
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ad1b064f94fe..f9a59f6cabd2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -192,7 +192,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
 	switch (ret) {
 	case IRQ_SET_MASK_OK:
 	case IRQ_SET_MASK_OK_DONE:
-		cpumask_copy(data->affinity, mask);
+		cpumask_copy(desc->irq_common_data.affinity, mask);
 	case IRQ_SET_MASK_OK_NOCOPY:
 		irq_set_thread_affinity(desc);
 		ret = 0;
@@ -304,7 +304,7 @@ static void irq_affinity_notify(struct work_struct *work)
 	if (irq_move_pending(&desc->irq_data))
 		irq_get_pending(cpumask, desc);
 	else
-		cpumask_copy(cpumask, desc->irq_data.affinity);
+		cpumask_copy(cpumask, desc->irq_common_data.affinity);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 	notify->notify(notify, cpumask);
@@ -375,9 +375,9 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
 	 * one of the targets is online.
 	 */
 	if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
-		if (cpumask_intersects(desc->irq_data.affinity,
+		if (cpumask_intersects(desc->irq_common_data.affinity,
 				       cpu_online_mask))
-			set = desc->irq_data.affinity;
+			set = desc->irq_common_data.affinity;
 		else
 			irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
 	}
@@ -829,8 +829,8 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
 	 * This code is triggered unconditionally. Check the affinity
 	 * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
 	 */
-	if (desc->irq_data.affinity)
-		cpumask_copy(mask, desc->irq_data.affinity);
+	if (desc->irq_common_data.affinity)
+		cpumask_copy(mask, desc->irq_common_data.affinity);
 	else
 		valid = false;
 	raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0e97c142ce40..e3a8c9577ba6 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -39,7 +39,7 @@ static struct proc_dir_entry *root_irq_dir;
 static int show_irq_affinity(int type, struct seq_file *m, void *v)
 {
 	struct irq_desc *desc = irq_to_desc((long)m->private);
-	const struct cpumask *mask = desc->irq_data.affinity;
+	const struct cpumask *mask = desc->irq_common_data.affinity;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (irqd_is_setaffinity_pending(&desc->irq_data))
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index dd95f44f99b2..b86886beee4f 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg)
 		clear_bit(irq, irqs_resend);
 		desc = irq_to_desc(irq);
 		local_irq_disable();
-		desc->handle_irq(irq, desc);
+		desc->handle_irq(desc);
 		local_irq_enable();
 	}
 }
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 337c8818541d..87e9ce6a63c5 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -289,7 +289,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	if (pv_enabled())
 		goto queue;
 
-	if (virt_queued_spin_lock(lock))
+	if (virt_spin_lock(lock))
 		return;
 
 	/*
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
new file mode 100644
index 000000000000..536c727a56e9
--- /dev/null
+++ b/kernel/membarrier.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, or if the command argument is invalid,
+ * this system call returns -EINVAL. For a given command, with flags argument
+ * set to 0, this system call is guaranteed to always return the same value
+ * until reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+	if (unlikely(flags))
+		return -EINVAL;
+	switch (cmd) {
+	case MEMBARRIER_CMD_QUERY:
+		return MEMBARRIER_CMD_BITMASK;
+	case MEMBARRIER_CMD_SHARED:
+		if (num_online_cpus() > 1)
+			synchronize_sched();
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3595403921bd..2f9c92884817 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -621,18 +621,21 @@ int get_nohz_timer_target(void)
 	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (!idle_cpu(cpu))
+	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
 		return cpu;
 
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		for_each_cpu(i, sched_domain_span(sd)) {
-			if (!idle_cpu(i)) {
+			if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
+
+	if (!is_housekeeping_cpu(cpu))
+		cpu = housekeeping_any_cpu();
 unlock:
 	rcu_read_unlock();
 	return cpu;
@@ -2666,13 +2669,20 @@ unsigned long nr_running(void)
 
 /*
  * Check if only the current task is running on the cpu.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race.  The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptable section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
  */
 bool single_task_running(void)
 {
-	if (cpu_rq(smp_processor_id())->nr_running == 1)
-		return true;
-	else
-		return false;
+	return raw_rq()->nr_running == 1;
 }
 EXPORT_SYMBOL(single_task_running);
 
@@ -5178,24 +5188,47 @@ static void migrate_tasks(struct rq *dead_rq)
 			break;
 
 		/*
-		 * Ensure rq->lock covers the entire task selection
-		 * until the migration.
+		 * pick_next_task assumes pinned rq->lock.
 		 */
 		lockdep_pin_lock(&rq->lock);
 		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
+		/*
+		 * Rules for changing task_struct::cpus_allowed are holding
+		 * both pi_lock and rq->lock, such that holding either
+		 * stabilizes the mask.
+		 *
+		 * Drop rq->lock is not quite as disastrous as it usually is
+		 * because !cpu_active at this point, which means load-balance
+		 * will not interfere. Also, stop-machine.
+		 */
+		lockdep_unpin_lock(&rq->lock);
+		raw_spin_unlock(&rq->lock);
+		raw_spin_lock(&next->pi_lock);
+		raw_spin_lock(&rq->lock);
+
+		/*
+		 * Since we're inside stop-machine, _nothing_ should have
+		 * changed the task, WARN if weird stuff happened, because in
+		 * that case the above rq->lock drop is a fail too.
+		 */
+		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+			raw_spin_unlock(&next->pi_lock);
+			continue;
+		}
+
 		/* Find suitable destination for @next, with force if needed. */
 		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
-		lockdep_unpin_lock(&rq->lock);
 		rq = __migrate_task(rq, next, dest_cpu);
 		if (rq != dead_rq) {
 			raw_spin_unlock(&rq->lock);
 			rq = dead_rq;
 			raw_spin_lock(&rq->lock);
 		}
+		raw_spin_unlock(&next->pi_lock);
 	}
 
 	rq->stop = stop;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 272d9322bc5d..052e02672d12 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
-			  void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
-	__wake_up_common(q, mode, nr, 0, key);
+	__wake_up_common(q, mode, 1, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
 	if (!list_empty(&wait->task_list))
 		list_del_init(&wait->task_list);
 	else if (waitqueue_active(q))
-		__wake_up_locked_key(q, mode, 1, key);
+		__wake_up_locked_key(q, mode, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 03c3875d9958..a02decf15583 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -245,3 +245,6 @@ cond_syscall(sys_bpf);
 
 /* execveat */
 cond_syscall(sys_execveat);
+
+/* membarrier */
+cond_syscall(sys_membarrier);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 50eb107f1198..a9b76a40319e 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -97,20 +97,6 @@ EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 static int __clockevents_switch_state(struct clock_event_device *dev,
 				      enum clock_event_state state)
 {
-	/* Transition with legacy set_mode() callback */
-	if (dev->set_mode) {
-		/* Legacy callback doesn't support new modes */
-		if (state > CLOCK_EVT_STATE_ONESHOT)
-			return -ENOSYS;
-		/*
-		 * 'clock_event_state' and 'clock_event_mode' have 1-to-1
-		 * mapping until *_ONESHOT, and so a simple cast will work.
-		 */
-		dev->set_mode((enum clock_event_mode)state, dev);
-		dev->mode = (enum clock_event_mode)state;
-		return 0;
-	}
-
 	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
 		return 0;
 
@@ -204,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev)
 {
 	int ret = 0;
 
-	if (dev->set_mode) {
-		dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
-		dev->mode = CLOCK_EVT_MODE_RESUME;
-	} else if (dev->tick_resume) {
+	if (dev->tick_resume)
 		ret = dev->tick_resume(dev);
-	}
 
 	return ret;
 }
@@ -460,26 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 }
 EXPORT_SYMBOL_GPL(clockevents_unbind_device);
 
-/* Sanity check of state transition callbacks */
-static int clockevents_sanity_check(struct clock_event_device *dev)
-{
-	/* Legacy set_mode() callback */
-	if (dev->set_mode) {
-		/* We shouldn't be supporting new modes now */
-		WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
-			dev->set_state_shutdown || dev->tick_resume ||
-			dev->set_state_oneshot_stopped);
-
-		BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-		return 0;
-	}
-
-	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
-		return 0;
-
-	return 0;
-}
-
 /**
  * clockevents_register_device - register a clock event device
  * @dev:	device to register
@@ -488,8 +450,6 @@ void clockevents_register_device(struct clock_event_device *dev)
 {
 	unsigned long flags;
 
-	BUG_ON(clockevents_sanity_check(dev));
-
 	/* Initialize state to DETACHED */
 	clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
 
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index d11c55b6ab7d..4fcd99e12aa0 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -398,7 +398,6 @@ void tick_shutdown(unsigned int cpu)
 		 * the set mode function!
 		 */
 		clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
-		dev->mode = CLOCK_EVT_MODE_UNUSED;
 		clockevents_exchange_device(dev, NULL);
 		dev->event_handler = clockevents_handle_noop;
 		td->evtdev = NULL;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3319e16f31e5..7c7ec4515983 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -290,16 +290,17 @@ static int __init tick_nohz_full_setup(char *str)
 __setup("nohz_full=", tick_nohz_full_setup);
 
 static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
-						 unsigned long action,
-						 void *hcpu)
+				       unsigned long action,
+				       void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
 		/*
-		 * If we handle the timekeeping duty for full dynticks CPUs,
-		 * we can't safely shutdown that CPU.
+		 * The boot CPU handles housekeeping duty (unbound timers,
+		 * workqueues, timekeeping, ...) on behalf of full dynticks
+		 * CPUs. It must remain online when nohz full is enabled.
 		 */
 		if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
 			return NOTIFY_BAD;
@@ -370,6 +371,12 @@ void __init tick_nohz_init(void)
 	cpu_notifier(tick_nohz_cpu_down_callback, 0);
 	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
 		cpumask_pr_args(tick_nohz_full_mask));
+
+	/*
+	 * We need at least one CPU to handle housekeeping work such
+	 * as timekeeping, unbound timers, workqueues, ...
+	 */
+	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 #endif
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f6ee2e6b6f5d..3739ac6aa473 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
 	negative = (tick_error < 0);
 
 	/* Sort out the magnitude of the correction */
-	tick_error = abs(tick_error);
+	tick_error = abs64(tick_error);
 	for (adj = 0; tick_error > interval; adj++)
 		tick_error >>= 1;
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 129c96033e46..f75e35b60149 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -225,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 		   (unsigned long long) dev->min_delta_ns);
 	SEQ_printf(m, " mult:           %u\n", dev->mult);
 	SEQ_printf(m, " shift:          %u\n", dev->shift);
-	SEQ_printf(m, " mode:           %d\n", dev->mode);
+	SEQ_printf(m, " mode:           %d\n", clockevent_get_state(dev));
 	SEQ_printf(m, " next_event:     %Ld nsecs\n",
 		   (unsigned long long) ktime_to_ns(dev->next_event));
 
@@ -233,40 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 	print_name_offset(m, dev->set_next_event);
 	SEQ_printf(m, "\n");
 
-	if (dev->set_mode) {
-		SEQ_printf(m, " set_mode:       ");
-		print_name_offset(m, dev->set_mode);
+	if (dev->set_state_shutdown) {
+		SEQ_printf(m, " shutdown: ");
+		print_name_offset(m, dev->set_state_shutdown);
 		SEQ_printf(m, "\n");
-	} else {
-		if (dev->set_state_shutdown) {
-			SEQ_printf(m, " shutdown: ");
-			print_name_offset(m, dev->set_state_shutdown);
-			SEQ_printf(m, "\n");
-		}
+	}
 
-		if (dev->set_state_periodic) {
-			SEQ_printf(m, " periodic: ");
-			print_name_offset(m, dev->set_state_periodic);
-			SEQ_printf(m, "\n");
-		}
+	if (dev->set_state_periodic) {
+		SEQ_printf(m, " periodic: ");
+		print_name_offset(m, dev->set_state_periodic);
+		SEQ_printf(m, "\n");
+	}
 
-		if (dev->set_state_oneshot) {
-			SEQ_printf(m, " oneshot:  ");
-			print_name_offset(m, dev->set_state_oneshot);
-			SEQ_printf(m, "\n");
-		}
+	if (dev->set_state_oneshot) {
+		SEQ_printf(m, " oneshot:  ");
+		print_name_offset(m, dev->set_state_oneshot);
+		SEQ_printf(m, "\n");
+	}
 
-		if (dev->set_state_oneshot_stopped) {
-			SEQ_printf(m, " oneshot stopped: ");
-			print_name_offset(m, dev->set_state_oneshot_stopped);
-			SEQ_printf(m, "\n");
-		}
+	if (dev->set_state_oneshot_stopped) {
+		SEQ_printf(m, " oneshot stopped: ");
+		print_name_offset(m, dev->set_state_oneshot_stopped);
+		SEQ_printf(m, "\n");
+	}
 
-		if (dev->tick_resume) {
-			SEQ_printf(m, " resume:   ");
-			print_name_offset(m, dev->tick_resume);
-			SEQ_printf(m, "\n");
-		}
+	if (dev->tick_resume) {
+		SEQ_printf(m, " resume:   ");
+		print_name_offset(m, dev->tick_resume);
+		SEQ_printf(m, "\n");
 	}
 
 	SEQ_printf(m, " event_handler:  ");