From d705ae6b133f9f6a8beee617b1224b6a5c99c5da Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 Feb 2012 09:45:49 +0100
Subject: block: replace icq->changed with icq->flags

icq->changed was used for ICQ_*_CHANGED bits.  Rename it to flags and
access it under ioc->lock instead of using atomic bitops.
ioc_get_changed() is added so that the changed part can be fetched and
cleared as before.

icq->flags will be used to carry other flags.

Signed-off-by: Tejun Heo <tj@kernel.org>
Tested-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/iocontext.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 119773eebe31..17839c7b9614 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -6,8 +6,10 @@
 #include <linux/workqueue.h>
 
 enum {
-	ICQ_IOPRIO_CHANGED,
-	ICQ_CGROUP_CHANGED,
+	ICQ_IOPRIO_CHANGED	= 1 << 0,
+	ICQ_CGROUP_CHANGED	= 1 << 1,
+
+	ICQ_CHANGED_MASK	= ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED,
 };
 
 /*
@@ -88,7 +90,7 @@ struct io_cq {
 		struct rcu_head		__rcu_head;
 	};
 
-	unsigned long		changed;
+	unsigned int		flags;
 };
 
 /*
@@ -139,6 +141,7 @@ struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
 void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
 void ioc_cgroup_changed(struct io_context *ioc);
+unsigned int icq_get_changed(struct io_cq *icq);
 #else
 struct io_context;
 static inline void put_io_context(struct io_context *ioc) { }
-- 
cgit v1.2.3


From 621032ad6eaabf2fe771c4fa0d8f58e1fcfcdba6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 Feb 2012 09:45:53 +0100
Subject: block: exit_io_context() should call elevator_exit_icq_fn()

While updating locking, b2efa05265 "block, cfq: unlink
cfq_io_context's immediately" moved elevator_exit_icq_fn() invocation
from exit_io_context() to the final ioc put.  While this doesn't cause
catastrophic failure, it effectively removes task exit notification to
elevator and cause noticeable IO performance degradation with CFQ.

On task exit, CFQ used to immediately expire the slice if it was being
used by the exiting task as no more IO would be issued by the task;
however, after b2efa05265, the notification is lost and disk could sit
idle needlessly, leading to noticeable IO performance degradation for
certain workloads.

This patch renames ioc_exit_icq() to ioc_destroy_icq(), separates
elevator_exit_icq_fn() invocation into ioc_exit_icq() and invokes it
from exit_io_context().  ICQ_EXITED flag is added to avoid invoking
the callback more than once for the same icq.

Walking icq_list from ioc side and invoking elevator callback requires
reverse double locking.  This may be better implemented using RCU;
unfortunately, using RCU isn't trivial.  e.g. RCU protection would
need to cover request_queue and queue_lock switch on cleanup makes
grabbing queue_lock from RCU unsafe.  Reverse double locking should
do, at least for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-bisected-by: Shaohua Li <shli@kernel.org>
LKML-Reference: <CANejiEVzs=pUhQSTvUppkDcc2TNZyfohBRLygW5zFmXyk5A-xQ@mail.gmail.com>
Tested-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c           | 55 ++++++++++++++++++++++++++++++++++++++++-------
 include/linux/iocontext.h |  1 +
 2 files changed, 48 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index f53c80ecaf07..92bf55540d87 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -36,10 +36,22 @@ static void icq_free_icq_rcu(struct rcu_head *head)
 	kmem_cache_free(icq->__rcu_icq_cache, icq);
 }
 
-/*
- * Exit and free an icq.  Called with both ioc and q locked.
- */
+/* Exit an icq. Called with both ioc and q locked. */
 static void ioc_exit_icq(struct io_cq *icq)
+{
+	struct elevator_type *et = icq->q->elevator->type;
+
+	if (icq->flags & ICQ_EXITED)
+		return;
+
+	if (et->ops.elevator_exit_icq_fn)
+		et->ops.elevator_exit_icq_fn(icq);
+
+	icq->flags |= ICQ_EXITED;
+}
+
+/* Release an icq.  Called with both ioc and q locked. */
+static void ioc_destroy_icq(struct io_cq *icq)
 {
 	struct io_context *ioc = icq->ioc;
 	struct request_queue *q = icq->q;
@@ -60,8 +72,7 @@ static void ioc_exit_icq(struct io_cq *icq)
 	if (rcu_dereference_raw(ioc->icq_hint) == icq)
 		rcu_assign_pointer(ioc->icq_hint, NULL);
 
-	if (et->ops.elevator_exit_icq_fn)
-		et->ops.elevator_exit_icq_fn(icq);
+	ioc_exit_icq(icq);
 
 	/*
 	 * @icq->q might have gone away by the time RCU callback runs
@@ -95,7 +106,7 @@ static void ioc_release_fn(struct work_struct *work)
 		struct request_queue *q = icq->q;
 
 		if (spin_trylock(q->queue_lock)) {
-			ioc_exit_icq(icq);
+			ioc_destroy_icq(icq);
 			spin_unlock(q->queue_lock);
 		} else {
 			spin_unlock_irqrestore(&ioc->lock, flags);
@@ -142,13 +153,41 @@ EXPORT_SYMBOL(put_io_context);
 void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
+	struct io_cq *icq;
+	struct hlist_node *n;
+	unsigned long flags;
 
 	task_lock(task);
 	ioc = task->io_context;
 	task->io_context = NULL;
 	task_unlock(task);
 
-	atomic_dec(&ioc->nr_tasks);
+	if (!atomic_dec_and_test(&ioc->nr_tasks)) {
+		put_io_context(ioc);
+		return;
+	}
+
+	/*
+	 * Need ioc lock to walk icq_list and q lock to exit icq.  Perform
+	 * reverse double locking.  Read comment in ioc_release_fn() for
+	 * explanation on the nested locking annotation.
+	 */
+retry:
+	spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+	hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) {
+		if (icq->flags & ICQ_EXITED)
+			continue;
+		if (spin_trylock(icq->q->queue_lock)) {
+			ioc_exit_icq(icq);
+			spin_unlock(icq->q->queue_lock);
+		} else {
+			spin_unlock_irqrestore(&ioc->lock, flags);
+			cpu_relax();
+			goto retry;
+		}
+	}
+	spin_unlock_irqrestore(&ioc->lock, flags);
+
 	put_io_context(ioc);
 }
 
@@ -168,7 +207,7 @@ void ioc_clear_queue(struct request_queue *q)
 		struct io_context *ioc = icq->ioc;
 
 		spin_lock(&ioc->lock);
-		ioc_exit_icq(icq);
+		ioc_destroy_icq(icq);
 		spin_unlock(&ioc->lock);
 	}
 }
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 17839c7b9614..1a3018063034 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -8,6 +8,7 @@
 enum {
 	ICQ_IOPRIO_CHANGED	= 1 << 0,
 	ICQ_CGROUP_CHANGED	= 1 << 1,
+	ICQ_EXITED		= 1 << 2,
 
 	ICQ_CHANGED_MASK	= ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED,
 };
-- 
cgit v1.2.3


From 2261cc627f5453004042b4f694612edae27e492e Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Wed, 15 Feb 2012 10:47:42 -0800
Subject: dt: add empty of_find_compatible_node function

Add empty of_find_compatible_node function for !CONFIG_OF build.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 include/linux/of.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/of.h b/include/linux/of.h
index a75a831e2057..92cf6ad35e0e 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -281,6 +281,14 @@ static inline struct property *of_find_property(const struct device_node *np,
 	return NULL;
 }
 
+static inline struct device_node *of_find_compatible_node(
+						struct device_node *from,
+						const char *type,
+						const char *compat)
+{
+	return NULL;
+}
+
 static inline int of_property_read_u32_array(const struct device_node *np,
 					     const char *propname,
 					     u32 *out_values, size_t sz)
-- 
cgit v1.2.3


From 7d96b3e55ad45ebe4ff1a1daad27ac1fff8682ec Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sun, 19 Feb 2012 18:29:11 +0400
Subject: percpu: fix generic definition of __this_cpu_add_and_return()

This patch adds missed "__" into function prefix.
Otherwise on all archectures (except x86) it expands to irq/preemtion-safe
variant: _this_cpu_generic_add_return(), which do extra irq-save/irq-restore.
Optimal generic implementation is __this_cpu_generic_add_return().

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 32cd1f67462e..3b609eb9cd7d 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -718,7 +718,8 @@ do {									\
 # ifndef __this_cpu_add_return_8
 #  define __this_cpu_add_return_8(pcp, val)	__this_cpu_generic_add_return(pcp, val)
 # endif
-# define __this_cpu_add_return(pcp, val)	__pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
+# define __this_cpu_add_return(pcp, val)	\
+	__pcpu_size_call_return2(__this_cpu_add_return_, pcp, val)
 #endif
 
 #define __this_cpu_sub_return(pcp, val)	this_cpu_add_return(pcp, -(val))
-- 
cgit v1.2.3


From e920d5971d706290c5a6281f719e16c25021f964 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Wed, 15 Feb 2012 16:54:38 +0800
Subject: percpu: use raw_local_irq_* in _this_cpu op

It doesn't make sense to trace irq off or do irq flags
lock proving inside 'this_cpu' operations, so replace local_irq_*
with raw_local_irq_* in 'this_cpu' op.

Also the patch fixes onelockdep warning[1] by the replacement, see
below:

In commit: 933393f58fef9963eac61db8093689544e29a600(percpu:
Remove irqsafe_cpu_xxx variants), local_irq_save/restore(flags) are
added inside this_cpu_inc operation, so that trace_hardirqs_off_caller
will be called by trace_hardirqs_on_caller directly because
__debug_atomic_inc is implemented as this_cpu_inc, which may trigger
the lockdep warning[1], for example in the below ARM scenary:

	kernel_thread_helper	/*irq disabled*/
		->trace_hardirqs_on_caller	/*hardirqs_enabled was set*/
			->trace_hardirqs_off_caller	/*hardirqs_enabled cleared*/
				__this_cpu_add(redundant_hardirqs_on)
			->trace_hardirqs_off_caller	/*irq disabled, so call here*/

The 'unannotated irqs-on' warning will be triggered somewhere because
irq is just enabled after the irq trace in kernel_thread_helper.

[1],
[    0.162841] ------------[ cut here ]------------
[    0.167694] WARNING: at kernel/lockdep.c:3493 check_flags+0xc0/0x1d0()
[    0.174468] Modules linked in:
[    0.177703] Backtrace:
[    0.180328] [<c00171f0>] (dump_backtrace+0x0/0x110) from [<c0412320>] (dump_stack+0x18/0x1c)
[    0.189086]  r6:c051f778 r5:00000da5 r4:00000000 r3:60000093
[    0.195007] [<c0412308>] (dump_stack+0x0/0x1c) from [<c00410e8>] (warn_slowpath_common+0x54/0x6c)
[    0.204223] [<c0041094>] (warn_slowpath_common+0x0/0x6c) from [<c0041124>] (warn_slowpath_null+0x24/0x2c)
[    0.214111]  r8:00000000 r7:00000000 r6:ee069598 r5:60000013 r4:ee082000
[    0.220825] r3:00000009
[    0.223693] [<c0041100>] (warn_slowpath_null+0x0/0x2c) from [<c0088f38>] (check_flags+0xc0/0x1d0)
[    0.232910] [<c0088e78>] (check_flags+0x0/0x1d0) from [<c008d348>] (lock_acquire+0x4c/0x11c)
[    0.241668] [<c008d2fc>] (lock_acquire+0x0/0x11c) from [<c0415aa4>] (_raw_spin_lock+0x3c/0x74)
[    0.250610] [<c0415a68>] (_raw_spin_lock+0x0/0x74) from [<c010a844>] (set_task_comm+0x20/0xc0)
[    0.259521]  r6:ee069588 r5:ee0691c0 r4:ee082000
[    0.264404] [<c010a824>] (set_task_comm+0x0/0xc0) from [<c0060780>] (kthreadd+0x28/0x108)
[    0.272857]  r8:00000000 r7:00000013 r6:c0044a08 r5:ee0691c0 r4:ee082000
[    0.279571] r3:ee083fe0
[    0.282470] [<c0060758>] (kthreadd+0x0/0x108) from [<c0044a08>] (do_exit+0x0/0x6dc)
[    0.290405]  r5:c0060758 r4:00000000
[    0.294189] ---[ end trace 1b75b31a2719ed1c ]---
[    0.299041] possible reason: unannotated irqs-on.
[    0.303955] irq event stamp: 5
[    0.307159] hardirqs last  enabled at (4): [<c001331c>] no_work_pending+0x8/0x2c
[    0.314880] hardirqs last disabled at (5): [<c0089b08>] trace_hardirqs_on_caller+0x60/0x26c
[    0.323547] softirqs last  enabled at (0): [<c003f754>] copy_process+0x33c/0xef4
[    0.331207] softirqs last disabled at (0): [<  (null)>]   (null)
[    0.337585] CPU0: thread -1, cpu 0, socket 0, mpidr 80000000

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 3b609eb9cd7d..594c0040fdd8 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -348,9 +348,9 @@ do {									\
 #define _this_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	unsigned long flags;						\
-	local_irq_save(flags);						\
+	raw_local_irq_save(flags);					\
 	*__this_cpu_ptr(&(pcp)) op val;					\
-	local_irq_restore(flags);					\
+	raw_local_irq_restore(flags);					\
 } while (0)
 
 #ifndef this_cpu_write
@@ -449,10 +449,10 @@ do {									\
 ({									\
 	typeof(pcp) ret__;						\
 	unsigned long flags;						\
-	local_irq_save(flags);						\
+	raw_local_irq_save(flags);					\
 	__this_cpu_add(pcp, val);					\
 	ret__ = __this_cpu_read(pcp);					\
-	local_irq_restore(flags);					\
+	raw_local_irq_restore(flags);					\
 	ret__;								\
 })
 
@@ -479,10 +479,10 @@ do {									\
 #define _this_cpu_generic_xchg(pcp, nval)				\
 ({	typeof(pcp) ret__;						\
 	unsigned long flags;						\
-	local_irq_save(flags);						\
+	raw_local_irq_save(flags);					\
 	ret__ = __this_cpu_read(pcp);					\
 	__this_cpu_write(pcp, nval);					\
-	local_irq_restore(flags);					\
+	raw_local_irq_restore(flags);					\
 	ret__;								\
 })
 
@@ -507,11 +507,11 @@ do {									\
 ({									\
 	typeof(pcp) ret__;						\
 	unsigned long flags;						\
-	local_irq_save(flags);						\
+	raw_local_irq_save(flags);					\
 	ret__ = __this_cpu_read(pcp);					\
 	if (ret__ == (oval))						\
 		__this_cpu_write(pcp, nval);				\
-	local_irq_restore(flags);					\
+	raw_local_irq_restore(flags);					\
 	ret__;								\
 })
 
@@ -544,10 +544,10 @@ do {									\
 ({									\
 	int ret__;							\
 	unsigned long flags;						\
-	local_irq_save(flags);						\
+	raw_local_irq_save(flags);					\
 	ret__ = __this_cpu_generic_cmpxchg_double(pcp1, pcp2,		\
 			oval1, oval2, nval1, nval2);			\
-	local_irq_restore(flags);					\
+	raw_local_irq_restore(flags);					\
 	ret__;								\
 })
 
-- 
cgit v1.2.3


From 7e55d0527e4925a49464a5b26fdabae1f7a91a77 Mon Sep 17 00:00:00 2001
From: viresh kumar <viresh.kumar@st.com>
Date: Thu, 23 Feb 2012 04:41:05 +0100
Subject: ARM: 7339/1: amba/serial.h: Include types.h for resolving dependency
 of type bool

serial.h uses bool, but its definition is missing, as it doesn't include
types.h. Fix this by including types.h

Signed-off-by: Viresh Kumar <viresh.kumar@st.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/amba/serial.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 514ed45c462e..d117b29d1062 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -23,6 +23,8 @@
 #ifndef ASM_ARM_HARDWARE_SERIAL_AMBA_H
 #define ASM_ARM_HARDWARE_SERIAL_AMBA_H
 
+#include <linux/types.h>
+
 /* -------------------------------------------------------------------------------
  *  From AMBA UART (PL010) Block Specification
  * -------------------------------------------------------------------------------
-- 
cgit v1.2.3


From ecb971923614775a118bc05ad16b2bde450cac7d Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 27 Feb 2012 17:52:52 -0500
Subject: tcp: fix comment for tp->highest_sack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There was an off-by-one error in the comments describing the
highest_sack field in struct tcp_sock. The comments previously claimed
that it was the "start sequence of the highest skb with SACKed
bit". This commit fixes the comments to note that it is the "start
sequence of the skb just *after* the highest skb with SACKed bit".

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 3 ++-
 include/net/tcp.h   | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 46a85c9e1f25..3c7ffdb40dc6 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -412,7 +412,8 @@ struct tcp_sock {
 
 	struct tcp_sack_block recv_sack_cache[4];
 
-	struct sk_buff *highest_sack;   /* highest skb with SACK received
+	struct sk_buff *highest_sack;   /* skb just after the highest
+					 * skb with SACKed bit set
 					 * (validity guaranteed only if
 					 * sacked_out > 0)
 					 */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 42c29bfbcee3..2d80c291fffb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1364,8 +1364,9 @@ static inline void tcp_push_pending_frames(struct sock *sk)
 	}
 }
 
-/* Start sequence of the highest skb with SACKed bit, valid only if
- * sacked > 0 or when the caller has ensured validity by itself.
+/* Start sequence of the skb just after the highest skb with SACKed
+ * bit, valid only if sacked_out > 0 or when the caller has ensured
+ * validity by itself.
  */
 static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
 {
-- 
cgit v1.2.3


From fe316bf2d5847bc5dd975668671a7b1067603bc7 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Fri, 2 Mar 2012 10:38:33 +0100
Subject: block: Fix NULL pointer dereference in sd_revalidate_disk

Since 2.6.39 (1196f8b), when a driver returns -ENOMEDIUM for open(),
__blkdev_get() calls rescan_partitions() to remove
in-kernel partition structures and raise KOBJ_CHANGE uevent.

However it ends up calling driver's revalidate_disk without open
and could cause oops.

In the case of SCSI:

  process A                  process B
  ----------------------------------------------
  sys_open
    __blkdev_get
      sd_open
        returns -ENOMEDIUM
                             scsi_remove_device
                               <scsi_device torn down>
      rescan_partitions
        sd_revalidate_disk
          <oops>
Oopses are reported here:
http://marc.info/?l=linux-scsi&m=132388619710052

This patch separates the partition invalidation from rescan_partitions()
and use it for -ENOMEDIUM case.

Reported-by: Huajun Li <huajun.li.lee@gmail.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/partition-generic.c | 48 +++++++++++++++++++++++++++++++++++++++--------
 fs/block_dev.c            | 16 ++++++++++++----
 include/linux/genhd.h     |  1 +
 3 files changed, 53 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/block/partition-generic.c b/block/partition-generic.c
index d06ec1c829c2..6df5d6928a44 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -389,17 +389,11 @@ static bool disk_unlock_native_capacity(struct gendisk *disk)
 	}
 }
 
-int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
+static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
 {
-	struct parsed_partitions *state = NULL;
 	struct disk_part_iter piter;
 	struct hd_struct *part;
-	int p, highest, res;
-rescan:
-	if (state && !IS_ERR(state)) {
-		kfree(state);
-		state = NULL;
-	}
+	int res;
 
 	if (bdev->bd_part_count)
 		return -EBUSY;
@@ -412,6 +406,24 @@ rescan:
 		delete_partition(disk, part->partno);
 	disk_part_iter_exit(&piter);
 
+	return 0;
+}
+
+int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
+{
+	struct parsed_partitions *state = NULL;
+	struct hd_struct *part;
+	int p, highest, res;
+rescan:
+	if (state && !IS_ERR(state)) {
+		kfree(state);
+		state = NULL;
+	}
+
+	res = drop_partitions(disk, bdev);
+	if (res)
+		return res;
+
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
 	check_disk_size_change(disk, bdev);
@@ -515,6 +527,26 @@ rescan:
 	return 0;
 }
 
+int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
+{
+	int res;
+
+	if (!bdev->bd_invalidated)
+		return 0;
+
+	res = drop_partitions(disk, bdev);
+	if (res)
+		return res;
+
+	set_capacity(disk, 0);
+	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
+	/* tell userspace that the media / partition table may have changed */
+	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+
+	return 0;
+}
+
 unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0e575d1304b4..5e9f198f7712 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1183,8 +1183,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			 * The latter is necessary to prevent ghost
 			 * partitions on a removed medium.
 			 */
-			if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
-				rescan_partitions(disk, bdev);
+			if (bdev->bd_invalidated) {
+				if (!ret)
+					rescan_partitions(disk, bdev);
+				else if (ret == -ENOMEDIUM)
+					invalidate_partitions(disk, bdev);
+			}
 			if (ret)
 				goto out_clear;
 		} else {
@@ -1214,8 +1218,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (bdev->bd_disk->fops->open)
 				ret = bdev->bd_disk->fops->open(bdev, mode);
 			/* the same as first opener case, read comment there */
-			if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
-				rescan_partitions(bdev->bd_disk, bdev);
+			if (bdev->bd_invalidated) {
+				if (!ret)
+					rescan_partitions(bdev->bd_disk, bdev);
+				else if (ret == -ENOMEDIUM)
+					invalidate_partitions(bdev->bd_disk, bdev);
+			}
 			if (ret)
 				goto out_unlock_bdev;
 		}
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index fe23ee768589..e61d3192448e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -596,6 +596,7 @@ extern char *disk_name (struct gendisk *hd, int partno, char *buf);
 
 extern int disk_expand_part_tbl(struct gendisk *disk, int target);
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
+extern int invalidate_partitions(struct gendisk *disk, struct block_device *bdev);
 extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
 						     int partno, sector_t start,
 						     sector_t len, int flags,
-- 
cgit v1.2.3


From 62d3c5439c534b0e6c653fc63e6d8c67be3a57b1 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 2 Mar 2012 10:51:00 +0100
Subject: Block: use a freezable workqueue for disk-event polling

This patch (as1519) fixes a bug in the block layer's disk-events
polling.  The polling is done by a work routine queued on the
system_nrt_wq workqueue.  Since that workqueue isn't freezable, the
polling continues even in the middle of a system sleep transition.

Obviously, polling a suspended drive for media changes and such isn't
a good thing to do; in the case of USB mass-storage devices it can
lead to real problems requiring device resets and even re-enumeration.

The patch fixes things by creating a new system-wide, non-reentrant,
freezable workqueue and using it for disk-events polling.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
CC: <stable@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c             | 10 +++++-----
 include/linux/workqueue.h |  4 ++++
 kernel/workqueue.c        |  7 ++++++-
 3 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/block/genhd.c b/block/genhd.c
index b26c4085590d..df9816ede75b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1478,9 +1478,9 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
 	intv = disk_events_poll_jiffies(disk);
 	set_timer_slack(&ev->dwork.timer, intv / 4);
 	if (check_now)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	else if (intv)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv);
 out_unlock:
 	spin_unlock_irqrestore(&ev->lock, flags);
 }
@@ -1524,7 +1524,7 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
 	ev->clearing |= mask;
 	if (!ev->block) {
 		cancel_delayed_work(&ev->dwork);
-		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	}
 	spin_unlock_irq(&ev->lock);
 }
@@ -1561,7 +1561,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
 
 	/* uncondtionally schedule event check and wait for it to finish */
 	disk_block_events(disk);
-	queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+	queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	flush_delayed_work(&ev->dwork);
 	__disk_unblock_events(disk, false);
 
@@ -1598,7 +1598,7 @@ static void disk_events_workfn(struct work_struct *work)
 
 	intv = disk_events_poll_jiffies(disk);
 	if (!ev->block && intv)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv);
 
 	spin_unlock_irq(&ev->lock);
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index eb8b9f15f2e0..af155450cabb 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -289,12 +289,16 @@ enum {
  *
  * system_freezable_wq is equivalent to system_wq except that it's
  * freezable.
+ *
+ * system_nrt_freezable_wq is equivalent to system_nrt_wq except that
+ * it's freezable.
  */
 extern struct workqueue_struct *system_wq;
 extern struct workqueue_struct *system_long_wq;
 extern struct workqueue_struct *system_nrt_wq;
 extern struct workqueue_struct *system_unbound_wq;
 extern struct workqueue_struct *system_freezable_wq;
+extern struct workqueue_struct *system_nrt_freezable_wq;
 
 extern struct workqueue_struct *
 __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bec7b5b53e03..f2c5638bb5ab 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -253,11 +253,13 @@ struct workqueue_struct *system_long_wq __read_mostly;
 struct workqueue_struct *system_nrt_wq __read_mostly;
 struct workqueue_struct *system_unbound_wq __read_mostly;
 struct workqueue_struct *system_freezable_wq __read_mostly;
+struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
 EXPORT_SYMBOL_GPL(system_long_wq);
 EXPORT_SYMBOL_GPL(system_nrt_wq);
 EXPORT_SYMBOL_GPL(system_unbound_wq);
 EXPORT_SYMBOL_GPL(system_freezable_wq);
+EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
@@ -3833,8 +3835,11 @@ static int __init init_workqueues(void)
 					    WQ_UNBOUND_MAX_ACTIVE);
 	system_freezable_wq = alloc_workqueue("events_freezable",
 					      WQ_FREEZABLE, 0);
+	system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable",
+			WQ_NON_REENTRANT | WQ_FREEZABLE, 0);
 	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
-	       !system_unbound_wq || !system_freezable_wq);
+	       !system_unbound_wq || !system_freezable_wq ||
+		!system_nrt_freezable_wq);
 	return 0;
 }
 early_initcall(init_workqueues);
-- 
cgit v1.2.3


From adb795062f89b8d67d295ee25e04034bccce6779 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Wed, 29 Feb 2012 00:41:12 +0400
Subject: percpu: fix __this_cpu_{sub,inc,dec}_return() definition

This patch adds missed "__" prefixes, otherwise these functions
works as irq/preemption safe.

Reported-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 594c0040fdd8..21638ae14e07 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -722,9 +722,9 @@ do {									\
 	__pcpu_size_call_return2(__this_cpu_add_return_, pcp, val)
 #endif
 
-#define __this_cpu_sub_return(pcp, val)	this_cpu_add_return(pcp, -(val))
-#define __this_cpu_inc_return(pcp)	this_cpu_add_return(pcp, 1)
-#define __this_cpu_dec_return(pcp)	this_cpu_add_return(pcp, -1)
+#define __this_cpu_sub_return(pcp, val)	__this_cpu_add_return(pcp, -(val))
+#define __this_cpu_inc_return(pcp)	__this_cpu_add_return(pcp, 1)
+#define __this_cpu_dec_return(pcp)	__this_cpu_add_return(pcp, -1)
 
 #define __this_cpu_generic_xchg(pcp, nval)				\
 ({	typeof(pcp) ret__;						\
-- 
cgit v1.2.3


From 5483f18e986ed5267b923bec12b407845181350b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 4 Mar 2012 15:51:42 -0800
Subject: vfs: move dentry_cmp from <linux/dcache.h> to fs/dcache.c

It's only used inside fs/dcache.c, and we're going to play games with it
for the word-at-a-time patches.  This time we really don't even want to
export it, because it really is an internal function to fs/dcache.c, and
has been since it was introduced.

Having it in that extremely hot header file (it's included in pretty
much everything, thanks to <linux/fs.h>) is a disaster for testing
different versions, and is utterly pointless.

We really should have some kind of header file diet thing, where we
figure out which parts of header files are really better off private and
only result in more expensive compiles.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c            | 20 ++++++++++++++++++++
 include/linux/dcache.h | 20 --------------------
 2 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index 138be96e25b6..bcbdb33fcc20 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -137,6 +137,26 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 }
 #endif
 
+/*
+ * Compare 2 name strings, return 0 if they match, otherwise non-zero.
+ * The strings are both count bytes long, and count is non-zero.
+ */
+static inline int dentry_cmp(const unsigned char *cs, size_t scount,
+				const unsigned char *ct, size_t tcount)
+{
+	if (scount != tcount)
+		return 1;
+
+	do {
+		if (*cs != *ct)
+			return 1;
+		cs++;
+		ct++;
+		tcount--;
+	} while (tcount);
+	return 0;
+}
+
 static void __d_free(struct rcu_head *head)
 {
 	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 4270bedd2308..ff5f5256d175 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -47,26 +47,6 @@ struct dentry_stat_t {
 };
 extern struct dentry_stat_t dentry_stat;
 
-/*
- * Compare 2 name strings, return 0 if they match, otherwise non-zero.
- * The strings are both count bytes long, and count is non-zero.
- */
-static inline int dentry_cmp(const unsigned char *cs, size_t scount,
-				const unsigned char *ct, size_t tcount)
-{
-	if (scount != tcount)
-		return 1;
-
-	do {
-		if (*cs != *ct)
-			return 1;
-		cs++;
-		ct++;
-		tcount--;
-	} while (tcount);
-	return 0;
-}
-
 /* Name hashing routines. Initial hash value */
 /* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
 #define init_name_hash()		0
-- 
cgit v1.2.3


From c22ab332902333f83766017478c1ef6607ace681 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Mon, 5 Mar 2012 14:59:10 -0800
Subject: kmsg_dump: don't run on non-error paths by default

Since commit 04c6862c055f ("kmsg_dump: add kmsg_dump() calls to the
reboot, halt, poweroff and emergency_restart paths"), kmsg_dump() gets
run on normal paths including poweroff and reboot.

This is less than ideal given pstore implementations that can only
represent single backtraces, since a reboot may overwrite a stored oops
before it's been picked up by userspace.  In addition, some pstore
backends may have low performance and provide a significant delay in
reboot as a result.

This patch adds a printk.always_kmsg_dump kernel parameter (which can also
be changed from userspace).  Without it, the code will only be run on
failure paths rather than on normal paths.  The option can be enabled in
environments where there's a desire to attempt to audit whether or not a
reboot was cleanly requested or not.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Acked-by: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Marco Stornelli <marco.stornelli@gmail.com>
Cc: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt | 6 ++++++
 include/linux/kmsg_dump.h           | 9 +++++++--
 kernel/printk.c                     | 6 ++++++
 3 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 033d4e69b43b..d99fd9c0ec0e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2211,6 +2211,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 			default: off.
 
+	printk.always_kmsg_dump=
+			Trigger kmsg_dump for cases other than kernel oops or
+			panics
+			Format: <bool>  (1/Y/y=enable, 0/N/n=disable)
+			default: disabled
+
 	printk.time=	Show timing data prefixed to each printk message line
 			Format: <bool>  (1/Y/y=enable, 0/N/n=disable)
 
diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index fee66317e071..35f7237ec972 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -15,13 +15,18 @@
 #include <linux/errno.h>
 #include <linux/list.h>
 
+/*
+ * Keep this list arranged in rough order of priority. Anything listed after
+ * KMSG_DUMP_OOPS will not be logged by default unless printk.always_kmsg_dump
+ * is passed to the kernel.
+ */
 enum kmsg_dump_reason {
-	KMSG_DUMP_OOPS,
 	KMSG_DUMP_PANIC,
+	KMSG_DUMP_OOPS,
+	KMSG_DUMP_EMERG,
 	KMSG_DUMP_RESTART,
 	KMSG_DUMP_HALT,
 	KMSG_DUMP_POWEROFF,
-	KMSG_DUMP_EMERG,
 };
 
 /**
diff --git a/kernel/printk.c b/kernel/printk.c
index 13c0a1143f49..32690a0b7a18 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -702,6 +702,9 @@ static bool printk_time = 0;
 #endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
 
+static bool always_kmsg_dump;
+module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
+
 /* Check if we have any console registered that can be called early in boot. */
 static int have_callable_console(void)
 {
@@ -1732,6 +1735,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 	unsigned long l1, l2;
 	unsigned long flags;
 
+	if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
+		return;
+
 	/* Theoretically, the log could move on after we do this, but
 	   there's not a lot we can do about that. The new messages
 	   will overwrite the start of what we dump. */
-- 
cgit v1.2.3


From c415c3b47ea2754659d915cca387a20999044163 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 5 Mar 2012 14:59:13 -0800
Subject: vfork: introduce complete_vfork_done()

No functional changes.

Move the clear-and-complete-vfork_done code into the new trivial helper,
complete_vfork_done().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             |  8 ++------
 include/linux/sched.h |  1 +
 kernel/fork.c         | 17 ++++++++++-------
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index 92ce83a11e90..dccdcec913e9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1915,7 +1915,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 {
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
-	struct completion *vfork_done;
 	int core_waiters = -EBUSY;
 
 	init_completion(&core_state->startup);
@@ -1934,11 +1933,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 	 * Make sure nobody is waiting for us to release the VM,
 	 * otherwise we can deadlock when we wait on each other
 	 */
-	vfork_done = tsk->vfork_done;
-	if (vfork_done) {
-		tsk->vfork_done = NULL;
-		complete(vfork_done);
-	}
+	if (tsk->vfork_done)
+		complete_vfork_done(tsk);
 
 	if (core_waiters)
 		wait_for_completion(&core_state->startup);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7d379a6bfd88..1b25a37f2aee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2291,6 +2291,7 @@ extern int do_execve(const char *,
 		     const char __user * const __user *,
 		     const char __user * const __user *, struct pt_regs *);
 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
+extern void complete_vfork_done(struct task_struct *tsk);
 struct task_struct *fork_idle(int);
 
 extern void set_task_comm(struct task_struct *tsk, char *from);
diff --git a/kernel/fork.c b/kernel/fork.c
index e2cd3e2a5ae8..cf3d96379608 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -668,6 +668,14 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 	return mm;
 }
 
+void complete_vfork_done(struct task_struct *tsk)
+{
+	struct completion *vfork_done = tsk->vfork_done;
+
+	tsk->vfork_done = NULL;
+	complete(vfork_done);
+}
+
 /* Please note the differences between mmput and mm_release.
  * mmput is called whenever we stop holding onto a mm_struct,
  * error success whatever.
@@ -683,8 +691,6 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
  */
 void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 {
-	struct completion *vfork_done = tsk->vfork_done;
-
 	/* Get rid of any futexes when releasing the mm */
 #ifdef CONFIG_FUTEX
 	if (unlikely(tsk->robust_list)) {
@@ -704,11 +710,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 	/* Get rid of any cached register state */
 	deactivate_mm(tsk, mm);
 
-	/* notify parent sleeping on vfork() */
-	if (vfork_done) {
-		tsk->vfork_done = NULL;
-		complete(vfork_done);
-	}
+	if (tsk->vfork_done)
+		complete_vfork_done(tsk);
 
 	/*
 	 * If we're exiting normally, clear a user-space tid field if
-- 
cgit v1.2.3


From d68b46fe16ad59b3a5f51ec73daaa5dc06753798 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 5 Mar 2012 14:59:13 -0800
Subject: vfork: make it killable

Make vfork() killable.

Change do_fork(CLONE_VFORK) to do wait_for_completion_killable().  If it
fails we do not return to the user-mode and never touch the memory shared
with our child.

However, in this case we should clear child->vfork_done before return, we
use task_lock() in do_fork()->wait_for_vfork_done() and
complete_vfork_done() to serialize with each other.

Note: now that we use task_lock() we don't really need completion, we
could turn task->vfork_done into "task_struct *wake_up_me" but this needs
some complications.

NOTE: this and the next patches do not affect in-kernel users of
CLONE_VFORK, kernel threads run with all signals ignored including
SIGKILL/SIGSTOP.

However this is obviously the user-visible change.  Not only a fatal
signal can kill the vforking parent, a sub-thread can do execve or
exit_group() and kill the thread sleeping in vfork().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  2 +-
 kernel/fork.c         | 40 ++++++++++++++++++++++++++++++++--------
 2 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1b25a37f2aee..b6467711f12e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2372,7 +2372,7 @@ static inline int thread_group_empty(struct task_struct *p)
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
  * pins the final release of task.io_context.  Also protects ->cpuset and
- * ->cgroup.subsys[].
+ * ->cgroup.subsys[]. And ->vfork_done.
  *
  * Nests both inside and outside of read_lock(&tasklist_lock).
  * It must not be nested with write_lock_irq(&tasklist_lock),
diff --git a/kernel/fork.c b/kernel/fork.c
index cf3d96379608..892c534ce6e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -670,10 +670,34 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 
 void complete_vfork_done(struct task_struct *tsk)
 {
-	struct completion *vfork_done = tsk->vfork_done;
+	struct completion *vfork;
 
-	tsk->vfork_done = NULL;
-	complete(vfork_done);
+	task_lock(tsk);
+	vfork = tsk->vfork_done;
+	if (likely(vfork)) {
+		tsk->vfork_done = NULL;
+		complete(vfork);
+	}
+	task_unlock(tsk);
+}
+
+static int wait_for_vfork_done(struct task_struct *child,
+				struct completion *vfork)
+{
+	int killed;
+
+	freezer_do_not_count();
+	killed = wait_for_completion_killable(vfork);
+	freezer_count();
+
+	if (killed) {
+		task_lock(child);
+		child->vfork_done = NULL;
+		task_unlock(child);
+	}
+
+	put_task_struct(child);
+	return killed;
 }
 
 /* Please note the differences between mmput and mm_release.
@@ -717,7 +741,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 	 * If we're exiting normally, clear a user-space tid field if
 	 * requested.  We leave this alone when dying by signal, to leave
 	 * the value intact in a core dump, and to save the unnecessary
-	 * trouble otherwise.  Userland only wants this done for a sys_exit.
+	 * trouble, say, a killed vfork parent shouldn't touch this mm.
+	 * Userland only wants this done for a sys_exit.
 	 */
 	if (tsk->clear_child_tid) {
 		if (!(tsk->flags & PF_SIGNALED) &&
@@ -1551,6 +1576,7 @@ long do_fork(unsigned long clone_flags,
 		if (clone_flags & CLONE_VFORK) {
 			p->vfork_done = &vfork;
 			init_completion(&vfork);
+			get_task_struct(p);
 		}
 
 		/*
@@ -1568,10 +1594,8 @@ long do_fork(unsigned long clone_flags,
 			ptrace_event(trace, nr);
 
 		if (clone_flags & CLONE_VFORK) {
-			freezer_do_not_count();
-			wait_for_completion(&vfork);
-			freezer_count();
-			ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
+			if (!wait_for_vfork_done(p, &vfork))
+				ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
 		}
 	} else {
 		nr = PTR_ERR(p);
-- 
cgit v1.2.3


From 57b59c4a1400fa6c34764eab2e35a8762dc05a09 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 5 Mar 2012 14:59:13 -0800
Subject: coredump_wait: don't call complete_vfork_done()

Now that CLONE_VFORK is killable, coredump_wait() no longer needs
complete_vfork_done().  zap_threads() should find and kill all tasks with
the same ->mm, this includes our parent if ->vfork_done is set.

mm_release() becomes the only caller, unexport complete_vfork_done().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             | 14 ++------------
 include/linux/sched.h |  1 -
 kernel/fork.c         |  2 +-
 3 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index dccdcec913e9..153dee14fe55 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1926,19 +1926,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 		core_waiters = zap_threads(tsk, mm, core_state, exit_code);
 	up_write(&mm->mmap_sem);
 
-	if (unlikely(core_waiters < 0))
-		goto fail;
-
-	/*
-	 * Make sure nobody is waiting for us to release the VM,
-	 * otherwise we can deadlock when we wait on each other
-	 */
-	if (tsk->vfork_done)
-		complete_vfork_done(tsk);
-
-	if (core_waiters)
+	if (core_waiters > 0)
 		wait_for_completion(&core_state->startup);
-fail:
+
 	return core_waiters;
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b6467711f12e..11fcafaf4ae4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2291,7 +2291,6 @@ extern int do_execve(const char *,
 		     const char __user * const __user *,
 		     const char __user * const __user *, struct pt_regs *);
 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
-extern void complete_vfork_done(struct task_struct *tsk);
 struct task_struct *fork_idle(int);
 
 extern void set_task_comm(struct task_struct *tsk, char *from);
diff --git a/kernel/fork.c b/kernel/fork.c
index 892c534ce6e3..44b0e21af50e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -668,7 +668,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 	return mm;
 }
 
-void complete_vfork_done(struct task_struct *tsk)
+static void complete_vfork_done(struct task_struct *tsk)
 {
 	struct completion *vfork;
 
-- 
cgit v1.2.3


From 6e27f63edbd7ab893258e16500171dd1270a1369 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 5 Mar 2012 14:59:14 -0800
Subject: vfork: kill PF_STARTING

Previously it was (ab)used by utrace.  Then it was wrongly used by the
scheduler code.

Currently it is not used, kill it before it finds the new erroneous user.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 1 -
 kernel/fork.c         | 9 ---------
 2 files changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11fcafaf4ae4..0657368bd78f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1777,7 +1777,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 /*
  * Per process flags
  */
-#define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
diff --git a/kernel/fork.c b/kernel/fork.c
index 44b0e21af50e..26a7a6707fa7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1046,7 +1046,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 
 	new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
 	new_flags |= PF_FORKNOEXEC;
-	new_flags |= PF_STARTING;
 	p->flags = new_flags;
 }
 
@@ -1579,14 +1578,6 @@ long do_fork(unsigned long clone_flags,
 			get_task_struct(p);
 		}
 
-		/*
-		 * We set PF_STARTING at creation in case tracing wants to
-		 * use this to distinguish a fully live task from one that
-		 * hasn't finished SIGSTOP raising yet.  Now we clear it
-		 * and set the child going.
-		 */
-		p->flags &= ~PF_STARTING;
-
 		wake_up_new_task(p);
 
 		/* forking complete and child started to run, tell ptracer */
-- 
cgit v1.2.3


From 7512102cf64d36e3c7444480273623c7aab3563f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 5 Mar 2012 14:59:18 -0800
Subject: memcg: fix GPF when cgroup removal races with last exit

When moving tasks from old memcg (with move_charge_at_immigrate on new
memcg), followed by removal of old memcg, hit General Protection Fault in
mem_cgroup_lru_del_list() (called from release_pages called from
free_pages_and_swap_cache from tlb_flush_mmu from tlb_finish_mmu from
exit_mmap from mmput from exit_mm from do_exit).

Somewhat reproducible, takes a few hours: the old struct mem_cgroup has
been freed and poisoned by SLAB_DEBUG, but mem_cgroup_lru_del_list() is
still trying to update its stats, and take page off lru before freeing.

A task, or a charge, or a page on lru: each secures a memcg against
removal.  In this case, the last task has been moved out of the old memcg,
and it is exiting: anonymous pages are uncharged one by one from the
memcg, as they are zapped from its pagetables, so the charge gets down to
0; but the pages themselves are queued in an mmu_gather for freeing.

Most of those pages will be on lru (and force_empty is careful to
lru_add_drain_all, to add pages from pagevec to lru first), but not
necessarily all: perhaps some have been isolated for page reclaim, perhaps
some isolated for other reasons.  So, force_empty may find no task, no
charge and no page on lru, and let the removal proceed.

There would still be no problem if these pages were immediately freed; but
typically (and the put_page_testzero protocol demands it) they have to be
added back to lru before they are found freeable, then removed from lru
and freed.  We don't see the issue when adding, because the
mem_cgroup_iter() loops keep their own reference to the memcg being
scanned; but when it comes to mem_cgroup_lru_del_list().

I believe this was not an issue in v3.2: there, PageCgroupAcctLRU and
PageCgroupUsed flags were used (like a trick with mirrors) to deflect view
of pc->mem_cgroup to the stable root_mem_cgroup when neither set.
38c5d72f3ebe ("memcg: simplify LRU handling by new rule") mercifully
removed those convolutions, but left this General Protection Fault.

But it's surprisingly easy to restore the old behaviour: just check
PageCgroupUsed in mem_cgroup_lru_add_list() (which decides on which lruvec
to add), and reset pc to root_mem_cgroup if page is uncharged.  A risky
change?  just going back to how it worked before; testing, and an audit of
uses of pc->mem_cgroup, show no problem.

And there's a nice bonus: with mem_cgroup_lru_add_list() itself making
sure that an uncharged page goes to root lru, mem_cgroup_reset_owner() no
longer has any purpose, and we can safely revert 4e5f01c2b9b9 ("memcg:
clear pc->mem_cgroup if necessary").

Calling update_page_reclaim_stat() after add_page_to_lru_list() in swap.c
is not strictly necessary: the lru_lock there, with RCU before memcg
structures are freed, makes mem_cgroup_get_reclaim_stat_from_page safe
without that; but it seems cleaner to rely on one dependency less.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  5 -----
 mm/ksm.c                   | 11 -----------
 mm/memcontrol.c            | 30 +++++++++++++-----------------
 mm/migrate.c               |  2 --
 mm/swap.c                  |  8 +++++---
 mm/swap_state.c            | 10 ----------
 6 files changed, 18 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4d34356fe644..b80de520670b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -129,7 +129,6 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
 					struct page *newpage);
 
-extern void mem_cgroup_reset_owner(struct page *page);
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern int do_swap_account;
 #endif
@@ -392,10 +391,6 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
 				struct page *newpage)
 {
 }
-
-static inline void mem_cgroup_reset_owner(struct page *page)
-{
-}
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
diff --git a/mm/ksm.c b/mm/ksm.c
index 1925ffbfb27f..310544a379ae 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -28,7 +28,6 @@
 #include <linux/kthread.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
-#include <linux/memcontrol.h>
 #include <linux/rbtree.h>
 #include <linux/memory.h>
 #include <linux/mmu_notifier.h>
@@ -1572,16 +1571,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
 
 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 	if (new_page) {
-		/*
-		 * The memcg-specific accounting when moving
-		 * pages around the LRU lists relies on the
-		 * page's owner (memcg) to be valid.  Usually,
-		 * pages are assigned to a new owner before
-		 * being put on the LRU list, but since this
-		 * is not the case here, the stale owner from
-		 * a previous allocation cycle must be reset.
-		 */
-		mem_cgroup_reset_owner(new_page);
 		copy_user_highpage(new_page, page, address, vma);
 
 		SetPageDirty(new_page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1097d8098f8c..d0e57a3cda18 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1042,6 +1042,19 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
 
 	pc = lookup_page_cgroup(page);
 	memcg = pc->mem_cgroup;
+
+	/*
+	 * Surreptitiously switch any uncharged page to root:
+	 * an uncharged page off lru does nothing to secure
+	 * its former mem_cgroup from sudden removal.
+	 *
+	 * Our caller holds lru_lock, and PageCgroupUsed is updated
+	 * under page_cgroup lock: between them, they make all uses
+	 * of pc->mem_cgroup safe.
+	 */
+	if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
+		pc->mem_cgroup = memcg = root_mem_cgroup;
+
 	mz = page_cgroup_zoneinfo(memcg, page);
 	/* compound_order() is stabilized through lru_lock */
 	MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
@@ -3029,23 +3042,6 @@ void mem_cgroup_uncharge_end(void)
 	batch->memcg = NULL;
 }
 
-/*
- * A function for resetting pc->mem_cgroup for newly allocated pages.
- * This function should be called if the newpage will be added to LRU
- * before start accounting.
- */
-void mem_cgroup_reset_owner(struct page *newpage)
-{
-	struct page_cgroup *pc;
-
-	if (mem_cgroup_disabled())
-		return;
-
-	pc = lookup_page_cgroup(newpage);
-	VM_BUG_ON(PageCgroupUsed(pc));
-	pc->mem_cgroup = root_mem_cgroup;
-}
-
 #ifdef CONFIG_SWAP
 /*
  * called after __delete_from_swap_cache() and drop "page" account.
diff --git a/mm/migrate.c b/mm/migrate.c
index df141f60289e..1503b6b54ecb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -839,8 +839,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	if (!newpage)
 		return -ENOMEM;
 
-	mem_cgroup_reset_owner(newpage);
-
 	if (page_count(page) == 1) {
 		/* page was freed from under us. So we are done. */
 		goto out;
diff --git a/mm/swap.c b/mm/swap.c
index fff1ff7fb9ad..14380e9fbe33 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -652,7 +652,7 @@ EXPORT_SYMBOL(__pagevec_release);
 void lru_add_page_tail(struct zone* zone,
 		       struct page *page, struct page *page_tail)
 {
-	int active;
+	int uninitialized_var(active);
 	enum lru_list lru;
 	const int file = 0;
 
@@ -672,7 +672,6 @@ void lru_add_page_tail(struct zone* zone,
 			active = 0;
 			lru = LRU_INACTIVE_ANON;
 		}
-		update_page_reclaim_stat(zone, page_tail, file, active);
 	} else {
 		SetPageUnevictable(page_tail);
 		lru = LRU_UNEVICTABLE;
@@ -693,6 +692,9 @@ void lru_add_page_tail(struct zone* zone,
 		list_head = page_tail->lru.prev;
 		list_move_tail(&page_tail->lru, list_head);
 	}
+
+	if (!PageUnevictable(page))
+		update_page_reclaim_stat(zone, page_tail, file, active);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
@@ -710,8 +712,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg)
 	SetPageLRU(page);
 	if (active)
 		SetPageActive(page);
-	update_page_reclaim_stat(zone, page, file, active);
 	add_page_to_lru_list(zone, page, lru);
+	update_page_reclaim_stat(zone, page, file, active);
 }
 
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 470038a91873..ea6b32d61873 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -300,16 +300,6 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			new_page = alloc_page_vma(gfp_mask, vma, addr);
 			if (!new_page)
 				break;		/* Out of memory */
-			/*
-			 * The memcg-specific accounting when moving
-			 * pages around the LRU lists relies on the
-			 * page's owner (memcg) to be valid.  Usually,
-			 * pages are assigned to a new owner before
-			 * being put on the LRU list, but since this
-			 * is not the case here, the stale owner from
-			 * a previous allocation cycle must be reset.
-			 */
-			mem_cgroup_reset_owner(new_page);
 		}
 
 		/*
-- 
cgit v1.2.3


From 5faa5df1fa2024bd750089ff21dcc4191798263d Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 6 Mar 2012 21:20:26 +0000
Subject: inetpeer: Invalidate the inetpeer tree along with the routing cache

We initialize the routing metrics with the values cached on the
inetpeer in rt_init_metrics(). So if we have the metrics cached on the
inetpeer, we ignore the user configured fib_metrics.

To fix this issue, we replace the old tree with a fresh initialized
inet_peer_base. The old tree is removed later with a delayed work queue.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h |  3 ++
 net/ipv4/inetpeer.c    | 80 +++++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv4/route.c       |  1 +
 3 files changed, 83 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 06b795dd5906..ff04a33acf00 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -41,6 +41,7 @@ struct inet_peer {
 	u32			pmtu_orig;
 	u32			pmtu_learned;
 	struct inetpeer_addr_base redirect_learned;
+	struct list_head	gc_list;
 	/*
 	 * Once inet_peer is queued for deletion (refcnt == -1), following fields
 	 * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp
@@ -96,6 +97,8 @@ static inline struct inet_peer *inet_getpeer_v6(const struct in6_addr *v6daddr,
 extern void inet_putpeer(struct inet_peer *p);
 extern bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);
 
+extern void inetpeer_invalidate_tree(int family);
+
 /*
  * temporary check to make sure we dont access rid, ip_id_count, tcp_ts,
  * tcp_ts_stamp if no refcount is taken on inet_peer
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index bf4a9c4808e1..deea2e96b7f2 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/net.h>
+#include <linux/workqueue.h>
 #include <net/ip.h>
 #include <net/inetpeer.h>
 #include <net/secure_seq.h>
@@ -66,6 +67,11 @@
 
 static struct kmem_cache *peer_cachep __read_mostly;
 
+static LIST_HEAD(gc_list);
+static const int gc_delay = 60 * HZ;
+static struct delayed_work gc_work;
+static DEFINE_SPINLOCK(gc_lock);
+
 #define node_height(x) x->avl_height
 
 #define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
@@ -102,6 +108,50 @@ int inet_peer_threshold __read_mostly = 65536 + 128;	/* start to throw entries m
 int inet_peer_minttl __read_mostly = 120 * HZ;	/* TTL under high load: 120 sec */
 int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min */
 
+static void inetpeer_gc_worker(struct work_struct *work)
+{
+	struct inet_peer *p, *n;
+	LIST_HEAD(list);
+
+	spin_lock_bh(&gc_lock);
+	list_replace_init(&gc_list, &list);
+	spin_unlock_bh(&gc_lock);
+
+	if (list_empty(&list))
+		return;
+
+	list_for_each_entry_safe(p, n, &list, gc_list) {
+
+		if(need_resched())
+			cond_resched();
+
+		if (p->avl_left != peer_avl_empty) {
+			list_add_tail(&p->avl_left->gc_list, &list);
+			p->avl_left = peer_avl_empty;
+		}
+
+		if (p->avl_right != peer_avl_empty) {
+			list_add_tail(&p->avl_right->gc_list, &list);
+			p->avl_right = peer_avl_empty;
+		}
+
+		n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
+
+		if (!atomic_read(&p->refcnt)) {
+			list_del(&p->gc_list);
+			kmem_cache_free(peer_cachep, p);
+		}
+	}
+
+	if (list_empty(&list))
+		return;
+
+	spin_lock_bh(&gc_lock);
+	list_splice(&list, &gc_list);
+	spin_unlock_bh(&gc_lock);
+
+	schedule_delayed_work(&gc_work, gc_delay);
+}
 
 /* Called from ip_output.c:ip_init  */
 void __init inet_initpeers(void)
@@ -126,6 +176,7 @@ void __init inet_initpeers(void)
 			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
 			NULL);
 
+	INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker);
 }
 
 static int addr_compare(const struct inetpeer_addr *a,
@@ -449,7 +500,7 @@ relookup:
 		p->pmtu_orig = 0;
 		p->redirect_genid = 0;
 		memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
-
+		INIT_LIST_HEAD(&p->gc_list);
 
 		/* Link the node. */
 		link_to_pool(p, base);
@@ -509,3 +560,30 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
 	return rc;
 }
 EXPORT_SYMBOL(inet_peer_xrlim_allow);
+
+void inetpeer_invalidate_tree(int family)
+{
+	struct inet_peer *old, *new, *prev;
+	struct inet_peer_base *base = family_to_base(family);
+
+	write_seqlock_bh(&base->lock);
+
+	old = base->root;
+	if (old == peer_avl_empty_rcu)
+		goto out;
+
+	new = peer_avl_empty_rcu;
+
+	prev = cmpxchg(&base->root, old, new);
+	if (prev == old) {
+		base->total = 0;
+		spin_lock(&gc_lock);
+		list_add_tail(&prev->gc_list, &gc_list);
+		spin_unlock(&gc_lock);
+		schedule_delayed_work(&gc_work, gc_delay);
+	}
+
+out:
+	write_sequnlock_bh(&base->lock);
+}
+EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index bcacf54e5418..23ce0c1287ab 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -938,6 +938,7 @@ static void rt_cache_invalidate(struct net *net)
 	get_random_bytes(&shuffle, sizeof(shuffle));
 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 	redirect_genid++;
+	inetpeer_invalidate_tree(AF_INET);
 }
 
 /*
-- 
cgit v1.2.3


From ac3f48de09d8f4b73397047e413fadff7f65cfa7 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 6 Mar 2012 21:21:10 +0000
Subject: route: Remove redirect_genid

As we invalidate the inetpeer tree along with the routing cache now,
we don't need a genid to reset the redirect handling when the routing
cache is flushed.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h |  1 -
 net/ipv4/inetpeer.c    |  1 -
 net/ipv4/route.c       | 11 ++---------
 3 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index ff04a33acf00..b94765e38e80 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -35,7 +35,6 @@ struct inet_peer {
 
 	u32			metrics[RTAX_MAX];
 	u32			rate_tokens;	/* rate limiting for ICMP */
-	int			redirect_genid;
 	unsigned long		rate_last;
 	unsigned long		pmtu_expires;
 	u32			pmtu_orig;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index deea2e96b7f2..d4d61b694fab 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -498,7 +498,6 @@ relookup:
 		p->rate_last = 0;
 		p->pmtu_expires = 0;
 		p->pmtu_orig = 0;
-		p->redirect_genid = 0;
 		memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
 		INIT_LIST_HEAD(&p->gc_list);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 23ce0c1287ab..019774796174 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -132,7 +132,6 @@ static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly	= 256;
 static int rt_chain_length_max __read_mostly	= 20;
-static int redirect_genid;
 
 static struct delayed_work expires_work;
 static unsigned long expires_ljiffies;
@@ -937,7 +936,6 @@ static void rt_cache_invalidate(struct net *net)
 
 	get_random_bytes(&shuffle, sizeof(shuffle));
 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-	redirect_genid++;
 	inetpeer_invalidate_tree(AF_INET);
 }
 
@@ -1486,10 +1484,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 
 				peer = rt->peer;
 				if (peer) {
-					if (peer->redirect_learned.a4 != new_gw ||
-					    peer->redirect_genid != redirect_genid) {
+					if (peer->redirect_learned.a4 != new_gw) {
 						peer->redirect_learned.a4 = new_gw;
-						peer->redirect_genid = redirect_genid;
 						atomic_inc(&__rt_peer_genid);
 					}
 					check_peer_redir(&rt->dst, peer);
@@ -1794,8 +1790,6 @@ static void ipv4_validate_peer(struct rtable *rt)
 		if (peer) {
 			check_peer_pmtu(&rt->dst, peer);
 
-			if (peer->redirect_genid != redirect_genid)
-				peer->redirect_learned.a4 = 0;
 			if (peer->redirect_learned.a4 &&
 			    peer->redirect_learned.a4 != rt->rt_gateway)
 				check_peer_redir(&rt->dst, peer);
@@ -1959,8 +1953,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 		dst_init_metrics(&rt->dst, peer->metrics, false);
 
 		check_peer_pmtu(&rt->dst, peer);
-		if (peer->redirect_genid != redirect_genid)
-			peer->redirect_learned.a4 = 0;
+
 		if (peer->redirect_learned.a4 &&
 		    peer->redirect_learned.a4 != rt->rt_gateway) {
 			rt->rt_gateway = peer->redirect_learned.a4;
-- 
cgit v1.2.3


From e34828298ec542294f4b798606ee73e462d322f5 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@opensource.se>
Date: Wed, 29 Feb 2012 22:16:13 +0900
Subject: sh: introduce sh_clk_ops in parallel with clk_ops

Introduce sh_clk_ops in parallel with clk_ops.

Signed-off-by: Magnus Damm <damm@opensource.se>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/sh_clk.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/sh_clk.h b/include/linux/sh_clk.h
index 54341d811685..706c005cd4ba 100644
--- a/include/linux/sh_clk.h
+++ b/include/linux/sh_clk.h
@@ -18,6 +18,8 @@ struct clk_mapping {
 	struct kref		ref;
 };
 
+#define sh_clk_ops clk_ops
+
 struct clk_ops {
 #ifdef CONFIG_SH_CLK_CPG_LEGACY
 	void (*init)(struct clk *clk);
-- 
cgit v1.2.3


From 84c36ffd7c580e1a63d7284e318670b082260118 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@opensource.se>
Date: Wed, 29 Feb 2012 22:18:19 +0900
Subject: sh: remove clk_ops

Now when all clk_ops have been renamed it is
safe to rename clk_ops to sh_clk_ops.

Signed-off-by: Magnus Damm <damm@opensource.se>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/sh_clk.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sh_clk.h b/include/linux/sh_clk.h
index 706c005cd4ba..0a9d8f2ac519 100644
--- a/include/linux/sh_clk.h
+++ b/include/linux/sh_clk.h
@@ -18,9 +18,8 @@ struct clk_mapping {
 	struct kref		ref;
 };
 
-#define sh_clk_ops clk_ops
 
-struct clk_ops {
+struct sh_clk_ops {
 #ifdef CONFIG_SH_CLK_CPG_LEGACY
 	void (*init)(struct clk *clk);
 #endif
@@ -39,7 +38,7 @@ struct clk {
 	unsigned short		parent_num;	/* choose between */
 	unsigned char		src_shift;	/* source clock field in the */
 	unsigned char		src_width;	/* configuration register */
-	struct clk_ops		*ops;
+	struct sh_clk_ops	*ops;
 
 	struct list_head	children;
 	struct list_head	sibling;	/* node for children */
-- 
cgit v1.2.3