From f339b9dc1f03591761d5d930800db24bc0eda1e1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 31 May 2011 10:49:20 +0200
Subject: sched: Fix schedstat.nr_wakeups_migrate

While looking over the code I found that with the ttwu rework the
nr_wakeups_migrate test broke since we now switch cpus prior to
calling ttwu_stat(), hence the test is always true.

Cure this by passing the migration state in wake_flags. Also move the
whole test under CONFIG_SMP, its hard to migrate tasks on UP :-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-pwwxl7gdqs5676f1d4cx6pj7@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8da84b7bc1b8..483c1ed5bc4d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1063,6 +1063,7 @@ struct sched_domain;
  */
 #define WF_SYNC		0x01		/* waker goes to sleep after wakup */
 #define WF_FORK		0x02		/* child wakeup after fork */
+#define WF_MIGRATED	0x04		/* internal use, task got migrated */
 
 #define ENQUEUE_WAKEUP		1
 #define ENQUEUE_HEAD		2
-- 
cgit v1.2.3


From 3a43e05f4d0600e906fa09f4a65d749288c44592 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Date: Tue, 31 May 2011 08:56:11 +0200
Subject: irq: Handle spurios irq detection for threaded irqs

The detection of spurios interrupts is currently limited to first level
handler. In force-threaded mode we never notice if the threaded irq does
not feel responsible.
This patch catches the return value of the threaded handler and forwards
it to the spurious detector. If the primary handler returns only
IRQ_WAKE_THREAD then the spourious detector ignores it because it gets
called again from the threaded handler.

[ tglx: Report the erroneous return value early and bail out ]

Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Link: http://lkml.kernel.org/r/1306824972-27067-2-git-send-email-sebastian@breakpoint.cc
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqreturn.h |  6 +++---
 kernel/irq/handle.c       |  6 ------
 kernel/irq/manage.c       | 24 ++++++++++++++++++------
 kernel/irq/spurious.c     | 22 ++++++++++++++++++----
 4 files changed, 39 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqreturn.h b/include/linux/irqreturn.h
index 819acaaac3f5..714ba08dc092 100644
--- a/include/linux/irqreturn.h
+++ b/include/linux/irqreturn.h
@@ -8,9 +8,9 @@
  * @IRQ_WAKE_THREAD	handler requests to wake the handler thread
  */
 enum irqreturn {
-	IRQ_NONE,
-	IRQ_HANDLED,
-	IRQ_WAKE_THREAD,
+	IRQ_NONE		= (0 << 0),
+	IRQ_HANDLED		= (1 << 0),
+	IRQ_WAKE_THREAD		= (1 << 1),
 };
 
 typedef enum irqreturn irqreturn_t;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 90cb55f6d7eb..470d08c82bbe 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,12 +132,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 
 		switch (res) {
 		case IRQ_WAKE_THREAD:
-			/*
-			 * Set result to handled so the spurious check
-			 * does not trigger.
-			 */
-			res = IRQ_HANDLED;
-
 			/*
 			 * Catch drivers which return WAKE_THREAD but
 			 * did not set up a thread function
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f7ce0021e1c4..d64bafb1afd0 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -723,13 +723,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
  * context. So we need to disable bh here to avoid deadlocks and other
  * side effects.
  */
-static void
+static irqreturn_t
 irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
 {
+	irqreturn_t ret;
+
 	local_bh_disable();
-	action->thread_fn(action->irq, action->dev_id);
+	ret = action->thread_fn(action->irq, action->dev_id);
 	irq_finalize_oneshot(desc, action, false);
 	local_bh_enable();
+	return ret;
 }
 
 /*
@@ -737,10 +740,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
  * preemtible - many of them need to sleep and wait for slow busses to
  * complete.
  */
-static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
+static irqreturn_t irq_thread_fn(struct irq_desc *desc,
+		struct irqaction *action)
 {
-	action->thread_fn(action->irq, action->dev_id);
+	irqreturn_t ret;
+
+	ret = action->thread_fn(action->irq, action->dev_id);
 	irq_finalize_oneshot(desc, action, false);
+	return ret;
 }
 
 /*
@@ -753,7 +760,8 @@ static int irq_thread(void *data)
 	};
 	struct irqaction *action = data;
 	struct irq_desc *desc = irq_to_desc(action->irq);
-	void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
+	irqreturn_t (*handler_fn)(struct irq_desc *desc,
+			struct irqaction *action);
 	int wake;
 
 	if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
@@ -783,8 +791,12 @@ static int irq_thread(void *data)
 			desc->istate |= IRQS_PENDING;
 			raw_spin_unlock_irq(&desc->lock);
 		} else {
+			irqreturn_t action_ret;
+
 			raw_spin_unlock_irq(&desc->lock);
-			handler_fn(desc, action);
+			action_ret = handler_fn(desc, action);
+			if (!noirqdebug)
+				note_interrupt(action->irq, desc, action_ret);
 		}
 
 		wake = atomic_dec_and_test(&desc->threads_active);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index c9a78ba30b6f..aa57d5da18c1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -167,6 +167,13 @@ out:
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
 
+static inline int bad_action_ret(irqreturn_t action_ret)
+{
+	if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
+		return 0;
+	return 1;
+}
+
 /*
  * If 99,900 of the previous 100,000 interrupts have not been handled
  * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
 	struct irqaction *action;
 	unsigned long flags;
 
-	if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+	if (bad_action_ret(action_ret)) {
 		printk(KERN_ERR "irq event %d: bogus return value %x\n",
 				irq, action_ret);
 	} else {
@@ -263,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 	if (desc->istate & IRQS_POLL_INPROGRESS)
 		return;
 
-	if (unlikely(action_ret != IRQ_HANDLED)) {
+	/* we get here again via the threaded handler */
+	if (action_ret == IRQ_WAKE_THREAD)
+		return;
+
+	if (bad_action_ret(action_ret)) {
+		report_bad_irq(irq, desc, action_ret);
+		return;
+	}
+
+	if (unlikely(action_ret == IRQ_NONE)) {
 		/*
 		 * If we are seeing only the odd spurious IRQ caused by
 		 * bus asynchronicity then don't eventually trigger an error,
@@ -275,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		else
 			desc->irqs_unhandled++;
 		desc->last_unhandled = jiffies;
-		if (unlikely(action_ret != IRQ_NONE))
-			report_bad_irq(irq, desc, action_ret);
 	}
 
 	if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
-- 
cgit v1.2.3


From d4d84fef6d0366b585b7de13527a0faeca84d9ce Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 2 Jun 2011 10:19:41 -0400
Subject: slub: always align cpu_slab to honor cmpxchg_double requirement

On an architecture without CMPXCHG_LOCAL but with DEBUG_VM enabled,
the VM_BUG_ON() in __pcpu_double_call_return_bool() will cause an early
panic during boot unless we always align cpu_slab properly.

In principle we could remove the alignment-testing VM_BUG_ON() for
architectures that don't have CMPXCHG_LOCAL, but leaving it in means
that new code will tend not to break x86 even if it is introduced
on another platform, and it's low cost to require alignment.

Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/percpu.h |  3 +++
 mm/slub.c              | 12 ++++--------
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 8b97308e65df..9ca008f0c542 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -259,6 +259,9 @@ extern void __bad_size_call_parameter(void);
  * Special handling for cmpxchg_double.  cmpxchg_double is passed two
  * percpu variables.  The first has to be aligned to a double word
  * boundary and the second has to follow directly thereafter.
+ * We enforce this on all architectures even if they don't support
+ * a double cmpxchg instruction, since it's a cheap requirement, and it
+ * avoids breaking the requirement for architectures with the instruction.
  */
 #define __pcpu_double_call_return_bool(stem, pcp1, pcp2, ...)		\
 ({									\
diff --git a/mm/slub.c b/mm/slub.c
index 7be0223531b0..35f351f26193 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2320,16 +2320,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
 	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
 			SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
 
-#ifdef CONFIG_CMPXCHG_LOCAL
 	/*
-	 * Must align to double word boundary for the double cmpxchg instructions
-	 * to work.
+	 * Must align to double word boundary for the double cmpxchg
+	 * instructions to work; see __pcpu_double_call_return_bool().
 	 */
-	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
-#else
-	/* Regular alignment is sufficient */
-	s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
-#endif
+	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
+				     2 * sizeof(void *));
 
 	if (!s->cpu_slab)
 		return 0;
-- 
cgit v1.2.3


From 9e1f1de02c2275d7172e18dc4e7c2065777611bf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jun 2011 18:24:58 -0400
Subject: more conservative S_NOSEC handling

Caching "we have already removed suid/caps" was overenthusiastic as merged.
On network filesystems we might have had suid/caps set on another client,
silently picked by this client on revalidate, all of that *without* clearing
the S_NOSEC flag.

AFAICS, the only reasonably sane way to deal with that is
	* new superblock flag; unless set, S_NOSEC is not going to be set.
	* local block filesystems set it in their ->mount() (more accurately,
mount_bdev() does, so does btrfs ->mount(), users of mount_bdev() other than
local block ones clear it)
	* if any network filesystem (or a cluster one) wants to use S_NOSEC,
it'll need to set MS_NOSEC in sb->s_flags *AND* take care to clear S_NOSEC when
inode attribute changes are picked from other clients.

It's not an earth-shattering hole (anybody that can set suid on another client
will almost certainly be able to write to the file before doing that anyway),
but it's a bug that needs fixing.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c   | 2 +-
 fs/fuse/inode.c    | 2 ++
 fs/ocfs2/super.c   | 2 +-
 fs/super.c         | 2 +-
 include/linux/fs.h | 3 ++-
 mm/filemap.c       | 2 +-
 6 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9b2e7e5bc3ef..d158b672a2d2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -819,7 +819,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	} else {
 		char b[BDEVNAME_SIZE];
 
-		s->s_flags = flags;
+		s->s_flags = flags | MS_NOSEC;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cc6ec4b2f0ff..38f84cd48b67 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -921,6 +921,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (sb->s_flags & MS_MANDLOCK)
 		goto err;
 
+	sb->s_flags &= ~MS_NOSEC;
+
 	if (!parse_fuse_opt((char *) data, &d, is_bdev))
 		goto err;
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index cdbaf5e97308..56f61027236b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1072,7 +1072,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+	sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
 		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
 
 	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
diff --git a/fs/super.c b/fs/super.c
index c75593953c52..ab3d672db0de 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -822,7 +822,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 	} else {
 		char b[BDEVNAME_SIZE];
 
-		s->s_flags = flags;
+		s->s_flags = flags | MS_NOSEC;
 		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c55d6b7cd5d6..646a1836152a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -208,6 +208,7 @@ struct inodes_stat_t {
 #define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
 #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
+#define MS_NOSEC	(1<<28)
 #define MS_BORN		(1<<29)
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)
@@ -2591,7 +2592,7 @@ static inline int is_sxid(mode_t mode)
 
 static inline void inode_has_no_xattr(struct inode *inode)
 {
-	if (!is_sxid(inode->i_mode))
+	if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC))
 		inode->i_flags |= S_NOSEC;
 }
 
diff --git a/mm/filemap.c b/mm/filemap.c
index d7b10578a64b..a8251a8d3457 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2000,7 +2000,7 @@ int file_remove_suid(struct file *file)
 		error = security_inode_killpriv(dentry);
 	if (!error && killsuid)
 		error = __remove_suid(dentry, killsuid);
-	if (!error)
+	if (!error && (inode->i_sb->s_flags & MS_NOSEC))
 		inode->i_flags |= S_NOSEC;
 
 	return error;
-- 
cgit v1.2.3


From d7ebe75b065a7c2d58ffc12f9d2e00d5ea4e71eb Mon Sep 17 00:00:00 2001
From: Vince Weaver <vweaver1@eecs.utk.edu>
Date: Fri, 3 Jun 2011 17:59:51 -0400
Subject: perf: Fix comments in include/linux/perf_event.h

Fix include/linux/perf_event.h comments to be consistent with
the actual #define names. This is trivial, but it can be a bit
confusing when first  reading through the file.

Signed-off-by: Vince Weaver <vweaver1@eecs.utk.edu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus@samba.org
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1106031757090.29381@cl320.eecs.utk.edu
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3412684ce5d5..e0786e35f247 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -137,14 +137,14 @@ enum perf_event_sample_format {
  *
  * struct read_format {
  *	{ u64		value;
- *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
- *	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
  *	  { u64		id;           } && PERF_FORMAT_ID
  *	} && !PERF_FORMAT_GROUP
  *
  *	{ u64		nr;
- *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
- *	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
  *	  { u64		value;
  *	    { u64	id;           } && PERF_FORMAT_ID
  *	  }		cntr[nr];
-- 
cgit v1.2.3


From fb04883371f2cb7867d24783e7d590036dc9b548 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 19 May 2011 15:44:27 +0200
Subject: netfilter: add more values to enum ip_conntrack_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Following error is raised (and other similar ones) :

net/ipv4/netfilter/nf_nat_standalone.c: In function ‘nf_nat_fn’:
net/ipv4/netfilter/nf_nat_standalone.c:119:2: warning: case value ‘4’
not in enumerated type ‘enum ip_conntrack_info’

gcc barfs on adding two enum values and getting a not enumerated
result :

case IP_CT_RELATED+IP_CT_IS_REPLY:

Add missing enum values

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: David Miller <davem@davemloft.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_common.h  |  3 +++
 net/ipv4/netfilter/ipt_CLUSTERIP.c             |  6 +++---
 net/ipv4/netfilter/ipt_MASQUERADE.c            |  2 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  2 +-
 net/ipv4/netfilter/nf_nat_core.c               |  2 +-
 net/ipv4/netfilter/nf_nat_rule.c               |  2 +-
 net/ipv4/netfilter/nf_nat_standalone.c         |  4 ++--
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  2 +-
 net/netfilter/nf_conntrack_core.c              |  4 ++--
 net/netfilter/nf_conntrack_ftp.c               |  2 +-
 net/netfilter/nf_conntrack_h323_main.c         | 10 ++++------
 net/netfilter/nf_conntrack_irc.c               |  3 +--
 net/netfilter/nf_conntrack_pptp.c              |  3 +--
 net/netfilter/nf_conntrack_sane.c              |  2 +-
 net/netfilter/nf_conntrack_sip.c               |  2 +-
 net/netfilter/xt_socket.c                      |  4 ++--
 16 files changed, 26 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 50cdc2559a5a..0d3dd66322ec 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -18,6 +18,9 @@ enum ip_conntrack_info {
 	/* >= this indicates reply direction */
 	IP_CT_IS_REPLY,
 
+	IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
+	IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
+	IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,	
 	/* Number of distinct IP_CT types (no NEW in reply dirn). */
 	IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
 };
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index d609ac3cb9a4..5c9e97c79017 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -307,7 +307,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	 * error messages (RELATED) and information requests (see below) */
 	if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
 	    (ctinfo == IP_CT_RELATED ||
-	     ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY))
+	     ctinfo == IP_CT_RELATED_REPLY))
 		return XT_CONTINUE;
 
 	/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -321,12 +321,12 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
 			ct->mark = hash;
 			break;
 		case IP_CT_RELATED:
-		case IP_CT_RELATED+IP_CT_IS_REPLY:
+		case IP_CT_RELATED_REPLY:
 			/* FIXME: we don't handle expectations at the
 			 * moment.  they can arrive on a different node than
 			 * the master connection (e.g. FTP passive mode) */
 		case IP_CT_ESTABLISHED:
-		case IP_CT_ESTABLISHED+IP_CT_IS_REPLY:
+		case IP_CT_ESTABLISHED_REPLY:
 			break;
 		default:
 			break;
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index d2ed9dc74ebc..9931152a78b5 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -60,7 +60,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	nat = nfct_nat(ct);
 
 	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
-			    ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+			    ctinfo == IP_CT_RELATED_REPLY));
 
 	/* Source address is 0.0.0.0 - locally generated packet that is
 	 * probably not supposed to be masqueraded.
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 5a03c02af999..db10075dd88e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -101,7 +101,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
 
 	/* This is where we call the helper: as the packet goes out. */
 	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
+	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
 		goto out;
 
 	help = nfct_help(ct);
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 9c71b2755ce3..3346de5d94d0 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -433,7 +433,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 
 	/* Must be RELATED */
 	NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED ||
-		     skb->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
+		     skb->nfctinfo == IP_CT_RELATED_REPLY);
 
 	/* Redirects on non-null nats must be dropped, else they'll
 	   start talking to each other without our translation, and be
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 21c30426480b..733c9abc1cbd 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -53,7 +53,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
 
 	/* Connection must be valid and new. */
 	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
-			    ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+			    ctinfo == IP_CT_RELATED_REPLY));
 	NF_CT_ASSERT(par->out != NULL);
 
 	return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC);
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 7317bdf1d457..483b76d042da 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -116,7 +116,7 @@ nf_nat_fn(unsigned int hooknum,
 
 	switch (ctinfo) {
 	case IP_CT_RELATED:
-	case IP_CT_RELATED+IP_CT_IS_REPLY:
+	case IP_CT_RELATED_REPLY:
 		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
 			if (!nf_nat_icmp_reply_translation(ct, ctinfo,
 							   hooknum, skb))
@@ -144,7 +144,7 @@ nf_nat_fn(unsigned int hooknum,
 	default:
 		/* ESTABLISHED */
 		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
-			     ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
+			     ctinfo == IP_CT_ESTABLISHED_REPLY);
 	}
 
 	return nf_nat_packet(ct, ctinfo, hooknum, skb);
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index c8af58b22562..4111050a9fc5 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -160,7 +160,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
 
 	/* This is where we call the helper: as the packet goes out. */
 	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
+	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
 		goto out;
 
 	help = nfct_help(ct);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 2e1c11f78419..0bd568929403 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -850,7 +850,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 
 	/* It exists; we have (non-exclusive) reference. */
 	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
-		*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
+		*ctinfo = IP_CT_ESTABLISHED_REPLY;
 		/* Please set reply bit if this packet OK */
 		*set_reply = 1;
 	} else {
@@ -1143,7 +1143,7 @@ static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
 	/* This ICMP is in reverse direction to the packet which caused it */
 	ct = nf_ct_get(skb, &ctinfo);
 	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
-		ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
+		ctinfo = IP_CT_RELATED_REPLY;
 	else
 		ctinfo = IP_CT_RELATED;
 
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index e17cb7c7dd8f..6f5801eac999 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -368,7 +368,7 @@ static int help(struct sk_buff *skb,
 
 	/* Until there's been traffic both ways, don't look in packets. */
 	if (ctinfo != IP_CT_ESTABLISHED &&
-	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
+	    ctinfo != IP_CT_ESTABLISHED_REPLY) {
 		pr_debug("ftp: Conntrackinfo = %u\n", ctinfo);
 		return NF_ACCEPT;
 	}
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 18b2ce5c8ced..f03c2d4539f6 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -571,10 +571,9 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff,
 	int ret;
 
 	/* Until there's been traffic both ways, don't look in packets. */
-	if (ctinfo != IP_CT_ESTABLISHED &&
-	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
+	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
 		return NF_ACCEPT;
-	}
+
 	pr_debug("nf_ct_h245: skblen = %u\n", skb->len);
 
 	spin_lock_bh(&nf_h323_lock);
@@ -1125,10 +1124,9 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff,
 	int ret;
 
 	/* Until there's been traffic both ways, don't look in packets. */
-	if (ctinfo != IP_CT_ESTABLISHED &&
-	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
+	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
 		return NF_ACCEPT;
-	}
+
 	pr_debug("nf_ct_q931: skblen = %u\n", skb->len);
 
 	spin_lock_bh(&nf_h323_lock);
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index b394aa318776..4f9390b98697 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -125,8 +125,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
 		return NF_ACCEPT;
 
 	/* Until there's been traffic both ways, don't look in packets. */
-	if (ctinfo != IP_CT_ESTABLISHED &&
-	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY)
+	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
 		return NF_ACCEPT;
 
 	/* Not a full tcp header? */
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 088944824e13..2fd4565144de 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -519,8 +519,7 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
 	u_int16_t msg;
 
 	/* don't do any tracking before tcp handshake complete */
-	if (ctinfo != IP_CT_ESTABLISHED &&
-	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY)
+	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
 		return NF_ACCEPT;
 
 	nexthdr_off = protoff;
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index d9e27734b2a2..8501823b3f9b 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -78,7 +78,7 @@ static int help(struct sk_buff *skb,
 	ct_sane_info = &nfct_help(ct)->help.ct_sane_info;
 	/* Until there's been traffic both ways, don't look in packets. */
 	if (ctinfo != IP_CT_ESTABLISHED &&
-	    ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY)
+	    ctinfo != IP_CT_ESTABLISHED_REPLY)
 		return NF_ACCEPT;
 
 	/* Not a full tcp header? */
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index cb5a28581782..93faf6a3a637 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1423,7 +1423,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
 	typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust;
 
 	if (ctinfo != IP_CT_ESTABLISHED &&
-	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY)
+	    ctinfo != IP_CT_ESTABLISHED_REPLY)
 		return NF_ACCEPT;
 
 	/* No Data ? */
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 9cc46356b577..fe39f7e913df 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -143,9 +143,9 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 	ct = nf_ct_get(skb, &ctinfo);
 	if (ct && !nf_ct_is_untracked(ct) &&
 	    ((iph->protocol != IPPROTO_ICMP &&
-	      ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) ||
+	      ctinfo == IP_CT_ESTABLISHED_REPLY) ||
 	     (iph->protocol == IPPROTO_ICMP &&
-	      ctinfo == IP_CT_IS_REPLY + IP_CT_RELATED)) &&
+	      ctinfo == IP_CT_RELATED_REPLY)) &&
 	    (ct->status & IPS_SRC_NAT_DONE)) {
 
 		daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
-- 
cgit v1.2.3


From b084f598df36b62dfae83c10ed17f0b66b50f442 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 31 May 2011 12:24:58 -0400
Subject: nfsd: fix dependency of nfsd on auth_rpcgss

Commit b0b0c0a26e84 "nfsd: add proc file listing kernel's gss_krb5
enctypes" added an nunnecessary dependency of nfsd on the auth_rpcgss
module.

It's a little ad hoc, but since the only piece of information nfsd needs
from rpcsec_gss_krb5 is a single static string, one solution is just to
share it with an include file.

Cc: stable@kernel.org
Reported-by: Michael Guntsche <mike@it-loops.com>
Cc: Kevin Coffman <kwc@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsctl.c                         | 19 ++++++-------------
 include/linux/sunrpc/gss_krb5_enctypes.h |  4 ++++
 net/sunrpc/auth_gss/gss_krb5_mech.c      |  3 ++-
 3 files changed, 12 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/sunrpc/gss_krb5_enctypes.h

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 1f5eae40f34e..2b1449dd2f49 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -13,6 +13,7 @@
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/gss_krb5_enctypes.h>
 
 #include "idmap.h"
 #include "nfsd.h"
@@ -189,18 +190,10 @@ static struct file_operations export_features_operations = {
 	.release	= single_release,
 };
 
-#ifdef CONFIG_SUNRPC_GSS
+#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
 static int supported_enctypes_show(struct seq_file *m, void *v)
 {
-	struct gss_api_mech *k5mech;
-
-	k5mech = gss_mech_get_by_name("krb5");
-	if (k5mech == NULL)
-		goto out;
-	if (k5mech->gm_upcall_enctypes != NULL)
-		seq_printf(m, k5mech->gm_upcall_enctypes);
-	gss_mech_put(k5mech);
-out:
+	seq_printf(m, KRB5_SUPPORTED_ENCTYPES);
 	return 0;
 }
 
@@ -215,7 +208,7 @@ static struct file_operations supported_enctypes_ops = {
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
-#endif /* CONFIG_SUNRPC_GSS */
+#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
 
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
@@ -1427,9 +1420,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
-#ifdef CONFIG_SUNRPC_GSS
+#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
 		[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
-#endif /* CONFIG_SUNRPC_GSS */
+#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/include/linux/sunrpc/gss_krb5_enctypes.h b/include/linux/sunrpc/gss_krb5_enctypes.h
new file mode 100644
index 000000000000..ec6234eee89c
--- /dev/null
+++ b/include/linux/sunrpc/gss_krb5_enctypes.h
@@ -0,0 +1,4 @@
+/*
+ * Dumb way to share this static piece of information with nfsd
+ */
+#define KRB5_SUPPORTED_ENCTYPES "18,17,16,23,3,1,2"
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 0a9a2ec2e469..c3b75333b821 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -43,6 +43,7 @@
 #include <linux/sunrpc/gss_krb5.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/crypto.h>
+#include <linux/sunrpc/gss_krb5_enctypes.h>
 
 #ifdef RPC_DEBUG
 # define RPCDBG_FACILITY	RPCDBG_AUTH
@@ -750,7 +751,7 @@ static struct gss_api_mech gss_kerberos_mech = {
 	.gm_ops		= &gss_kerberos_ops,
 	.gm_pf_num	= ARRAY_SIZE(gss_kerberos_pfs),
 	.gm_pfs		= gss_kerberos_pfs,
-	.gm_upcall_enctypes = "18,17,16,23,3,1,2",
+	.gm_upcall_enctypes = KRB5_SUPPORTED_ENCTYPES,
 };
 
 static int __init init_kerberos_module(void)
-- 
cgit v1.2.3


From 5f98ecdbcef1920323d8777c0ba55dbd4335d3cf Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Sun, 5 Jun 2011 11:47:29 +0900
Subject: swiotlb: Export swioltb_nr_tbl and utilize it as appropiate.

By default the io_tlb_nslabs is set to zero, and gets set to
whatever value is passed in via swiotlb_init_with_tbl function.
The default value passed in is 64MB. However, if the user provides
the 'swiotlb=<nslabs>' the default value is ignored and
the value provided by the user is used... Except when the SWIOTLB
is used under Xen - there the default value of 64MB is used and
the Xen-SWIOTLB has no mechanism to get the 'io_tlb_nslabs' filled
out by setup_io_tlb_npages functions. This patch provides a function
for the Xen-SWIOTLB to call to see if the io_tlb_nslabs is set
and if so use that value.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/swiotlb-xen.c | 12 +++++++++---
 include/linux/swiotlb.h   |  1 +
 lib/swiotlb.c             |  5 +++++
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 54469c3eeacd..5ac3f4843f5c 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -147,9 +147,15 @@ void __init xen_swiotlb_init(int verbose)
 {
 	unsigned long bytes;
 	int rc;
-
-	xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
-	xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
+	unsigned long nr_tbl;
+
+	nr_tbl = swioltb_nr_tbl();
+	if (nr_tbl)
+		xen_io_tlb_nslabs = nr_tbl;
+	else {
+		xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
+		xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
 
 	bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT;
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 8c0e349f4a6c..445702c60d04 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -24,6 +24,7 @@ extern int swiotlb_force;
 
 extern void swiotlb_init(int verbose);
 extern void swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
+extern unsigned long swioltb_nr_tbl(void);
 
 /*
  * Enumeration for sync targets
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 93ca08b8a451..99093b396145 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -110,6 +110,11 @@ setup_io_tlb_npages(char *str)
 __setup("swiotlb=", setup_io_tlb_npages);
 /* make io_tlb_overflow tunable too? */
 
+unsigned long swioltb_nr_tbl(void)
+{
+	return io_tlb_nslabs;
+}
+
 /* Note that this doesn't work with highmem page */
 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 				      volatile void *address)
-- 
cgit v1.2.3


From 3019de124b9f5b1526cb3668b74af14371e21795 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 6 Jun 2011 16:41:33 -0700
Subject: net: Rework netdev_drivername() to avoid warning.

This interface uses a temporary buffer, but for no real reason.
And now can generate warnings like:

net/sched/sch_generic.c: In function dev_watchdog
net/sched/sch_generic.c:254:10: warning: unused variable drivername

Just return driver->name directly or "".

Reported-by: Connor Hansen <cmdkhh@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 +-
 net/core/dev.c            | 16 +++++-----------
 net/sched/sch_generic.c   |  3 +--
 3 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ca333e79e10f..54b8b4d7b68f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2555,7 +2555,7 @@ extern void netdev_class_remove_file(struct class_attribute *class_attr);
 
 extern struct kobj_ns_type_operations net_ns_type_operations;
 
-extern char *netdev_drivername(const struct net_device *dev, char *buffer, int len);
+extern const char *netdev_drivername(const struct net_device *dev);
 
 extern void linkwatch_run_queue(void);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 939307891e71..1af6cb27f67a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6264,29 +6264,23 @@ err_name:
 /**
  *	netdev_drivername - network driver for the device
  *	@dev: network device
- *	@buffer: buffer for resulting name
- *	@len: size of buffer
  *
  *	Determine network driver for device.
  */
-char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
+const char *netdev_drivername(const struct net_device *dev)
 {
 	const struct device_driver *driver;
 	const struct device *parent;
-
-	if (len <= 0 || !buffer)
-		return buffer;
-	buffer[0] = 0;
+	const char *empty = "";
 
 	parent = dev->dev.parent;
-
 	if (!parent)
-		return buffer;
+		return empty;
 
 	driver = parent->driver;
 	if (driver && driver->name)
-		strlcpy(buffer, driver->name, len);
-	return buffer;
+		return driver->name;
+	return empty;
 }
 
 static int __netdev_printk(const char *level, const struct net_device *dev,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index b1721d71c27c..b4c680900d7a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -251,9 +251,8 @@ static void dev_watchdog(unsigned long arg)
 			}
 
 			if (some_queue_timedout) {
-				char drivername[64];
 				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
-				       dev->name, netdev_drivername(dev, drivername, 64), i);
+				       dev->name, netdev_drivername(dev), i);
 				dev->netdev_ops->ndo_tx_timeout(dev);
 			}
 			if (!mod_timer(&dev->watchdog_timer,
-- 
cgit v1.2.3


From 13fcb7bd322164c67926ffe272846d4860196dc6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 6 Jun 2011 22:42:06 -0700
Subject: af_packet: prevent information leak

In 2.6.27, commit 393e52e33c6c2 (packet: deliver VLAN TCI to userspace)
added a small information leak.

Add padding field and make sure its zeroed before copy to user.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_packet.h | 2 ++
 net/packet/af_packet.c    | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 6d66ce1791a9..7b318630139f 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -62,6 +62,7 @@ struct tpacket_auxdata {
 	__u16		tp_mac;
 	__u16		tp_net;
 	__u16		tp_vlan_tci;
+	__u16		tp_padding;
 };
 
 /* Rx ring - header status */
@@ -101,6 +102,7 @@ struct tpacket2_hdr {
 	__u32		tp_sec;
 	__u32		tp_nsec;
 	__u16		tp_vlan_tci;
+	__u16		tp_padding;
 };
 
 #define TPACKET2_HDRLEN		(TPACKET_ALIGN(sizeof(struct tpacket2_hdr)) + sizeof(struct sockaddr_ll))
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ba248d93399a..c0c3cda19712 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -804,6 +804,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		} else {
 			h.h2->tp_vlan_tci = 0;
 		}
+		h.h2->tp_padding = 0;
 		hdrlen = sizeof(*h.h2);
 		break;
 	default:
@@ -1736,6 +1737,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		} else {
 			aux.tp_vlan_tci = 0;
 		}
+		aux.tp_padding = 0;
 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
 	}
 
-- 
cgit v1.2.3


From 21c13a4f7bc185552c4b402b792c3bbb9aa69df0 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Tue, 7 Jun 2011 11:35:52 -0400
Subject: usb-storage: redo incorrect reads

Some USB mass-storage devices have bugs that cause them not to handle
the first READ(10) command they receive correctly.  The Corsair
Padlock v2 returns completely bogus data for its first read (possibly
it returns the data in encrypted form even though the device is
supposed to be unlocked).  The Feiya SD/SDHC card reader fails to
complete the first READ(10) command after it is plugged in or after a
new card is inserted, returning a status code that indicates it thinks
the command was invalid, which prevents the kernel from retrying the
read.

Since the first read of a new device or a new medium is for the
partition sector, the kernel is unable to retrieve the device's
partition table.  Users have to manually issue an "hdparm -z" or
"blockdev --rereadpt" command before they can access the device.

This patch (as1470) works around the problem.  It adds a new quirk
flag, US_FL_INVALID_READ10, indicating that the first READ(10) should
always be retried immediately, as should any failing READ(10) commands
(provided the preceding READ(10) command succeeded, to avoid getting
stuck in a loop).  The patch also adds appropriate unusual_devs
entries containing the new flag.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Tested-by: Sven Geggus <sven-usbst@geggus.net>
Tested-by: Paul Hartman <paul.hartman+linux@gmail.com>
CC: Matthew Dharm <mdharm-usb@one-eyed-alien.net>
CC: <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/kernel-parameters.txt |  2 ++
 drivers/usb/storage/transport.c     | 29 +++++++++++++++++++++++++++++
 drivers/usb/storage/unusual_devs.h  | 19 +++++++++++++++++++
 drivers/usb/storage/usb.c           | 13 ++++++++++++-
 drivers/usb/storage/usb.h           |  2 ++
 include/linux/usb_usual.h           |  4 +++-
 6 files changed, 67 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index d9a203b058f1..fd248a318211 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2598,6 +2598,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 					unlock ejectable media);
 				m = MAX_SECTORS_64 (don't transfer more
 					than 64 sectors = 32 KB at a time);
+				n = INITIAL_READ10 (force a retry of the
+					initial READ(10) command);
 				o = CAPACITY_OK (accept the capacity
 					reported by the device);
 				r = IGNORE_RESIDUE (the device reports
diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c
index 00418995d8e9..e8ae21b2d387 100644
--- a/drivers/usb/storage/transport.c
+++ b/drivers/usb/storage/transport.c
@@ -819,6 +819,35 @@ Retry_Sense:
 		}
 	}
 
+	/*
+	 * Some devices don't work or return incorrect data the first
+	 * time they get a READ(10) command, or for the first READ(10)
+	 * after a media change.  If the INITIAL_READ10 flag is set,
+	 * keep track of whether READ(10) commands succeed.  If the
+	 * previous one succeeded and this one failed, set the REDO_READ10
+	 * flag to force a retry.
+	 */
+	if (unlikely((us->fflags & US_FL_INITIAL_READ10) &&
+			srb->cmnd[0] == READ_10)) {
+		if (srb->result == SAM_STAT_GOOD) {
+			set_bit(US_FLIDX_READ10_WORKED, &us->dflags);
+		} else if (test_bit(US_FLIDX_READ10_WORKED, &us->dflags)) {
+			clear_bit(US_FLIDX_READ10_WORKED, &us->dflags);
+			set_bit(US_FLIDX_REDO_READ10, &us->dflags);
+		}
+
+		/*
+		 * Next, if the REDO_READ10 flag is set, return a result
+		 * code that will cause the SCSI core to retry the READ(10)
+		 * command immediately.
+		 */
+		if (test_bit(US_FLIDX_REDO_READ10, &us->dflags)) {
+			clear_bit(US_FLIDX_REDO_READ10, &us->dflags);
+			srb->result = DID_IMM_RETRY << 16;
+			srb->sense_buffer[0] = 0;
+		}
+	}
+
 	/* Did we transfer less than the minimum amount required? */
 	if ((srb->result == SAM_STAT_GOOD || srb->sense_buffer[2] == 0) &&
 			scsi_bufflen(srb) - scsi_get_resid(srb) < srb->underflow)
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index c1602b8c5594..ccff3483eebc 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -1114,6 +1114,16 @@ UNUSUAL_DEV( 0x090c, 0x1132, 0x0000, 0xffff,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_FIX_CAPACITY ),
 
+/* Reported by Paul Hartman <paul.hartman+linux@gmail.com>
+ * This card reader returns "Illegal Request, Logical Block Address
+ * Out of Range" for the first READ(10) after a new card is inserted.
+ */
+UNUSUAL_DEV(  0x090c, 0x6000, 0x0100, 0x0100,
+		"Feiya",
+		"SD/SDHC Card Reader",
+		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+		US_FL_INITIAL_READ10 ),
+
 /* This Pentax still camera is not conformant
  * to the USB storage specification: -
  * - It does not like the INQUIRY command. So we must handle this command
@@ -1888,6 +1898,15 @@ UNUSUAL_DEV( 0x1908, 0x3335, 0x0200, 0x0200,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_NO_READ_DISC_INFO ),
 
+/* Reported by Sven Geggus <sven-usbst@geggus.net>
+ * This encrypted pen drive returns bogus data for the initial READ(10).
+ */
+UNUSUAL_DEV(  0x1b1c, 0x1ab5, 0x0200, 0x0200,
+		"Corsair",
+		"Padlock v2",
+		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+		US_FL_INITIAL_READ10 ),
+
 /* Patch by Richard Sch�tz <r.schtz@t-online.de>
  * This external hard drive enclosure uses a JMicron chip which
  * needs the US_FL_IGNORE_RESIDUE flag to work properly. */
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index 5ee7ac42e08f..0ca095820f3e 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -440,7 +440,8 @@ static void adjust_quirks(struct us_data *us)
 			US_FL_NOT_LOCKABLE | US_FL_MAX_SECTORS_64 |
 			US_FL_CAPACITY_OK | US_FL_IGNORE_RESIDUE |
 			US_FL_SINGLE_LUN | US_FL_NO_WP_DETECT |
-			US_FL_NO_READ_DISC_INFO | US_FL_NO_READ_CAPACITY_16);
+			US_FL_NO_READ_DISC_INFO | US_FL_NO_READ_CAPACITY_16 |
+			US_FL_INITIAL_READ10);
 
 	p = quirks;
 	while (*p) {
@@ -490,6 +491,9 @@ static void adjust_quirks(struct us_data *us)
 		case 'm':
 			f |= US_FL_MAX_SECTORS_64;
 			break;
+		case 'n':
+			f |= US_FL_INITIAL_READ10;
+			break;
 		case 'o':
 			f |= US_FL_CAPACITY_OK;
 			break;
@@ -953,6 +957,13 @@ int usb_stor_probe2(struct us_data *us)
 	if (result)
 		goto BadDevice;
 
+	/*
+	 * If the device returns invalid data for the first READ(10)
+	 * command, indicate the command should be retried.
+	 */
+	if (us->fflags & US_FL_INITIAL_READ10)
+		set_bit(US_FLIDX_REDO_READ10, &us->dflags);
+
 	/* Acquire all the other resources and add the host */
 	result = usb_stor_acquire_resources(us);
 	if (result)
diff --git a/drivers/usb/storage/usb.h b/drivers/usb/storage/usb.h
index 89d3bfff98df..7b0f2113632e 100644
--- a/drivers/usb/storage/usb.h
+++ b/drivers/usb/storage/usb.h
@@ -73,6 +73,8 @@ struct us_unusual_dev {
 #define US_FLIDX_RESETTING	4	/* device reset in progress */
 #define US_FLIDX_TIMED_OUT	5	/* SCSI midlayer timed out  */
 #define US_FLIDX_DONT_SCAN	6	/* don't scan (disconnect)  */
+#define US_FLIDX_REDO_READ10	7	/* redo READ(10) command    */
+#define US_FLIDX_READ10_WORKED	8	/* previous READ(10) succeeded */
 
 #define USB_STOR_STRING_LEN 32
 
diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h
index 71693d4a4fe1..17df3600bcef 100644
--- a/include/linux/usb_usual.h
+++ b/include/linux/usb_usual.h
@@ -62,7 +62,9 @@
 	US_FLAG(NO_READ_DISC_INFO,	0x00040000)		\
 		/* cannot handle READ_DISC_INFO */		\
 	US_FLAG(NO_READ_CAPACITY_16,	0x00080000)		\
-		/* cannot handle READ_CAPACITY_16 */
+		/* cannot handle READ_CAPACITY_16 */		\
+	US_FLAG(INITIAL_READ10,	0x00100000)			\
+		/* Initial READ(10) (and others) must be retried */
 
 #define US_FLAG(name, value)	US_FL_##name = value ,
 enum { US_DO_ALL_FLAGS };
-- 
cgit v1.2.3


From ea2c00095c022846dd8dfd211de05154d3e4e1b8 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Tue, 17 May 2011 15:25:37 -0700
Subject: Connector: Set the CN_NETLINK_USERS correctly

The CN_NETLINK_USERS must be set to the highest valid index +1.
Thanks to Evgeniy for pointing this out.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Cc: stable <stable@kernel.org> [.39]
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/connector.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/connector.h b/include/linux/connector.h
index 7c60d0942adb..f696bccd48cb 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -44,7 +44,7 @@
 #define CN_VAL_DRBD			0x1
 #define CN_KVP_IDX			0x9	/* HyperV KVP */
 
-#define CN_NETLINK_USERS		9
+#define CN_NETLINK_USERS		10	/* Highest index + 1 */
 
 /*
  * Maximum connector's message size.
-- 
cgit v1.2.3


From 13e12d14e2dccc7995b8f15a5678a338ab4e6a8c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 8 Jun 2011 15:18:19 -0700
Subject: vfs: reorganize 'struct inode' layout a bit

This tries to make the 'struct inode' accesses denser in the data cache
by moving a commonly accessed field (i_security) closer to other fields
that are accessed often.

It also makes 'i_state' just an 'unsigned int' rather than 'unsigned
long', since we only use a few bits of that field, and moves it next to
the existing 'i_flags' so that we potentially get better structure
layout (although depending on config options, i_flags may already have
packed in the same word as i_lock, so this improves packing only for the
case of spinlock debugging)

Out 'struct inode' is still way too big, and we should probably move
some other fields around too (the acl fields in particular) for better
data cache access density.  Other fields (like the inode hash) are
likely to be entirely irrelevant under most loads.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 646a1836152a..1c777878f1ea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -744,9 +744,13 @@ struct inode {
 
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned int		i_flags;
+	unsigned int		i_state;
+#ifdef CONFIG_SECURITY
+	void			*i_security;
+#endif
 	struct mutex		i_mutex;
 
-	unsigned long		i_state;
+
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
 	struct hlist_node	i_hash;
@@ -798,9 +802,6 @@ struct inode {
 	atomic_t		i_readcount; /* struct files open RO */
 #endif
 	atomic_t		i_writecount;
-#ifdef CONFIG_SECURITY
-	void			*i_security;
-#endif
 #ifdef CONFIG_FS_POSIX_ACL
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;
-- 
cgit v1.2.3


From b4c8cc88c18213688268d1d53a51d97ce2f19a64 Mon Sep 17 00:00:00 2001
From: Yegor Yefremov <yegorslists@googlemail.com>
Date: Thu, 9 Jun 2011 15:05:48 -0700
Subject: ethtool.h: fix typos

Signed-off-by: Yegor Yefremov <yegorslists@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index c6a850ab2ec5..439b173c5882 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -268,7 +268,7 @@ struct ethtool_pauseparam {
 	__u32	cmd;	/* ETHTOOL_{G,S}PAUSEPARAM */
 
 	/* If the link is being auto-negotiated (via ethtool_cmd.autoneg
-	 * being true) the user may set 'autonet' here non-zero to have the
+	 * being true) the user may set 'autoneg' here non-zero to have the
 	 * pause parameters be auto-negotiated too.  In such a case, the
 	 * {rx,tx}_pause values below determine what capabilities are
 	 * advertised.
@@ -811,7 +811,7 @@ bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported);
  * @get_tx_csum: Deprecated as redundant. Report whether transmit checksums
  *	are turned on or off.
  * @set_tx_csum: Deprecated in favour of generic netdev features.  Turn
- *	transmit checksums on or off.  Returns a egative error code or zero.
+ *	transmit checksums on or off.  Returns a negative error code or zero.
  * @get_sg: Deprecated as redundant.  Report whether scatter-gather is
  *	enabled.  
  * @set_sg: Deprecated in favour of generic netdev features.  Turn
@@ -1087,7 +1087,7 @@ struct ethtool_ops {
 /* The following are all involved in forcing a particular link
  * mode for the device for setting things.  When getting the
  * devices settings, these indicate the current mode and whether
- * it was foced up into this mode or autonegotiated.
+ * it was forced up into this mode or autonegotiated.
  */
 
 /* The forced speed, 10Mb, 100Mb, gigabit, 2.5Gb, 10GbE. */
-- 
cgit v1.2.3


From e5ea3f12d41d96edf4ad9374db2b1725e457acec Mon Sep 17 00:00:00 2001
From: Jamie Iles <jamie@jamieiles.com>
Date: Fri, 10 Jun 2011 13:44:49 +0100
Subject: gpio/basic_mmio: add missing include of spinlock_types.h

include/linux/basic_mmio_gpio.h uses a spinlock_t without including any
of the spinlock headers resulting in this compiler warning.

include/linux/basic_mmio_gpio.h:51:2: error: expected specifier-qualifier-list before 'spinlock_t'

Explicitly include linux/spinlock_types.h to fix it.

Signed-off-by: Jamie Iles <jamie@jamieiles.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 include/linux/basic_mmio_gpio.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/basic_mmio_gpio.h b/include/linux/basic_mmio_gpio.h
index 1ae12710d732..98999cf107ce 100644
--- a/include/linux/basic_mmio_gpio.h
+++ b/include/linux/basic_mmio_gpio.h
@@ -16,6 +16,7 @@
 #include <linux/gpio.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
+#include <linux/spinlock_types.h>
 
 struct bgpio_pdata {
 	int base;
-- 
cgit v1.2.3


From 56a210526adb2854d5b7c398a40260720390ee05 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 11 Jun 2011 12:29:58 +0100
Subject: linux/seqlock.h should #include asm/processor.h for cpu_relax()

It uses cpu_relax(), and so needs <asm/processor.h>

Without this patch, I see:

   CC      arch/mn10300/kernel/asm-offsets.s
  In file included from include/linux/time.h:8,
                   from include/linux/timex.h:56,
                   from include/linux/sched.h:57,
                   from arch/mn10300/kernel/asm-offsets.c:7:
  include/linux/seqlock.h: In function 'read_seqbegin':
  include/linux/seqlock.h:91: error: implicit declaration of function 'cpu_relax'

whilst building asb2364_defconfig on MN10300.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/seqlock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index e9811892844f..c6db9fb33c44 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -28,6 +28,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
+#include <asm/processor.h>
 
 typedef struct {
 	unsigned sequence;
-- 
cgit v1.2.3


From 0b5c9db1b11d3175bb42b80663a9f072f801edf5 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Fri, 10 Jun 2011 06:56:58 +0000
Subject: vlan: Fix the ingress VLAN_FLAG_REORDER_HDR check

Testing of VLAN_FLAG_REORDER_HDR does not belong in vlan_untag
but rather in vlan_do_receive.  Otherwise the vlan header
will not be properly put on the packet in the case of
vlan header accelleration.

As we remove the check from vlan_check_reorder_header
rename it vlan_reorder_header to keep the naming clean.

Fix up the skb->pkt_type early so we don't look at the packet
after adding the vlan tag, which guarantees we don't goof
and look at the wrong field.

Use a simple if statement instead of a complicated switch
statement to decided that we need to increment rx_stats
for a multicast packet.

Hopefully at somepoint we will just declare the case where
VLAN_FLAG_REORDER_HDR is cleared as unsupported and remove
the code.  Until then this keeps it working correctly.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Acked-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 25 ++++++++++++++++++---
 include/linux/skbuff.h  |  5 +++++
 net/8021q/vlan_core.c   | 60 +++++++++++++++++++++++++++----------------------
 net/core/dev.c          |  2 +-
 4 files changed, 61 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index dc01681fbb42..affa27380b72 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -225,7 +225,7 @@ static inline int vlan_hwaccel_receive_skb(struct sk_buff *skb,
 }
 
 /**
- * __vlan_put_tag - regular VLAN tag inserting
+ * vlan_insert_tag - regular VLAN tag inserting
  * @skb: skbuff to tag
  * @vlan_tci: VLAN TCI to insert
  *
@@ -234,8 +234,10 @@ static inline int vlan_hwaccel_receive_skb(struct sk_buff *skb,
  *
  * Following the skb_unshare() example, in case of error, the calling function
  * doesn't have to worry about freeing the original skb.
+ *
+ * Does not change skb->protocol so this function can be used during receive.
  */
-static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
+static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, u16 vlan_tci)
 {
 	struct vlan_ethhdr *veth;
 
@@ -255,8 +257,25 @@ static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
 	/* now, the TCI */
 	veth->h_vlan_TCI = htons(vlan_tci);
 
-	skb->protocol = htons(ETH_P_8021Q);
+	return skb;
+}
 
+/**
+ * __vlan_put_tag - regular VLAN tag inserting
+ * @skb: skbuff to tag
+ * @vlan_tci: VLAN TCI to insert
+ *
+ * Inserts the VLAN tag into @skb as part of the payload
+ * Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
+ *
+ * Following the skb_unshare() example, in case of error, the calling function
+ * doesn't have to worry about freeing the original skb.
+ */
+static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
+{
+	skb = vlan_insert_tag(skb, vlan_tci);
+	if (skb)
+		skb->protocol = htons(ETH_P_8021Q);
 	return skb;
 }
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e8b78ce14474..c0a4f3ab0cc0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1256,6 +1256,11 @@ static inline void skb_reserve(struct sk_buff *skb, int len)
 	skb->tail += len;
 }
 
+static inline void skb_reset_mac_len(struct sk_buff *skb)
+{
+	skb->mac_len = skb->network_header - skb->mac_header;
+}
+
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
 static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
 {
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 41495dc2a4c9..fcc684678af6 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -23,6 +23,31 @@ bool vlan_do_receive(struct sk_buff **skbp)
 		return false;
 
 	skb->dev = vlan_dev;
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		/* Our lower layer thinks this is not local, let's make sure.
+		 * This allows the VLAN to have a different MAC than the
+		 * underlying device, and still route correctly. */
+		if (!compare_ether_addr(eth_hdr(skb)->h_dest,
+					vlan_dev->dev_addr))
+			skb->pkt_type = PACKET_HOST;
+	}
+
+	if (!(vlan_dev_info(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR)) {
+		unsigned int offset = skb->data - skb_mac_header(skb);
+
+		/*
+		 * vlan_insert_tag expect skb->data pointing to mac header.
+		 * So change skb->data before calling it and change back to
+		 * original position later
+		 */
+		skb_push(skb, offset);
+		skb = *skbp = vlan_insert_tag(skb, skb->vlan_tci);
+		if (!skb)
+			return false;
+		skb_pull(skb, offset + VLAN_HLEN);
+		skb_reset_mac_len(skb);
+	}
+
 	skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
 	skb->vlan_tci = 0;
 
@@ -31,22 +56,8 @@ bool vlan_do_receive(struct sk_buff **skbp)
 	u64_stats_update_begin(&rx_stats->syncp);
 	rx_stats->rx_packets++;
 	rx_stats->rx_bytes += skb->len;
-
-	switch (skb->pkt_type) {
-	case PACKET_BROADCAST:
-		break;
-	case PACKET_MULTICAST:
+	if (skb->pkt_type == PACKET_MULTICAST)
 		rx_stats->rx_multicast++;
-		break;
-	case PACKET_OTHERHOST:
-		/* Our lower layer thinks this is not local, let's make sure.
-		 * This allows the VLAN to have a different MAC than the
-		 * underlying device, and still route correctly. */
-		if (!compare_ether_addr(eth_hdr(skb)->h_dest,
-					vlan_dev->dev_addr))
-			skb->pkt_type = PACKET_HOST;
-		break;
-	}
 	u64_stats_update_end(&rx_stats->syncp);
 
 	return true;
@@ -89,18 +100,13 @@ gro_result_t vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
 }
 EXPORT_SYMBOL(vlan_gro_frags);
 
-static struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb)
+static struct sk_buff *vlan_reorder_header(struct sk_buff *skb)
 {
-	if (vlan_dev_info(skb->dev)->flags & VLAN_FLAG_REORDER_HDR) {
-		if (skb_cow(skb, skb_headroom(skb)) < 0)
-			skb = NULL;
-		if (skb) {
-			/* Lifted from Gleb's VLAN code... */
-			memmove(skb->data - ETH_HLEN,
-				skb->data - VLAN_ETH_HLEN, 12);
-			skb->mac_header += VLAN_HLEN;
-		}
-	}
+	if (skb_cow(skb, skb_headroom(skb)) < 0)
+		return NULL;
+	memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
+	skb->mac_header += VLAN_HLEN;
+	skb_reset_mac_len(skb);
 	return skb;
 }
 
@@ -161,7 +167,7 @@ struct sk_buff *vlan_untag(struct sk_buff *skb)
 	skb_pull_rcsum(skb, VLAN_HLEN);
 	vlan_set_encap_proto(skb, vhdr);
 
-	skb = vlan_check_reorder_header(skb);
+	skb = vlan_reorder_header(skb);
 	if (unlikely(!skb))
 		goto err_free;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index a54c9f87ddbb..9c58c1ec41a9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3114,7 +3114,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 
 	skb_reset_network_header(skb);
 	skb_reset_transport_header(skb);
-	skb->mac_len = skb->network_header - skb->mac_header;
+	skb_reset_mac_len(skb);
 
 	pt_prev = NULL;
 
-- 
cgit v1.2.3


From a685e08987d1edf1995b76511d4c98ea0e905377 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 8 Jun 2011 21:13:01 -0400
Subject: Delay struct net freeing while there's a sysfs instance refering to
 it

	* new refcount in struct net, controlling actual freeing of the memory
	* new method in kobj_ns_type_operations (->drop_ns())
	* ->current_ns() semantics change - it's supposed to be followed by
corresponding ->drop_ns().  For struct net in case of CONFIG_NET_NS it bumps
the new refcount; net_drop_ns() decrements it and calls net_free() if the
last reference has been dropped.  Method renamed to ->grab_current_ns().
	* old net_free() callers call net_drop_ns() instead.
	* sysfs_exit_ns() is gone, along with a large part of callchain
leading to it; now that the references stored in ->ns[...] stay valid we
do not need to hunt them down and replace them with NULL.  That fixes
problems in sysfs_lookup() and sysfs_readdir(), along with getting rid
of sb->s_instances abuse.

	Note that struct net *shutdown* logics has not changed - net_cleanup()
is called exactly when it used to be called.  The only thing postponed by
having a sysfs instance refering to that struct net is actual freeing of
memory occupied by struct net.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysfs/mount.c            | 37 +++++++++++--------------------------
 fs/sysfs/sysfs.h            |  2 +-
 include/linux/kobject_ns.h  | 10 ++++++----
 include/linux/sysfs.h       |  7 -------
 include/net/net_namespace.h | 10 +++++++++-
 lib/kobject.c               | 26 +++++++++-----------------
 net/core/net-sysfs.c        | 23 +++++++++--------------
 net/core/net_namespace.c    | 12 ++++++++++--
 8 files changed, 55 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 266895783b47..e34f0d99ea4e 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -95,6 +95,14 @@ static int sysfs_set_super(struct super_block *sb, void *data)
 	return error;
 }
 
+static void free_sysfs_super_info(struct sysfs_super_info *info)
+{
+	int type;
+	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+		kobj_ns_drop(type, info->ns[type]);
+	kfree(info);
+}
+
 static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
@@ -108,11 +116,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 		return ERR_PTR(-ENOMEM);
 
 	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-		info->ns[type] = kobj_ns_current(type);
+		info->ns[type] = kobj_ns_grab_current(type);
 
 	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
-		kfree(info);
+		free_sysfs_super_info(info);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 	if (!sb->s_root) {
@@ -131,12 +139,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 static void sysfs_kill_sb(struct super_block *sb)
 {
 	struct sysfs_super_info *info = sysfs_info(sb);
-
 	/* Remove the superblock from fs_supers/s_instances
 	 * so we can't find it, before freeing sysfs_super_info.
 	 */
 	kill_anon_super(sb);
-	kfree(info);
+	free_sysfs_super_info(info);
 }
 
 static struct file_system_type sysfs_fs_type = {
@@ -145,28 +152,6 @@ static struct file_system_type sysfs_fs_type = {
 	.kill_sb	= sysfs_kill_sb,
 };
 
-void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
-{
-	struct super_block *sb;
-
-	mutex_lock(&sysfs_mutex);
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
-		struct sysfs_super_info *info = sysfs_info(sb);
-		/*
-		 * If we see a superblock on the fs_supers/s_instances
-		 * list the unmount has not completed and sb->s_fs_info
-		 * points to a valid struct sysfs_super_info.
-		 */
-		/* Ignore superblocks with the wrong ns */
-		if (info->ns[type] != ns)
-			continue;
-		info->ns[type] = NULL;
-	}
-	spin_unlock(&sb_lock);
-	mutex_unlock(&sysfs_mutex);
-}
-
 int __init sysfs_init(void)
 {
 	int err = -ENOMEM;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3d28af31d863..2ed2404f3113 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -136,7 +136,7 @@ struct sysfs_addrm_cxt {
  * instance).
  */
 struct sysfs_super_info {
-	const void *ns[KOBJ_NS_TYPES];
+	void *ns[KOBJ_NS_TYPES];
 };
 #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h
index 82cb5bf461fb..f66b065a8b5f 100644
--- a/include/linux/kobject_ns.h
+++ b/include/linux/kobject_ns.h
@@ -32,15 +32,17 @@ enum kobj_ns_type {
 
 /*
  * Callbacks so sysfs can determine namespaces
- *   @current_ns: return calling task's namespace
+ *   @grab_current_ns: return a new reference to calling task's namespace
  *   @netlink_ns: return namespace to which a sock belongs (right?)
  *   @initial_ns: return the initial namespace (i.e. init_net_ns)
+ *   @drop_ns: drops a reference to namespace
  */
 struct kobj_ns_type_operations {
 	enum kobj_ns_type type;
-	const void *(*current_ns)(void);
+	void *(*grab_current_ns)(void);
 	const void *(*netlink_ns)(struct sock *sk);
 	const void *(*initial_ns)(void);
+	void (*drop_ns)(void *);
 };
 
 int kobj_ns_type_register(const struct kobj_ns_type_operations *ops);
@@ -48,9 +50,9 @@ int kobj_ns_type_registered(enum kobj_ns_type type);
 const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent);
 const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj);
 
-const void *kobj_ns_current(enum kobj_ns_type type);
+void *kobj_ns_grab_current(enum kobj_ns_type type);
 const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk);
 const void *kobj_ns_initial(enum kobj_ns_type type);
-void kobj_ns_exit(enum kobj_ns_type type, const void *ns);
+void kobj_ns_drop(enum kobj_ns_type type, void *ns);
 
 #endif /* _LINUX_KOBJECT_NS_H */
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index c3acda60eee0..e2696d76a599 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -177,9 +177,6 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd);
 void sysfs_put(struct sysfs_dirent *sd);
 
-/* Called to clear a ns tag when it is no longer valid */
-void sysfs_exit_ns(enum kobj_ns_type type, const void *tag);
-
 int __must_check sysfs_init(void);
 
 #else /* CONFIG_SYSFS */
@@ -338,10 +335,6 @@ static inline void sysfs_put(struct sysfs_dirent *sd)
 {
 }
 
-static inline void sysfs_exit_ns(int type, const void *tag)
-{
-}
-
 static inline int __must_check sysfs_init(void)
 {
 	return 0;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 2bf9ed9ef26b..aef430d779bd 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -35,8 +35,11 @@ struct netns_ipvs;
 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 
 struct net {
+	atomic_t		passive;	/* To decided when the network
+						 * namespace should be freed.
+						 */
 	atomic_t		count;		/* To decided when the network
-						 *  namespace should be freed.
+						 *  namespace should be shut down.
 						 */
 #ifdef NETNS_REFCNT_DEBUG
 	atomic_t		use_count;	/* To track references we
@@ -154,6 +157,9 @@ int net_eq(const struct net *net1, const struct net *net2)
 {
 	return net1 == net2;
 }
+
+extern void net_drop_ns(void *);
+
 #else
 
 static inline struct net *get_net(struct net *net)
@@ -175,6 +181,8 @@ int net_eq(const struct net *net1, const struct net *net2)
 {
 	return 1;
 }
+
+#define net_drop_ns NULL
 #endif
 
 
diff --git a/lib/kobject.c b/lib/kobject.c
index 82dc34c095c2..640bd98a4c8a 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -948,14 +948,14 @@ const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj)
 }
 
 
-const void *kobj_ns_current(enum kobj_ns_type type)
+void *kobj_ns_grab_current(enum kobj_ns_type type)
 {
-	const void *ns = NULL;
+	void *ns = NULL;
 
 	spin_lock(&kobj_ns_type_lock);
 	if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
 	    kobj_ns_ops_tbl[type])
-		ns = kobj_ns_ops_tbl[type]->current_ns();
+		ns = kobj_ns_ops_tbl[type]->grab_current_ns();
 	spin_unlock(&kobj_ns_type_lock);
 
 	return ns;
@@ -987,23 +987,15 @@ const void *kobj_ns_initial(enum kobj_ns_type type)
 	return ns;
 }
 
-/*
- * kobj_ns_exit - invalidate a namespace tag
- *
- * @type: the namespace type (i.e. KOBJ_NS_TYPE_NET)
- * @ns: the actual namespace being invalidated
- *
- * This is called when a tag is no longer valid.  For instance,
- * when a network namespace exits, it uses this helper to
- * make sure no sb's sysfs_info points to the now-invalidated
- * netns.
- */
-void kobj_ns_exit(enum kobj_ns_type type, const void *ns)
+void kobj_ns_drop(enum kobj_ns_type type, void *ns)
 {
-	sysfs_exit_ns(type, ns);
+	spin_lock(&kobj_ns_type_lock);
+	if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
+	    kobj_ns_ops_tbl[type] && kobj_ns_ops_tbl[type]->drop_ns)
+		kobj_ns_ops_tbl[type]->drop_ns(ns);
+	spin_unlock(&kobj_ns_type_lock);
 }
 
-
 EXPORT_SYMBOL(kobject_get);
 EXPORT_SYMBOL(kobject_put);
 EXPORT_SYMBOL(kobject_del);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 11b98bc2aa8f..33d2a1fba131 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1179,9 +1179,14 @@ static void remove_queue_kobjects(struct net_device *net)
 #endif
 }
 
-static const void *net_current_ns(void)
+static void *net_grab_current_ns(void)
 {
-	return current->nsproxy->net_ns;
+	struct net *ns = current->nsproxy->net_ns;
+#ifdef CONFIG_NET_NS
+	if (ns)
+		atomic_inc(&ns->passive);
+#endif
+	return ns;
 }
 
 static const void *net_initial_ns(void)
@@ -1196,22 +1201,13 @@ static const void *net_netlink_ns(struct sock *sk)
 
 struct kobj_ns_type_operations net_ns_type_operations = {
 	.type = KOBJ_NS_TYPE_NET,
-	.current_ns = net_current_ns,
+	.grab_current_ns = net_grab_current_ns,
 	.netlink_ns = net_netlink_ns,
 	.initial_ns = net_initial_ns,
+	.drop_ns = net_drop_ns,
 };
 EXPORT_SYMBOL_GPL(net_ns_type_operations);
 
-static void net_kobj_ns_exit(struct net *net)
-{
-	kobj_ns_exit(KOBJ_NS_TYPE_NET, net);
-}
-
-static struct pernet_operations kobj_net_ops = {
-	.exit = net_kobj_ns_exit,
-};
-
-
 #ifdef CONFIG_HOTPLUG
 static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
 {
@@ -1339,6 +1335,5 @@ EXPORT_SYMBOL(netdev_class_remove_file);
 int netdev_kobject_init(void)
 {
 	kobj_ns_type_register(&net_ns_type_operations);
-	register_pernet_subsys(&kobj_net_ops);
 	return class_register(&net_class);
 }
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 6c6b86d0da15..cdcbc3cb00a9 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -128,6 +128,7 @@ static __net_init int setup_net(struct net *net)
 	LIST_HEAD(net_exit_list);
 
 	atomic_set(&net->count, 1);
+	atomic_set(&net->passive, 1);
 
 #ifdef NETNS_REFCNT_DEBUG
 	atomic_set(&net->use_count, 0);
@@ -210,6 +211,13 @@ static void net_free(struct net *net)
 	kmem_cache_free(net_cachep, net);
 }
 
+void net_drop_ns(void *p)
+{
+	struct net *ns = p;
+	if (ns && atomic_dec_and_test(&ns->passive))
+		net_free(ns);
+}
+
 struct net *copy_net_ns(unsigned long flags, struct net *old_net)
 {
 	struct net *net;
@@ -230,7 +238,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
 	}
 	mutex_unlock(&net_mutex);
 	if (rv < 0) {
-		net_free(net);
+		net_drop_ns(net);
 		return ERR_PTR(rv);
 	}
 	return net;
@@ -286,7 +294,7 @@ static void cleanup_net(struct work_struct *work)
 	/* Finally it is safe to free my network namespace structure */
 	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
 		list_del_init(&net->exit_list);
-		net_free(net);
+		net_drop_ns(net);
 	}
 }
 static DECLARE_WORK(net_cleanup_work, cleanup_net);
-- 
cgit v1.2.3


From 08e8138adebdd511e0955e8d6c051904bb4082af Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 13 Jun 2011 10:42:49 +0200
Subject: block: Add __attribute__((format(printf...) and fix fallout

Use the compiler to verify format strings and arguments.

Fix fallout.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c         |  4 ++--
 block/cfq-iosched.c          | 11 ++++++-----
 include/linux/blktrace_api.h |  3 ++-
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a62be8d0dc1b..3689f833afdc 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -927,7 +927,7 @@ static int throtl_dispatch(struct request_queue *q)
 
 	bio_list_init(&bio_list_on_stack);
 
-	throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
+	throtl_log(td, "dispatch nr_queued=%d read=%u write=%u",
 			total_nr_queued(td), td->nr_queued[READ],
 			td->nr_queued[WRITE]);
 
@@ -1204,7 +1204,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	}
 
 queue_bio:
-	throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
+	throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
 			" iodisp=%u iops=%u queued=%d/%d",
 			rw == READ ? 'R' : 'W',
 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 545b8d4c829d..f3799432676d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -988,9 +988,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
-	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
-			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
-			iops_mode(cfqd), cfqq->nr_sectors);
+	cfq_log_cfqq(cfqq->cfqd, cfqq,
+		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
+		     used_sl, cfqq->slice_dispatch, charge,
+		     iops_mode(cfqd), cfqq->nr_sectors);
 	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
 					  unaccounted_sl);
 	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
@@ -2023,8 +2024,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 */
 	if (sample_valid(cic->ttime_samples) &&
 	    (cfqq->slice_end - jiffies < cic->ttime_mean)) {
-		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
-				cic->ttime_mean);
+		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
+			     cic->ttime_mean);
 		return;
 	}
 
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index b22fb0d3db0f..8c7c2de7631a 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -169,7 +169,8 @@ extern void blk_trace_shutdown(struct request_queue *);
 extern int do_blk_trace_setup(struct request_queue *q, char *name,
 			      dev_t dev, struct block_device *bdev,
 			      struct blk_user_trace_setup *buts);
-extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
+extern __attribute__((format(printf, 2, 3)))
+void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 
 /**
  * blk_add_trace_msg - Add a (simple) message to the blktrace stream
-- 
cgit v1.2.3


From de1b794130b130e77ffa975bb58cb843744f9ae5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 13 Jun 2011 15:38:22 -0400
Subject: jbd2: Fix oops in jbd2_journal_remove_journal_head()

jbd2_journal_remove_journal_head() can oops when trying to access
journal_head returned by bh2jh(). This is caused for example by the
following race:

	TASK1					TASK2
  jbd2_journal_commit_transaction()
    ...
    processing t_forget list
      __jbd2_journal_refile_buffer(jh);
      if (!jh->b_transaction) {
        jbd_unlock_bh_state(bh);
					jbd2_journal_try_to_free_buffers()
					  jbd2_journal_grab_journal_head(bh)
					  jbd_lock_bh_state(bh)
					  __journal_try_to_free_buffer()
					  jbd2_journal_put_journal_head(jh)
        jbd2_journal_remove_journal_head(bh);

jbd2_journal_put_journal_head() in TASK2 sees that b_jcount == 0 and
buffer is not part of any transaction and thus frees journal_head
before TASK1 gets to doing so. Note that even buffer_head can be
released by try_to_free_buffers() after
jbd2_journal_put_journal_head() which adds even larger opportunity for
oops (but I didn't see this happen in reality).

Fix the problem by making transactions hold their own journal_head
reference (in b_jcount). That way we don't have to remove journal_head
explicitely via jbd2_journal_remove_journal_head() and instead just
remove journal_head when b_jcount drops to zero. The result of this is
that [__]jbd2_journal_refile_buffer(),
[__]jbd2_journal_unfile_buffer(), and
__jdb2_journal_remove_checkpoint() can free journal_head which needs
modification of a few callers. Also we have to be careful because once
journal_head is removed, buffer_head might be freed as well. So we
have to get our own buffer_head reference where it matters.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c  | 28 +++++++++-------
 fs/jbd2/commit.c      | 33 +++++++++++--------
 fs/jbd2/journal.c     | 91 ++++++++++++++++-----------------------------------
 fs/jbd2/transaction.c | 67 +++++++++++++++++++------------------
 include/linux/jbd2.h  |  2 --
 5 files changed, 99 insertions(+), 122 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6a79fd0a1a32..2c62c5aae82f 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -97,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
 	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
+		/*
+		 * Get our reference so that bh cannot be freed before
+		 * we unlock it
+		 */
+		get_bh(bh);
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __jbd2_journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
-		jbd2_journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
 	} else {
@@ -223,8 +227,8 @@ restart:
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
+		get_bh(bh);
 		if (buffer_locked(bh)) {
-			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
 			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
@@ -243,7 +247,6 @@ restart:
 		 */
 		released = __jbd2_journal_remove_checkpoint(jh);
 		jbd_unlock_bh_state(bh);
-		jbd2_journal_remove_journal_head(bh);
 		__brelse(bh);
 	}
 
@@ -284,7 +287,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 	int ret = 0;
 
 	if (buffer_locked(bh)) {
-		atomic_inc(&bh->b_count);
+		get_bh(bh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
 		wait_on_buffer(bh);
@@ -316,12 +319,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		ret = 1;
 		if (unlikely(buffer_write_io_error(bh)))
 			ret = -EIO;
+		get_bh(bh);
 		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
 		__jbd2_journal_remove_checkpoint(jh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
-		jbd2_journal_remove_journal_head(bh);
 		__brelse(bh);
 	} else {
 		/*
@@ -554,7 +557,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 /*
  * journal_clean_one_cp_list
  *
- * Find all the written-back checkpoint buffers in the given list and release them.
+ * Find all the written-back checkpoint buffers in the given list and
+ * release them.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
@@ -663,8 +667,8 @@ out:
  * checkpoint lists.
  *
  * The function returns 1 if it frees the transaction, 0 otherwise.
+ * The function can free jh and bh.
  *
- * This function is called with the journal locked.
  * This function is called with j_list_lock held.
  * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
@@ -684,13 +688,14 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 	}
 	journal = transaction->t_journal;
 
+	JBUFFER_TRACE(jh, "removing from transaction");
 	__buffer_unlink(jh);
 	jh->b_cp_transaction = NULL;
+	jbd2_journal_put_journal_head(jh);
 
 	if (transaction->t_checkpoint_list != NULL ||
 	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
-	JBUFFER_TRACE(jh, "transaction has no more buffers");
 
 	/*
 	 * There is one special case to worry about: if we have just pulled the
@@ -701,10 +706,8 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 	 * The locking here around t_state is a bit sleazy.
 	 * See the comment at the end of jbd2_journal_commit_transaction().
 	 */
-	if (transaction->t_state != T_FINISHED) {
-		JBUFFER_TRACE(jh, "belongs to running/committing transaction");
+	if (transaction->t_state != T_FINISHED)
 		goto out;
-	}
 
 	/* OK, that was the last buffer for the transaction: we can now
 	   safely remove this transaction from the log */
@@ -723,7 +726,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 	wake_up(&journal->j_wait_logspace);
 	ret = 1;
 out:
-	JBUFFER_TRACE(jh, "exit");
 	return ret;
 }
 
@@ -742,6 +744,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
 	J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
 	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
 
+	/* Get reference for checkpointing transaction */
+	jbd2_journal_grab_journal_head(jh2bh(jh));
 	jh->b_cp_transaction = transaction;
 
 	if (!transaction->t_checkpoint_list) {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7f21cf3aaf92..eef6979821a4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -848,10 +848,16 @@ restart_loop:
 	while (commit_transaction->t_forget) {
 		transaction_t *cp_transaction;
 		struct buffer_head *bh;
+		int try_to_free = 0;
 
 		jh = commit_transaction->t_forget;
 		spin_unlock(&journal->j_list_lock);
 		bh = jh2bh(jh);
+		/*
+		 * Get a reference so that bh cannot be freed before we are
+		 * done with it.
+		 */
+		get_bh(bh);
 		jbd_lock_bh_state(bh);
 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
 
@@ -914,28 +920,27 @@ restart_loop:
 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
 			if (is_journal_aborted(journal))
 				clear_buffer_jbddirty(bh);
-			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
-			__jbd2_journal_refile_buffer(jh);
-			jbd_unlock_bh_state(bh);
 		} else {
 			J_ASSERT_BH(bh, !buffer_dirty(bh));
-			/* The buffer on BJ_Forget list and not jbddirty means
+			/*
+			 * The buffer on BJ_Forget list and not jbddirty means
 			 * it has been freed by this transaction and hence it
 			 * could not have been reallocated until this
 			 * transaction has committed. *BUT* it could be
 			 * reallocated once we have written all the data to
 			 * disk and before we process the buffer on BJ_Forget
-			 * list. */
-			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
-			__jbd2_journal_refile_buffer(jh);
-			if (!jh->b_transaction) {
-				jbd_unlock_bh_state(bh);
-				 /* needs a brelse */
-				jbd2_journal_remove_journal_head(bh);
-				release_buffer_page(bh);
-			} else
-				jbd_unlock_bh_state(bh);
+			 * list.
+			 */
+			if (!jh->b_next_transaction)
+				try_to_free = 1;
 		}
+		JBUFFER_TRACE(jh, "refile or unfile buffer");
+		__jbd2_journal_refile_buffer(jh);
+		jbd_unlock_bh_state(bh);
+		if (try_to_free)
+			release_buffer_page(bh);	/* Drops bh reference */
+		else
+			__brelse(bh);
 		cond_resched_lock(&journal->j_list_lock);
 	}
 	spin_unlock(&journal->j_list_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9a7826990304..0dfa5b598e68 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2078,10 +2078,9 @@ static void journal_free_journal_head(struct journal_head *jh)
  * When a buffer has its BH_JBD bit set it is immune from being released by
  * core kernel code, mainly via ->b_count.
  *
- * A journal_head may be detached from its buffer_head when the journal_head's
- * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
- * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the
- * journal_head can be dropped if needed.
+ * A journal_head is detached from its buffer_head when the journal_head's
+ * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
+ * transaction (b_cp_transaction) hold their references to b_jcount.
  *
  * Various places in the kernel want to attach a journal_head to a buffer_head
  * _before_ attaching the journal_head to a transaction.  To protect the
@@ -2094,17 +2093,16 @@ static void journal_free_journal_head(struct journal_head *jh)
  *	(Attach a journal_head if needed.  Increments b_jcount)
  *	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
  *	...
+ *      (Get another reference for transaction)
+ *	jbd2_journal_grab_journal_head(bh);
  *	jh->b_transaction = xxx;
+ *	(Put original reference)
  *	jbd2_journal_put_journal_head(jh);
- *
- * Now, the journal_head's b_jcount is zero, but it is safe from being released
- * because it has a non-zero b_transaction.
  */
 
 /*
  * Give a buffer_head a journal_head.
  *
- * Doesn't need the journal lock.
  * May sleep.
  */
 struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
@@ -2168,61 +2166,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
 	struct journal_head *jh = bh2jh(bh);
 
 	J_ASSERT_JH(jh, jh->b_jcount >= 0);
-
-	get_bh(bh);
-	if (jh->b_jcount == 0) {
-		if (jh->b_transaction == NULL &&
-				jh->b_next_transaction == NULL &&
-				jh->b_cp_transaction == NULL) {
-			J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
-			J_ASSERT_BH(bh, buffer_jbd(bh));
-			J_ASSERT_BH(bh, jh2bh(jh) == bh);
-			BUFFER_TRACE(bh, "remove journal_head");
-			if (jh->b_frozen_data) {
-				printk(KERN_WARNING "%s: freeing "
-						"b_frozen_data\n",
-						__func__);
-				jbd2_free(jh->b_frozen_data, bh->b_size);
-			}
-			if (jh->b_committed_data) {
-				printk(KERN_WARNING "%s: freeing "
-						"b_committed_data\n",
-						__func__);
-				jbd2_free(jh->b_committed_data, bh->b_size);
-			}
-			bh->b_private = NULL;
-			jh->b_bh = NULL;	/* debug, really */
-			clear_buffer_jbd(bh);
-			__brelse(bh);
-			journal_free_journal_head(jh);
-		} else {
-			BUFFER_TRACE(bh, "journal_head was locked");
-		}
+	J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+	J_ASSERT_BH(bh, buffer_jbd(bh));
+	J_ASSERT_BH(bh, jh2bh(jh) == bh);
+	BUFFER_TRACE(bh, "remove journal_head");
+	if (jh->b_frozen_data) {
+		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
+		jbd2_free(jh->b_frozen_data, bh->b_size);
 	}
+	if (jh->b_committed_data) {
+		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
+		jbd2_free(jh->b_committed_data, bh->b_size);
+	}
+	bh->b_private = NULL;
+	jh->b_bh = NULL;	/* debug, really */
+	clear_buffer_jbd(bh);
+	journal_free_journal_head(jh);
 }
 
 /*
- * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction
- * and has a zero b_jcount then remove and release its journal_head.   If we did
- * see that the buffer is not used by any transaction we also "logically"
- * decrement ->b_count.
- *
- * We in fact take an additional increment on ->b_count as a convenience,
- * because the caller usually wants to do additional things with the bh
- * after calling here.
- * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some
- * time.  Once the caller has run __brelse(), the buffer is eligible for
- * reaping by try_to_free_buffers().
- */
-void jbd2_journal_remove_journal_head(struct buffer_head *bh)
-{
-	jbd_lock_bh_journal_head(bh);
-	__journal_remove_journal_head(bh);
-	jbd_unlock_bh_journal_head(bh);
-}
-
-/*
- * Drop a reference on the passed journal_head.  If it fell to zero then try to
+ * Drop a reference on the passed journal_head.  If it fell to zero then
  * release the journal_head from the buffer_head.
  */
 void jbd2_journal_put_journal_head(struct journal_head *jh)
@@ -2232,11 +2198,12 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 	jbd_lock_bh_journal_head(bh);
 	J_ASSERT_JH(jh, jh->b_jcount > 0);
 	--jh->b_jcount;
-	if (!jh->b_jcount && !jh->b_transaction) {
+	if (!jh->b_jcount) {
 		__journal_remove_journal_head(bh);
+		jbd_unlock_bh_journal_head(bh);
 		__brelse(bh);
-	}
-	jbd_unlock_bh_journal_head(bh);
+	} else
+		jbd_unlock_bh_journal_head(bh);
 }
 
 /*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 547b101049e5..2d7109414cdd 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
+static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
 
 /*
  * jbd2_get_transaction: obtain a new transaction_t object.
@@ -764,7 +765,6 @@ repeat:
 	if (!jh->b_transaction) {
 		JBUFFER_TRACE(jh, "no transaction");
 		J_ASSERT_JH(jh, !jh->b_next_transaction);
-		jh->b_transaction = transaction;
 		JBUFFER_TRACE(jh, "file as BJ_Reserved");
 		spin_lock(&journal->j_list_lock);
 		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
@@ -895,8 +895,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 		 * committed and so it's safe to clear the dirty bit.
 		 */
 		clear_buffer_dirty(jh2bh(jh));
-		jh->b_transaction = transaction;
-
 		/* first access by this transaction */
 		jh->b_modified = 0;
 
@@ -1230,8 +1228,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 			__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
 		} else {
 			__jbd2_journal_unfile_buffer(jh);
-			jbd2_journal_remove_journal_head(bh);
-			__brelse(bh);
 			if (!buffer_jbd(bh)) {
 				spin_unlock(&journal->j_list_lock);
 				jbd_unlock_bh_state(bh);
@@ -1554,19 +1550,32 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 		mark_buffer_dirty(bh);	/* Expose it to the VM */
 }
 
-void __jbd2_journal_unfile_buffer(struct journal_head *jh)
+/*
+ * Remove buffer from all transactions.
+ *
+ * Called with bh_state lock and j_list_lock
+ *
+ * jh and bh may be already freed when this function returns.
+ */
+static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
 {
 	__jbd2_journal_temp_unlink_buffer(jh);
 	jh->b_transaction = NULL;
+	jbd2_journal_put_journal_head(jh);
 }
 
 void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
 {
-	jbd_lock_bh_state(jh2bh(jh));
+	struct buffer_head *bh = jh2bh(jh);
+
+	/* Get reference so that buffer cannot be freed before we unlock it */
+	get_bh(bh);
+	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
 	__jbd2_journal_unfile_buffer(jh);
 	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(jh2bh(jh));
+	jbd_unlock_bh_state(bh);
+	__brelse(bh);
 }
 
 /*
@@ -1593,8 +1602,6 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 		if (jh->b_jlist == BJ_None) {
 			JBUFFER_TRACE(jh, "remove from checkpoint list");
 			__jbd2_journal_remove_checkpoint(jh);
-			jbd2_journal_remove_journal_head(bh);
-			__brelse(bh);
 		}
 	}
 	spin_unlock(&journal->j_list_lock);
@@ -1657,7 +1664,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
 		/*
 		 * We take our own ref against the journal_head here to avoid
 		 * having to add tons of locking around each instance of
-		 * jbd2_journal_remove_journal_head() and
 		 * jbd2_journal_put_journal_head().
 		 */
 		jh = jbd2_journal_grab_journal_head(bh);
@@ -1695,10 +1701,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
 	int may_free = 1;
 	struct buffer_head *bh = jh2bh(jh);
 
-	__jbd2_journal_unfile_buffer(jh);
-
 	if (jh->b_cp_transaction) {
 		JBUFFER_TRACE(jh, "on running+cp transaction");
+		__jbd2_journal_temp_unlink_buffer(jh);
 		/*
 		 * We don't want to write the buffer anymore, clear the
 		 * bit so that we don't confuse checks in
@@ -1709,8 +1714,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
 		may_free = 0;
 	} else {
 		JBUFFER_TRACE(jh, "on running transaction");
-		jbd2_journal_remove_journal_head(bh);
-		__brelse(bh);
+		__jbd2_journal_unfile_buffer(jh);
 	}
 	return may_free;
 }
@@ -1988,6 +1992,8 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 
 	if (jh->b_transaction)
 		__jbd2_journal_temp_unlink_buffer(jh);
+	else
+		jbd2_journal_grab_journal_head(bh);
 	jh->b_transaction = transaction;
 
 	switch (jlist) {
@@ -2039,9 +2045,10 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
  * already started to be used by a subsequent transaction, refile the
  * buffer on that transaction's metadata list.
  *
- * Called under journal->j_list_lock
- *
+ * Called under j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh))
+ *
+ * jh and bh may be already free when this function returns
  */
 void __jbd2_journal_refile_buffer(struct journal_head *jh)
 {
@@ -2065,6 +2072,11 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
 
 	was_dirty = test_clear_buffer_jbddirty(bh);
 	__jbd2_journal_temp_unlink_buffer(jh);
+	/*
+	 * We set b_transaction here because b_next_transaction will inherit
+	 * our jh reference and thus __jbd2_journal_file_buffer() must not
+	 * take a new one.
+	 */
 	jh->b_transaction = jh->b_next_transaction;
 	jh->b_next_transaction = NULL;
 	if (buffer_freed(bh))
@@ -2081,30 +2093,21 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
 }
 
 /*
- * For the unlocked version of this call, also make sure that any
- * hanging journal_head is cleaned up if necessary.
- *
- * __jbd2_journal_refile_buffer is usually called as part of a single locked
- * operation on a buffer_head, in which the caller is probably going to
- * be hooking the journal_head onto other lists.  In that case it is up
- * to the caller to remove the journal_head if necessary.  For the
- * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be
- * doing anything else to the buffer so we need to do the cleanup
- * ourselves to avoid a jh leak.
- *
- * *** The journal_head may be freed by this call! ***
+ * __jbd2_journal_refile_buffer() with necessary locking added. We take our
+ * bh reference so that we can safely unlock bh.
+ *
+ * The jh and bh may be freed by this call.
  */
 void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 {
 	struct buffer_head *bh = jh2bh(jh);
 
+	/* Get reference so that buffer cannot be freed before we unlock it */
+	get_bh(bh);
 	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
-
 	__jbd2_journal_refile_buffer(jh);
 	jbd_unlock_bh_state(bh);
-	jbd2_journal_remove_journal_head(bh);
-
 	spin_unlock(&journal->j_list_lock);
 	__brelse(bh);
 }
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 4ecb7b16b278..d087c2e7b2aa 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1024,7 +1024,6 @@ struct journal_s
 
 /* Filing buffers */
 extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
-extern void __jbd2_journal_unfile_buffer(struct journal_head *);
 extern void __jbd2_journal_refile_buffer(struct journal_head *);
 extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
 extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
@@ -1165,7 +1164,6 @@ extern void	   jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_in
  */
 struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh);
 struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh);
-void jbd2_journal_remove_journal_head(struct buffer_head *bh);
 void jbd2_journal_put_journal_head(struct journal_head *jh);
 
 /*
-- 
cgit v1.2.3


From 09223371deac67d08ca0b70bd18787920284c967 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 14 Jun 2011 13:26:25 +0800
Subject: rcu: Use softirq to address performance regression

Commit a26ac2455ffcf3(rcu: move TREE_RCU from softirq to kthread)
introduced performance regression. In an AIM7 test, this commit degraded
performance by about 40%.

The commit runs rcu callbacks in a kthread instead of softirq. We observed
high rate of context switch which is caused by this. Out test system has
64 CPUs and HZ is 1000, so we saw more than 64k context switch per second
which is caused by RCU's per-CPU kthread.  A trace showed that most of
the time the RCU per-CPU kthread doesn't actually handle any callbacks,
but instead just does a very small amount of work handling grace periods.
This means that RCU's per-CPU kthreads are making the scheduler do quite
a bit of work in order to allow a very small amount of RCU-related
processing to be done.

Alex Shi's analysis determined that this slowdown is due to lock
contention within the scheduler.  Unfortunately, as Peter Zijlstra points
out, the scheduler's real-time semantics require global action, which
means that this contention is inherent in real-time scheduling.  (Yes,
perhaps someone will come up with a workaround -- otherwise, -rt is not
going to do well on large SMP systems -- but this patch will work around
this issue in the meantime.  And "the meantime" might well be forever.)

This patch therefore re-introduces softirq processing to RCU, but only
for core RCU work.  RCU callbacks are still executed in kthread context,
so that only a small amount of RCU work runs in softirq context in the
common case.  This should minimize ksoftirqd execution, allowing us to
skip boosting of ksoftirqd for CONFIG_RCU_BOOST=y kernels.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Tested-by: "Alex,Shi" <alex.shi@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/filesystems/proc.txt  |  1 +
 include/linux/interrupt.h           |  1 +
 include/trace/events/irq.h          |  3 ++-
 kernel/rcutree.c                    | 23 +++++++++++++++++++----
 kernel/rcutree.h                    |  1 +
 kernel/rcutree_plugin.h             |  9 +++++++++
 kernel/softirq.c                    |  2 +-
 tools/perf/util/trace-event-parse.c |  1 +
 8 files changed, 35 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index f48178024067..db3b1aba32a3 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -843,6 +843,7 @@ Provides counts of softirq handlers serviced since boot time, for each cpu.
  TASKLET:          0          0          0        290
    SCHED:      27035      26983      26971      26746
  HRTIMER:          0          0          0          0
+     RCU:       1678       1769       2178       2250
 
 
 1.3 IDE devices in /proc/ide
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 6c12989839d9..f6efed0039ed 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -414,6 +414,7 @@ enum
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
 	HRTIMER_SOFTIRQ,
+	RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */
 
 	NR_SOFTIRQS
 };
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index ae045ca7d356..1c09820df585 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -20,7 +20,8 @@ struct softirq_action;
 			 softirq_name(BLOCK_IOPOLL),	\
 			 softirq_name(TASKLET),		\
 			 softirq_name(SCHED),		\
-			 softirq_name(HRTIMER))
+			 softirq_name(HRTIMER),		\
+			 softirq_name(RCU))
 
 /**
  * irq_handler_entry - called immediately before the irq action handler
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0a8ec5b2e208..ae5c9ea68662 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -100,6 +100,7 @@ static char rcu_kthreads_spawnable;
 
 static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_cpu_kthread(void);
+static void __invoke_rcu_cpu_kthread(void);
 
 #define RCU_KTHREAD_PRIO 1	/* RT priority for per-CPU kthreads. */
 
@@ -1442,13 +1443,21 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 
 	/* If there are callbacks ready, invoke them. */
-	rcu_do_batch(rsp, rdp);
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		__invoke_rcu_cpu_kthread();
+}
+
+static void rcu_kthread_do_work(void)
+{
+	rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
+	rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+	rcu_preempt_do_callbacks();
 }
 
 /*
  * Do softirq processing for the current CPU.
  */
-static void rcu_process_callbacks(void)
+static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	__rcu_process_callbacks(&rcu_sched_state,
 				&__get_cpu_var(rcu_sched_data));
@@ -1465,7 +1474,7 @@ static void rcu_process_callbacks(void)
  * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
  * cannot disappear out from under us.
  */
-static void invoke_rcu_cpu_kthread(void)
+static void __invoke_rcu_cpu_kthread(void)
 {
 	unsigned long flags;
 
@@ -1479,6 +1488,11 @@ static void invoke_rcu_cpu_kthread(void)
 	local_irq_restore(flags);
 }
 
+static void invoke_rcu_cpu_kthread(void)
+{
+	raise_softirq(RCU_SOFTIRQ);
+}
+
 /*
  * Wake up the specified per-rcu_node-structure kthread.
  * Because the per-rcu_node kthreads are immortal, we don't need
@@ -1613,7 +1627,7 @@ static int rcu_cpu_kthread(void *arg)
 		*workp = 0;
 		local_irq_restore(flags);
 		if (work)
-			rcu_process_callbacks();
+			rcu_kthread_do_work();
 		local_bh_enable();
 		if (*workp != 0)
 			spincnt++;
@@ -2387,6 +2401,7 @@ void __init rcu_init(void)
 	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
 	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
 	__rcu_init_preempt();
+	 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 
 	/*
 	 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7b9a08b4aaea..0fed6b934d2a 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -439,6 +439,7 @@ static void rcu_preempt_offline_cpu(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_preempt_check_callbacks(int cpu);
 static void rcu_preempt_process_callbacks(void);
+static void rcu_preempt_do_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ea2e2fb79e81..38d09c5f2b41 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -602,6 +602,11 @@ static void rcu_preempt_process_callbacks(void)
 				&__get_cpu_var(rcu_preempt_data));
 }
 
+static void rcu_preempt_do_callbacks(void)
+{
+	rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
+}
+
 /*
  * Queue a preemptible-RCU callback for invocation after a grace period.
  */
@@ -997,6 +1002,10 @@ static void rcu_preempt_process_callbacks(void)
 {
 }
 
+static void rcu_preempt_do_callbacks(void)
+{
+}
+
 /*
  * Wait for an rcu-preempt grace period, but make it happen quickly.
  * But because preemptible RCU does not exist, map to rcu-sched.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 13960170cad4..40cf63ddd4b3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
 char *softirq_to_name[NR_SOFTIRQS] = {
 	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
-	"TASKLET", "SCHED", "HRTIMER"
+	"TASKLET", "SCHED", "HRTIMER", "RCU"
 };
 
 /*
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 1e88485c16a0..0a7ed5b5e281 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -2187,6 +2187,7 @@ static const struct flag flags[] = {
 	{ "TASKLET_SOFTIRQ", 6 },
 	{ "SCHED_SOFTIRQ", 7 },
 	{ "HRTIMER_SOFTIRQ", 8 },
+	{ "RCU_SOFTIRQ", 9 },
 
 	{ "HRTIMER_NORESTART", 0 },
 	{ "HRTIMER_RESTART", 1 },
-- 
cgit v1.2.3


From 0b760113a3a155269a3fba93a409c640031dd68f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 31 May 2011 15:15:34 -0400
Subject: NLM: Don't hang forever on NLM unlock requests

If the NLM daemon is killed on the NFS server, we can currently end up
hanging forever on an 'unlock' request, instead of aborting. Basically,
if the rpcbind request fails, or the server keeps returning garbage, we
really want to quit instead of retrying.

Tested-by: Vasily Averin <vvs@sw.ru>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/lockd/clntproc.c          | 8 +++++++-
 include/linux/sunrpc/sched.h | 3 ++-
 net/sunrpc/clnt.c            | 3 +++
 net/sunrpc/sched.c           | 1 +
 4 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index adb45ec9038c..e374050a911c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -708,7 +708,13 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 
 	if (task->tk_status < 0) {
 		dprintk("lockd: unlock failed (err = %d)\n", -task->tk_status);
-		goto retry_rebind;
+		switch (task->tk_status) {
+		case -EACCES:
+		case -EIO:
+			goto die;
+		default:
+			goto retry_rebind;
+		}
 	}
 	if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
 		rpc_delay(task, NLMCLNT_GRACE_WAIT);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index f73c482ec9c6..fe2d8e6b923b 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -84,7 +84,8 @@ struct rpc_task {
 #endif
 	unsigned char		tk_priority : 2,/* Task priority */
 				tk_garb_retry : 2,
-				tk_cred_retry : 2;
+				tk_cred_retry : 2,
+				tk_rebind_retry : 2;
 };
 #define tk_xprt			tk_client->cl_xprt
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b84d7395535e..566bcfd067f6 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1175,6 +1175,9 @@ call_bind_status(struct rpc_task *task)
 			status = -EOPNOTSUPP;
 			break;
 		}
+		if (task->tk_rebind_retry == 0)
+			break;
+		task->tk_rebind_retry--;
 		rpc_delay(task, 3*HZ);
 		goto retry_timeout;
 	case -ETIMEDOUT:
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 6b43ee7221d5..a27406b1654f 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -792,6 +792,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 	/* Initialize retry counters */
 	task->tk_garb_retry = 2;
 	task->tk_cred_retry = 2;
+	task->tk_rebind_retry = 2;
 
 	task->tk_priority = task_setup_data->priority - RPC_PRIORITY_LOW;
 	task->tk_owner = current->tgid;
-- 
cgit v1.2.3


From c9c30dd5f73dccaa326a54dfcf490316946aea87 Mon Sep 17 00:00:00 2001
From: Benny Halevy <benny@tonian.com>
Date: Sat, 11 Jun 2011 17:08:39 -0400
Subject: NFSv4.1: deprecate headerpadsz in CREATE_SESSION

We don't support header padding yet so better off ditching it

Reported-by: Sid Moore <learnmost@gmail.com>
Signed-off-by: Benny Halevy <benny@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       |  4 ----
 fs/nfs/nfs4xdr.c        | 10 ++++++----
 include/linux/nfs_xdr.h |  1 -
 3 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d2c4b59c896d..3ca449789545 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5098,7 +5098,6 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 	if (mxresp_sz == 0)
 		mxresp_sz = NFS_MAX_FILE_IO_SIZE;
 	/* Fore channel attributes */
-	args->fc_attrs.headerpadsz = 0;
 	args->fc_attrs.max_rqst_sz = mxrqst_sz;
 	args->fc_attrs.max_resp_sz = mxresp_sz;
 	args->fc_attrs.max_ops = NFS4_MAX_OPS;
@@ -5111,7 +5110,6 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 		args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
 
 	/* Back channel attributes */
-	args->bc_attrs.headerpadsz = 0;
 	args->bc_attrs.max_rqst_sz = PAGE_SIZE;
 	args->bc_attrs.max_resp_sz = PAGE_SIZE;
 	args->bc_attrs.max_resp_sz_cached = 0;
@@ -5131,8 +5129,6 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args
 	struct nfs4_channel_attrs *sent = &args->fc_attrs;
 	struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
 
-	if (rcvd->headerpadsz > sent->headerpadsz)
-		return -EINVAL;
 	if (rcvd->max_resp_sz > sent->max_resp_sz)
 		return -EINVAL;
 	/*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d869a5e5464b..c4b7d6c04948 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1725,7 +1725,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(args->flags);			/*flags */
 
 	/* Fore Channel */
-	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
+	*p++ = cpu_to_be32(0);				/* header padding size */
 	*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz);	/* max req size */
 	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz);	/* max resp size */
 	*p++ = cpu_to_be32(max_resp_sz_cached);		/* Max resp sz cached */
@@ -1734,7 +1734,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
 
 	/* Back Channel */
-	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
+	*p++ = cpu_to_be32(0);				/* header padding size */
 	*p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz);	/* max req size */
 	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz);	/* max resp size */
 	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
@@ -4997,12 +4997,14 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 			     struct nfs4_channel_attrs *attrs)
 {
 	__be32 *p;
-	u32 nr_attrs;
+	u32 nr_attrs, val;
 
 	p = xdr_inline_decode(xdr, 28);
 	if (unlikely(!p))
 		goto out_overflow;
-	attrs->headerpadsz = be32_to_cpup(p++);
+	val = be32_to_cpup(p++);	/* headerpadsz */
+	if (val)
+		return -EINVAL;		/* no support for header padding yet */
 	attrs->max_rqst_sz = be32_to_cpup(p++);
 	attrs->max_resp_sz = be32_to_cpup(p++);
 	attrs->max_resp_sz_cached = be32_to_cpup(p++);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 5e8444a11adf..00848d86ffb2 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -158,7 +158,6 @@ struct nfs_seqid;
 
 /* nfs41 sessions channel attributes */
 struct nfs4_channel_attrs {
-	u32			headerpadsz;
 	u32			max_rqst_sz;
 	u32			max_resp_sz;
 	u32			max_resp_sz_cached;
-- 
cgit v1.2.3


From a59ec1e7ff98cc4365d5b1bff4e7102e86b5716b Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Wed, 15 Jun 2011 15:08:11 -0700
Subject: backlight: new driver for the ADP8870 backlight devices

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../testing/sysfs-class-backlight-driver-adp8870   |   56 ++
 drivers/video/backlight/Kconfig                    |   12 +
 drivers/video/backlight/Makefile                   |    1 +
 drivers/video/backlight/adp8870_bl.c               | 1011 ++++++++++++++++++++
 include/linux/i2c/adp8870.h                        |  153 +++
 5 files changed, 1233 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-class-backlight-driver-adp8870
 create mode 100644 drivers/video/backlight/adp8870_bl.c
 create mode 100644 include/linux/i2c/adp8870.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-backlight-driver-adp8870 b/Documentation/ABI/testing/sysfs-class-backlight-driver-adp8870
new file mode 100644
index 000000000000..aa11dbdd794b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-backlight-driver-adp8870
@@ -0,0 +1,56 @@
+What:		/sys/class/backlight/<backlight>/<ambient light zone>_max
+What:		/sys/class/backlight/<backlight>/l1_daylight_max
+What:		/sys/class/backlight/<backlight>/l2_bright_max
+What:		/sys/class/backlight/<backlight>/l3_office_max
+What:		/sys/class/backlight/<backlight>/l4_indoor_max
+What:		/sys/class/backlight/<backlight>/l5_dark_max
+Date:		Mai 2011
+KernelVersion:	2.6.40
+Contact:	device-drivers-devel@blackfin.uclinux.org
+Description:
+		Control the maximum brightness for <ambient light zone>
+		on this <backlight>. Values are between 0 and 127. This file
+		will also show the brightness level stored for this
+		<ambient light zone>.
+
+What:		/sys/class/backlight/<backlight>/<ambient light zone>_dim
+What:		/sys/class/backlight/<backlight>/l2_bright_dim
+What:		/sys/class/backlight/<backlight>/l3_office_dim
+What:		/sys/class/backlight/<backlight>/l4_indoor_dim
+What:		/sys/class/backlight/<backlight>/l5_dark_dim
+Date:		Mai 2011
+KernelVersion:	2.6.40
+Contact:	device-drivers-devel@blackfin.uclinux.org
+Description:
+		Control the dim brightness for <ambient light zone>
+		on this <backlight>. Values are between 0 and 127, typically
+		set to 0. Full off when the backlight is disabled.
+		This file will also show the dim brightness level stored for
+		this <ambient light zone>.
+
+What:		/sys/class/backlight/<backlight>/ambient_light_level
+Date:		Mai 2011
+KernelVersion:	2.6.40
+Contact:	device-drivers-devel@blackfin.uclinux.org
+Description:
+		Get conversion value of the light sensor.
+		This value is updated every 80 ms (when the light sensor
+		is enabled). Returns integer between 0 (dark) and
+		8000 (max ambient brightness)
+
+What:		/sys/class/backlight/<backlight>/ambient_light_zone
+Date:		Mai 2011
+KernelVersion:	2.6.40
+Contact:	device-drivers-devel@blackfin.uclinux.org
+Description:
+		Get/Set current ambient light zone. Reading returns
+		integer between 1..5 (1 = daylight, 2 = bright, ..., 5 = dark).
+		Writing a value between 1..5 forces the backlight controller
+		to enter the corresponding ambient light zone.
+		Writing 0 returns to normal/automatic ambient light level
+		operation. The ambient light sensing feature on these devices
+		is an extension to the API documented in
+		Documentation/ABI/stable/sysfs-class-backlight.
+		It can be enabled by writing the value stored in
+		/sys/class/backlight/<backlight>/max_brightness to
+		/sys/class/backlight/<backlight>/brightness.
\ No newline at end of file
diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index 0c9373bedd1f..2d93c8d61ad5 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -302,6 +302,18 @@ config BACKLIGHT_ADP8860
 	  To compile this driver as a module, choose M here: the module will
 	  be called adp8860_bl.
 
+config BACKLIGHT_ADP8870
+	tristate "Backlight Driver for ADP8870 using WLED"
+	depends on BACKLIGHT_CLASS_DEVICE && I2C
+	select NEW_LEDS
+	select LEDS_CLASS
+	help
+	  If you have a LCD backlight connected to the ADP8870,
+	  say Y here to enable this driver.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called adp8870_bl.
+
 config BACKLIGHT_88PM860X
 	tristate "Backlight Driver for 88PM8606 using WLED"
 	depends on MFD_88PM860X
diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile
index b9ca8490df87..ee72adb8786e 100644
--- a/drivers/video/backlight/Makefile
+++ b/drivers/video/backlight/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_BACKLIGHT_WM831X)	+= wm831x_bl.o
 obj-$(CONFIG_BACKLIGHT_ADX)    += adx_bl.o
 obj-$(CONFIG_BACKLIGHT_ADP5520)	+= adp5520_bl.o
 obj-$(CONFIG_BACKLIGHT_ADP8860)	+= adp8860_bl.o
+obj-$(CONFIG_BACKLIGHT_ADP8870)	+= adp8870_bl.o
 obj-$(CONFIG_BACKLIGHT_88PM860X) += 88pm860x_bl.o
 obj-$(CONFIG_BACKLIGHT_PCF50633)	+= pcf50633-backlight.o
 
diff --git a/drivers/video/backlight/adp8870_bl.c b/drivers/video/backlight/adp8870_bl.c
new file mode 100644
index 000000000000..e320b62576e2
--- /dev/null
+++ b/drivers/video/backlight/adp8870_bl.c
@@ -0,0 +1,1011 @@
+/*
+ * Backlight driver for Analog Devices ADP8870 Backlight Devices
+ *
+ * Copyright 2009-2011 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pm.h>
+#include <linux/platform_device.h>
+#include <linux/i2c.h>
+#include <linux/fb.h>
+#include <linux/backlight.h>
+#include <linux/leds.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+
+#include <linux/i2c/adp8870.h>
+#define ADP8870_EXT_FEATURES
+#define ADP8870_USE_LEDS
+
+
+#define ADP8870_MFDVID	0x00  /* Manufacturer and device ID */
+#define ADP8870_MDCR	0x01  /* Device mode and status */
+#define ADP8870_INT_STAT 0x02  /* Interrupts status */
+#define ADP8870_INT_EN	0x03  /* Interrupts enable */
+#define ADP8870_CFGR	0x04  /* Configuration register */
+#define ADP8870_BLSEL	0x05  /* Sink enable backlight or independent */
+#define ADP8870_PWMLED	0x06  /* PWM Enable Selection Register */
+#define ADP8870_BLOFF	0x07  /* Backlight off timeout */
+#define ADP8870_BLDIM	0x08  /* Backlight dim timeout */
+#define ADP8870_BLFR	0x09  /* Backlight fade in and out rates */
+#define ADP8870_BLMX1	0x0A  /* Backlight (Brightness Level 1-daylight) maximum current */
+#define ADP8870_BLDM1	0x0B  /* Backlight (Brightness Level 1-daylight) dim current */
+#define ADP8870_BLMX2	0x0C  /* Backlight (Brightness Level 2-bright) maximum current */
+#define ADP8870_BLDM2	0x0D  /* Backlight (Brightness Level 2-bright) dim current */
+#define ADP8870_BLMX3	0x0E  /* Backlight (Brightness Level 3-office) maximum current */
+#define ADP8870_BLDM3	0x0F  /* Backlight (Brightness Level 3-office) dim current */
+#define ADP8870_BLMX4	0x10  /* Backlight (Brightness Level 4-indoor) maximum current */
+#define ADP8870_BLDM4	0x11  /* Backlight (Brightness Level 4-indoor) dim current */
+#define ADP8870_BLMX5	0x12  /* Backlight (Brightness Level 5-dark) maximum current */
+#define ADP8870_BLDM5	0x13  /* Backlight (Brightness Level 5-dark) dim current */
+#define ADP8870_ISCLAW	0x1A  /* Independent sink current fade law register */
+#define ADP8870_ISCC	0x1B  /* Independent sink current control register */
+#define ADP8870_ISCT1	0x1C  /* Independent Sink Current Timer Register LED[7:5] */
+#define ADP8870_ISCT2	0x1D  /* Independent Sink Current Timer Register LED[4:1] */
+#define ADP8870_ISCF	0x1E  /* Independent sink current fade register */
+#define ADP8870_ISC1	0x1F  /* Independent Sink Current LED1 */
+#define ADP8870_ISC2	0x20  /* Independent Sink Current LED2 */
+#define ADP8870_ISC3	0x21  /* Independent Sink Current LED3 */
+#define ADP8870_ISC4	0x22  /* Independent Sink Current LED4 */
+#define ADP8870_ISC5	0x23  /* Independent Sink Current LED5 */
+#define ADP8870_ISC6	0x24  /* Independent Sink Current LED6 */
+#define ADP8870_ISC7	0x25  /* Independent Sink Current LED7 (Brightness Level 1-daylight) */
+#define ADP8870_ISC7_L2	0x26  /* Independent Sink Current LED7 (Brightness Level 2-bright) */
+#define ADP8870_ISC7_L3	0x27  /* Independent Sink Current LED7 (Brightness Level 3-office) */
+#define ADP8870_ISC7_L4	0x28  /* Independent Sink Current LED7 (Brightness Level 4-indoor) */
+#define ADP8870_ISC7_L5	0x29  /* Independent Sink Current LED7 (Brightness Level 5-dark) */
+#define ADP8870_CMP_CTL	0x2D  /* ALS Comparator Control Register */
+#define ADP8870_ALS1_EN	0x2E  /* Main ALS comparator level enable */
+#define ADP8870_ALS2_EN	0x2F  /* Second ALS comparator level enable */
+#define ADP8870_ALS1_STAT 0x30  /* Main ALS Comparator Status Register */
+#define ADP8870_ALS2_STAT 0x31  /* Second ALS Comparator Status Register */
+#define ADP8870_L2TRP	0x32  /* L2 comparator reference */
+#define ADP8870_L2HYS	0x33  /* L2 hysteresis */
+#define ADP8870_L3TRP	0x34  /* L3 comparator reference */
+#define ADP8870_L3HYS	0x35  /* L3 hysteresis */
+#define ADP8870_L4TRP	0x36  /* L4 comparator reference */
+#define ADP8870_L4HYS	0x37  /* L4 hysteresis */
+#define ADP8870_L5TRP	0x38  /* L5 comparator reference */
+#define ADP8870_L5HYS	0x39  /* L5 hysteresis */
+#define ADP8870_PH1LEVL	0x40  /* First phototransistor ambient light level-low byte register */
+#define ADP8870_PH1LEVH	0x41  /* First phototransistor ambient light level-high byte register */
+#define ADP8870_PH2LEVL	0x42  /* Second phototransistor ambient light level-low byte register */
+#define ADP8870_PH2LEVH	0x43  /* Second phototransistor ambient light level-high byte register */
+
+#define ADP8870_MANUFID		0x3  /* Analog Devices AD8870 Manufacturer and device ID */
+#define ADP8870_DEVID(x)	((x) & 0xF)
+#define ADP8870_MANID(x)	((x) >> 4)
+
+/* MDCR Device mode and status */
+#define D7ALSEN			(1 << 7)
+#define INT_CFG			(1 << 6)
+#define NSTBY			(1 << 5)
+#define DIM_EN			(1 << 4)
+#define GDWN_DIS		(1 << 3)
+#define SIS_EN			(1 << 2)
+#define CMP_AUTOEN		(1 << 1)
+#define BLEN			(1 << 0)
+
+/* ADP8870_ALS1_EN Main ALS comparator level enable */
+#define L5_EN			(1 << 3)
+#define L4_EN			(1 << 2)
+#define L3_EN			(1 << 1)
+#define L2_EN			(1 << 0)
+
+#define CFGR_BLV_SHIFT		3
+#define CFGR_BLV_MASK		0x7
+#define ADP8870_FLAG_LED_MASK	0xFF
+
+#define FADE_VAL(in, out)	((0xF & (in)) | ((0xF & (out)) << 4))
+#define BL_CFGR_VAL(law, blv)	((((blv) & CFGR_BLV_MASK) << CFGR_BLV_SHIFT) | ((0x3 & (law)) << 1))
+#define ALS_CMPR_CFG_VAL(filt)	((0x7 & (filt)) << 1)
+
+struct adp8870_bl {
+	struct i2c_client *client;
+	struct backlight_device *bl;
+	struct adp8870_led *led;
+	struct adp8870_backlight_platform_data *pdata;
+	struct mutex lock;
+	unsigned long cached_daylight_max;
+	int id;
+	int revid;
+	int current_brightness;
+};
+
+struct adp8870_led {
+	struct led_classdev	cdev;
+	struct work_struct	work;
+	struct i2c_client	*client;
+	enum led_brightness	new_brightness;
+	int			id;
+	int			flags;
+};
+
+static int adp8870_read(struct i2c_client *client, int reg, uint8_t *val)
+{
+	int ret;
+
+	ret = i2c_smbus_read_byte_data(client, reg);
+	if (ret < 0) {
+		dev_err(&client->dev, "failed reading at 0x%02x\n", reg);
+		return ret;
+	}
+
+	*val = ret;
+	return 0;
+}
+
+
+static int adp8870_write(struct i2c_client *client, u8 reg, u8 val)
+{
+	int ret = i2c_smbus_write_byte_data(client, reg, val);
+	if (ret)
+		dev_err(&client->dev, "failed to write\n");
+
+	return ret;
+}
+
+static int adp8870_set_bits(struct i2c_client *client, int reg, uint8_t bit_mask)
+{
+	struct adp8870_bl *data = i2c_get_clientdata(client);
+	uint8_t reg_val;
+	int ret;
+
+	mutex_lock(&data->lock);
+
+	ret = adp8870_read(client, reg, &reg_val);
+
+	if (!ret && ((reg_val & bit_mask) == 0)) {
+		reg_val |= bit_mask;
+		ret = adp8870_write(client, reg, reg_val);
+	}
+
+	mutex_unlock(&data->lock);
+	return ret;
+}
+
+static int adp8870_clr_bits(struct i2c_client *client, int reg, uint8_t bit_mask)
+{
+	struct adp8870_bl *data = i2c_get_clientdata(client);
+	uint8_t reg_val;
+	int ret;
+
+	mutex_lock(&data->lock);
+
+	ret = adp8870_read(client, reg, &reg_val);
+
+	if (!ret && (reg_val & bit_mask)) {
+		reg_val &= ~bit_mask;
+		ret = adp8870_write(client, reg, reg_val);
+	}
+
+	mutex_unlock(&data->lock);
+	return ret;
+}
+
+/*
+ * Independent sink / LED
+ */
+#if defined(ADP8870_USE_LEDS)
+static void adp8870_led_work(struct work_struct *work)
+{
+	struct adp8870_led *led = container_of(work, struct adp8870_led, work);
+	adp8870_write(led->client, ADP8870_ISC1 + led->id - 1,
+			 led->new_brightness >> 1);
+}
+
+static void adp8870_led_set(struct led_classdev *led_cdev,
+			   enum led_brightness value)
+{
+	struct adp8870_led *led;
+
+	led = container_of(led_cdev, struct adp8870_led, cdev);
+	led->new_brightness = value;
+	/*
+	 * Use workqueue for IO since I2C operations can sleep.
+	 */
+	schedule_work(&led->work);
+}
+
+static int adp8870_led_setup(struct adp8870_led *led)
+{
+	struct i2c_client *client = led->client;
+	int ret = 0;
+
+	ret = adp8870_write(client, ADP8870_ISC1 + led->id - 1, 0);
+	if (ret)
+		return ret;
+
+	ret = adp8870_set_bits(client, ADP8870_ISCC, 1 << (led->id - 1));
+	if (ret)
+		return ret;
+
+	if (led->id > 4)
+		ret = adp8870_set_bits(client, ADP8870_ISCT1,
+				(led->flags & 0x3) << ((led->id - 5) * 2));
+	else
+		ret = adp8870_set_bits(client, ADP8870_ISCT2,
+				(led->flags & 0x3) << ((led->id - 1) * 2));
+
+	return ret;
+}
+
+static int __devinit adp8870_led_probe(struct i2c_client *client)
+{
+	struct adp8870_backlight_platform_data *pdata =
+		client->dev.platform_data;
+	struct adp8870_bl *data = i2c_get_clientdata(client);
+	struct adp8870_led *led, *led_dat;
+	struct led_info *cur_led;
+	int ret, i;
+
+
+	led = kcalloc(pdata->num_leds, sizeof(*led), GFP_KERNEL);
+	if (led == NULL) {
+		dev_err(&client->dev, "failed to alloc memory\n");
+		return -ENOMEM;
+	}
+
+	ret = adp8870_write(client, ADP8870_ISCLAW, pdata->led_fade_law);
+	if (ret)
+		goto err_free;
+
+	ret = adp8870_write(client, ADP8870_ISCT1,
+			(pdata->led_on_time & 0x3) << 6);
+	if (ret)
+		goto err_free;
+
+	ret = adp8870_write(client, ADP8870_ISCF,
+			FADE_VAL(pdata->led_fade_in, pdata->led_fade_out));
+	if (ret)
+		goto err_free;
+
+	for (i = 0; i < pdata->num_leds; ++i) {
+		cur_led = &pdata->leds[i];
+		led_dat = &led[i];
+
+		led_dat->id = cur_led->flags & ADP8870_FLAG_LED_MASK;
+
+		if (led_dat->id > 7 || led_dat->id < 1) {
+			dev_err(&client->dev, "Invalid LED ID %d\n",
+				led_dat->id);
+			goto err;
+		}
+
+		if (pdata->bl_led_assign & (1 << (led_dat->id - 1))) {
+			dev_err(&client->dev, "LED %d used by Backlight\n",
+				led_dat->id);
+			goto err;
+		}
+
+		led_dat->cdev.name = cur_led->name;
+		led_dat->cdev.default_trigger = cur_led->default_trigger;
+		led_dat->cdev.brightness_set = adp8870_led_set;
+		led_dat->cdev.brightness = LED_OFF;
+		led_dat->flags = cur_led->flags >> FLAG_OFFT_SHIFT;
+		led_dat->client = client;
+		led_dat->new_brightness = LED_OFF;
+		INIT_WORK(&led_dat->work, adp8870_led_work);
+
+		ret = led_classdev_register(&client->dev, &led_dat->cdev);
+		if (ret) {
+			dev_err(&client->dev, "failed to register LED %d\n",
+				led_dat->id);
+			goto err;
+		}
+
+		ret = adp8870_led_setup(led_dat);
+		if (ret) {
+			dev_err(&client->dev, "failed to write\n");
+			i++;
+			goto err;
+		}
+	}
+
+	data->led = led;
+
+	return 0;
+
+ err:
+	for (i = i - 1; i >= 0; --i) {
+		led_classdev_unregister(&led[i].cdev);
+		cancel_work_sync(&led[i].work);
+	}
+
+ err_free:
+	kfree(led);
+
+	return ret;
+}
+
+static int __devexit adp8870_led_remove(struct i2c_client *client)
+{
+	struct adp8870_backlight_platform_data *pdata =
+		client->dev.platform_data;
+	struct adp8870_bl *data = i2c_get_clientdata(client);
+	int i;
+
+	for (i = 0; i < pdata->num_leds; i++) {
+		led_classdev_unregister(&data->led[i].cdev);
+		cancel_work_sync(&data->led[i].work);
+	}
+
+	kfree(data->led);
+	return 0;
+}
+#else
+static int __devinit adp8870_led_probe(struct i2c_client *client)
+{
+	return 0;
+}
+
+static int __devexit adp8870_led_remove(struct i2c_client *client)
+{
+	return 0;
+}
+#endif
+
+static int adp8870_bl_set(struct backlight_device *bl, int brightness)
+{
+	struct adp8870_bl *data = bl_get_data(bl);
+	struct i2c_client *client = data->client;
+	int ret = 0;
+
+	if (data->pdata->en_ambl_sens) {
+		if ((brightness > 0) && (brightness < ADP8870_MAX_BRIGHTNESS)) {
+			/* Disable Ambient Light auto adjust */
+			ret = adp8870_clr_bits(client, ADP8870_MDCR,
+					CMP_AUTOEN);
+			if (ret)
+				return ret;
+			ret = adp8870_write(client, ADP8870_BLMX1, brightness);
+			if (ret)
+				return ret;
+		} else {
+			/*
+			 * MAX_BRIGHTNESS -> Enable Ambient Light auto adjust
+			 * restore daylight l1 sysfs brightness
+			 */
+			ret = adp8870_write(client, ADP8870_BLMX1,
+					 data->cached_daylight_max);
+			if (ret)
+				return ret;
+
+			ret = adp8870_set_bits(client, ADP8870_MDCR,
+					 CMP_AUTOEN);
+			if (ret)
+				return ret;
+		}
+	} else {
+		ret = adp8870_write(client, ADP8870_BLMX1, brightness);
+		if (ret)
+			return ret;
+	}
+
+	if (data->current_brightness && brightness == 0)
+		ret = adp8870_set_bits(client,
+				ADP8870_MDCR, DIM_EN);
+	else if (data->current_brightness == 0 && brightness)
+		ret = adp8870_clr_bits(client,
+				ADP8870_MDCR, DIM_EN);
+
+	if (!ret)
+		data->current_brightness = brightness;
+
+	return ret;
+}
+
+static int adp8870_bl_update_status(struct backlight_device *bl)
+{
+	int brightness = bl->props.brightness;
+	if (bl->props.power != FB_BLANK_UNBLANK)
+		brightness = 0;
+
+	if (bl->props.fb_blank != FB_BLANK_UNBLANK)
+		brightness = 0;
+
+	return adp8870_bl_set(bl, brightness);
+}
+
+static int adp8870_bl_get_brightness(struct backlight_device *bl)
+{
+	struct adp8870_bl *data = bl_get_data(bl);
+
+	return data->current_brightness;
+}
+
+static const struct backlight_ops adp8870_bl_ops = {
+	.update_status	= adp8870_bl_update_status,
+	.get_brightness	= adp8870_bl_get_brightness,
+};
+
+static int adp8870_bl_setup(struct backlight_device *bl)
+{
+	struct adp8870_bl *data = bl_get_data(bl);
+	struct i2c_client *client = data->client;
+	struct adp8870_backlight_platform_data *pdata = data->pdata;
+	int ret = 0;
+
+	ret = adp8870_write(client, ADP8870_BLSEL, ~pdata->bl_led_assign);
+	if (ret)
+		return ret;
+
+	ret = adp8870_write(client, ADP8870_PWMLED, pdata->pwm_assign);
+	if (ret)
+		return ret;
+
+	ret = adp8870_write(client, ADP8870_BLMX1, pdata->l1_daylight_max);
+	if (ret)
+		return ret;
+
+	ret = adp8870_write(client, ADP8870_BLDM1, pdata->l1_daylight_dim);
+	if (ret)
+		return ret;
+
+	if (pdata->en_ambl_sens) {
+		data->cached_daylight_max = pdata->l1_daylight_max;
+		ret = adp8870_write(client, ADP8870_BLMX2,
+						pdata->l2_bright_max);
+		if (ret)
+			return ret;
+		ret = adp8870_write(client, ADP8870_BLDM2,
+						pdata->l2_bright_dim);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_BLMX3,
+						pdata->l3_office_max);
+		if (ret)
+			return ret;
+		ret = adp8870_write(client, ADP8870_BLDM3,
+						pdata->l3_office_dim);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_BLMX4,
+						pdata->l4_indoor_max);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_BLDM4,
+						pdata->l4_indor_dim);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_BLMX5,
+						pdata->l5_dark_max);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_BLDM5,
+						pdata->l5_dark_dim);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_L2TRP, pdata->l2_trip);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_L2HYS, pdata->l2_hyst);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_L3TRP, pdata->l3_trip);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_L3HYS, pdata->l3_hyst);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_L4TRP, pdata->l4_trip);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_L4HYS, pdata->l4_hyst);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_L5TRP, pdata->l5_trip);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_L5HYS, pdata->l5_hyst);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_ALS1_EN, L5_EN | L4_EN |
+						L3_EN | L2_EN);
+		if (ret)
+			return ret;
+
+		ret = adp8870_write(client, ADP8870_CMP_CTL,
+			ALS_CMPR_CFG_VAL(pdata->abml_filt));
+		if (ret)
+			return ret;
+	}
+
+	ret = adp8870_write(client, ADP8870_CFGR,
+			BL_CFGR_VAL(pdata->bl_fade_law, 0));
+	if (ret)
+		return ret;
+
+	ret = adp8870_write(client, ADP8870_BLFR, FADE_VAL(pdata->bl_fade_in,
+			pdata->bl_fade_out));
+	if (ret)
+		return ret;
+	/*
+	 * ADP8870 Rev0 requires GDWN_DIS bit set
+	 */
+
+	ret = adp8870_set_bits(client, ADP8870_MDCR, BLEN | DIM_EN | NSTBY |
+			(data->revid == 0 ? GDWN_DIS : 0));
+
+	return ret;
+}
+
+static ssize_t adp8870_show(struct device *dev, char *buf, int reg)
+{
+	struct adp8870_bl *data = dev_get_drvdata(dev);
+	int error;
+	uint8_t reg_val;
+
+	mutex_lock(&data->lock);
+	error = adp8870_read(data->client, reg, &reg_val);
+	mutex_unlock(&data->lock);
+
+	if (error < 0)
+		return error;
+
+	return sprintf(buf, "%u\n", reg_val);
+}
+
+static ssize_t adp8870_store(struct device *dev, const char *buf,
+			 size_t count, int reg)
+{
+	struct adp8870_bl *data = dev_get_drvdata(dev);
+	unsigned long val;
+	int ret;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	mutex_lock(&data->lock);
+	adp8870_write(data->client, reg, val);
+	mutex_unlock(&data->lock);
+
+	return count;
+}
+
+static ssize_t adp8870_bl_l5_dark_max_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return adp8870_show(dev, buf, ADP8870_BLMX5);
+}
+
+static ssize_t adp8870_bl_l5_dark_max_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return adp8870_store(dev, buf, count, ADP8870_BLMX5);
+}
+static DEVICE_ATTR(l5_dark_max, 0664, adp8870_bl_l5_dark_max_show,
+			adp8870_bl_l5_dark_max_store);
+
+
+static ssize_t adp8870_bl_l4_indoor_max_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return adp8870_show(dev, buf, ADP8870_BLMX4);
+}
+
+static ssize_t adp8870_bl_l4_indoor_max_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return adp8870_store(dev, buf, count, ADP8870_BLMX4);
+}
+static DEVICE_ATTR(l4_indoor_max, 0664, adp8870_bl_l4_indoor_max_show,
+			adp8870_bl_l4_indoor_max_store);
+
+
+static ssize_t adp8870_bl_l3_office_max_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	return adp8870_show(dev, buf, ADP8870_BLMX3);
+}
+
+static ssize_t adp8870_bl_l3_office_max_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return adp8870_store(dev, buf, count, ADP8870_BLMX3);
+}
+
+static DEVICE_ATTR(l3_office_max, 0664, adp8870_bl_l3_office_max_show,
+			adp8870_bl_l3_office_max_store);
+
+static ssize_t adp8870_bl_l2_bright_max_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return adp8870_show(dev, buf, ADP8870_BLMX2);
+}
+
+static ssize_t adp8870_bl_l2_bright_max_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return adp8870_store(dev, buf, count, ADP8870_BLMX2);
+}
+static DEVICE_ATTR(l2_bright_max, 0664, adp8870_bl_l2_bright_max_show,
+			adp8870_bl_l2_bright_max_store);
+
+static ssize_t adp8870_bl_l1_daylight_max_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	return adp8870_show(dev, buf, ADP8870_BLMX1);
+}
+
+static ssize_t adp8870_bl_l1_daylight_max_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct adp8870_bl *data = dev_get_drvdata(dev);
+	int ret = strict_strtoul(buf, 10, &data->cached_daylight_max);
+	if (ret)
+		return ret;
+
+	return adp8870_store(dev, buf, count, ADP8870_BLMX1);
+}
+static DEVICE_ATTR(l1_daylight_max, 0664, adp8870_bl_l1_daylight_max_show,
+			adp8870_bl_l1_daylight_max_store);
+
+static ssize_t adp8870_bl_l5_dark_dim_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	return adp8870_show(dev, buf, ADP8870_BLDM5);
+}
+
+static ssize_t adp8870_bl_l5_dark_dim_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	return adp8870_store(dev, buf, count, ADP8870_BLDM5);
+}
+static DEVICE_ATTR(l5_dark_dim, 0664, adp8870_bl_l5_dark_dim_show,
+			adp8870_bl_l5_dark_dim_store);
+
+static ssize_t adp8870_bl_l4_indoor_dim_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	return adp8870_show(dev, buf, ADP8870_BLDM4);
+}
+
+static ssize_t adp8870_bl_l4_indoor_dim_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	return adp8870_store(dev, buf, count, ADP8870_BLDM4);
+}
+static DEVICE_ATTR(l4_indoor_dim, 0664, adp8870_bl_l4_indoor_dim_show,
+			adp8870_bl_l4_indoor_dim_store);
+
+
+static ssize_t adp8870_bl_l3_office_dim_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	return adp8870_show(dev, buf, ADP8870_BLDM3);
+}
+
+static ssize_t adp8870_bl_l3_office_dim_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	return adp8870_store(dev, buf, count, ADP8870_BLDM3);
+}
+static DEVICE_ATTR(l3_office_dim, 0664, adp8870_bl_l3_office_dim_show,
+			adp8870_bl_l3_office_dim_store);
+
+static ssize_t adp8870_bl_l2_bright_dim_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	return adp8870_show(dev, buf, ADP8870_BLDM2);
+}
+
+static ssize_t adp8870_bl_l2_bright_dim_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	return adp8870_store(dev, buf, count, ADP8870_BLDM2);
+}
+static DEVICE_ATTR(l2_bright_dim, 0664, adp8870_bl_l2_bright_dim_show,
+			adp8870_bl_l2_bright_dim_store);
+
+static ssize_t adp8870_bl_l1_daylight_dim_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	return adp8870_show(dev, buf, ADP8870_BLDM1);
+}
+
+static ssize_t adp8870_bl_l1_daylight_dim_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	return adp8870_store(dev, buf, count, ADP8870_BLDM1);
+}
+static DEVICE_ATTR(l1_daylight_dim, 0664, adp8870_bl_l1_daylight_dim_show,
+			adp8870_bl_l1_daylight_dim_store);
+
+#ifdef ADP8870_EXT_FEATURES
+static ssize_t adp8870_bl_ambient_light_level_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct adp8870_bl *data = dev_get_drvdata(dev);
+	int error;
+	uint8_t reg_val;
+	uint16_t ret_val;
+
+	mutex_lock(&data->lock);
+	error = adp8870_read(data->client, ADP8870_PH1LEVL, &reg_val);
+	if (error < 0) {
+		mutex_unlock(&data->lock);
+		return error;
+	}
+	ret_val = reg_val;
+	error = adp8870_read(data->client, ADP8870_PH1LEVH, &reg_val);
+	mutex_unlock(&data->lock);
+
+	if (error < 0)
+		return error;
+
+	/* Return 13-bit conversion value for the first light sensor */
+	ret_val += (reg_val & 0x1F) << 8;
+
+	return sprintf(buf, "%u\n", ret_val);
+}
+static DEVICE_ATTR(ambient_light_level, 0444,
+		adp8870_bl_ambient_light_level_show, NULL);
+
+static ssize_t adp8870_bl_ambient_light_zone_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct adp8870_bl *data = dev_get_drvdata(dev);
+	int error;
+	uint8_t reg_val;
+
+	mutex_lock(&data->lock);
+	error = adp8870_read(data->client, ADP8870_CFGR, &reg_val);
+	mutex_unlock(&data->lock);
+
+	if (error < 0)
+		return error;
+
+	return sprintf(buf, "%u\n",
+		((reg_val >> CFGR_BLV_SHIFT) & CFGR_BLV_MASK) + 1);
+}
+
+static ssize_t adp8870_bl_ambient_light_zone_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct adp8870_bl *data = dev_get_drvdata(dev);
+	unsigned long val;
+	uint8_t reg_val;
+	int ret;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	if (val == 0) {
+		/* Enable automatic ambient light sensing */
+		adp8870_set_bits(data->client, ADP8870_MDCR, CMP_AUTOEN);
+	} else if ((val > 0) && (val < 6)) {
+		/* Disable automatic ambient light sensing */
+		adp8870_clr_bits(data->client, ADP8870_MDCR, CMP_AUTOEN);
+
+		/* Set user supplied ambient light zone */
+		mutex_lock(&data->lock);
+		adp8870_read(data->client, ADP8870_CFGR, &reg_val);
+		reg_val &= ~(CFGR_BLV_MASK << CFGR_BLV_SHIFT);
+		reg_val |= (val - 1) << CFGR_BLV_SHIFT;
+		adp8870_write(data->client, ADP8870_CFGR, reg_val);
+		mutex_unlock(&data->lock);
+	}
+
+	return count;
+}
+static DEVICE_ATTR(ambient_light_zone, 0664,
+		adp8870_bl_ambient_light_zone_show,
+		adp8870_bl_ambient_light_zone_store);
+#endif
+
+static struct attribute *adp8870_bl_attributes[] = {
+	&dev_attr_l5_dark_max.attr,
+	&dev_attr_l5_dark_dim.attr,
+	&dev_attr_l4_indoor_max.attr,
+	&dev_attr_l4_indoor_dim.attr,
+	&dev_attr_l3_office_max.attr,
+	&dev_attr_l3_office_dim.attr,
+	&dev_attr_l2_bright_max.attr,
+	&dev_attr_l2_bright_dim.attr,
+	&dev_attr_l1_daylight_max.attr,
+	&dev_attr_l1_daylight_dim.attr,
+#ifdef ADP8870_EXT_FEATURES
+	&dev_attr_ambient_light_level.attr,
+	&dev_attr_ambient_light_zone.attr,
+#endif
+	NULL
+};
+
+static const struct attribute_group adp8870_bl_attr_group = {
+	.attrs = adp8870_bl_attributes,
+};
+
+static int __devinit adp8870_probe(struct i2c_client *client,
+					const struct i2c_device_id *id)
+{
+	struct backlight_properties props;
+	struct backlight_device *bl;
+	struct adp8870_bl *data;
+	struct adp8870_backlight_platform_data *pdata =
+		client->dev.platform_data;
+	uint8_t reg_val;
+	int ret;
+
+	if (!i2c_check_functionality(client->adapter,
+					I2C_FUNC_SMBUS_BYTE_DATA)) {
+		dev_err(&client->dev, "SMBUS Byte Data not Supported\n");
+		return -EIO;
+	}
+
+	if (!pdata) {
+		dev_err(&client->dev, "no platform data?\n");
+		return -EINVAL;
+	}
+
+	ret = adp8870_read(client, ADP8870_MFDVID, &reg_val);
+	if (ret < 0)
+		return -EIO;
+
+	if (ADP8870_MANID(reg_val) != ADP8870_MANUFID) {
+		dev_err(&client->dev, "failed to probe\n");
+		return -ENODEV;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return -ENOMEM;
+
+	data->revid = ADP8870_DEVID(reg_val);
+	data->client = client;
+	data->pdata = pdata;
+	data->id = id->driver_data;
+	data->current_brightness = 0;
+	i2c_set_clientdata(client, data);
+
+	mutex_init(&data->lock);
+
+	memset(&props, 0, sizeof(props));
+	props.max_brightness = props.brightness = ADP8870_MAX_BRIGHTNESS;
+	bl = backlight_device_register(dev_driver_string(&client->dev),
+			&client->dev, data, &adp8870_bl_ops, &props);
+	if (IS_ERR(bl)) {
+		dev_err(&client->dev, "failed to register backlight\n");
+		ret = PTR_ERR(bl);
+		goto out2;
+	}
+
+	data->bl = bl;
+
+	if (pdata->en_ambl_sens)
+		ret = sysfs_create_group(&bl->dev.kobj,
+			&adp8870_bl_attr_group);
+
+	if (ret) {
+		dev_err(&client->dev, "failed to register sysfs\n");
+		goto out1;
+	}
+
+	ret = adp8870_bl_setup(bl);
+	if (ret) {
+		ret = -EIO;
+		goto out;
+	}
+
+	backlight_update_status(bl);
+
+	dev_info(&client->dev, "Rev.%d Backlight\n", data->revid);
+
+	if (pdata->num_leds)
+		adp8870_led_probe(client);
+
+	return 0;
+
+out:
+	if (data->pdata->en_ambl_sens)
+		sysfs_remove_group(&data->bl->dev.kobj,
+			&adp8870_bl_attr_group);
+out1:
+	backlight_device_unregister(bl);
+out2:
+	i2c_set_clientdata(client, NULL);
+	kfree(data);
+
+	return ret;
+}
+
+static int __devexit adp8870_remove(struct i2c_client *client)
+{
+	struct adp8870_bl *data = i2c_get_clientdata(client);
+
+	adp8870_clr_bits(client, ADP8870_MDCR, NSTBY);
+
+	if (data->led)
+		adp8870_led_remove(client);
+
+	if (data->pdata->en_ambl_sens)
+		sysfs_remove_group(&data->bl->dev.kobj,
+			&adp8870_bl_attr_group);
+
+	backlight_device_unregister(data->bl);
+	i2c_set_clientdata(client, NULL);
+	kfree(data);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int adp8870_i2c_suspend(struct i2c_client *client, pm_message_t message)
+{
+	adp8870_clr_bits(client, ADP8870_MDCR, NSTBY);
+
+	return 0;
+}
+
+static int adp8870_i2c_resume(struct i2c_client *client)
+{
+	adp8870_set_bits(client, ADP8870_MDCR, NSTBY);
+
+	return 0;
+}
+#else
+#define adp8870_i2c_suspend NULL
+#define adp8870_i2c_resume NULL
+#endif
+
+static const struct i2c_device_id adp8870_id[] = {
+	{ "adp8870", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, adp8870_id);
+
+static struct i2c_driver adp8870_driver = {
+	.driver = {
+		.name = KBUILD_MODNAME,
+	},
+	.probe    = adp8870_probe,
+	.remove   = __devexit_p(adp8870_remove),
+	.suspend = adp8870_i2c_suspend,
+	.resume  = adp8870_i2c_resume,
+	.id_table = adp8870_id,
+};
+
+static int __init adp8870_init(void)
+{
+	return i2c_add_driver(&adp8870_driver);
+}
+module_init(adp8870_init);
+
+static void __exit adp8870_exit(void)
+{
+	i2c_del_driver(&adp8870_driver);
+}
+module_exit(adp8870_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
+MODULE_DESCRIPTION("ADP8870 Backlight driver");
+MODULE_ALIAS("platform:adp8870-backlight");
diff --git a/include/linux/i2c/adp8870.h b/include/linux/i2c/adp8870.h
new file mode 100644
index 000000000000..624dceccbd5b
--- /dev/null
+++ b/include/linux/i2c/adp8870.h
@@ -0,0 +1,153 @@
+/*
+ * Definitions and platform data for Analog Devices
+ * Backlight drivers ADP8870
+ *
+ * Copyright 2009-2010 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#ifndef __LINUX_I2C_ADP8870_H
+#define __LINUX_I2C_ADP8870_H
+
+#define ID_ADP8870		8870
+
+#define ADP8870_MAX_BRIGHTNESS	0x7F
+#define FLAG_OFFT_SHIFT 8
+
+/*
+ * LEDs subdevice platform data
+ */
+
+#define ADP8870_LED_DIS_BLINK	(0 << FLAG_OFFT_SHIFT)
+#define ADP8870_LED_OFFT_600ms	(1 << FLAG_OFFT_SHIFT)
+#define ADP8870_LED_OFFT_1200ms	(2 << FLAG_OFFT_SHIFT)
+#define ADP8870_LED_OFFT_1800ms	(3 << FLAG_OFFT_SHIFT)
+
+#define ADP8870_LED_ONT_200ms	0
+#define ADP8870_LED_ONT_600ms	1
+#define ADP8870_LED_ONT_800ms	2
+#define ADP8870_LED_ONT_1200ms	3
+
+#define ADP8870_LED_D7		(7)
+#define ADP8870_LED_D6		(6)
+#define ADP8870_LED_D5		(5)
+#define ADP8870_LED_D4		(4)
+#define ADP8870_LED_D3		(3)
+#define ADP8870_LED_D2		(2)
+#define ADP8870_LED_D1		(1)
+
+/*
+ * Backlight subdevice platform data
+ */
+
+#define ADP8870_BL_D7		(1 << 6)
+#define ADP8870_BL_D6		(1 << 5)
+#define ADP8870_BL_D5		(1 << 4)
+#define ADP8870_BL_D4		(1 << 3)
+#define ADP8870_BL_D3		(1 << 2)
+#define ADP8870_BL_D2		(1 << 1)
+#define ADP8870_BL_D1		(1 << 0)
+
+#define ADP8870_FADE_T_DIS	0	/* Fade Timer Disabled */
+#define ADP8870_FADE_T_300ms	1	/* 0.3 Sec */
+#define ADP8870_FADE_T_600ms	2
+#define ADP8870_FADE_T_900ms	3
+#define ADP8870_FADE_T_1200ms	4
+#define ADP8870_FADE_T_1500ms	5
+#define ADP8870_FADE_T_1800ms	6
+#define ADP8870_FADE_T_2100ms	7
+#define ADP8870_FADE_T_2400ms	8
+#define ADP8870_FADE_T_2700ms	9
+#define ADP8870_FADE_T_3000ms	10
+#define ADP8870_FADE_T_3500ms	11
+#define ADP8870_FADE_T_4000ms	12
+#define ADP8870_FADE_T_4500ms	13
+#define ADP8870_FADE_T_5000ms	14
+#define ADP8870_FADE_T_5500ms	15	/* 5.5 Sec */
+
+#define ADP8870_FADE_LAW_LINEAR	0
+#define ADP8870_FADE_LAW_SQUARE	1
+#define ADP8870_FADE_LAW_CUBIC1	2
+#define ADP8870_FADE_LAW_CUBIC2	3
+
+#define ADP8870_BL_AMBL_FILT_80ms	0	/* Light sensor filter time */
+#define ADP8870_BL_AMBL_FILT_160ms	1
+#define ADP8870_BL_AMBL_FILT_320ms	2
+#define ADP8870_BL_AMBL_FILT_640ms	3
+#define ADP8870_BL_AMBL_FILT_1280ms	4
+#define ADP8870_BL_AMBL_FILT_2560ms	5
+#define ADP8870_BL_AMBL_FILT_5120ms	6
+#define ADP8870_BL_AMBL_FILT_10240ms	7	/* 10.24 sec */
+
+/*
+ * Blacklight current 0..30mA
+ */
+#define ADP8870_BL_CUR_mA(I)		((I * 127) / 30)
+
+/*
+ * L2 comparator current 0..1106uA
+ */
+#define ADP8870_L2_COMP_CURR_uA(I)	((I * 255) / 1106)
+
+/*
+ * L3 comparator current 0..551uA
+ */
+#define ADP8870_L3_COMP_CURR_uA(I)	((I * 255) / 551)
+
+/*
+ * L4 comparator current 0..275uA
+ */
+#define ADP8870_L4_COMP_CURR_uA(I)	((I * 255) / 275)
+
+/*
+ * L5 comparator current 0..138uA
+ */
+#define ADP8870_L5_COMP_CURR_uA(I)	((I * 255) / 138)
+
+struct adp8870_backlight_platform_data {
+	u8 bl_led_assign;	/* 1 = Backlight 0 = Individual LED */
+	u8 pwm_assign;		/* 1 = Enables PWM mode */
+
+	u8 bl_fade_in;		/* Backlight Fade-In Timer */
+	u8 bl_fade_out;		/* Backlight Fade-Out Timer */
+	u8 bl_fade_law;		/* fade-on/fade-off transfer characteristic */
+
+	u8 en_ambl_sens;	/* 1 = enable ambient light sensor */
+	u8 abml_filt;		/* Light sensor filter time */
+
+	u8 l1_daylight_max;	/* use BL_CUR_mA(I) 0 <= I <= 30 mA */
+	u8 l1_daylight_dim;	/* typ = 0, use BL_CUR_mA(I) 0 <= I <= 30 mA */
+	u8 l2_bright_max;	/* use BL_CUR_mA(I) 0 <= I <= 30 mA */
+	u8 l2_bright_dim;	/* typ = 0, use BL_CUR_mA(I) 0 <= I <= 30 mA */
+	u8 l3_office_max;	/* use BL_CUR_mA(I) 0 <= I <= 30 mA */
+	u8 l3_office_dim;	/* typ = 0, use BL_CUR_mA(I) 0 <= I <= 30 mA */
+	u8 l4_indoor_max;	/* use BL_CUR_mA(I) 0 <= I <= 30 mA */
+	u8 l4_indor_dim;	/* typ = 0, use BL_CUR_mA(I) 0 <= I <= 30 mA */
+	u8 l5_dark_max;		/* use BL_CUR_mA(I) 0 <= I <= 30 mA */
+	u8 l5_dark_dim;		/* typ = 0, use BL_CUR_mA(I) 0 <= I <= 30 mA */
+
+	u8 l2_trip;		/* use L2_COMP_CURR_uA(I) 0 <= I <= 1106 uA */
+	u8 l2_hyst;		/* use L2_COMP_CURR_uA(I) 0 <= I <= 1106 uA */
+	u8 l3_trip;		/* use L3_COMP_CURR_uA(I) 0 <= I <= 551 uA */
+	u8 l3_hyst;		/* use L3_COMP_CURR_uA(I) 0 <= I <= 551 uA */
+	u8 l4_trip;		/* use L4_COMP_CURR_uA(I) 0 <= I <= 275 uA */
+	u8 l4_hyst;		/* use L4_COMP_CURR_uA(I) 0 <= I <= 275 uA */
+	u8 l5_trip;		/* use L5_COMP_CURR_uA(I) 0 <= I <= 138 uA */
+	u8 l5_hyst;		/* use L6_COMP_CURR_uA(I) 0 <= I <= 138 uA */
+
+	/**
+	 * Independent Current Sinks / LEDS
+	 * Sinks not assigned to the Backlight can be exposed to
+	 * user space using the LEDS CLASS interface
+	 */
+
+	int num_leds;
+	struct led_info	*leds;
+	u8 led_fade_in;		/* LED Fade-In Timer */
+	u8 led_fade_out;	/* LED Fade-Out Timer */
+	u8 led_fade_law;	/* fade-on/fade-off transfer characteristic */
+	u8 led_on_time;
+};
+
+#endif /* __LINUX_I2C_ADP8870_H */
-- 
cgit v1.2.3


From a433658c30974fc87ba3ff52d7e4e6299762aa3d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 15 Jun 2011 15:08:13 -0700
Subject: vmscan,memcg: memcg aware swap token

Currently, memcg reclaim can disable swap token even if the swap token mm
doesn't belong in its memory cgroup.  It's slightly risky.  If an admin
creates very small mem-cgroup and silly guy runs contentious heavy memory
pressure workload, every tasks are going to lose swap token and then
system may become unresponsive.  That's bad.

This patch adds 'memcg' parameter into disable_swap_token().  and if the
parameter doesn't match swap token, VM doesn't disable it.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Rik van Riel<riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ++++
 include/linux/swap.h       |  8 ++---
 mm/memcontrol.c            | 16 ++++-----
 mm/thrash.c                | 87 ++++++++++++++++++++++++++++++++++++----------
 mm/vmscan.c                |  4 +--
 5 files changed, 85 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9724a38ee69d..50940da6adf3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -84,6 +84,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
 
 static inline
 int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
@@ -246,6 +247,11 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	return NULL;
 }
 
+static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+	return NULL;
+}
+
 static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem)
 {
 	return 1;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 384eb5fe530b..e70564647039 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -358,6 +358,7 @@ struct backing_dev_info;
 extern struct mm_struct *swap_token_mm;
 extern void grab_swap_token(struct mm_struct *);
 extern void __put_swap_token(struct mm_struct *);
+extern void disable_swap_token(struct mem_cgroup *memcg);
 
 static inline int has_swap_token(struct mm_struct *mm)
 {
@@ -370,11 +371,6 @@ static inline void put_swap_token(struct mm_struct *mm)
 		__put_swap_token(mm);
 }
 
-static inline void disable_swap_token(void)
-{
-	put_swap_token(swap_token_mm);
-}
-
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 extern void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
@@ -500,7 +496,7 @@ static inline int has_swap_token(struct mm_struct *mm)
 	return 0;
 }
 
-static inline void disable_swap_token(void)
+static inline void disable_swap_token(struct mem_cgroup *memcg)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bd9052a5d3ad..e37c44d88d46 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -735,7 +735,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 				struct mem_cgroup, css);
 }
 
-static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *mem = NULL;
 
@@ -5414,18 +5414,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 				struct cgroup *old_cont,
 				struct task_struct *p)
 {
-	struct mm_struct *mm;
+	struct mm_struct *mm = get_task_mm(p);
 
-	if (!mc.to)
-		/* no need to move charge */
-		return;
-
-	mm = get_task_mm(p);
 	if (mm) {
-		mem_cgroup_move_charge(mm);
+		if (mc.to)
+			mem_cgroup_move_charge(mm);
+		put_swap_token(mm);
 		mmput(mm);
 	}
-	mem_cgroup_clear_mc();
+	if (mc.to)
+		mem_cgroup_clear_mc();
 }
 #else	/* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
diff --git a/mm/thrash.c b/mm/thrash.c
index 2372d4ed5dd8..6cdf86511385 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -21,11 +21,31 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/swap.h>
+#include <linux/memcontrol.h>
 
 static DEFINE_SPINLOCK(swap_token_lock);
 struct mm_struct *swap_token_mm;
+struct mem_cgroup *swap_token_memcg;
 static unsigned int global_faults;
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+	struct mem_cgroup *memcg;
+
+	memcg = try_get_mem_cgroup_from_mm(mm);
+	if (memcg)
+		css_put(mem_cgroup_css(memcg));
+
+	return memcg;
+}
+#else
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+	return NULL;
+}
+#endif
+
 void grab_swap_token(struct mm_struct *mm)
 {
 	int current_interval;
@@ -38,40 +58,69 @@ void grab_swap_token(struct mm_struct *mm)
 		return;
 
 	/* First come first served */
-	if (swap_token_mm == NULL) {
-		mm->token_priority = mm->token_priority + 2;
-		swap_token_mm = mm;
+	if (!swap_token_mm)
+		goto replace_token;
+
+	if (mm == swap_token_mm) {
+		mm->token_priority += 2;
 		goto out;
 	}
 
-	if (mm != swap_token_mm) {
-		if (current_interval < mm->last_interval)
-			mm->token_priority++;
-		else {
-			if (likely(mm->token_priority > 0))
-				mm->token_priority--;
-		}
-		/* Check if we deserve the token */
-		if (mm->token_priority > swap_token_mm->token_priority) {
-			mm->token_priority += 2;
-			swap_token_mm = mm;
-		}
-	} else {
-		/* Token holder came in again! */
-		mm->token_priority += 2;
+	if (current_interval < mm->last_interval)
+		mm->token_priority++;
+	else {
+		if (likely(mm->token_priority > 0))
+			mm->token_priority--;
 	}
 
+	/* Check if we deserve the token */
+	if (mm->token_priority > swap_token_mm->token_priority)
+		goto replace_token;
+
 out:
 	mm->faultstamp = global_faults;
 	mm->last_interval = current_interval;
 	spin_unlock(&swap_token_lock);
+	return;
+
+replace_token:
+	mm->token_priority += 2;
+	swap_token_mm = mm;
+	swap_token_memcg = swap_token_memcg_from_mm(mm);
+	goto out;
 }
 
 /* Called on process exit. */
 void __put_swap_token(struct mm_struct *mm)
 {
 	spin_lock(&swap_token_lock);
-	if (likely(mm == swap_token_mm))
+	if (likely(mm == swap_token_mm)) {
 		swap_token_mm = NULL;
+		swap_token_memcg = NULL;
+	}
 	spin_unlock(&swap_token_lock);
 }
+
+static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
+{
+	if (!a)
+		return true;
+	if (!b)
+		return true;
+	if (a == b)
+		return true;
+	return false;
+}
+
+void disable_swap_token(struct mem_cgroup *memcg)
+{
+	/* memcg reclaim don't disable unrelated mm token. */
+	if (match_memcg(memcg, swap_token_memcg)) {
+		spin_lock(&swap_token_lock);
+		if (match_memcg(memcg, swap_token_memcg)) {
+			swap_token_mm = NULL;
+			swap_token_memcg = NULL;
+		}
+		spin_unlock(&swap_token_lock);
+	}
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index faa0a088f9cc..dbe6ea321df4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2081,7 +2081,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		sc->nr_scanned = 0;
 		if (!priority)
-			disable_swap_token();
+			disable_swap_token(sc->mem_cgroup);
 		total_scanned += shrink_zones(priority, zonelist, sc);
 		/*
 		 * Don't shrink slabs when reclaiming memory from
@@ -2407,7 +2407,7 @@ loop_again:
 
 		/* The swap token gets in the way of swapout... */
 		if (!priority)
-			disable_swap_token();
+			disable_swap_token(NULL);
 
 		all_zones_ok = 1;
 		balanced = 0;
-- 
cgit v1.2.3


From ac5622418bbff9cd3dc607aa57dfb4f62a7f2043 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 15 Jun 2011 15:08:17 -0700
Subject: kmsg_dump.h: fix build when CONFIG_PRINTK is disabled

Fix <linux/kmsg_dump.h> when CONFIG_PRINTK is not enabled:

  include/linux/kmsg_dump.h:56: error: 'EINVAL' undeclared (first use in this function)
  include/linux/kmsg_dump.h:61: error: 'EINVAL' undeclared (first use in this function)

Looks like commit 595dd3d8bf95 ("kmsg_dump: fix build for
CONFIG_PRINTK=n") uses EINVAL without having the needed header file(s),
but I'm sure that I build tested that patch also.  oh well.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmsg_dump.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index 2a0d7d651dc3..ee0c952188de 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -12,6 +12,7 @@
 #ifndef _LINUX_KMSG_DUMP_H
 #define _LINUX_KMSG_DUMP_H
 
+#include <linux/errno.h>
 #include <linux/list.h>
 
 enum kmsg_dump_reason {
-- 
cgit v1.2.3


From 32e45ff43eaf5c17f5a82c9ad358d515622c2562 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 15 Jun 2011 15:08:20 -0700
Subject: mm: increase RECLAIM_DISTANCE to 30

Recently, Robert Mueller reported (http://lkml.org/lkml/2010/9/12/236)
that zone_reclaim_mode doesn't work properly on his new NUMA server (Dual
Xeon E5520 + Intel S5520UR MB).  He is using Cyrus IMAPd and it's built on
a very traditional single-process model.

  * a master process which reads config files and manages the other
    process
  * multiple imapd processes, one per connection
  * multiple pop3d processes, one per connection
  * multiple lmtpd processes, one per connection
  * periodical "cleanup" processes.

There are thousands of independent processes.  The problem is, recent
Intel motherboard turn on zone_reclaim_mode by default and traditional
prefork model software don't work well on it.  Unfortunatelly, such models
are still typical even in the 21st century.  We can't ignore them.

This patch raises the zone_reclaim_mode threshold to 30.  30 doesn't have
any specific meaning.  but 20 means that one-hop QPI/Hypertransport and
such relatively cheap 2-4 socket machine are often used for traditional
servers as above.  The intention is that these machines don't use
zone_reclaim_mode.

Note: ia64 and Power have arch specific RECLAIM_DISTANCE definitions.
This patch doesn't change such high-end NUMA machine behavior.

Dave Hansen said:

: I know specifically of pieces of x86 hardware that set the information
: in the BIOS to '21' *specifically* so they'll get the zone_reclaim_mode
: behavior which that implies.
:
: They've done performance testing and run very large and scary benchmarks
: to make sure that they _want_ this turned on.  What this means for them
: is that they'll probably be de-optimized, at least on newer versions of
: the kernel.
:
: If you want to do this for particular systems, maybe _that_'s what we
: should do.  Have a list of specific configurations that need the
: defaults overridden either because they're buggy, or they have an
: unusual hardware configuration not really reflected in the distance
: table.

And later said:

: The original change in the hardware tables was for the benefit of a
: benchmark.  Said benchmark isn't going to get run on mainline until the
: next batch of enterprise distros drops, at which point the hardware where
: this was done will be irrelevant for the benchmark.  I'm sure any new
: hardware will just set this distance to another yet arbitrary value to
: make the kernel do what it wants.  :)
:
: Also, when the hardware got _set_ to this initially, I complained.  So, I
: guess I'm getting my way now, with this patch.  I'm cool with it.

Reported-by: Robert Mueller <robm@fastmail.fm>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Acked-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index b91a40e847d2..fc839bfa7935 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -60,7 +60,7 @@ int arch_update_cpu_topology(void);
  * (in whatever arch specific measurement units returned by node_distance())
  * then switch on zone reclaim on boot.
  */
-#define RECLAIM_DISTANCE 20
+#define RECLAIM_DISTANCE 30
 #endif
 #ifndef PENALTY_FOR_NODE_WITH_CPUS
 #define PENALTY_FOR_NODE_WITH_CPUS	(1)
-- 
cgit v1.2.3


From ca39599c633fb02aceac31a7e67563612e4fe347 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Wed, 15 Jun 2011 15:08:27 -0700
Subject: BUILD_BUG_ON_ZERO: fix sparse breakage

BUILD_BUG_ON_ZERO and BUILD_BUG_ON_NULL must return values, even in the
CHECKER case otherwise various users of it become syntactically invalid.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index fb0e7329fee1..953352a88336 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -671,8 +671,8 @@ struct sysinfo {
 
 #ifdef __CHECKER__
 #define BUILD_BUG_ON_NOT_POWER_OF_2(n)
-#define BUILD_BUG_ON_ZERO(e)
-#define BUILD_BUG_ON_NULL(e)
+#define BUILD_BUG_ON_ZERO(e) (0)
+#define BUILD_BUG_ON_NULL(e) ((void*)0)
 #define BUILD_BUG_ON(condition)
 #else /* __CHECKER__ */
 
-- 
cgit v1.2.3


From bd5dc17be87b3a3073d50b23802647db3ae3fa8e Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Wed, 15 Jun 2011 15:08:28 -0700
Subject: uts: make default hostname configurable, rather than always using
 "(none)"

The "hostname" tool falls back to setting the hostname to "localhost" if
/etc/hostname does not exist.  Distribution init scripts have the same
fallback.  However, if userspace never calls sethostname, such as when
booting with init=/bin/sh, or otherwise booting a minimal system without
the usual init scripts, the default hostname of "(none)" remains,
unhelpfully appearing in various places such as prompts ("root@(none):~#")
and logs.  Furthermore, "(none)" doesn't typically resolve to anything
useful.

Make the default hostname configurable.  This removes the need for the
standard fallback, provides a useful default for systems that never call
sethostname, and makes minimal systems that much more useful with less
configuration.  Distributions could choose to use "localhost" here to
avoid the fallback, while embedded systems may wish to use a specific
target hostname.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: David Miller <davem@davemloft.net>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Kel Modderman <kel@otaku42.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uts.h | 2 +-
 init/Kconfig        | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uts.h b/include/linux/uts.h
index 73eb1ed36ec4..6ddbd86377de 100644
--- a/include/linux/uts.h
+++ b/include/linux/uts.h
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef UTS_NODENAME
-#define UTS_NODENAME "(none)"	/* set by sethostname() */
+#define UTS_NODENAME CONFIG_DEFAULT_HOSTNAME /* set by sethostname() */
 #endif
 
 #ifndef UTS_DOMAINNAME
diff --git a/init/Kconfig b/init/Kconfig
index ebafac4231ee..e0a22e42d39a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -204,6 +204,15 @@ config KERNEL_LZO
 
 endchoice
 
+config DEFAULT_HOSTNAME
+	string "Default hostname"
+	default "(none)"
+	help
+	  This option determines the default system hostname before userspace
+	  calls sethostname(2). The kernel traditionally uses "(none)" here,
+	  but you may wish to use a different default here to make a minimal
+	  system more usable with less configuration.
+
 config SWAP
 	bool "Support for paging of anonymous memory (swap)"
 	depends on MMU && BLOCK
-- 
cgit v1.2.3


From c001fb72a7b705f902bdfdd05b5d2408efe6f848 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 14 Jun 2011 17:05:11 -0700
Subject: gpio: add GPIOF_ values regardless on kconfig settings

Make GPIOF_ defined values available even when GPIOLIB nor GENERIC_GPIO
is enabled by moving them to <linux/gpio.h>.

Fixes these build errors in linux-next:
sound/soc/codecs/ak4641.c:524: error: 'GPIOF_OUT_INIT_LOW' undeclared (first use in this function)
sound/soc/codecs/wm8915.c:2921: error: 'GPIOF_OUT_INIT_LOW' undeclared (first use in this function)

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 include/asm-generic/gpio.h | 10 ----------
 include/linux/gpio.h       | 11 +++++++++++
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index fcdcb5d5c995..d494001b1226 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -170,16 +170,6 @@ extern int __gpio_cansleep(unsigned gpio);
 
 extern int __gpio_to_irq(unsigned gpio);
 
-#define GPIOF_DIR_OUT	(0 << 0)
-#define GPIOF_DIR_IN	(1 << 0)
-
-#define GPIOF_INIT_LOW	(0 << 1)
-#define GPIOF_INIT_HIGH	(1 << 1)
-
-#define GPIOF_IN		(GPIOF_DIR_IN)
-#define GPIOF_OUT_INIT_LOW	(GPIOF_DIR_OUT | GPIOF_INIT_LOW)
-#define GPIOF_OUT_INIT_HIGH	(GPIOF_DIR_OUT | GPIOF_INIT_HIGH)
-
 /**
  * struct gpio - a structure describing a GPIO with configuration
  * @gpio:	the GPIO number
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 32d47e710661..17b5a0d80e42 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -3,6 +3,17 @@
 
 /* see Documentation/gpio.txt */
 
+/* make these flag values available regardless of GPIO kconfig options */
+#define GPIOF_DIR_OUT	(0 << 0)
+#define GPIOF_DIR_IN	(1 << 0)
+
+#define GPIOF_INIT_LOW	(0 << 1)
+#define GPIOF_INIT_HIGH	(1 << 1)
+
+#define GPIOF_IN		(GPIOF_DIR_IN)
+#define GPIOF_OUT_INIT_LOW	(GPIOF_DIR_OUT | GPIOF_INIT_LOW)
+#define GPIOF_OUT_INIT_HIGH	(GPIOF_DIR_OUT | GPIOF_INIT_HIGH)
+
 #ifdef CONFIG_GENERIC_GPIO
 #include <asm/gpio.h>
 
-- 
cgit v1.2.3


From b5199515c25cca622495eb9c6a8a1d275e775088 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Jun 2011 16:22:08 +0200
Subject: clocksource: Make watchdog robust vs. interruption

The clocksource watchdog code is interruptible and it has been
observed that this can trigger false positives which disable the TSC.

The reason is that an interrupt storm or a long running interrupt
handler between the read of the watchdog source and the read of the
TSC brings the two far enough apart that the delta is larger than the
unstable treshold. Move both reads into a short interrupt disabled
region to avoid that.

Reported-and-tested-by: Vernon Mauery <vernux@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org
---
 include/linux/clocksource.h |  1 +
 kernel/time/clocksource.c   | 24 +++++++++++++-----------
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index d4646b48dc4a..18a1baf31f2d 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -188,6 +188,7 @@ struct clocksource {
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 	/* Watchdog related data, used by the framework */
 	struct list_head wd_list;
+	cycle_t cs_last;
 	cycle_t wd_last;
 #endif
 } ____cacheline_aligned;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1c95fd677328..e0980f0d9a0a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -185,7 +185,6 @@ static struct clocksource *watchdog;
 static struct timer_list watchdog_timer;
 static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
-static cycle_t watchdog_last;
 static int watchdog_running;
 
 static int clocksource_watchdog_kthread(void *data);
@@ -254,11 +253,6 @@ static void clocksource_watchdog(unsigned long data)
 	if (!watchdog_running)
 		goto out;
 
-	wdnow = watchdog->read(watchdog);
-	wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
-				     watchdog->mult, watchdog->shift);
-	watchdog_last = wdnow;
-
 	list_for_each_entry(cs, &watchdog_list, wd_list) {
 
 		/* Clocksource already marked unstable? */
@@ -268,19 +262,28 @@ static void clocksource_watchdog(unsigned long data)
 			continue;
 		}
 
+		local_irq_disable();
 		csnow = cs->read(cs);
+		wdnow = watchdog->read(watchdog);
+		local_irq_enable();
 
 		/* Clocksource initialized ? */
 		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
 			cs->flags |= CLOCK_SOURCE_WATCHDOG;
-			cs->wd_last = csnow;
+			cs->wd_last = wdnow;
+			cs->cs_last = csnow;
 			continue;
 		}
 
-		/* Check the deviation from the watchdog clocksource. */
-		cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
+		wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
+					     watchdog->mult, watchdog->shift);
+
+		cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
 					     cs->mask, cs->mult, cs->shift);
-		cs->wd_last = csnow;
+		cs->cs_last = csnow;
+		cs->wd_last = wdnow;
+
+		/* Check the deviation from the watchdog clocksource. */
 		if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
 			clocksource_unstable(cs, cs_nsec - wd_nsec);
 			continue;
@@ -318,7 +321,6 @@ static inline void clocksource_start_watchdog(void)
 		return;
 	init_timer(&watchdog_timer);
 	watchdog_timer.function = clocksource_watchdog;
-	watchdog_last = watchdog->read(watchdog);
 	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
 	watchdog_running = 1;
-- 
cgit v1.2.3


From c16d51a32bbb61ac8fd96f78b5ce2fccfe0fb4c3 Mon Sep 17 00:00:00 2001
From: Shreshtha Kumar Sahu <shreshthakumar.sahu@stericsson.com>
Date: Mon, 13 Jun 2011 10:11:33 +0200
Subject: amba pl011: workaround for uart registers lockup

This workaround aims to break the deadlock situation
which raises during continuous transfer of data for long
duration over uart with hardware flow control. It is
observed that CTS interrupt cannot be cleared in uart
interrupt register (ICR). Hence further transfer over
uart gets blocked.

It is seen that during such deadlock condition ICR
don't get cleared even on multiple write. This leads
pass_counter to decrease and finally reach zero. This
can be taken as trigger point to run this UART_BT_WA.

Workaround backups the register configuration, does soft
reset of UART using BIT-0 of PRCC_K_SOFTRST_SET/CLEAR
registers and restores the registers.

This patch also provides support for uart init and exit
function calls if present.

Signed-off-by: Shreshtha Kumar Sahu <shreshthakumar.sahu@stericsson.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/tty/serial/amba-pl011.c | 123 +++++++++++++++++++++++++++++++++++++++-
 include/linux/amba/serial.h     |   3 +
 2 files changed, 125 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index 8dc0541feecc..f5f6831b0a64 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -50,6 +50,7 @@
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>
+#include <linux/delay.h>
 
 #include <asm/io.h>
 #include <asm/sizes.h>
@@ -65,6 +66,30 @@
 #define UART_DR_ERROR		(UART011_DR_OE|UART011_DR_BE|UART011_DR_PE|UART011_DR_FE)
 #define UART_DUMMY_DR_RX	(1 << 16)
 
+
+#define UART_WA_SAVE_NR 14
+
+static void pl011_lockup_wa(unsigned long data);
+static const u32 uart_wa_reg[UART_WA_SAVE_NR] = {
+	ST_UART011_DMAWM,
+	ST_UART011_TIMEOUT,
+	ST_UART011_LCRH_RX,
+	UART011_IBRD,
+	UART011_FBRD,
+	ST_UART011_LCRH_TX,
+	UART011_IFLS,
+	ST_UART011_XFCR,
+	ST_UART011_XON1,
+	ST_UART011_XON2,
+	ST_UART011_XOFF1,
+	ST_UART011_XOFF2,
+	UART011_CR,
+	UART011_IMSC
+};
+
+static u32 uart_wa_regdata[UART_WA_SAVE_NR];
+static DECLARE_TASKLET(pl011_lockup_tlet, pl011_lockup_wa, 0);
+
 /* There is by now at least one vendor with differing details, so handle it */
 struct vendor_data {
 	unsigned int		ifls;
@@ -72,6 +97,7 @@ struct vendor_data {
 	unsigned int		lcrh_tx;
 	unsigned int		lcrh_rx;
 	bool			oversampling;
+	bool			interrupt_may_hang;   /* vendor-specific */
 	bool			dma_threshold;
 };
 
@@ -90,9 +116,12 @@ static struct vendor_data vendor_st = {
 	.lcrh_tx		= ST_UART011_LCRH_TX,
 	.lcrh_rx		= ST_UART011_LCRH_RX,
 	.oversampling		= true,
+	.interrupt_may_hang	= true,
 	.dma_threshold		= true,
 };
 
+static struct uart_amba_port *amba_ports[UART_NR];
+
 /* Deals with DMA transactions */
 
 struct pl011_sgbuf {
@@ -132,6 +161,7 @@ struct uart_amba_port {
 	unsigned int		lcrh_rx;	/* vendor-specific */
 	bool			autorts;
 	char			type[12];
+	bool			interrupt_may_hang; /* vendor-specific */
 #ifdef CONFIG_DMA_ENGINE
 	/* DMA stuff */
 	bool			using_tx_dma;
@@ -1008,6 +1038,68 @@ static inline bool pl011_dma_rx_running(struct uart_amba_port *uap)
 #endif
 
 
+/*
+ * pl011_lockup_wa
+ * This workaround aims to break the deadlock situation
+ * when after long transfer over uart in hardware flow
+ * control, uart interrupt registers cannot be cleared.
+ * Hence uart transfer gets blocked.
+ *
+ * It is seen that during such deadlock condition ICR
+ * don't get cleared even on multiple write. This leads
+ * pass_counter to decrease and finally reach zero. This
+ * can be taken as trigger point to run this UART_BT_WA.
+ *
+ */
+static void pl011_lockup_wa(unsigned long data)
+{
+	struct uart_amba_port *uap = amba_ports[0];
+	void __iomem *base = uap->port.membase;
+	struct circ_buf *xmit = &uap->port.state->xmit;
+	struct tty_struct *tty = uap->port.state->port.tty;
+	int buf_empty_retries = 200;
+	int loop;
+
+	/* Stop HCI layer from submitting data for tx */
+	tty->hw_stopped = 1;
+	while (!uart_circ_empty(xmit)) {
+		if (buf_empty_retries-- == 0)
+			break;
+		udelay(100);
+	}
+
+	/* Backup registers */
+	for (loop = 0; loop < UART_WA_SAVE_NR; loop++)
+		uart_wa_regdata[loop] = readl(base + uart_wa_reg[loop]);
+
+	/* Disable UART so that FIFO data is flushed out */
+	writew(0x00, uap->port.membase + UART011_CR);
+
+	/* Soft reset UART module */
+	if (uap->port.dev->platform_data) {
+		struct amba_pl011_data *plat;
+
+		plat = uap->port.dev->platform_data;
+		if (plat->reset)
+			plat->reset();
+	}
+
+	/* Restore registers */
+	for (loop = 0; loop < UART_WA_SAVE_NR; loop++)
+		writew(uart_wa_regdata[loop] ,
+				uap->port.membase + uart_wa_reg[loop]);
+
+	/* Initialise the old status of the modem signals */
+	uap->old_status = readw(uap->port.membase + UART01x_FR) &
+		UART01x_FR_MODEM_ANY;
+
+	if (readl(base + UART011_MIS) & 0x2)
+		printk(KERN_EMERG "UART_BT_WA: ***FAILED***\n");
+
+	/* Start Tx/Rx */
+	tty->hw_stopped = 0;
+}
+
 static void pl011_stop_tx(struct uart_port *port)
 {
 	struct uart_amba_port *uap = (struct uart_amba_port *)port;
@@ -1158,8 +1250,11 @@ static irqreturn_t pl011_int(int irq, void *dev_id)
 			if (status & UART011_TXIS)
 				pl011_tx_chars(uap);
 
-			if (pass_counter-- == 0)
+			if (pass_counter-- == 0) {
+				if (uap->interrupt_may_hang)
+					tasklet_schedule(&pl011_lockup_tlet);
 				break;
+			}
 
 			status = readw(uap->port.membase + UART011_MIS);
 		} while (status != 0);
@@ -1339,6 +1434,14 @@ static int pl011_startup(struct uart_port *port)
 	writew(uap->im, uap->port.membase + UART011_IMSC);
 	spin_unlock_irq(&uap->port.lock);
 
+	if (uap->port.dev->platform_data) {
+		struct amba_pl011_data *plat;
+
+		plat = uap->port.dev->platform_data;
+		if (plat->init)
+			plat->init();
+	}
+
 	return 0;
 
  clk_dis:
@@ -1394,6 +1497,15 @@ static void pl011_shutdown(struct uart_port *port)
 	 * Shut down the clock producer
 	 */
 	clk_disable(uap->clk);
+
+	if (uap->port.dev->platform_data) {
+		struct amba_pl011_data *plat;
+
+		plat = uap->port.dev->platform_data;
+		if (plat->exit)
+			plat->exit();
+	}
+
 }
 
 static void
@@ -1700,6 +1812,14 @@ static int __init pl011_console_setup(struct console *co, char *options)
 	if (!uap)
 		return -ENODEV;
 
+	if (uap->port.dev->platform_data) {
+		struct amba_pl011_data *plat;
+
+		plat = uap->port.dev->platform_data;
+		if (plat->init)
+			plat->init();
+	}
+
 	uap->port.uartclk = clk_get_rate(uap->clk);
 
 	if (options)
@@ -1774,6 +1894,7 @@ static int pl011_probe(struct amba_device *dev, const struct amba_id *id)
 	uap->lcrh_rx = vendor->lcrh_rx;
 	uap->lcrh_tx = vendor->lcrh_tx;
 	uap->fifosize = vendor->fifosize;
+	uap->interrupt_may_hang = vendor->interrupt_may_hang;
 	uap->port.dev = &dev->dev;
 	uap->port.mapbase = dev->res.start;
 	uap->port.membase = base;
diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 5479fdc849e9..514ed45c462e 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -201,6 +201,9 @@ struct amba_pl011_data {
 	bool (*dma_filter)(struct dma_chan *chan, void *filter_param);
 	void *dma_rx_param;
 	void *dma_tx_param;
+        void (*init) (void);
+	void (*exit) (void);
+	void (*reset) (void);
 };
 #endif
 
-- 
cgit v1.2.3


From d8ad7d1123a960cc9f276bd499f9325c6f5e1bd1 Mon Sep 17 00:00:00 2001
From: Takao Indoh <indou.takao@jp.fujitsu.com>
Date: Tue, 29 Mar 2011 12:35:04 -0400
Subject: generic-ipi: Fix kexec boot crash by initializing call_single_queue
 before enabling interrupts

There is a problem that kdump(2nd kernel) sometimes hangs up due
to a pending IPI from 1st kernel. Kernel panic occurs because IPI
comes before call_single_queue is initialized.

To fix the crash, rename init_call_single_data() to call_function_init()
and call it in start_kernel() so that call_single_queue can be
initialized before enabling interrupts.

The details of the crash are:

 (1) 2nd kernel boots up

 (2) A pending IPI from 1st kernel comes when irqs are first enabled
     in start_kernel().

 (3) Kernel tries to handle the interrupt, but call_single_queue
     is not initialized yet at this point. As a result, in the
     generic_smp_call_function_single_interrupt(), NULL pointer
     dereference occurs when list_replace_init() tries to access
     &q->list.next.

Therefore this patch changes the name of init_call_single_data()
to call_function_init() and calls it before local_irq_enable()
in start_kernel().

Signed-off-by: Takao Indoh <indou.takao@jp.fujitsu.com>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Milton Miller <miltonm@bga.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: kexec@lists.infradead.org
Link: http://lkml.kernel.org/r/D6CBEE2F420741indou.takao@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/smp.h | 5 ++++-
 init/main.c         | 1 +
 kernel/smp.c        | 5 +----
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 7ad824d510a2..8cc38d3bab0c 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -85,12 +85,15 @@ int smp_call_function_any(const struct cpumask *mask,
  * Generic and arch helpers
  */
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
+void __init call_function_init(void);
 void generic_smp_call_function_single_interrupt(void);
 void generic_smp_call_function_interrupt(void);
 void ipi_call_lock(void);
 void ipi_call_unlock(void);
 void ipi_call_lock_irq(void);
 void ipi_call_unlock_irq(void);
+#else
+static inline void call_function_init(void) { }
 #endif
 
 /*
@@ -134,7 +137,7 @@ static inline void smp_send_reschedule(int cpu) { }
 #define smp_prepare_boot_cpu()			do {} while (0)
 #define smp_call_function_many(mask, func, info, wait) \
 			(up_smp_call_function(func, info))
-static inline void init_call_single_data(void) { }
+static inline void call_function_init(void) { }
 
 static inline int
 smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
diff --git a/init/main.c b/init/main.c
index cafba67c13bf..d7211faed2ad 100644
--- a/init/main.c
+++ b/init/main.c
@@ -542,6 +542,7 @@ asmlinkage void __init start_kernel(void)
 	timekeeping_init();
 	time_init();
 	profile_init();
+	call_function_init();
 	if (!irqs_disabled())
 		printk(KERN_CRIT "start_kernel(): bug: interrupts were "
 				 "enabled early\n");
diff --git a/kernel/smp.c b/kernel/smp.c
index 73a195193558..fb67dfa8394e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -74,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
 	.notifier_call		= hotplug_cfd,
 };
 
-static int __cpuinit init_call_single_data(void)
+void __init call_function_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int i;
@@ -88,10 +88,7 @@ static int __cpuinit init_call_single_data(void)
 
 	hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
 	register_cpu_notifier(&hotplug_cfd_notifier);
-
-	return 0;
 }
-early_initcall(init_call_single_data);
 
 /*
  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
-- 
cgit v1.2.3


From 879669961b11e7f40b518784863a259f735a72bf Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 17 Jun 2011 11:25:59 +0100
Subject: KEYS/DNS: Fix ____call_usermodehelper() to not lose the session
 keyring

____call_usermodehelper() now erases any credentials set by the
subprocess_inf::init() function.  The problem is that commit
17f60a7da150 ("capabilites: allow the application of capability limits
to usermode helpers") creates and commits new credentials with
prepare_kernel_cred() after the call to the init() function.  This wipes
all keyrings after umh_keys_init() is called.

The best way to deal with this is to put the init() call just prior to
the commit_creds() call, and pass the cred pointer to init().  That
means that umh_keys_init() and suchlike can modify the credentials
_before_ they are published and potentially in use by the rest of the
system.

This prevents request_key() from working as it is prevented from passing
the session keyring it set up with the authorisation token to
/sbin/request-key, and so the latter can't assume the authority to
instantiate the key.  This causes the in-kernel DNS resolver to fail
with ENOKEY unconditionally.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Eric Paris <eparis@redhat.com>
Tested-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                   |  2 +-
 include/linux/kmod.h        |  8 ++++----
 kernel/kmod.c               | 16 +++++++++-------
 security/keys/request_key.c |  3 +--
 4 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 97e0d52d72fd..6075a1e727ae 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1996,7 +1996,7 @@ static void wait_for_dump_helpers(struct file *file)
  * is a special value that we use to trap recursive
  * core dumps
  */
-static int umh_pipe_setup(struct subprocess_info *info)
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct file *rp, *wp;
 	struct fdtable *fdt;
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index d4a5c84c503d..0da38cf7db7b 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -45,7 +45,7 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS;
 #endif
 
 
-struct key;
+struct cred;
 struct file;
 
 enum umh_wait {
@@ -62,7 +62,7 @@ struct subprocess_info {
 	char **envp;
 	enum umh_wait wait;
 	int retval;
-	int (*init)(struct subprocess_info *info);
+	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
 	void *data;
 };
@@ -73,7 +73,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 
 /* Set various pieces of state into the subprocess_info structure */
 void call_usermodehelper_setfns(struct subprocess_info *info,
-		    int (*init)(struct subprocess_info *info),
+		    int (*init)(struct subprocess_info *info, struct cred *new),
 		    void (*cleanup)(struct subprocess_info *info),
 		    void *data);
 
@@ -87,7 +87,7 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info);
 static inline int
 call_usermodehelper_fns(char *path, char **argv, char **envp,
 			enum umh_wait wait,
-			int (*init)(struct subprocess_info *info),
+			int (*init)(struct subprocess_info *info, struct cred *new),
 			void (*cleanup)(struct subprocess_info *), void *data)
 {
 	struct subprocess_info *info;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ad6a81c58b44..47613dfb7b28 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -156,12 +156,6 @@ static int ____call_usermodehelper(void *data)
 	 */
 	set_user_nice(current, 0);
 
-	if (sub_info->init) {
-		retval = sub_info->init(sub_info);
-		if (retval)
-			goto fail;
-	}
-
 	retval = -ENOMEM;
 	new = prepare_kernel_cred(current);
 	if (!new)
@@ -173,6 +167,14 @@ static int ____call_usermodehelper(void *data)
 					     new->cap_inheritable);
 	spin_unlock(&umh_sysctl_lock);
 
+	if (sub_info->init) {
+		retval = sub_info->init(sub_info, new);
+		if (retval) {
+			abort_creds(new);
+			goto fail;
+		}
+	}
+
 	commit_creds(new);
 
 	retval = kernel_execve(sub_info->path,
@@ -388,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
  * context in which call_usermodehelper_exec is called.
  */
 void call_usermodehelper_setfns(struct subprocess_info *info,
-		    int (*init)(struct subprocess_info *info),
+		    int (*init)(struct subprocess_info *info, struct cred *new),
 		    void (*cleanup)(struct subprocess_info *info),
 		    void *data)
 {
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index d31862e0aa1c..8e319a416eec 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -71,9 +71,8 @@ EXPORT_SYMBOL(complete_request_key);
  * This is called in context of freshly forked kthread before kernel_execve(),
  * so we can simply install the desired session_keyring at this point.
  */
-static int umh_keys_init(struct subprocess_info *info)
+static int umh_keys_init(struct subprocess_info *info, struct cred *cred)
 {
-	struct cred *cred = (struct cred*)current_cred();
 	struct key *keyring = info->data;
 
 	return install_session_keyring_to_cred(cred, keyring);
-- 
cgit v1.2.3


From cca23d0b5350c9ca0473625c3f5879422ba534a6 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@opensource.se>
Date: Sat, 18 Jun 2011 02:51:52 -0700
Subject: Input: sh_keysc - 8x8 MODE_6 fix

According to the data sheet for G4, AP4 and AG5 KEYSC MODE_6 is 8x8 keys.
Bump up MAXKEYS to 64 too.

Signed-off-by: Magnus Damm <damm@opensource.se>
Reviewed-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/keyboard/sh_keysc.c | 2 +-
 include/linux/input/sh_keysc.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/sh_keysc.c b/drivers/input/keyboard/sh_keysc.c
index 834cf98e7efb..6876700a4469 100644
--- a/drivers/input/keyboard/sh_keysc.c
+++ b/drivers/input/keyboard/sh_keysc.c
@@ -32,7 +32,7 @@ static const struct {
 	[SH_KEYSC_MODE_3] = { 2, 4, 7 },
 	[SH_KEYSC_MODE_4] = { 3, 6, 6 },
 	[SH_KEYSC_MODE_5] = { 4, 6, 7 },
-	[SH_KEYSC_MODE_6] = { 5, 7, 7 },
+	[SH_KEYSC_MODE_6] = { 5, 8, 8 },
 };
 
 struct sh_keysc_priv {
diff --git a/include/linux/input/sh_keysc.h b/include/linux/input/sh_keysc.h
index 649dc7f12925..5d253cd93691 100644
--- a/include/linux/input/sh_keysc.h
+++ b/include/linux/input/sh_keysc.h
@@ -1,7 +1,7 @@
 #ifndef __SH_KEYSC_H__
 #define __SH_KEYSC_H__
 
-#define SH_KEYSC_MAXKEYS 49
+#define SH_KEYSC_MAXKEYS 64
 
 struct sh_keysc_info {
 	enum { SH_KEYSC_MODE_1, SH_KEYSC_MODE_2, SH_KEYSC_MODE_3,
-- 
cgit v1.2.3


From be98ca652faa6468916a9b7608befff215a8ca70 Mon Sep 17 00:00:00 2001
From: Manoj Iyer <manoj.iyer@canonical.com>
Date: Thu, 26 May 2011 11:19:05 -0500
Subject: mmc: Add PCI fixup quirks for Ricoh 1180:e823 reader

Signed-off-by: Manoj Iyer <manoj.iyer@canonical.com>
Cc: <stable@kernel.org>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/pci/quirks.c    | 2 ++
 include/linux/pci_ids.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index e8a140669f90..02145e9697a9 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -2761,6 +2761,8 @@ static void ricoh_mmc_fixup_r5c832(struct pci_dev *dev)
 }
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5C832, ricoh_mmc_fixup_r5c832);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5C832, ricoh_mmc_fixup_r5c832);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5CE823, ricoh_mmc_fixup_r5c832);
+DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5CE823, ricoh_mmc_fixup_r5c832);
 #endif /*CONFIG_MMC_RICOH_MMC*/
 
 #if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP)
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a311008af5e1..f8910e155566 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1537,6 +1537,7 @@
 #define PCI_DEVICE_ID_RICOH_RL5C476	0x0476
 #define PCI_DEVICE_ID_RICOH_RL5C478	0x0478
 #define PCI_DEVICE_ID_RICOH_R5C822	0x0822
+#define PCI_DEVICE_ID_RICOH_R5CE823	0xe823
 #define PCI_DEVICE_ID_RICOH_R5C832	0x0832
 #define PCI_DEVICE_ID_RICOH_R5C843	0x0843
 
-- 
cgit v1.2.3


From 155d109b5f52ffd749219b27702462dcd9cf4f8d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 20 Jun 2011 13:23:14 +0200
Subject: block: add REQ_SECURE to REQ_COMMON_MASK

Add REQ_SECURE flag to REQ_COMMON_MASK so that
init_request_from_bio() can pass it to @req->cmd_flags.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: stable@kernel.org # 2.6.36 and newer
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blk_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 2a7cea53ca0d..6395692b2e7a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -167,7 +167,7 @@ enum rq_flag_bits {
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 #define REQ_COMMON_MASK \
 	(REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_DISCARD | \
-	 REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
+	 REQ_NOIDLE | REQ_FLUSH | REQ_FUA | REQ_SECURE)
 #define REQ_CLONE_MASK		REQ_COMMON_MASK
 
 #define REQ_RAHEAD		(1 << __REQ_RAHEAD)
-- 
cgit v1.2.3


From 482e0cd3dbaa70f2a2bead4b5f2c0d203ef654ba Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 19 Jun 2011 13:01:04 -0400
Subject: devcgroup_inode_permission: take "is it a device node" checks to
 inlined wrapper

inode_permission() calls devcgroup_inode_permission() and almost all such
calls are _not_ for device nodes; let's at least keep the common path
straight...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/device_cgroup.h | 10 +++++++++-
 security/device_cgroup.c      |  8 +-------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index 0b0d9c39ed67..7aad1f440867 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -2,8 +2,16 @@
 #include <linux/fs.h>
 
 #ifdef CONFIG_CGROUP_DEVICE
-extern int devcgroup_inode_permission(struct inode *inode, int mask);
+extern int __devcgroup_inode_permission(struct inode *inode, int mask);
 extern int devcgroup_inode_mknod(int mode, dev_t dev);
+static inline int devcgroup_inode_permission(struct inode *inode, int mask)
+{
+	if (likely(!inode->i_rdev))
+		return 0;
+	if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))
+		return 0;
+	return __devcgroup_inode_permission(inode, mask);
+}
 #else
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 { return 0; }
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index cd1f779fa51d..1be68269e1c2 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -474,17 +474,11 @@ struct cgroup_subsys devices_subsys = {
 	.subsys_id = devices_subsys_id,
 };
 
-int devcgroup_inode_permission(struct inode *inode, int mask)
+int __devcgroup_inode_permission(struct inode *inode, int mask)
 {
 	struct dev_cgroup *dev_cgroup;
 	struct dev_whitelist_item *wh;
 
-	dev_t device = inode->i_rdev;
-	if (!device)
-		return 0;
-	if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))
-		return 0;
-
 	rcu_read_lock();
 
 	dev_cgroup = task_devcgroup(current);
-- 
cgit v1.2.3


From 19345cb299e8234006c5125151ab723e851a1d24 Mon Sep 17 00:00:00 2001
From: Benny Halevy <benny@tonian.com>
Date: Sun, 19 Jun 2011 18:33:46 -0400
Subject: NFSv4.1: file layout must consider pg_bsize for coalescing

Otherwise we end up overflowing the rpc buffer size on the receive end.

Signed-off-by: Benny Halevy <benny@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c  | 6 ++++--
 fs/nfs/pagelist.c        | 3 ++-
 include/linux/nfs_page.h | 3 +++
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 5d6f369b15d0..0bafcc91c27f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -30,6 +30,7 @@
  */
 
 #include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
 
 #include "internal.h"
 #include "nfs4filelayout.h"
@@ -666,8 +667,9 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 	u64 p_stripe, r_stripe;
 	u32 stripe_unit;
 
-	if (!pnfs_generic_pg_test(pgio, prev, req))
-		return 0;
+	if (!pnfs_generic_pg_test(pgio, prev, req) ||
+	    !nfs_generic_pg_test(pgio, prev, req))
+		return false;
 
 	if (!pgio->pg_lseg)
 		return 1;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7913961aff22..009855716286 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -204,7 +204,7 @@ nfs_wait_on_request(struct nfs_page *req)
 			TASK_UNINTERRUPTIBLE);
 }
 
-static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
 {
 	/*
 	 * FIXME: ideally we should be able to coalesce all requests
@@ -218,6 +218,7 @@ static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_p
 
 	return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
 }
+EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
 
 /**
  * nfs_pageio_init - initialise a page io descriptor
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 3a34e80ae92f..25311b3bedf8 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -92,6 +92,9 @@ extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
 				   struct nfs_page *);
 extern	void nfs_pageio_complete(struct nfs_pageio_descriptor *desc);
 extern	void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t);
+extern bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
+				struct nfs_page *prev,
+				struct nfs_page *req);
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
 extern	int nfs_set_page_tag_locked(struct nfs_page *req);
-- 
cgit v1.2.3


From 79568f5be06c91071697c065f01f3ebfbeb25a61 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 20 Jun 2011 20:13:49 -0700
Subject: vfs: i_state needs to be 'unsigned long' for now

Commit 13e12d14e2dc ("vfs: reorganize 'struct inode' layout a bit")
moved things around a bit changed i_state to be unsigned int instead of
unsigned long.  That was to help structure layout for the 64-bit case,
and shrink 'struct inode' a bit (admittedly that only happened when
spinlock debugging was on and i_flags didn't pack with i_lock).

However, Meelis Roos reports that this results in unaligned exceptions
on sprc, and it turns out that the bit-locking primitives that we use
for the I_NEW bit want to use the bitops.  Which want 'unsigned long',
not 'unsigned int'.

We really should fix the bit locking code to not have that kind of
requirement, but that's a much bigger change.  So for now, revert that
field back to 'unsigned long' (but keep the other re-ordering changes
from the commit that caused this).

Andi points out that we have played games with this in 'struct page', so
it's solvable with other hacks too, but since right now the struct inode
size advantage only happens with some rare config options, it's not
worth fighting.

It _would_ be worth fixing the bitlocking code, though.  Especially
since there is no type safety in the bitlocking code (this never caused
any warnings, and worked fine on x86-64, because the bitlocks take a
'void *' and x86-64 doesn't care that deeply about alignment).  So it's
currently a very easy problem to trigger by mistake and never notice.

Reported-by: Meelis Roos <mroos@linux.ee>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1c777878f1ea..6e73e2e9ae33 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -744,7 +744,7 @@ struct inode {
 
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned int		i_flags;
-	unsigned int		i_state;
+	unsigned long		i_state;
 #ifdef CONFIG_SECURITY
 	void			*i_security;
 #endif
-- 
cgit v1.2.3


From f76b168b6f117a49d36307053e1acbe30580ea5b Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sat, 18 Jun 2011 20:22:23 +0200
Subject: PM: Rename dev_pm_info.in_suspend to is_prepared

This patch (as1473) renames the "in_suspend" field in struct
dev_pm_info to "is_prepared", in preparation for an upcoming change.
The new name is more descriptive of what the field really means.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: stable@kernel.org
---
 drivers/base/power/main.c | 14 +++++++++-----
 drivers/usb/core/driver.c |  6 +++---
 include/linux/device.h    |  4 ++--
 include/linux/pm.h        |  2 +-
 4 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index aa6320207745..bf5a59ac1957 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -57,7 +57,7 @@ static int async_error;
  */
 void device_pm_init(struct device *dev)
 {
-	dev->power.in_suspend = false;
+	dev->power.is_prepared = false;
 	init_completion(&dev->power.completion);
 	complete_all(&dev->power.completion);
 	dev->power.wakeup = NULL;
@@ -91,7 +91,7 @@ void device_pm_add(struct device *dev)
 	pr_debug("PM: Adding info for %s:%s\n",
 		 dev->bus ? dev->bus->name : "No Bus", dev_name(dev));
 	mutex_lock(&dpm_list_mtx);
-	if (dev->parent && dev->parent->power.in_suspend)
+	if (dev->parent && dev->parent->power.is_prepared)
 		dev_warn(dev, "parent %s should not be sleeping\n",
 			dev_name(dev->parent));
 	list_add_tail(&dev->power.entry, &dpm_list);
@@ -511,7 +511,11 @@ static int device_resume(struct device *dev, pm_message_t state, bool async)
 	dpm_wait(dev->parent, async);
 	device_lock(dev);
 
-	dev->power.in_suspend = false;
+	/*
+	 * This is a fib.  But we'll allow new children to be added below
+	 * a resumed device, even if the device hasn't been completed yet.
+	 */
+	dev->power.is_prepared = false;
 
 	if (dev->pwr_domain) {
 		pm_dev_dbg(dev, state, "power domain ");
@@ -670,7 +674,7 @@ void dpm_complete(pm_message_t state)
 		struct device *dev = to_device(dpm_prepared_list.prev);
 
 		get_device(dev);
-		dev->power.in_suspend = false;
+		dev->power.is_prepared = false;
 		list_move(&dev->power.entry, &list);
 		mutex_unlock(&dpm_list_mtx);
 
@@ -1042,7 +1046,7 @@ int dpm_prepare(pm_message_t state)
 			put_device(dev);
 			break;
 		}
-		dev->power.in_suspend = true;
+		dev->power.is_prepared = true;
 		if (!list_empty(&dev->power.entry))
 			list_move_tail(&dev->power.entry, &dpm_prepared_list);
 		put_device(dev);
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index e35a17687c05..aa3cc465a601 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -375,7 +375,7 @@ static int usb_unbind_interface(struct device *dev)
 		 * Just re-enable it without affecting the endpoint toggles.
 		 */
 		usb_enable_interface(udev, intf, false);
-	} else if (!error && !intf->dev.power.in_suspend) {
+	} else if (!error && !intf->dev.power.is_prepared) {
 		r = usb_set_interface(udev, intf->altsetting[0].
 				desc.bInterfaceNumber, 0);
 		if (r < 0)
@@ -960,7 +960,7 @@ void usb_rebind_intf(struct usb_interface *intf)
 	}
 
 	/* Try to rebind the interface */
-	if (!intf->dev.power.in_suspend) {
+	if (!intf->dev.power.is_prepared) {
 		intf->needs_binding = 0;
 		rc = device_attach(&intf->dev);
 		if (rc < 0)
@@ -1107,7 +1107,7 @@ static int usb_resume_interface(struct usb_device *udev,
 	if (intf->condition == USB_INTERFACE_UNBOUND) {
 
 		/* Carry out a deferred switch to altsetting 0 */
-		if (intf->needs_altsetting0 && !intf->dev.power.in_suspend) {
+		if (intf->needs_altsetting0 && !intf->dev.power.is_prepared) {
 			usb_set_interface(udev, intf->altsetting[0].
 					desc.bInterfaceNumber, 0);
 			intf->needs_altsetting0 = 0;
diff --git a/include/linux/device.h b/include/linux/device.h
index c66111affca9..553fd37b173b 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -654,13 +654,13 @@ static inline int device_is_registered(struct device *dev)
 
 static inline void device_enable_async_suspend(struct device *dev)
 {
-	if (!dev->power.in_suspend)
+	if (!dev->power.is_prepared)
 		dev->power.async_suspend = true;
 }
 
 static inline void device_disable_async_suspend(struct device *dev)
 {
-	if (!dev->power.in_suspend)
+	if (!dev->power.is_prepared)
 		dev->power.async_suspend = false;
 }
 
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 3160648ccdda..cc536bd80984 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -425,7 +425,7 @@ struct dev_pm_info {
 	pm_message_t		power_state;
 	unsigned int		can_wakeup:1;
 	unsigned int		async_suspend:1;
-	unsigned int		in_suspend:1;	/* Owned by the PM core */
+	bool			is_prepared:1;	/* Owned by the PM core */
 	spinlock_t		lock;
 #ifdef CONFIG_PM_SLEEP
 	struct list_head	entry;
-- 
cgit v1.2.3


From 6d0e0e84f66d32c33511984dd3badd32364b863c Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sat, 18 Jun 2011 22:42:09 +0200
Subject: PM: Fix async resume following suspend failure

The PM core doesn't handle suspend failures correctly when it comes to
asynchronously suspended devices.  These devices are moved onto the
dpm_suspended_list as soon as the corresponding async thread is
started up, and they remain on the list even if they fail to suspend
or the sleep transition is cancelled before they get suspended.  As a
result, when the PM core unwinds the transition, it tries to resume
the devices even though they were never suspended.

This patch (as1474) fixes the problem by adding a new "is_suspended"
flag to dev_pm_info.  Devices are resumed only if the flag is set.

[rjw:
 * Moved the dev->power.is_suspended check into device_resume(),
   because we need to complete dev->power.completion and clear
   dev->power.is_prepared too for devices whose
   dev->power.is_suspended flags are unset.
 * Fixed __device_suspend() to avoid setting dev->power.is_suspended
   if async_error is different from zero.]

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: stable@kernel.org
---
 drivers/base/power/main.c | 14 ++++++++++++--
 include/linux/pm.h        |  1 +
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index bf5a59ac1957..06f09bf89cb2 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -58,6 +58,7 @@ static int async_error;
 void device_pm_init(struct device *dev)
 {
 	dev->power.is_prepared = false;
+	dev->power.is_suspended = false;
 	init_completion(&dev->power.completion);
 	complete_all(&dev->power.completion);
 	dev->power.wakeup = NULL;
@@ -517,6 +518,9 @@ static int device_resume(struct device *dev, pm_message_t state, bool async)
 	 */
 	dev->power.is_prepared = false;
 
+	if (!dev->power.is_suspended)
+		goto Unlock;
+
 	if (dev->pwr_domain) {
 		pm_dev_dbg(dev, state, "power domain ");
 		error = pm_op(dev, &dev->pwr_domain->ops, state);
@@ -552,6 +556,9 @@ static int device_resume(struct device *dev, pm_message_t state, bool async)
 	}
 
  End:
+	dev->power.is_suspended = false;
+
+ Unlock:
 	device_unlock(dev);
 	complete_all(&dev->power.completion);
 
@@ -839,11 +846,11 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	device_lock(dev);
 
 	if (async_error)
-		goto End;
+		goto Unlock;
 
 	if (pm_wakeup_pending()) {
 		async_error = -EBUSY;
-		goto End;
+		goto Unlock;
 	}
 
 	if (dev->pwr_domain) {
@@ -881,6 +888,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	}
 
  End:
+	dev->power.is_suspended = !error;
+
+ Unlock:
 	device_unlock(dev);
 	complete_all(&dev->power.completion);
 
diff --git a/include/linux/pm.h b/include/linux/pm.h
index cc536bd80984..411e4f4be52b 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -426,6 +426,7 @@ struct dev_pm_info {
 	unsigned int		can_wakeup:1;
 	unsigned int		async_suspend:1;
 	bool			is_prepared:1;	/* Owned by the PM core */
+	bool			is_suspended:1;	/* Ditto */
 	spinlock_t		lock;
 #ifdef CONFIG_PM_SLEEP
 	struct list_head	entry;
-- 
cgit v1.2.3


From 39785eb1d3e6c58cc8bf8f6990956a58037ecc75 Mon Sep 17 00:00:00 2001
From: Timur Tabi <timur@freescale.com>
Date: Thu, 23 Jun 2011 20:20:26 +0000
Subject: fsl-diu-fb: remove check for pixel clock ranges

The Freescale DIU framebuffer driver defines two constants, MIN_PIX_CLK and
MAX_PIX_CLK, that are supposed to represent the lower and upper limits of
the pixel clock.  These values, however, are true only for one platform
clock rate (533MHz) and only for the MPC8610.  So the actual range for
the pixel clock is chip-specific, which means the current values are almost
always wrong.  The chance of an out-of-range pixel clock being used are also
remote.

Rather than try to detect an out-of-range clock in the DIU driver, we depend
on the board-specific pixel clock function (e.g. p1022ds_set_pixel_clock)
to clamp the pixel clock to a supported value.

Signed-off-by: Timur Tabi <timur@freescale.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/video/fsl-diu-fb.c | 16 ----------------
 include/linux/fsl-diu-fb.h |  6 ------
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fsl-diu-fb.c b/drivers/video/fsl-diu-fb.c
index bedf5be27f05..0acc7d65aeaa 100644
--- a/drivers/video/fsl-diu-fb.c
+++ b/drivers/video/fsl-diu-fb.c
@@ -555,8 +555,6 @@ static void adjust_aoi_size_position(struct fb_var_screeninfo *var,
 static int fsl_diu_check_var(struct fb_var_screeninfo *var,
 				struct fb_info *info)
 {
-	unsigned long htotal, vtotal;
-
 	pr_debug("check_var xres: %d\n", var->xres);
 	pr_debug("check_var yres: %d\n", var->yres);
 
@@ -635,20 +633,6 @@ static int fsl_diu_check_var(struct fb_var_screeninfo *var,
 
 		break;
 	}
-	/* If the pixclock is below the minimum spec'd value then set to
-	 * refresh rate for 60Hz since this is supported by most monitors.
-	 * Refer to Documentation/fb/ for calculations.
-	 */
-	if ((var->pixclock < MIN_PIX_CLK) || (var->pixclock > MAX_PIX_CLK)) {
-		htotal = var->xres + var->right_margin + var->hsync_len +
-		    var->left_margin;
-		vtotal = var->yres + var->lower_margin + var->vsync_len +
-		    var->upper_margin;
-		var->pixclock = (vtotal * htotal * 6UL) / 100UL;
-		var->pixclock = KHZ2PICOS(var->pixclock);
-		pr_debug("pixclock set for 60Hz refresh = %u ps\n",
-			var->pixclock);
-	}
 
 	var->height = -1;
 	var->width = -1;
diff --git a/include/linux/fsl-diu-fb.h b/include/linux/fsl-diu-fb.h
index 781d4671415f..daa9952d2174 100644
--- a/include/linux/fsl-diu-fb.h
+++ b/include/linux/fsl-diu-fb.h
@@ -24,12 +24,6 @@
  * See mpc8610fb_set_par(), map_video_memory(), and unmap_video_memory()
  */
 #define MEM_ALLOC_THRESHOLD (1024*768*4+32)
-/* Minimum value that the pixel clock can be set to in pico seconds
- * This is determined by platform clock/3 where the minimum platform
- * clock is 533MHz. This gives 5629 pico seconds.
- */
-#define MIN_PIX_CLK 5629
-#define MAX_PIX_CLK 96096
 
 #include <linux/types.h>
 
-- 
cgit v1.2.3


From c6830c22603aaecf65405af23f6da2d55892f9cb Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 16 Jun 2011 17:28:07 +0900
Subject: Fix node_start/end_pfn() definition for mm/page_cgroup.c

commit 21a3c96 uses node_start/end_pfn(nid) for detection start/end
of nodes. But, it's not defined in linux/mmzone.h but defined in
/arch/???/include/mmzone.h which is included only under
CONFIG_NEED_MULTIPLE_NODES=y.

Then, we see
  mm/page_cgroup.c: In function 'page_cgroup_init':
  mm/page_cgroup.c:308: error: implicit declaration of function 'node_start_pfn'
  mm/page_cgroup.c:309: error: implicit declaration of function 'node_end_pfn'

So, fixiing page_cgroup.c is an idea...

But node_start_pfn()/node_end_pfn() is a very generic macro and
should be implemented in the same manner for all archs.
(m32r has different implementation...)

This patch removes definitions of node_start/end_pfn() in each archs
and defines a unified one in linux/mmzone.h. It's not under
CONFIG_NEED_MULTIPLE_NODES, now.

A result of macro expansion is here (mm/page_cgroup.c)

for !NUMA
 start_pfn = ((&contig_page_data)->node_start_pfn);
  end_pfn = ({ pg_data_t *__pgdat = (&contig_page_data); __pgdat->node_start_pfn + __pgdat->node_spanned_pages;});

for NUMA (x86-64)
  start_pfn = ((node_data[nid])->node_start_pfn);
  end_pfn = ({ pg_data_t *__pgdat = (node_data[nid]); __pgdat->node_start_pfn + __pgdat->node_spanned_pages;});

Changelog:
 - fixed to avoid using "nid" twice in node_end_pfn() macro.

Reported-and-acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Reported-and-tested-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/mmzone.h   |  1 -
 arch/m32r/include/asm/mmzone.h    |  8 +-------
 arch/parisc/include/asm/mmzone.h  |  7 -------
 arch/powerpc/include/asm/mmzone.h |  7 -------
 arch/sh/include/asm/mmzone.h      |  4 ----
 arch/sparc/include/asm/mmzone.h   |  2 --
 arch/tile/include/asm/mmzone.h    | 11 -----------
 arch/x86/include/asm/mmzone_32.h  | 11 -----------
 arch/x86/include/asm/mmzone_64.h  |  3 ---
 include/linux/mmzone.h            |  7 +++++++
 10 files changed, 8 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/mmzone.h b/arch/alpha/include/asm/mmzone.h
index 8af56ce346ad..445dc42e0334 100644
--- a/arch/alpha/include/asm/mmzone.h
+++ b/arch/alpha/include/asm/mmzone.h
@@ -56,7 +56,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
  * Given a kernel address, find the home node of the underlying memory.
  */
 #define kvaddr_to_nid(kaddr)	pa_to_nid(__pa(kaddr))
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 
 /*
  * Given a kaddr, LOCAL_BASE_ADDR finds the owning node of the memory
diff --git a/arch/m32r/include/asm/mmzone.h b/arch/m32r/include/asm/mmzone.h
index 9f3b5accda88..115ced33febd 100644
--- a/arch/m32r/include/asm/mmzone.h
+++ b/arch/m32r/include/asm/mmzone.h
@@ -14,12 +14,6 @@ extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)		(node_data[nid])
 
 #define node_localnr(pfn, nid)	((pfn) - NODE_DATA(nid)->node_start_pfn)
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)						\
-({									\
-	pg_data_t *__pgdat = NODE_DATA(nid);				\
-	__pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1;	\
-})
 
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
 /*
@@ -44,7 +38,7 @@ static __inline__ int pfn_to_nid(unsigned long pfn)
 	int node;
 
 	for (node = 0 ; node < MAX_NUMNODES ; node++)
-		if (pfn >= node_start_pfn(node) && pfn <= node_end_pfn(node))
+		if (pfn >= node_start_pfn(node) && pfn < node_end_pfn(node))
 			break;
 
 	return node;
diff --git a/arch/parisc/include/asm/mmzone.h b/arch/parisc/include/asm/mmzone.h
index 9608d2cf214a..e67eb9c3d1bf 100644
--- a/arch/parisc/include/asm/mmzone.h
+++ b/arch/parisc/include/asm/mmzone.h
@@ -14,13 +14,6 @@ extern struct node_map_data node_data[];
 
 #define NODE_DATA(nid)          (&node_data[nid].pg_data)
 
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)						\
-({									\
-	pg_data_t *__pgdat = NODE_DATA(nid);				\
-	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
-})
-
 /* We have these possible memory map layouts:
  * Astro: 0-3.75, 67.75-68, 4-64
  * zx1: 0-1, 257-260, 4-256
diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h
index fd3fd58bad84..7b589178be46 100644
--- a/arch/powerpc/include/asm/mmzone.h
+++ b/arch/powerpc/include/asm/mmzone.h
@@ -38,13 +38,6 @@ u64 memory_hotplug_max(void);
 #define memory_hotplug_max() memblock_end_of_DRAM()
 #endif
 
-/*
- * Following are macros that each numa implmentation must define.
- */
-
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
-
 #else
 #define memory_hotplug_max() memblock_end_of_DRAM()
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
diff --git a/arch/sh/include/asm/mmzone.h b/arch/sh/include/asm/mmzone.h
index 8887baff5eff..15a8496960e6 100644
--- a/arch/sh/include/asm/mmzone.h
+++ b/arch/sh/include/asm/mmzone.h
@@ -9,10 +9,6 @@
 extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)		(node_data[nid])
 
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)	(NODE_DATA(nid)->node_start_pfn + \
-				 NODE_DATA(nid)->node_spanned_pages)
-
 static inline int pfn_to_nid(unsigned long pfn)
 {
 	int nid;
diff --git a/arch/sparc/include/asm/mmzone.h b/arch/sparc/include/asm/mmzone.h
index e8c648741ed4..99d9b9f577bf 100644
--- a/arch/sparc/include/asm/mmzone.h
+++ b/arch/sparc/include/asm/mmzone.h
@@ -8,8 +8,6 @@
 extern struct pglist_data *node_data[];
 
 #define NODE_DATA(nid)		(node_data[nid])
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
 
 extern int numa_cpu_lookup_table[];
 extern cpumask_t numa_cpumask_lookup_table[];
diff --git a/arch/tile/include/asm/mmzone.h b/arch/tile/include/asm/mmzone.h
index c6344c4f32ac..9d3dbce8f953 100644
--- a/arch/tile/include/asm/mmzone.h
+++ b/arch/tile/include/asm/mmzone.h
@@ -40,17 +40,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 	return highbits_to_node[__pfn_to_highbits(pfn)];
 }
 
-/*
- * Following are macros that each numa implmentation must define.
- */
-
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)						\
-({									\
-	pg_data_t *__pgdat = NODE_DATA(nid);				\
-	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
-})
-
 #define kern_addr_valid(kaddr)	virt_addr_valid((void *)kaddr)
 
 static inline int pfn_valid(int pfn)
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 5e83a416eca8..224e8c5eb307 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -48,17 +48,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 #endif
 }
 
-/*
- * Following are macros that each numa implmentation must define.
- */
-
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)						\
-({									\
-	pg_data_t *__pgdat = NODE_DATA(nid);				\
-	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
-})
-
 static inline int pfn_valid(int pfn)
 {
 	int nid = pfn_to_nid(pfn);
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index b3f88d7867c7..129d9aa3ceb3 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -13,8 +13,5 @@ extern struct pglist_data *node_data[];
 
 #define NODE_DATA(nid)		(node_data[nid])
 
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn +	\
-				 NODE_DATA(nid)->node_spanned_pages)
 #endif
 #endif /* _ASM_X86_MMZONE_64_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c928dac6cad0..9f7c3ebcbbad 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -647,6 +647,13 @@ typedef struct pglist_data {
 #endif
 #define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
+#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
+
+#define node_end_pfn(nid) ({\
+	pg_data_t *__pgdat = NODE_DATA(nid);\
+	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;\
+})
+
 #include <linux/memory_hotplug.h>
 
 extern struct mutex zonelists_mutex;
-- 
cgit v1.2.3


From 4d258b25d947521c8b913154db61ec55198243f8 Mon Sep 17 00:00:00 2001
From: Vitaliy Ivanov <vitalivanov@gmail.com>
Date: Mon, 27 Jun 2011 19:07:08 +0300
Subject: Fix some kernel-doc warnings

Fix 'make htmldocs' warnings:

  Warning(/include/linux/hrtimer.h:153): No description found for parameter 'clockid'
  Warning(/include/linux/device.h:604): Excess struct/union/enum/typedef member 'of_match' description in 'device'
  Warning(/include/net/sock.h:349): Excess struct/union/enum/typedef member 'sk_rmem_alloc' description in 'sock'

Signed-off-by: Vitaliy Ivanov <vitalivanov@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/device.h  | 1 -
 include/linux/hrtimer.h | 1 +
 include/net/sock.h      | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 553fd37b173b..e4f62d8896b7 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -530,7 +530,6 @@ struct device_dma_parameters {
  * @dma_mem:	Internal for coherent mem override.
  * @archdata:	For arch-specific additions.
  * @of_node:	Associated device tree node.
- * @of_match:	Matching of_device_id from driver.
  * @devt:	For creating the sysfs "dev".
  * @devres_lock: Spinlock to protect the resource of the device.
  * @devres_head: The resources list of the device.
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 51932e5acf7c..fd0dc30c9f15 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -135,6 +135,7 @@ struct hrtimer_sleeper {
  * @cpu_base:		per cpu clock base
  * @index:		clock type index for per_cpu support when moving a
  *			timer to a base on another cpu.
+ * @clockid:		clock id for per_cpu support
  * @active:		red black tree root node for the active timers
  * @resolution:		the resolution of the clock, in nanoseconds
  * @get_time:		function to retrieve the current time of the clock
diff --git a/include/net/sock.h b/include/net/sock.h
index f2046e404a61..c0b938cb4b1a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -178,7 +178,6 @@ struct sock_common {
   *	@sk_dst_cache: destination cache
   *	@sk_dst_lock: destination cache lock
   *	@sk_policy: flow policy
-  *	@sk_rmem_alloc: receive queue bytes committed
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
   *	@sk_write_queue: Packet sending queue
-- 
cgit v1.2.3


From 072441e21ddcd1140606b7d4ef6eab579a86b0b3 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 27 Jun 2011 16:18:02 -0700
Subject: mm: move shmem prototypes to shmem_fs.h

Before adding any more global entry points into shmem.c, gather such
prototypes into shmem_fs.h.  Remove mm's own declarations from swap.h,
but for now leave the ones in mm.h: because shmem_file_setup() and
shmem_zero_setup() are called from various places, and we should not
force other subsystems to update immediately.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 17 +++++++++++++++++
 include/linux/swap.h     | 10 ----------
 mm/memcontrol.c          |  1 +
 mm/swapfile.c            |  2 +-
 4 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 2b7fec840517..cae65dc42bcc 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -5,6 +5,13 @@
 #include <linux/mempolicy.h>
 #include <linux/percpu_counter.h>
 
+struct page;
+struct file;
+struct inode;
+struct super_block;
+struct user_struct;
+struct vm_area_struct;
+
 /* inode in-kernel data */
 
 #define SHMEM_NR_DIRECT 16
@@ -45,7 +52,17 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
 	return container_of(inode, struct shmem_inode_info, vfs_inode);
 }
 
+/*
+ * Functions in mm/shmem.c called directly from elsewhere:
+ */
 extern int init_tmpfs(void);
 extern int shmem_fill_super(struct super_block *sb, void *data, int silent);
+extern struct file *shmem_file_setup(const char *name,
+					loff_t size, unsigned long flags);
+extern int shmem_zero_setup(struct vm_area_struct *);
+extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
+extern int shmem_unuse(swp_entry_t entry, struct page *page);
+extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
+					struct page **pagep, swp_entry_t *ent);
 
 #endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index e70564647039..a273468f8285 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -300,16 +300,6 @@ static inline void scan_unevictable_unregister_node(struct node *node)
 extern int kswapd_run(int nid);
 extern void kswapd_stop(int nid);
 
-#ifdef CONFIG_MMU
-/* linux/mm/shmem.c */
-extern int shmem_unuse(swp_entry_t entry, struct page *page);
-#endif /* CONFIG_MMU */
-
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
-					struct page **pagep, swp_entry_t *ent);
-#endif
-
 #ifdef CONFIG_SWAP
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cf7d027a8844..ddffc74cdebe 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
 #include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d537d29e9b7b..ff8dc1a18cb4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,7 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/namei.h>
-#include <linux/shm.h>
+#include <linux/shmem_fs.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
 #include <linux/writeback.h>
-- 
cgit v1.2.3


From 94c1e62df4494b79782cb9c7279f827212d1de70 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 27 Jun 2011 16:18:03 -0700
Subject: tmpfs: take control of its truncate_range

2.6.35's new truncate convention gave tmpfs the opportunity to control
its file truncation, no longer enforced from outside by vmtruncate().
We shall want to build upon that, to handle pagecache and swap together.

Slightly redefine the ->truncate_range interface: let it now be called
between the unmap_mapping_range()s, with the filesystem responsible for
doing the truncate_inode_pages_range() from it - just as the filesystem
is nowadays responsible for doing that from its ->setattr.

Let's rename shmem_notify_change() to shmem_setattr().  Instead of
calling the generic truncate_setsize(), bring that code in so we can
call shmem_truncate_range() - which will later be updated to perform its
own variant of truncate_inode_pages_range().

Remove the punch_hole unmap_mapping_range() from shmem_truncate_range():
now that the COW's unmap_mapping_range() comes after ->truncate_range,
there is no need to call it a third time.

Export shmem_truncate_range() and add it to the list in shmem_fs.h, so
that i915_gem_object_truncate() can call it explicitly in future; get
this patch in first, then update drm/i915 once this is available (until
then, i915 will just be doing the truncate_inode_pages() twice).

Though introduced five years ago, no other filesystem is implementing
->truncate_range, and its only other user is madvise(,,MADV_REMOVE): we
expect to convert it to fallocate(,FALLOC_FL_PUNCH_HOLE,,) shortly,
whereupon ->truncate_range can be removed from inode_operations -
shmem_truncate_range() will help i915 across that transition too.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h |  1 +
 mm/shmem.c               | 51 +++++++++++++++++++++++++++---------------------
 mm/truncate.c            |  4 ++--
 3 files changed, 32 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index cae65dc42bcc..22a20af4d785 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -61,6 +61,7 @@ extern struct file *shmem_file_setup(const char *name,
 					loff_t size, unsigned long flags);
 extern int shmem_zero_setup(struct vm_area_struct *);
 extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
+extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
 extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
 					struct page **pagep, swp_entry_t *ent);
diff --git a/mm/shmem.c b/mm/shmem.c
index d221a1cfd7b1..f1714758ea96 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -539,7 +539,7 @@ static void shmem_free_pages(struct list_head *next)
 	} while (next);
 }
 
-static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	unsigned long idx;
@@ -562,6 +562,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 	spinlock_t *punch_lock;
 	unsigned long upper_limit;
 
+	truncate_inode_pages_range(inode->i_mapping, start, end);
+
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (idx >= info->next_index)
@@ -738,16 +740,8 @@ done2:
 		 * lowered next_index.  Also, though shmem_getpage checks
 		 * i_size before adding to cache, no recheck after: so fix the
 		 * narrow window there too.
-		 *
-		 * Recalling truncate_inode_pages_range and unmap_mapping_range
-		 * every time for punch_hole (which never got a chance to clear
-		 * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
-		 * yet hardly ever necessary: try to optimize them out later.
 		 */
 		truncate_inode_pages_range(inode->i_mapping, start, end);
-		if (punch_hole)
-			unmap_mapping_range(inode->i_mapping, start,
-							end - start, 1);
 	}
 
 	spin_lock(&info->lock);
@@ -766,22 +760,23 @@ done2:
 		shmem_free_pages(pages_to_free.next);
 	}
 }
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
-static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
-	loff_t newsize = attr->ia_size;
 	int error;
 
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
 
-	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
-					&& newsize != inode->i_size) {
+	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+		loff_t oldsize = inode->i_size;
+		loff_t newsize = attr->ia_size;
 		struct page *page = NULL;
 
-		if (newsize < inode->i_size) {
+		if (newsize < oldsize) {
 			/*
 			 * If truncating down to a partial page, then
 			 * if that page is already allocated, hold it
@@ -810,12 +805,19 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 				spin_unlock(&info->lock);
 			}
 		}
-
-		/* XXX(truncate): truncate_setsize should be called last */
-		truncate_setsize(inode, newsize);
+		if (newsize != oldsize) {
+			i_size_write(inode, newsize);
+			inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+		}
+		if (newsize < oldsize) {
+			loff_t holebegin = round_up(newsize, PAGE_SIZE);
+			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+			shmem_truncate_range(inode, newsize, (loff_t)-1);
+			/* unmap again to remove racily COWed private pages */
+			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+		}
 		if (page)
 			page_cache_release(page);
-		shmem_truncate_range(inode, newsize, (loff_t)-1);
 	}
 
 	setattr_copy(inode, attr);
@@ -832,7 +834,6 @@ static void shmem_evict_inode(struct inode *inode)
 	struct shmem_xattr *xattr, *nxattr;
 
 	if (inode->i_mapping->a_ops == &shmem_aops) {
-		truncate_inode_pages(inode->i_mapping, 0);
 		shmem_unacct_size(info->flags, inode->i_size);
 		inode->i_size = 0;
 		shmem_truncate_range(inode, 0, (loff_t)-1);
@@ -2706,7 +2707,7 @@ static const struct file_operations shmem_file_operations = {
 };
 
 static const struct inode_operations shmem_inode_operations = {
-	.setattr	= shmem_notify_change,
+	.setattr	= shmem_setattr,
 	.truncate_range	= shmem_truncate_range,
 #ifdef CONFIG_TMPFS_XATTR
 	.setxattr	= shmem_setxattr,
@@ -2739,7 +2740,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
 	.removexattr	= shmem_removexattr,
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
-	.setattr	= shmem_notify_change,
+	.setattr	= shmem_setattr,
 	.check_acl	= generic_check_acl,
 #endif
 };
@@ -2752,7 +2753,7 @@ static const struct inode_operations shmem_special_inode_operations = {
 	.removexattr	= shmem_removexattr,
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
-	.setattr	= shmem_notify_change,
+	.setattr	= shmem_setattr,
 	.check_acl	= generic_check_acl,
 #endif
 };
@@ -2908,6 +2909,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
 	return 0;
 }
 
+void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+{
+	truncate_inode_pages_range(inode->i_mapping, start, end);
+}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
+
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /**
  * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
diff --git a/mm/truncate.c b/mm/truncate.c
index 5b4c3a4847e9..29a9b8a5a31a 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -619,9 +619,9 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 	mutex_lock(&inode->i_mutex);
 	down_write(&inode->i_alloc_sem);
 	unmap_mapping_range(mapping, offset, (end - offset), 1);
-	truncate_inode_pages_range(mapping, offset, end);
-	unmap_mapping_range(mapping, offset, (end - offset), 1);
 	inode->i_op->truncate_range(inode, offset, end);
+	/* unmap again to remove racily COWed private pages */
+	unmap_mapping_range(mapping, offset, (end - offset), 1);
 	up_write(&inode->i_alloc_sem);
 	mutex_unlock(&inode->i_mutex);
 
-- 
cgit v1.2.3


From d9d90e5eb70e09903dadff42099b6c948f814050 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 27 Jun 2011 16:18:04 -0700
Subject: tmpfs: add shmem_read_mapping_page_gfp

Although it is used (by i915) on nothing but tmpfs, read_cache_page_gfp()
is unsuited to tmpfs, because it inserts a page into pagecache before
calling the filesystem's ->readpage: tmpfs may have pages in swapcache
which only it knows how to locate and switch to filecache.

At present tmpfs provides a ->readpage method, and copes with this by
copying pages; but soon we can simplify it by removing its ->readpage.
Provide shmem_read_mapping_page_gfp() now, ready for that transition,

Export shmem_read_mapping_page_gfp() and add it to list in shmem_fs.h,
with shmem_read_mapping_page() inline for the common mapping_gfp case.

(shmem_read_mapping_page_gfp or shmem_read_cache_page_gfp? Generally the
read_mapping_page functions use the mapping's ->readpage, and the
read_cache_page functions use the supplied filler, so I think
read_cache_page_gfp was slightly misnamed.)

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 17 ++++++++++-------
 mm/shmem.c               | 23 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 22a20af4d785..aa08fa8fd79b 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -3,15 +3,9 @@
 
 #include <linux/swap.h>
 #include <linux/mempolicy.h>
+#include <linux/pagemap.h>
 #include <linux/percpu_counter.h>
 
-struct page;
-struct file;
-struct inode;
-struct super_block;
-struct user_struct;
-struct vm_area_struct;
-
 /* inode in-kernel data */
 
 #define SHMEM_NR_DIRECT 16
@@ -61,9 +55,18 @@ extern struct file *shmem_file_setup(const char *name,
 					loff_t size, unsigned long flags);
 extern int shmem_zero_setup(struct vm_area_struct *);
 extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
+extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+					pgoff_t index, gfp_t gfp_mask);
 extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
 extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
 					struct page **pagep, swp_entry_t *ent);
 
+static inline struct page *shmem_read_mapping_page(
+				struct address_space *mapping, pgoff_t index)
+{
+	return shmem_read_mapping_page_gfp(mapping, index,
+					mapping_gfp_mask(mapping));
+}
+
 #endif
diff --git a/mm/shmem.c b/mm/shmem.c
index f1714758ea96..fcedf5464eb7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3035,3 +3035,26 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
+
+/**
+ * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping:	the page's address_space
+ * @index:	the page index
+ * @gfp:	the page allocator flags to use if allocating
+ *
+ * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
+ * with any new page allocations done using the specified allocation flags.
+ * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * suit tmpfs, since it may have pages in swapcache, and needs to find those
+ * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
+ *
+ * Provide a stub for those callers to start using now, then later
+ * flesh it out to call shmem_getpage() with additional gfp mask, when
+ * shmem_file_splice_read() is added and shmem_readpage() is removed.
+ */
+struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+					 pgoff_t index, gfp_t gfp)
+{
+	return read_cache_page_gfp(mapping, index, gfp);
+}
+EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
-- 
cgit v1.2.3


From 507c5f1224014f9956e604ee8703b3bbea7da4a4 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Mon, 27 Jun 2011 16:18:07 -0700
Subject: include/linux/compat.h: declare compat_sys_sendmmsg()

This is required for tilegx to be able to use the compat unistd.h header
where compat_sys_sendmmsg() is now mentioned.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index ddcb7db38e67..846bb1792572 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -467,6 +467,8 @@ asmlinkage long compat_sys_setsockopt(int fd, int level, int optname,
 				      char __user *optval, unsigned int optlen);
 asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg,
 				   unsigned flags);
+asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg,
+				    unsigned vlen, unsigned int flags);
 asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg,
 				   unsigned int flags);
 asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len,
-- 
cgit v1.2.3


From 08142579b6ca35883c1ed066a2681de6f6917062 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Jun 2011 16:18:10 -0700
Subject: mm: fix assertion mapping->nrpages == 0 in end_writeback()

Under heavy memory and filesystem load, users observe the assertion
mapping->nrpages == 0 in end_writeback() trigger.  This can be caused by
page reclaim reclaiming the last page from a mapping in the following
race:

	CPU0				CPU1
  ...
  shrink_page_list()
    __remove_mapping()
      __delete_from_page_cache()
        radix_tree_delete()
					evict_inode()
					  truncate_inode_pages()
					    truncate_inode_pages_range()
					      pagevec_lookup() - finds nothing
					  end_writeback()
					    mapping->nrpages != 0 -> BUG
        page->mapping = NULL
        mapping->nrpages--

Fix the problem by doing a reliable check of mapping->nrpages under
mapping->tree_lock in end_writeback().

Analyzed by Jay <jinshan.xiong@whamcloud.com>, lost in LKML, and dug out
by Miklos Szeredi <mszeredi@suse.de>.

Cc: Jay <jinshan.xiong@whamcloud.com>
Cc: Miklos Szeredi <mszeredi@suse.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inode.c         | 7 +++++++
 include/linux/fs.h | 1 +
 mm/truncate.c      | 5 +++++
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 0f7e88a7803f..43566d17d1b8 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -423,7 +423,14 @@ EXPORT_SYMBOL(remove_inode_hash);
 void end_writeback(struct inode *inode)
 {
 	might_sleep();
+	/*
+	 * We have to cycle tree_lock here because reclaim can be still in the
+	 * process of removing the last page (in __delete_from_page_cache())
+	 * and we must not free mapping under it.
+	 */
+	spin_lock_irq(&inode->i_data.tree_lock);
 	BUG_ON(inode->i_data.nrpages);
+	spin_unlock_irq(&inode->i_data.tree_lock);
 	BUG_ON(!list_empty(&inode->i_data.private_list));
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6e73e2e9ae33..b5b979247863 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -639,6 +639,7 @@ struct address_space {
 	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	struct mutex		i_mmap_mutex;	/* protect tree, count, list */
+	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
 	pgoff_t			writeback_index;/* writeback starts here */
 	const struct address_space_operations *a_ops;	/* methods */
diff --git a/mm/truncate.c b/mm/truncate.c
index 29a9b8a5a31a..e13f22efaad7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -304,6 +304,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
  * @lstart: offset from which to truncate
  *
  * Called under (and serialised by) inode->i_mutex.
+ *
+ * Note: When this function returns, there can be a page in the process of
+ * deletion (inside __delete_from_page_cache()) in the specified range.  Thus
+ * mapping->nrpages can be non-zero when this function returns even after
+ * truncation of the whole mapping.
  */
 void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 {
-- 
cgit v1.2.3


From 15b493d11fcce3c5547e3d7fb6d90e11ffe12777 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 28 Jun 2011 09:48:48 +0200
Subject: drbd: fix limit define, we support 1 PiByte now

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 include/linux/drbd_limits.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 246f576c981d..447c36752385 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -117,10 +117,10 @@
 /* drbdsetup XY resize -d Z
  * you are free to reduce the device size to nothing, if you want to.
  * the upper limit with 64bit kernel, enough ram and flexible meta data
- * is 16 TB, currently. */
+ * is 1 PiB, currently. */
 /* DRBD_MAX_SECTORS */
 #define DRBD_DISK_SIZE_SECT_MIN  0
-#define DRBD_DISK_SIZE_SECT_MAX  (16 * (2LLU << 30))
+#define DRBD_DISK_SIZE_SECT_MAX  (1 * (2LLU << 40))
 #define DRBD_DISK_SIZE_SECT_DEF  0 /* = disabled = no user size... */
 
 #define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
-- 
cgit v1.2.3


From e4c2fb0d5776b58049d2556b456144a4db3fe5a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 Jul 2011 10:56:32 +0200
Subject: sched: Disable (revert) SCHED_LOAD_SCALE increase

Alex reported that commit c8b281161df ("sched: Increase
SCHED_LOAD_SCALE resolution") caused a power usage regression
under light load as it increases the number of load-balance
operations and keeps idle cpus from staying idle.

Time has run out to find the root cause for this release so
disable the feature for v3.0 until we can figure out what
causes the problem.

Reported-by: "Alex, Shi" <alex.shi@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-m4onxn0sxnyn5iz9o88eskc3@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a837b20ba190..496770a96487 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -808,7 +808,7 @@ enum cpu_idle_type {
  * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
  * increased costs.
  */
-#if BITS_PER_LONG > 32
+#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
 # define SCHED_LOAD_RESOLUTION	10
 # define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
 # define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
-- 
cgit v1.2.3


From 659fb32d1b67476f4ade25e9ea0e2642a5b9c4b5 Mon Sep 17 00:00:00 2001
From: Simon Guinot <sguinot@lacie.com>
Date: Wed, 6 Jul 2011 12:41:31 -0400
Subject: genirq: replace irq_gc_ack() with {set,clr}_bit variants (fwd)

This fixes a regression introduced by e59347a "arm: orion:
Use generic irq chip".

Depending on the device, interrupts acknowledgement is done by setting
or by clearing a dedicated register. Replace irq_gc_ack() with some
{set,clr}_bit variants allows to handle both cases.

Note that this patch affects the following SoCs: Davinci, Samsung and
Orion. Except for this last, the change is minor: irq_gc_ack() is just
renamed into irq_gc_ack_set_bit().

For the Orion SoCs, the edge GPIO interrupts support is currently
broken. irq_gc_ack() try to acknowledge a such interrupt by setting
the corresponding cause register bit. The Orion GPIO device expect the
opposite. To fix this issue, the irq_gc_ack_clr_bit() variant is used.

Tested on Network Space v2.

Reported-by: Joey Oravec <joravec@drewtech.com>
Signed-off-by: Simon Guinot <sguinot@lacie.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/mach-davinci/irq.c      |  2 +-
 arch/arm/plat-orion/gpio.c       |  2 +-
 arch/arm/plat-s5p/irq-gpioint.c  |  2 +-
 arch/arm/plat-samsung/irq-uart.c |  2 +-
 include/linux/irq.h              |  3 ++-
 kernel/irq/generic-chip.c        | 18 ++++++++++++++++--
 6 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-davinci/irq.c b/arch/arm/mach-davinci/irq.c
index bfe68ec4e1a6..d8c1af025931 100644
--- a/arch/arm/mach-davinci/irq.c
+++ b/arch/arm/mach-davinci/irq.c
@@ -53,7 +53,7 @@ davinci_alloc_gc(void __iomem *base, unsigned int irq_start, unsigned int num)
 
 	gc = irq_alloc_generic_chip("AINTC", 1, irq_start, base, handle_edge_irq);
 	ct = gc->chip_types;
-	ct->chip.irq_ack = irq_gc_ack;
+	ct->chip.irq_ack = irq_gc_ack_set_bit;
 	ct->chip.irq_mask = irq_gc_mask_clr_bit;
 	ct->chip.irq_unmask = irq_gc_mask_set_bit;
 
diff --git a/arch/arm/plat-orion/gpio.c b/arch/arm/plat-orion/gpio.c
index 5b4fffab1eb4..41ab97ebe4cf 100644
--- a/arch/arm/plat-orion/gpio.c
+++ b/arch/arm/plat-orion/gpio.c
@@ -432,7 +432,7 @@ void __init orion_gpio_init(int gpio_base, int ngpio,
 	ct->regs.mask = ochip->mask_offset + GPIO_EDGE_MASK_OFF;
 	ct->regs.ack = GPIO_EDGE_CAUSE_OFF;
 	ct->type = IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING;
-	ct->chip.irq_ack = irq_gc_ack;
+	ct->chip.irq_ack = irq_gc_ack_clr_bit;
 	ct->chip.irq_mask = irq_gc_mask_clr_bit;
 	ct->chip.irq_unmask = irq_gc_mask_set_bit;
 	ct->chip.irq_set_type = gpio_irq_set_type;
diff --git a/arch/arm/plat-s5p/irq-gpioint.c b/arch/arm/plat-s5p/irq-gpioint.c
index 135abda31c9a..327ab9f662e8 100644
--- a/arch/arm/plat-s5p/irq-gpioint.c
+++ b/arch/arm/plat-s5p/irq-gpioint.c
@@ -152,7 +152,7 @@ static __init int s5p_gpioint_add(struct s3c_gpio_chip *chip)
 	if (!gc)
 		return -ENOMEM;
 	ct = gc->chip_types;
-	ct->chip.irq_ack = irq_gc_ack;
+	ct->chip.irq_ack = irq_gc_ack_set_bit;
 	ct->chip.irq_mask = irq_gc_mask_set_bit;
 	ct->chip.irq_unmask = irq_gc_mask_clr_bit;
 	ct->chip.irq_set_type = s5p_gpioint_set_type,
diff --git a/arch/arm/plat-samsung/irq-uart.c b/arch/arm/plat-samsung/irq-uart.c
index 32582c0958e3..0e46588d847b 100644
--- a/arch/arm/plat-samsung/irq-uart.c
+++ b/arch/arm/plat-samsung/irq-uart.c
@@ -55,7 +55,7 @@ static void __init s3c_init_uart_irq(struct s3c_uart_irq *uirq)
 	gc = irq_alloc_generic_chip("s3c-uart", 1, uirq->base_irq, reg_base,
 				    handle_level_irq);
 	ct = gc->chip_types;
-	ct->chip.irq_ack = irq_gc_ack;
+	ct->chip.irq_ack = irq_gc_ack_set_bit;
 	ct->chip.irq_mask = irq_gc_mask_set_bit;
 	ct->chip.irq_unmask = irq_gc_mask_clr_bit;
 	ct->regs.ack = S3C64XX_UINTP;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8b4538446636..baa397eb9c33 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -676,7 +676,8 @@ void irq_gc_mask_disable_reg(struct irq_data *d);
 void irq_gc_mask_set_bit(struct irq_data *d);
 void irq_gc_mask_clr_bit(struct irq_data *d);
 void irq_gc_unmask_enable_reg(struct irq_data *d);
-void irq_gc_ack(struct irq_data *d);
+void irq_gc_ack_set_bit(struct irq_data *d);
+void irq_gc_ack_clr_bit(struct irq_data *d);
 void irq_gc_mask_disable_reg_and_ack(struct irq_data *d);
 void irq_gc_eoi(struct irq_data *d);
 int irq_gc_set_wake(struct irq_data *d, unsigned int on);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 31a9db711906..3a2cab407b93 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -101,10 +101,10 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
 }
 
 /**
- * irq_gc_ack - Ack pending interrupt
+ * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
  * @d: irq_data
  */
-void irq_gc_ack(struct irq_data *d)
+void irq_gc_ack_set_bit(struct irq_data *d)
 {
 	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
 	u32 mask = 1 << (d->irq - gc->irq_base);
@@ -114,6 +114,20 @@ void irq_gc_ack(struct irq_data *d)
 	irq_gc_unlock(gc);
 }
 
+/**
+ * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
+ * @d: irq_data
+ */
+void irq_gc_ack_clr_bit(struct irq_data *d)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+	u32 mask = ~(1 << (d->irq - gc->irq_base));
+
+	irq_gc_lock(gc);
+	irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+	irq_gc_unlock(gc);
+}
+
 /**
  * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
  * @d: irq_data
-- 
cgit v1.2.3


From c902ce1bfb40d8b049bd2319b388b4b68b04bc27 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 7 Jul 2011 12:19:48 +0100
Subject: FS-Cache: Add a helper to bulk uncache pages on an inode

Add an FS-Cache helper to bulk uncache pages on an inode.  This will
only work for the circumstance where the pages in the cache correspond
1:1 with the pages attached to an inode's page cache.

This is required for CIFS and NFS: When disabling inode cookie, we were
returning the cookie and setting cifsi->fscache to NULL but failed to
invalidate any previously mapped pages.  This resulted in "Bad page
state" errors and manifested in other kind of errors when running
fsstress.  Fix it by uncaching mapped pages when we disable the inode
cookie.

This patch should fix the following oops and "Bad page state" errors
seen during fsstress testing.

  ------------[ cut here ]------------
  kernel BUG at fs/cachefiles/namei.c:201!
  invalid opcode: 0000 [#1] SMP
  Pid: 5, comm: kworker/u:0 Not tainted 2.6.38.7-30.fc15.x86_64 #1 Bochs Bochs
  RIP: 0010: cachefiles_walk_to_object+0x436/0x745 [cachefiles]
  RSP: 0018:ffff88002ce6dd00  EFLAGS: 00010282
  RAX: ffff88002ef165f0 RBX: ffff88001811f500 RCX: 0000000000000000
  RDX: 0000000000000000 RSI: 0000000000000100 RDI: 0000000000000282
  RBP: ffff88002ce6dda0 R08: 0000000000000100 R09: ffffffff81b3a300
  R10: 0000ffff00066c0a R11: 0000000000000003 R12: ffff88002ae54840
  R13: ffff88002ae54840 R14: ffff880029c29c00 R15: ffff88001811f4b0
  FS:  00007f394dd32720(0000) GS:ffff88002ef00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: 00007fffcb62ddf8 CR3: 000000001825f000 CR4: 00000000000006e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Process kworker/u:0 (pid: 5, threadinfo ffff88002ce6c000, task ffff88002ce55cc0)
  Stack:
   0000000000000246 ffff88002ce55cc0 ffff88002ce6dd58 ffff88001815dc00
   ffff8800185246c0 ffff88001811f618 ffff880029c29d18 ffff88001811f380
   ffff88002ce6dd50 ffffffff814757e4 ffff88002ce6dda0 ffffffff8106ac56
  Call Trace:
   cachefiles_lookup_object+0x78/0xd4 [cachefiles]
   fscache_lookup_object+0x131/0x16d [fscache]
   fscache_object_work_func+0x1bc/0x669 [fscache]
   process_one_work+0x186/0x298
   worker_thread+0xda/0x15d
   kthread+0x84/0x8c
   kernel_thread_helper+0x4/0x10
  RIP  cachefiles_walk_to_object+0x436/0x745 [cachefiles]
  ---[ end trace 1d481c9af1804caa ]---

I tested the uncaching by the following means:

 (1) Create a big file on my NFS server (104857600 bytes).

 (2) Read the file into the cache with md5sum on the NFS client.  Look in
     /proc/fs/fscache/stats:

	Pages  : mrk=25601 unc=0

 (3) Open the file for read/write ("bash 5<>/warthog/bigfile").  Look in proc
     again:

	Pages  : mrk=25601 unc=25601

Reported-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-Tested-by: Suresh Jayaraman <sjayaraman@suse.de>
cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/caching/netfs-api.txt | 16 +++++++++
 fs/cifs/fscache.c                               |  1 +
 fs/fscache/page.c                               | 44 +++++++++++++++++++++++++
 fs/nfs/fscache.c                                |  8 ++---
 include/linux/fscache.h                         | 21 ++++++++++++
 5 files changed, 85 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index a167ab876c35..7cc6bf2871eb 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -673,6 +673,22 @@ storage request to complete, or it may attempt to cancel the storage request -
 in which case the page will not be stored in the cache this time.
 
 
+BULK INODE PAGE UNCACHE
+-----------------------
+
+A convenience routine is provided to perform an uncache on all the pages
+attached to an inode.  This assumes that the pages on the inode correspond on a
+1:1 basis with the pages in the cache.
+
+	void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
+					     struct inode *inode);
+
+This takes the netfs cookie that the pages were cached with and the inode that
+the pages are attached to.  This function will wait for pages to finish being
+written to the cache and for the cache to finish with the page generally.  No
+error is returned.
+
+
 ==========================
 INDEX AND DATA FILE UPDATE
 ==========================
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 816696621ec9..42e5363b4102 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -92,6 +92,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode)
 
 	if (cifsi->fscache) {
 		cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache);
+		fscache_uncache_all_inode_pages(cifsi->fscache, inode);
 		fscache_relinquish_cookie(cifsi->fscache, 1);
 		cifsi->fscache = NULL;
 	}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index a2a5d19ece6a..2f343b4d7a7d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -954,3 +954,47 @@ void fscache_mark_pages_cached(struct fscache_retrieval *op,
 	pagevec_reinit(pagevec);
 }
 EXPORT_SYMBOL(fscache_mark_pages_cached);
+
+/*
+ * Uncache all the pages in an inode that are marked PG_fscache, assuming them
+ * to be associated with the given cookie.
+ */
+void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
+				       struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t next;
+	int i;
+
+	_enter("%p,%p", cookie, inode);
+
+	if (!mapping || mapping->nrpages == 0) {
+		_leave(" [no pages]");
+		return;
+	}
+
+	pagevec_init(&pvec, 0);
+	next = 0;
+	while (next <= (loff_t)-1 &&
+	       pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)
+	       ) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t page_index = page->index;
+
+			ASSERTCMP(page_index, >=, next);
+			next = page_index + 1;
+
+			if (PageFsCache(page)) {
+				__fscache_wait_on_page_write(cookie, page);
+				__fscache_uncache_page(cookie, page);
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_all_inode_pages);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index ce153a6b3aec..419119c371bf 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -259,12 +259,10 @@ static void nfs_fscache_disable_inode_cookie(struct inode *inode)
 		dfprintk(FSCACHE,
 			 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
 
-		/* Need to invalidate any mapped pages that were read in before
-		 * turning off the cache.
+		/* Need to uncache any pages attached to this inode that
+		 * fscache knows about before turning off the cache.
 		 */
-		if (inode->i_mapping && inode->i_mapping->nrpages)
-			invalidate_inode_pages2(inode->i_mapping);
-
+		fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode);
 		nfs_fscache_zap_inode_cookie(inode);
 	}
 }
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 7c4d72f5581f..9ec20dec3353 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -204,6 +204,8 @@ extern bool __fscache_check_page_write(struct fscache_cookie *, struct page *);
 extern void __fscache_wait_on_page_write(struct fscache_cookie *, struct page *);
 extern bool __fscache_maybe_release_page(struct fscache_cookie *, struct page *,
 					 gfp_t);
+extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *,
+					      struct inode *);
 
 /**
  * fscache_register_netfs - Register a filesystem as desiring caching services
@@ -643,4 +645,23 @@ bool fscache_maybe_release_page(struct fscache_cookie *cookie,
 	return false;
 }
 
+/**
+ * fscache_uncache_all_inode_pages - Uncache all an inode's pages
+ * @cookie: The cookie representing the inode's cache object.
+ * @inode: The inode to uncache pages from.
+ *
+ * Uncache all the pages in an inode that are marked PG_fscache, assuming them
+ * to be associated with the given cookie.
+ *
+ * This function may sleep.  It will wait for pages that are being written out
+ * and will wait whilst the PG_fscache mark is removed by the cache.
+ */
+static inline
+void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
+				     struct inode *inode)
+{
+	if (fscache_cookie_valid(cookie))
+		__fscache_uncache_all_inode_pages(cookie, inode);
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From f607e7fc5fb94d92030c4527287e9c149ddf9e65 Mon Sep 17 00:00:00 2001
From: Jean-François Dagenais <dagenaisj@sonatest.com>
Date: Fri, 8 Jul 2011 15:39:44 -0700
Subject: w1: ds1wm: add a reset recovery parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes a regression in 3.0 reported by Paul Parsons regarding the
removal of the msleep(1) in the ds1wm_reset() function:

: The linux-3.0-rc4 DS1WM 1-wire driver is logging "bus error, retrying"
: error messages on an HP iPAQ hx4700 PDA (XScale-PXA270):
:
: <snip>
: Driver for 1-wire Dallas network protocol.
: DS1WM w1 busmaster driver - (c) 2004 Szabolcs Gyurko
: 1-Wire driver for the DS2760 battery monitor  chip  - (c) 2004-2005, Szabolcs Gyurko
: ds1wm ds1wm: pass: 1 bus error, retrying
: ds1wm ds1wm: pass: 2 bus error, retrying
: ds1wm ds1wm: pass: 3 bus error, retrying
: ds1wm ds1wm: pass: 4 bus error, retrying
: ds1wm ds1wm: pass: 5 bus error, retrying
: ...
:
: The visible result is that the battery charging LED is erratic; sometimes
: it works, mostly it doesn't.
:
: The linux-2.6.39 DS1WM 1-wire driver worked OK.  I haven't tried 3.0-rc1,
: 3.0-rc2, or 3.0-rc3.

This sleep should not be required on normal circuitry provided the
pull-ups on the bus are correctly adapted to the slaves.  Unfortunately,
this is not always the case.  The sleep is restored but as a parameter to
the probe function in the pdata.

[akpm@linux-foundation.org: coding-style fixes]
Reported-by: Paul Parsons <lost.distance@yahoo.com>
Tested-by: Paul Parsons <lost.distance@yahoo.com>
Signed-off-by: Jean-François Dagenais <dagenaisj@sonatest.com>
Cc: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mfd/asic3.c        | 1 +
 drivers/mfd/htc-pasic3.c   | 1 +
 drivers/w1/masters/ds1wm.c | 5 +++++
 include/linux/mfd/ds1wm.h  | 7 +++++++
 4 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index c27fd1fc3b86..c71ae09430c5 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -619,6 +619,7 @@ static void asic3_clk_disable(struct asic3 *asic, struct asic3_clk *clk)
 /* MFD cells (SPI, PWM, LED, DS1WM, MMC) */
 static struct ds1wm_driver_data ds1wm_pdata = {
 	.active_high = 1,
+	.reset_recover_delay = 1,
 };
 
 static struct resource ds1wm_resources[] = {
diff --git a/drivers/mfd/htc-pasic3.c b/drivers/mfd/htc-pasic3.c
index 2808bd125d13..04c7093d6499 100644
--- a/drivers/mfd/htc-pasic3.c
+++ b/drivers/mfd/htc-pasic3.c
@@ -99,6 +99,7 @@ static int ds1wm_disable(struct platform_device *pdev)
 
 static struct ds1wm_driver_data ds1wm_pdata = {
 	.active_high = 0,
+	.reset_recover_delay = 1,
 };
 
 static struct resource ds1wm_resources[] __initdata = {
diff --git a/drivers/w1/masters/ds1wm.c b/drivers/w1/masters/ds1wm.c
index ad57593d224a..a0c8965c1a79 100644
--- a/drivers/w1/masters/ds1wm.c
+++ b/drivers/w1/masters/ds1wm.c
@@ -109,6 +109,7 @@ struct ds1wm_data {
 	/* byte to write that makes all intr disabled, */
 	/* considering active_state (IAS) (optimization) */
 	u8       int_en_reg_none;
+	unsigned int reset_recover_delay; /* see ds1wm.h */
 };
 
 static inline void ds1wm_write_register(struct ds1wm_data *ds1wm_data, u32 reg,
@@ -187,6 +188,9 @@ static int ds1wm_reset(struct ds1wm_data *ds1wm_data)
 		return 1;
 	}
 
+	if (ds1wm_data->reset_recover_delay)
+		msleep(ds1wm_data->reset_recover_delay);
+
 	return 0;
 }
 
@@ -490,6 +494,7 @@ static int ds1wm_probe(struct platform_device *pdev)
 	}
 	ds1wm_data->irq = res->start;
 	ds1wm_data->int_en_reg_none = (plat->active_high ? DS1WM_INTEN_IAS : 0);
+	ds1wm_data->reset_recover_delay = plat->reset_recover_delay;
 
 	if (res->flags & IORESOURCE_IRQ_HIGHEDGE)
 		irq_set_irq_type(ds1wm_data->irq, IRQ_TYPE_EDGE_RISING);
diff --git a/include/linux/mfd/ds1wm.h b/include/linux/mfd/ds1wm.h
index be469a357cbb..38a372a0e285 100644
--- a/include/linux/mfd/ds1wm.h
+++ b/include/linux/mfd/ds1wm.h
@@ -3,4 +3,11 @@
 struct ds1wm_driver_data {
 	int active_high;
 	int clock_rate;
+	/* in milliseconds, the amount of time to */
+	/* sleep following a reset pulse. Zero    */
+	/* should work if your bus devices recover*/
+	/* time respects the 1-wire spec since the*/
+	/* ds1wm implements the precise timings of*/
+	/* a reset pulse/presence detect sequence.*/
+	unsigned int reset_recover_delay;
 };
-- 
cgit v1.2.3


From a63fdc5156f2ef5690b6cf03d72b0c4917efbba7 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 14 Jun 2011 10:57:50 +1000
Subject: mm: Move definition of MIN_MEMORY_BLOCK_SIZE to a header

The macro MIN_MEMORY_BLOCK_SIZE is currently defined twice in two .c
files, and I need it in a third one to fix a powerpc bug, so let's
first move it into a header

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c  | 3 +--
 drivers/base/memory.c  | 1 -
 include/linux/memory.h | 2 ++
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d865c4aeec55..bbaaa005bf0e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -28,6 +28,7 @@
 #include <linux/poison.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
+#include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nmi.h>
 #include <linux/gfp.h>
@@ -895,8 +896,6 @@ const char *arch_vma_name(struct vm_area_struct *vma)
 }
 
 #ifdef CONFIG_X86_UV
-#define MIN_MEMORY_BLOCK_SIZE   (1 << SECTION_SIZE_BITS)
-
 unsigned long memory_block_size_bytes(void)
 {
 	if (is_uv_system()) {
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 9f9b2359f718..45d7c8fc73bd 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -30,7 +30,6 @@
 static DEFINE_MUTEX(mem_sysfs_mutex);
 
 #define MEMORY_CLASS_NAME	"memory"
-#define MIN_MEMORY_BLOCK_SIZE	(1 << SECTION_SIZE_BITS)
 
 static int sections_per_block;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index e1e3b2b84f85..935699b30b7c 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -20,6 +20,8 @@
 #include <linux/compiler.h>
 #include <linux/mutex.h>
 
+#define MIN_MEMORY_BLOCK_SIZE     (1 << SECTION_SIZE_BITS)
+
 struct memory_block {
 	unsigned long start_section_nr;
 	unsigned long end_section_nr;
-- 
cgit v1.2.3


From f39b2dd9d065151a04f5996656d1f27a7eb32d45 Mon Sep 17 00:00:00 2001
From: Philip Rakity <prakity@marvell.com>
Date: Thu, 7 Jul 2011 09:04:55 -0700
Subject: mmc: core: Bus width testing needs to handle suspend/resume

On reading the ext_csd for the first time (in 1 bit mode), save the
ext_csd information needed for bus width compare.

On every pass we make re-reading the ext_csd, compare the data
against the saved ext_csd data.

This fixes a regression introduced in 3.0-rc1 by 08ee80cc397ac1a3
("mmc: core: eMMC bus width may not work on all platforms"), which
incorrectly assumed we would be re-reading the ext_csd at resume-
time.

Signed-off-by: Philip Rakity <prakity@marvell.com>
Tested-by: Jaehoon Chung <jh80.chung@samsung.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/core/mmc.c   | 77 +++++++++++++++++++++++++++++++-----------------
 include/linux/mmc/card.h | 13 ++++++++
 2 files changed, 63 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 2a7e43bc796d..aa7d1d79b8c5 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -247,12 +247,12 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		return 0;
 
 	/* Version is coded in the CSD_STRUCTURE byte in the EXT_CSD register */
+	card->ext_csd.raw_ext_csd_structure = ext_csd[EXT_CSD_STRUCTURE];
 	if (card->csd.structure == 3) {
-		int ext_csd_struct = ext_csd[EXT_CSD_STRUCTURE];
-		if (ext_csd_struct > 2) {
+		if (card->ext_csd.raw_ext_csd_structure > 2) {
 			printk(KERN_ERR "%s: unrecognised EXT_CSD structure "
 				"version %d\n", mmc_hostname(card->host),
-					ext_csd_struct);
+					card->ext_csd.raw_ext_csd_structure);
 			err = -EINVAL;
 			goto out;
 		}
@@ -266,6 +266,10 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		goto out;
 	}
 
+	card->ext_csd.raw_sectors[0] = ext_csd[EXT_CSD_SEC_CNT + 0];
+	card->ext_csd.raw_sectors[1] = ext_csd[EXT_CSD_SEC_CNT + 1];
+	card->ext_csd.raw_sectors[2] = ext_csd[EXT_CSD_SEC_CNT + 2];
+	card->ext_csd.raw_sectors[3] = ext_csd[EXT_CSD_SEC_CNT + 3];
 	if (card->ext_csd.rev >= 2) {
 		card->ext_csd.sectors =
 			ext_csd[EXT_CSD_SEC_CNT + 0] << 0 |
@@ -277,7 +281,7 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		if (card->ext_csd.sectors > (2u * 1024 * 1024 * 1024) / 512)
 			mmc_card_set_blockaddr(card);
 	}
-
+	card->ext_csd.raw_card_type = ext_csd[EXT_CSD_CARD_TYPE];
 	switch (ext_csd[EXT_CSD_CARD_TYPE] & EXT_CSD_CARD_TYPE_MASK) {
 	case EXT_CSD_CARD_TYPE_DDR_52 | EXT_CSD_CARD_TYPE_52 |
 	     EXT_CSD_CARD_TYPE_26:
@@ -307,6 +311,11 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 			mmc_hostname(card->host));
 	}
 
+	card->ext_csd.raw_s_a_timeout = ext_csd[EXT_CSD_S_A_TIMEOUT];
+	card->ext_csd.raw_erase_timeout_mult =
+		ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT];
+	card->ext_csd.raw_hc_erase_grp_size =
+		ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE];
 	if (card->ext_csd.rev >= 3) {
 		u8 sa_shift = ext_csd[EXT_CSD_S_A_TIMEOUT];
 		card->ext_csd.part_config = ext_csd[EXT_CSD_PART_CONFIG];
@@ -334,6 +343,16 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		card->ext_csd.boot_size = ext_csd[EXT_CSD_BOOT_MULT] << 17;
 	}
 
+	card->ext_csd.raw_hc_erase_gap_size =
+		ext_csd[EXT_CSD_PARTITION_ATTRIBUTE];
+	card->ext_csd.raw_sec_trim_mult =
+		ext_csd[EXT_CSD_SEC_TRIM_MULT];
+	card->ext_csd.raw_sec_erase_mult =
+		ext_csd[EXT_CSD_SEC_ERASE_MULT];
+	card->ext_csd.raw_sec_feature_support =
+		ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT];
+	card->ext_csd.raw_trim_mult =
+		ext_csd[EXT_CSD_TRIM_MULT];
 	if (card->ext_csd.rev >= 4) {
 		/*
 		 * Enhanced area feature support -- check whether the eMMC
@@ -341,7 +360,7 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		 * area offset and size to user by adding sysfs interface.
 		 */
 		if ((ext_csd[EXT_CSD_PARTITION_SUPPORT] & 0x2) &&
-				(ext_csd[EXT_CSD_PARTITION_ATTRIBUTE] & 0x1)) {
+		    (ext_csd[EXT_CSD_PARTITION_ATTRIBUTE] & 0x1)) {
 			u8 hc_erase_grp_sz =
 				ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE];
 			u8 hc_wp_grp_sz =
@@ -401,17 +420,17 @@ static inline void mmc_free_ext_csd(u8 *ext_csd)
 }
 
 
-static int mmc_compare_ext_csds(struct mmc_card *card, u8 *ext_csd,
-			unsigned bus_width)
+static int mmc_compare_ext_csds(struct mmc_card *card, unsigned bus_width)
 {
 	u8 *bw_ext_csd;
 	int err;
 
+	if (bus_width == MMC_BUS_WIDTH_1)
+		return 0;
+
 	err = mmc_get_ext_csd(card, &bw_ext_csd);
-	if (err)
-		return err;
 
-	if ((ext_csd == NULL || bw_ext_csd == NULL)) {
+	if (err || bw_ext_csd == NULL) {
 		if (bus_width != MMC_BUS_WIDTH_1)
 			err = -EINVAL;
 		goto out;
@@ -421,35 +440,40 @@ static int mmc_compare_ext_csds(struct mmc_card *card, u8 *ext_csd,
 		goto out;
 
 	/* only compare read only fields */
-	err = (!(ext_csd[EXT_CSD_PARTITION_SUPPORT] ==
+	err = (!(card->ext_csd.raw_partition_support ==
 			bw_ext_csd[EXT_CSD_PARTITION_SUPPORT]) &&
-		(ext_csd[EXT_CSD_ERASED_MEM_CONT] ==
+		(card->ext_csd.raw_erased_mem_count ==
 			bw_ext_csd[EXT_CSD_ERASED_MEM_CONT]) &&
-		(ext_csd[EXT_CSD_REV] ==
+		(card->ext_csd.rev ==
 			bw_ext_csd[EXT_CSD_REV]) &&
-		(ext_csd[EXT_CSD_STRUCTURE] ==
+		(card->ext_csd.raw_ext_csd_structure ==
 			bw_ext_csd[EXT_CSD_STRUCTURE]) &&
-		(ext_csd[EXT_CSD_CARD_TYPE] ==
+		(card->ext_csd.raw_card_type ==
 			bw_ext_csd[EXT_CSD_CARD_TYPE]) &&
-		(ext_csd[EXT_CSD_S_A_TIMEOUT] ==
+		(card->ext_csd.raw_s_a_timeout ==
 			bw_ext_csd[EXT_CSD_S_A_TIMEOUT]) &&
-		(ext_csd[EXT_CSD_HC_WP_GRP_SIZE] ==
+		(card->ext_csd.raw_hc_erase_gap_size ==
 			bw_ext_csd[EXT_CSD_HC_WP_GRP_SIZE]) &&
-		(ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT] ==
+		(card->ext_csd.raw_erase_timeout_mult ==
 			bw_ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT]) &&
-		(ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE] ==
+		(card->ext_csd.raw_hc_erase_grp_size ==
 			bw_ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE]) &&
-		(ext_csd[EXT_CSD_SEC_TRIM_MULT] ==
+		(card->ext_csd.raw_sec_trim_mult ==
 			bw_ext_csd[EXT_CSD_SEC_TRIM_MULT]) &&
-		(ext_csd[EXT_CSD_SEC_ERASE_MULT] ==
+		(card->ext_csd.raw_sec_erase_mult ==
 			bw_ext_csd[EXT_CSD_SEC_ERASE_MULT]) &&
-		(ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT] ==
+		(card->ext_csd.raw_sec_feature_support ==
 			bw_ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT]) &&
-		(ext_csd[EXT_CSD_TRIM_MULT] ==
+		(card->ext_csd.raw_trim_mult ==
 			bw_ext_csd[EXT_CSD_TRIM_MULT]) &&
-		memcmp(&ext_csd[EXT_CSD_SEC_CNT],
-		       &bw_ext_csd[EXT_CSD_SEC_CNT],
-		       4) != 0);
+		(card->ext_csd.raw_sectors[0] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 0]) &&
+		(card->ext_csd.raw_sectors[1] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 1]) &&
+		(card->ext_csd.raw_sectors[2] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 2]) &&
+		(card->ext_csd.raw_sectors[3] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 3]));
 	if (err)
 		err = -EINVAL;
 
@@ -770,7 +794,6 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 				 */
 				if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST))
 					err = mmc_compare_ext_csds(card,
-						ext_csd,
 						bus_width);
 				else
 					err = mmc_bus_test(card, bus_width);
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index c6927a4d157f..6ad43554ac05 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -64,6 +64,19 @@ struct mmc_ext_csd {
 	unsigned long long	enhanced_area_offset;	/* Units: Byte */
 	unsigned int		enhanced_area_size;	/* Units: KB */
 	unsigned int		boot_size;		/* in bytes */
+	u8			raw_partition_support;	/* 160 */
+	u8			raw_erased_mem_count;	/* 181 */
+	u8			raw_ext_csd_structure;	/* 194 */
+	u8			raw_card_type;		/* 196 */
+	u8			raw_s_a_timeout;		/* 217 */
+	u8			raw_hc_erase_gap_size;	/* 221 */
+	u8			raw_erase_timeout_mult;	/* 223 */
+	u8			raw_hc_erase_grp_size;	/* 224 */
+	u8			raw_sec_trim_mult;	/* 229 */
+	u8			raw_sec_erase_mult;	/* 230 */
+	u8			raw_sec_feature_support;/* 231 */
+	u8			raw_trim_mult;		/* 232 */
+	u8			raw_sectors[4];		/* 212 - 4 bytes */
 };
 
 struct sd_scr {
-- 
cgit v1.2.3


From 62f2a3a48bdc99822a24356e667e52c30df287c9 Mon Sep 17 00:00:00 2001
From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Wed, 13 Jul 2011 14:10:29 +0000
Subject: net: remove NETIF_F_ALL_TX_OFFLOADS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no software fallback implemented for SCTP or FCoE checksumming,
and so it should not be passed on by software devices like bridge or bonding.

For VLAN devices, this is different. First, the driver for underlying device
should be prepared to get offloaded packets even when the feature is disabled
(especially if it advertises it in vlan_features). Second, devices under
VLANs do not get replaced without tearing down the VLAN first.

This fixes a mess I accidentally introduced while converting bonding to
ndo_fix_features.

NETIF_F_SOFT_FEATURES are removed from BOND_VLAN_FEATURES because they
are unused as of commit 712ae51afd.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 6 +++---
 include/linux/netdevice.h       | 6 ------
 net/8021q/vlan_dev.c            | 6 +++++-
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index eafe44a528ac..63c22b0bb5ad 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1428,9 +1428,9 @@ out:
 	return features;
 }
 
-#define BOND_VLAN_FEATURES	(NETIF_F_ALL_TX_OFFLOADS | \
-				 NETIF_F_SOFT_FEATURES | \
-				 NETIF_F_LRO)
+#define BOND_VLAN_FEATURES	(NETIF_F_ALL_CSUM | NETIF_F_SG | \
+				 NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
+				 NETIF_F_HIGHDMA | NETIF_F_LRO)
 
 static void bond_compute_features(struct bonding *bond)
 {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 54b8b4d7b68f..9e19477991ad 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1097,12 +1097,6 @@ struct net_device {
 #define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \
 				 NETIF_F_FSO)
 
-#define NETIF_F_ALL_TX_OFFLOADS	(NETIF_F_ALL_CSUM | NETIF_F_SG | \
-				 NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
-				 NETIF_F_HIGHDMA | \
-				 NETIF_F_SCTP_CSUM | \
-				 NETIF_F_ALL_FCOE)
-
 	/*
 	 * If one device supports one of these features, then enable them
 	 * for all in netdev_increment_features.
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 86bff9b1ac47..6e82148edfc8 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -528,7 +528,11 @@ static int vlan_dev_init(struct net_device *dev)
 					  (1<<__LINK_STATE_DORMANT))) |
 		      (1<<__LINK_STATE_PRESENT);
 
-	dev->hw_features = NETIF_F_ALL_TX_OFFLOADS;
+	dev->hw_features = NETIF_F_ALL_CSUM | NETIF_F_SG |
+			   NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
+			   NETIF_F_HIGHDMA | NETIF_F_SCTP_CSUM |
+			   NETIF_F_ALL_FCOE;
+
 	dev->features |= real_dev->vlan_features | NETIF_F_LLTX;
 	dev->gso_max_size = real_dev->gso_max_size;
 
-- 
cgit v1.2.3


From a07c7964a29b6dc515b120f1e1c223ac2f8666f5 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sat, 16 Jul 2011 22:22:20 +0000
Subject: include/linux/sdla.h: remove the prototype of sdla()

`make headers_check` complains that

linux-2.6/usr/include/linux/sdla.h:116: userspace cannot reference
function or variable defined in the kernel

this is due to that there is no such a kernel function,

void sdla(void *cfg_info, char *dev, struct frad_conf *conf, int quiet);

I don't know why we have it in a kernel header, so remove it.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sdla.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sdla.h b/include/linux/sdla.h
index 564acd3a71c1..9995c7fc3f60 100644
--- a/include/linux/sdla.h
+++ b/include/linux/sdla.h
@@ -112,11 +112,7 @@ struct sdla_dlci_conf {
    short Tb_max;
 };
 
-#ifndef __KERNEL__
-
-void sdla(void *cfg_info, char *dev, struct frad_conf *conf, int quiet);
-
-#else
+#ifdef __KERNEL__
 
 /* important Z80 window addresses */
 #define SDLA_CONTROL_WND		0xE000
-- 
cgit v1.2.3


From 7765be2fec0f476fcd61812d5f9406b04c765020 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 14 Jul 2011 12:24:11 -0700
Subject: rcu: Fix RCU_BOOST race handling current->rcu_read_unlock_special

The RCU_BOOST commits for TREE_PREEMPT_RCU introduced an other-task
write to a new RCU_READ_UNLOCK_BOOSTED bit in the task_struct structure's
->rcu_read_unlock_special field, but, as noted by Steven Rostedt, without
correctly synchronizing all accesses to ->rcu_read_unlock_special.
This could result in bits in ->rcu_read_unlock_special being spuriously
set and cleared due to conflicting accesses, which in turn could result
in deadlocks between the rcu_node structure's ->lock and the scheduler's
rq and pi locks.  These deadlocks would result from RCU incorrectly
believing that the just-ended RCU read-side critical section had been
preempted and/or boosted.  If that RCU read-side critical section was
executed with either rq or pi locks held, RCU's ensuing (incorrect)
calls to the scheduler would cause the scheduler to attempt to once
again acquire the rq and pi locks, resulting in deadlock.  More complex
deadlock cycles are also possible, involving multiple rq and pi locks
as well as locks from multiple rcu_node structures.

This commit fixes synchronization by creating ->rcu_boosted field in
task_struct that is accessed and modified only when holding the ->lock
in the rcu_node structure on which the task is queued (on that rcu_node
structure's ->blkd_tasks list).  This results in tasks accessing only
their own current->rcu_read_unlock_special fields, making unsynchronized
access once again legal, and keeping the rcu_read_unlock() fastpath free
of atomic instructions and memory barriers.

The reason that the rcu_read_unlock() fastpath does not need to access
the new current->rcu_boosted field is that this new field cannot
be non-zero unless the RCU_READ_UNLOCK_BLOCKED bit is set in the
current->rcu_read_unlock_special field.  Therefore, rcu_read_unlock()
need only test current->rcu_read_unlock_special: if that is zero, then
current->rcu_boosted must also be zero.

This bug does not affect TINY_PREEMPT_RCU because this implementation
of RCU accesses current->rcu_read_unlock_special with irqs disabled,
thus preventing races on the !SMP systems that TINY_PREEMPT_RCU runs on.

Maybe-reported-by: Dave Jones <davej@redhat.com>
Maybe-reported-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/sched.h   | 3 +++
 kernel/rcutree_plugin.h | 8 ++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 496770a96487..76676a407e4a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1254,6 +1254,9 @@ struct task_struct {
 #ifdef CONFIG_PREEMPT_RCU
 	int rcu_read_lock_nesting;
 	char rcu_read_unlock_special;
+#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU)
+	int rcu_boosted;
+#endif /* #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) */
 	struct list_head rcu_node_entry;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_TREE_PREEMPT_RCU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 6abef3cfcbc1..3a0ae0355222 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -342,6 +342,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
 		if (&t->rcu_node_entry == rnp->boost_tasks)
 			rnp->boost_tasks = np;
+		/* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
+		if (t->rcu_boosted) {
+			special |= RCU_READ_UNLOCK_BOOSTED;
+			t->rcu_boosted = 0;
+		}
 #endif /* #ifdef CONFIG_RCU_BOOST */
 		t->rcu_blocked_node = NULL;
 
@@ -358,7 +363,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
 		if (special & RCU_READ_UNLOCK_BOOSTED) {
-			t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
 			rt_mutex_unlock(t->rcu_boost_mutex);
 			t->rcu_boost_mutex = NULL;
 		}
@@ -1176,7 +1180,7 @@ static int rcu_boost(struct rcu_node *rnp)
 	t = container_of(tb, struct task_struct, rcu_node_entry);
 	rt_mutex_init_proxy_locked(&mtx, t);
 	t->rcu_boost_mutex = &mtx;
-	t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+	t->rcu_boosted = 1;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
 	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-- 
cgit v1.2.3


From 9c3f75cbd144014bea6af866a154cc2e73ab2287 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Jul 2011 13:00:06 +0200
Subject: sched: Break out cpu_power from the sched_group structure

In order to prepare for non-unique sched_groups per domain, we need to
carry the cpu_power elsewhere, so put a level of indirection in.

Reported-and-tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-qkho2byuhe4482fuknss40ad@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 14 +++++++++-----
 kernel/sched.c        | 32 ++++++++++++++++++++++++++------
 kernel/sched_fair.c   | 46 +++++++++++++++++++++++-----------------------
 3 files changed, 58 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 496770a96487..2e5b3c8e2d3e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -893,16 +893,20 @@ static inline int sd_power_saving_flags(void)
 	return 0;
 }
 
-struct sched_group {
-	struct sched_group *next;	/* Must be a circular list */
-	atomic_t ref;
-
+struct sched_group_power {
 	/*
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
 	 * single CPU.
 	 */
-	unsigned int cpu_power, cpu_power_orig;
+	unsigned int power, power_orig;
+};
+
+struct sched_group {
+	struct sched_group *next;	/* Must be a circular list */
+	atomic_t ref;
+
 	unsigned int group_weight;
+	struct sched_group_power *sgp;
 
 	/*
 	 * The CPUs this group covers.
diff --git a/kernel/sched.c b/kernel/sched.c
index 3dc716f6d8ad..36c10d25d4cd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6557,7 +6557,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (!group->cpu_power) {
+		if (!group->sgp->power) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: domain->cpu_power not "
 					"set\n");
@@ -6581,9 +6581,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
 		printk(KERN_CONT " %s", str);
-		if (group->cpu_power != SCHED_POWER_SCALE) {
+		if (group->sgp->power != SCHED_POWER_SCALE) {
 			printk(KERN_CONT " (cpu_power = %d)",
-				group->cpu_power);
+				group->sgp->power);
 		}
 
 		group = group->next;
@@ -6777,8 +6777,10 @@ static struct root_domain *alloc_rootdomain(void)
 static void free_sched_domain(struct rcu_head *rcu)
 {
 	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-	if (atomic_dec_and_test(&sd->groups->ref))
+	if (atomic_dec_and_test(&sd->groups->ref)) {
+		kfree(sd->groups->sgp);
 		kfree(sd->groups);
+	}
 	kfree(sd);
 }
 
@@ -6945,6 +6947,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 struct sd_data {
 	struct sched_domain **__percpu sd;
 	struct sched_group **__percpu sg;
+	struct sched_group_power **__percpu sgp;
 };
 
 struct s_data {
@@ -6981,8 +6984,10 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 	if (child)
 		cpu = cpumask_first(sched_domain_span(child));
 
-	if (sg)
+	if (sg) {
 		*sg = *per_cpu_ptr(sdd->sg, cpu);
+		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+	}
 
 	return cpu;
 }
@@ -7020,7 +7025,7 @@ build_sched_groups(struct sched_domain *sd)
 			continue;
 
 		cpumask_clear(sched_group_cpus(sg));
-		sg->cpu_power = 0;
+		sg->sgp->power = 0;
 
 		for_each_cpu(j, span) {
 			if (get_group(j, sdd, NULL) != group)
@@ -7185,6 +7190,7 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 	if (cpu == cpumask_first(sched_group_cpus(sg))) {
 		WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
+		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
 	}
 }
 
@@ -7234,9 +7240,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 		if (!sdd->sg)
 			return -ENOMEM;
 
+		sdd->sgp = alloc_percpu(struct sched_group_power *);
+		if (!sdd->sgp)
+			return -ENOMEM;
+
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
 			struct sched_group *sg;
+			struct sched_group_power *sgp;
 
 		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
@@ -7251,6 +7262,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 				return -ENOMEM;
 
 			*per_cpu_ptr(sdd->sg, j) = sg;
+
+			sgp = kzalloc_node(sizeof(struct sched_group_power),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sgp)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sgp, j) = sgp;
 		}
 	}
 
@@ -7268,9 +7286,11 @@ static void __sdt_free(const struct cpumask *cpu_map)
 		for_each_cpu(j, cpu_map) {
 			kfree(*per_cpu_ptr(sdd->sd, j));
 			kfree(*per_cpu_ptr(sdd->sg, j));
+			kfree(*per_cpu_ptr(sdd->sgp, j));
 		}
 		free_percpu(sdd->sd);
 		free_percpu(sdd->sg);
+		free_percpu(sdd->sgp);
 	}
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 433491c2dc8f..c768588e180b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1585,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		}
 
 		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
+		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
 
 		if (local_group) {
 			this_load = avg_load;
@@ -2631,7 +2631,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power >>= SCHED_POWER_SHIFT;
 	}
 
-	sdg->cpu_power_orig = power;
+	sdg->sgp->power_orig = power;
 
 	if (sched_feat(ARCH_POWER))
 		power *= arch_scale_freq_power(sd, cpu);
@@ -2647,7 +2647,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power = 1;
 
 	cpu_rq(cpu)->cpu_power = power;
-	sdg->cpu_power = power;
+	sdg->sgp->power = power;
 }
 
 static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2665,11 +2665,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
 
 	group = child->groups;
 	do {
-		power += group->cpu_power;
+		power += group->sgp->power;
 		group = group->next;
 	} while (group != child->groups);
 
-	sdg->cpu_power = power;
+	sdg->sgp->power = power;
 }
 
 /*
@@ -2691,7 +2691,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 	/*
 	 * If ~90% of the cpu_power is still there, we're good.
 	 */
-	if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
 		return 1;
 
 	return 0;
@@ -2771,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	}
 
 	/* Adjust by relative CPU power of the group */
-	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
+	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
 
 	/*
 	 * Consider the group unbalanced when the imbalance is larger
@@ -2788,7 +2788,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
 		sgs->group_imb = 1;
 
-	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
 						SCHED_POWER_SCALE);
 	if (!sgs->group_capacity)
 		sgs->group_capacity = fix_small_capacity(sd, group);
@@ -2877,7 +2877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			return;
 
 		sds->total_load += sgs.group_load;
-		sds->total_pwr += sg->cpu_power;
+		sds->total_pwr += sg->sgp->power;
 
 		/*
 		 * In case the child domain prefers tasks go to siblings
@@ -2962,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd,
 	if (this_cpu > busiest_cpu)
 		return 0;
 
-	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
 				       SCHED_POWER_SCALE);
 	return 1;
 }
@@ -2993,7 +2993,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 
 	scaled_busy_load_per_task = sds->busiest_load_per_task
 					 * SCHED_POWER_SCALE;
-	scaled_busy_load_per_task /= sds->busiest->cpu_power;
+	scaled_busy_load_per_task /= sds->busiest->sgp->power;
 
 	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
 			(scaled_busy_load_per_task * imbn)) {
@@ -3007,28 +3007,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 	 * moving them.
 	 */
 
-	pwr_now += sds->busiest->cpu_power *
+	pwr_now += sds->busiest->sgp->power *
 			min(sds->busiest_load_per_task, sds->max_load);
-	pwr_now += sds->this->cpu_power *
+	pwr_now += sds->this->sgp->power *
 			min(sds->this_load_per_task, sds->this_load);
 	pwr_now /= SCHED_POWER_SCALE;
 
 	/* Amount of load we'd subtract */
 	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-		sds->busiest->cpu_power;
+		sds->busiest->sgp->power;
 	if (sds->max_load > tmp)
-		pwr_move += sds->busiest->cpu_power *
+		pwr_move += sds->busiest->sgp->power *
 			min(sds->busiest_load_per_task, sds->max_load - tmp);
 
 	/* Amount of load we'd add */
-	if (sds->max_load * sds->busiest->cpu_power <
+	if (sds->max_load * sds->busiest->sgp->power <
 		sds->busiest_load_per_task * SCHED_POWER_SCALE)
-		tmp = (sds->max_load * sds->busiest->cpu_power) /
-			sds->this->cpu_power;
+		tmp = (sds->max_load * sds->busiest->sgp->power) /
+			sds->this->sgp->power;
 	else
 		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-			sds->this->cpu_power;
-	pwr_move += sds->this->cpu_power *
+			sds->this->sgp->power;
+	pwr_move += sds->this->sgp->power *
 			min(sds->this_load_per_task, sds->this_load + tmp);
 	pwr_move /= SCHED_POWER_SCALE;
 
@@ -3074,7 +3074,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 
 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
 
-		load_above_capacity /= sds->busiest->cpu_power;
+		load_above_capacity /= sds->busiest->sgp->power;
 	}
 
 	/*
@@ -3090,8 +3090,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
 
 	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min(max_pull * sds->busiest->cpu_power,
-		(sds->avg_load - sds->this_load) * sds->this->cpu_power)
+	*imbalance = min(max_pull * sds->busiest->sgp->power,
+		(sds->avg_load - sds->this_load) * sds->this->sgp->power)
 			/ SCHED_POWER_SCALE;
 
 	/*
-- 
cgit v1.2.3


From e3589f6c81e4764d32a25d2a2a0afe54fa344f5c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 Jul 2011 10:35:52 +0200
Subject: sched: Allow for overlapping sched_domain spans

Allow for sched_domain spans that overlap by giving such domains their
own sched_group list instead of sharing the sched_groups amongst
each-other.

This is needed for machines with more than 16 nodes, because
sched_domain_node_span() will generate a node mask from the
16 nearest nodes without regard if these masks have any overlap.

Currently sched_domains have a sched_group that maps to their child
sched_domain span, and since there is no overlap we share the
sched_group between the sched_domains of the various CPUs. If however
there is overlap, we would need to link the sched_group list in
different ways for each cpu, and hence sharing isn't possible.

In order to solve this, allocate private sched_groups for each CPU's
sched_domain but have the sched_groups share a sched_group_power
structure such that we can uniquely track the power.

Reported-and-tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-08bxqw9wis3qti9u5inifh3y@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |   2 +
 kernel/sched.c          | 157 +++++++++++++++++++++++++++++++++++++++---------
 kernel/sched_features.h |   2 +
 3 files changed, 132 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e5b3c8e2d3e..bde99d5358dc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -844,6 +844,7 @@ enum cpu_idle_type {
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
 #define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
+#define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
 
 enum powersavings_balance_level {
 	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
@@ -894,6 +895,7 @@ static inline int sd_power_saving_flags(void)
 }
 
 struct sched_group_power {
+	atomic_t ref;
 	/*
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
 	 * single CPU.
diff --git a/kernel/sched.c b/kernel/sched.c
index 36c10d25d4cd..921adf6f6fad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6774,10 +6774,36 @@ static struct root_domain *alloc_rootdomain(void)
 	return rd;
 }
 
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+	struct sched_group *tmp, *first;
+
+	if (!sg)
+		return;
+
+	first = sg;
+	do {
+		tmp = sg->next;
+
+		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+			kfree(sg->sgp);
+
+		kfree(sg);
+		sg = tmp;
+	} while (sg != first);
+}
+
 static void free_sched_domain(struct rcu_head *rcu)
 {
 	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-	if (atomic_dec_and_test(&sd->groups->ref)) {
+
+	/*
+	 * If its an overlapping domain it has private groups, iterate and
+	 * nuke them all.
+	 */
+	if (sd->flags & SD_OVERLAP) {
+		free_sched_groups(sd->groups, 1);
+	} else if (atomic_dec_and_test(&sd->groups->ref)) {
 		kfree(sd->groups->sgp);
 		kfree(sd->groups);
 	}
@@ -6967,15 +6993,73 @@ struct sched_domain_topology_level;
 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 
+#define SDTL_OVERLAP	0x01
+
 struct sched_domain_topology_level {
 	sched_domain_init_f init;
 	sched_domain_mask_f mask;
+	int		    flags;
 	struct sd_data      data;
 };
 
-/*
- * Assumes the sched_domain tree is fully constructed
- */
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+	const struct cpumask *span = sched_domain_span(sd);
+	struct cpumask *covered = sched_domains_tmpmask;
+	struct sd_data *sdd = sd->private;
+	struct sched_domain *child;
+	int i;
+
+	cpumask_clear(covered);
+
+	for_each_cpu(i, span) {
+		struct cpumask *sg_span;
+
+		if (cpumask_test_cpu(i, covered))
+			continue;
+
+		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+				GFP_KERNEL, cpu_to_node(i));
+
+		if (!sg)
+			goto fail;
+
+		sg_span = sched_group_cpus(sg);
+
+		child = *per_cpu_ptr(sdd->sd, i);
+		if (child->child) {
+			child = child->child;
+			cpumask_copy(sg_span, sched_domain_span(child));
+		} else
+			cpumask_set_cpu(i, sg_span);
+
+		cpumask_or(covered, covered, sg_span);
+
+		sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+		atomic_inc(&sg->sgp->ref);
+
+		if (cpumask_test_cpu(cpu, sg_span))
+			groups = sg;
+
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+		last->next = first;
+	}
+	sd->groups = groups;
+
+	return 0;
+
+fail:
+	free_sched_groups(first, 0);
+
+	return -ENOMEM;
+}
+
 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 {
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -6987,23 +7071,21 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 	if (sg) {
 		*sg = *per_cpu_ptr(sdd->sg, cpu);
 		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
 	}
 
 	return cpu;
 }
 
 /*
- * build_sched_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
  * build_sched_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
  */
-static void
-build_sched_groups(struct sched_domain *sd)
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
 {
 	struct sched_group *first = NULL, *last = NULL;
 	struct sd_data *sdd = sd->private;
@@ -7011,6 +7093,12 @@ build_sched_groups(struct sched_domain *sd)
 	struct cpumask *covered;
 	int i;
 
+	get_group(cpu, sdd, &sd->groups);
+	atomic_inc(&sd->groups->ref);
+
+	if (cpu != cpumask_first(sched_domain_span(sd)))
+		return 0;
+
 	lockdep_assert_held(&sched_domains_mutex);
 	covered = sched_domains_tmpmask;
 
@@ -7042,6 +7130,8 @@ build_sched_groups(struct sched_domain *sd)
 		last = sg;
 	}
 	last->next = first;
+
+	return 0;
 }
 
 /*
@@ -7056,12 +7146,17 @@ build_sched_groups(struct sched_domain *sd)
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
-	WARN_ON(!sd || !sd->groups);
+	struct sched_group *sg = sd->groups;
 
-	if (cpu != group_first_cpu(sd->groups))
-		return;
+	WARN_ON(!sd || !sg);
+
+	do {
+		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+		sg = sg->next;
+	} while (sg != sd->groups);
 
-	sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+	if (cpu != group_first_cpu(sg))
+		return;
 
 	update_group_power(sd, cpu);
 }
@@ -7182,16 +7277,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 static void claim_allocations(int cpu, struct sched_domain *sd)
 {
 	struct sd_data *sdd = sd->private;
-	struct sched_group *sg = sd->groups;
 
 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
 	*per_cpu_ptr(sdd->sd, cpu) = NULL;
 
-	if (cpu == cpumask_first(sched_group_cpus(sg))) {
-		WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
 		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
-	}
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -7216,7 +7310,7 @@ static struct sched_domain_topology_level default_topology[] = {
 #endif
 	{ sd_init_CPU, cpu_cpu_mask, },
 #ifdef CONFIG_NUMA
-	{ sd_init_NODE, cpu_node_mask, },
+	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
 	{ sd_init_ALLNODES, cpu_allnodes_mask, },
 #endif
 	{ NULL, },
@@ -7284,7 +7378,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
 		struct sd_data *sdd = &tl->data;
 
 		for_each_cpu(j, cpu_map) {
-			kfree(*per_cpu_ptr(sdd->sd, j));
+			struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+			if (sd && (sd->flags & SD_OVERLAP))
+				free_sched_groups(sd->groups, 0);
 			kfree(*per_cpu_ptr(sdd->sg, j));
 			kfree(*per_cpu_ptr(sdd->sgp, j));
 		}
@@ -7336,8 +7432,11 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 		struct sched_domain_topology_level *tl;
 
 		sd = NULL;
-		for (tl = sched_domain_topology; tl->init; tl++)
+		for (tl = sched_domain_topology; tl->init; tl++) {
 			sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+				sd->flags |= SD_OVERLAP;
+		}
 
 		while (sd->child)
 			sd = sd->child;
@@ -7349,13 +7448,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	for_each_cpu(i, cpu_map) {
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			sd->span_weight = cpumask_weight(sched_domain_span(sd));
-			get_group(i, sd->private, &sd->groups);
-			atomic_inc(&sd->groups->ref);
-
-			if (i != cpumask_first(sched_domain_span(sd)))
-				continue;
-
-			build_sched_groups(sd);
+			if (sd->flags & SD_OVERLAP) {
+				if (build_overlap_sched_groups(sd, i))
+					goto error;
+			} else {
+				if (build_sched_groups(sd, i))
+					goto error;
+			}
 		}
 	}
 
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f7371ee1..1e7066d76c26 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -70,3 +70,5 @@ SCHED_FEAT(NONIRQ_POWER, 1)
  * using the scheduler IPI. Reduces rq->lock contention/bounces.
  */
 SCHED_FEAT(TTWU_QUEUE, 1)
+
+SCHED_FEAT(FORCE_SD_OVERLAP, 0)
-- 
cgit v1.2.3