From 35995a4d815586bc968a857f7235707940a2f755 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Tue, 19 Aug 2008 20:43:25 +0300
Subject: SLUB: Replace __builtin_return_address(0) with _RET_IP_.

This patch replaces __builtin_return_address(0) with _RET_IP_, since a
previous patch moved _RET_IP_ and _THIS_IP_ to include/linux/kernel.h and
they're widely available now. This makes for shorter and easier to read
code.

[penberg@cs.helsinki.fi: remove _RET_IP_ casts to void pointer]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slab.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 000da12b5cf0..c97ed28559ec 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -253,9 +253,9 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
  * request comes from.
  */
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
-extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
+extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
 #define kmalloc_track_caller(size, flags) \
-	__kmalloc_track_caller(size, flags, __builtin_return_address(0))
+	__kmalloc_track_caller(size, flags, _RET_IP_)
 #else
 #define kmalloc_track_caller(size, flags) \
 	__kmalloc(size, flags)
@@ -271,10 +271,10 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
  * allocation request comes from.
  */
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
-extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *);
+extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node_track_caller(size, flags, node, \
-			__builtin_return_address(0))
+			_RET_IP_)
 #else
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node(size, flags, node)
-- 
cgit v1.2.3


From b9ce08c01020eb28bfbfa6faf1c740281c5f418e Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Sun, 10 Aug 2008 20:14:03 +0300
Subject: kmemtrace: Core implementation.

kmemtrace provides tracing for slab allocator functions, such as kmalloc,
kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected data is then fed
to the userspace application in order to analyse allocation hotspots,
internal fragmentation and so on, making it possible to see how well an
allocator performs, as well as debug and profile kernel code.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 Documentation/kernel-parameters.txt |  10 ++
 MAINTAINERS                         |   6 +
 include/linux/kmemtrace.h           |  85 +++++++++
 init/main.c                         |   2 +
 lib/Kconfig.debug                   |  28 +++
 mm/Makefile                         |   1 +
 mm/kmemtrace.c                      | 335 ++++++++++++++++++++++++++++++++++++
 7 files changed, 467 insertions(+)
 create mode 100644 include/linux/kmemtrace.h
 create mode 100644 mm/kmemtrace.c

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e0f346d201ed..542c2d8843db 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -49,6 +49,7 @@ parameter is applicable:
 	ISAPNP	ISA PnP code is enabled.
 	ISDN	Appropriate ISDN support is enabled.
 	JOY	Appropriate joystick support is enabled.
+	KMEMTRACE kmemtrace is enabled.
 	LIBATA  Libata driver is enabled
 	LP	Printer support is enabled.
 	LOOP	Loopback device support is enabled.
@@ -1018,6 +1019,15 @@ and is between 256 and 4096 characters. It is defined in the file
 			use the HighMem zone if it exists, and the Normal
 			zone if it does not.
 
+	kmemtrace.enable=	[KNL,KMEMTRACE] Format: { yes | no }
+				Controls whether kmemtrace is enabled
+				at boot-time.
+
+	kmemtrace.subbufs=n	[KNL,KMEMTRACE] Overrides the number of
+			subbufs kmemtrace's relay channel has. Set this
+			higher than default (KMEMTRACE_N_SUBBUFS in code) if
+			you experience buffer overruns.
+
 	movablecore=nn[KMG]	[KNL,X86-32,IA-64,PPC,X86-64] This parameter
 			is similar to kernelcore except it specifies the
 			amount of memory used for migratable allocations.
diff --git a/MAINTAINERS b/MAINTAINERS
index 618c1ef4a397..e2b3c8555051 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2565,6 +2565,12 @@ M:	jason.wessel@windriver.com
 L:	kgdb-bugreport@lists.sourceforge.net
 S:	Maintained
 
+KMEMTRACE
+P:	Eduard - Gabriel Munteanu
+M:	eduard.munteanu@linux360.ro
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 KPROBES
 P:	Ananth N Mavinakayanahalli
 M:	ananth@in.ibm.com
diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
new file mode 100644
index 000000000000..2c332010cb4e
--- /dev/null
+++ b/include/linux/kmemtrace.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/marker.h>
+
+enum kmemtrace_type_id {
+	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
+	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
+	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
+};
+
+#ifdef CONFIG_KMEMTRACE
+
+extern void kmemtrace_init(void);
+
+static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node)
+{
+	trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu "
+		   "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d",
+		   type_id, call_site, (unsigned long) ptr,
+		   bytes_req, bytes_alloc, (unsigned long) gfp_flags, node);
+}
+
+static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr)
+{
+	trace_mark(kmemtrace_free, "type_id %d call_site %lu ptr %lu",
+		   type_id, call_site, (unsigned long) ptr);
+}
+
+#else /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_init(void)
+{
+}
+
+static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node)
+{
+}
+
+static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr)
+{
+}
+
+#endif /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
+					unsigned long call_site,
+					const void *ptr,
+					size_t bytes_req,
+					size_t bytes_alloc,
+					gfp_t gfp_flags)
+{
+	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
+				  bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+
diff --git a/init/main.c b/init/main.c
index 7e117a231af1..be1fe2242a55 100644
--- a/init/main.c
+++ b/init/main.c
@@ -69,6 +69,7 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
+#include <linux/kmemtrace.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
@@ -653,6 +654,7 @@ asmlinkage void __init start_kernel(void)
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
 	kmem_cache_init();
+	kmemtrace_init();
 	debug_objects_mem_init();
 	idr_init_cache();
 	setup_per_cpu_pageset();
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e443bc..78d669b461d2 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -803,6 +803,34 @@ config FIREWIRE_OHCI_REMOTE_DMA
 
 	  If unsure, say N.
 
+config KMEMTRACE
+	bool "Kernel memory tracer (kmemtrace)"
+	depends on RELAY && DEBUG_FS && MARKERS
+	help
+	  kmemtrace provides tracing for slab allocator functions, such as
+	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
+	  data is then fed to the userspace application in order to analyse
+	  allocation hotspots, internal fragmentation and so on, making it
+	  possible to see how well an allocator performs, as well as debug
+	  and profile kernel code.
+
+	  This requires an userspace application to use. See
+	  Documentation/vm/kmemtrace.txt for more information.
+
+	  Saying Y will make the kernel somewhat larger and slower. However,
+	  if you disable kmemtrace at run-time or boot-time, the performance
+	  impact is minimal (depending on the arch the kernel is built for).
+
+	  If unsure, say N.
+
+config KMEMTRACE_DEFAULT_ENABLED
+	bool "Enabled by default at boot"
+	depends on KMEMTRACE
+	help
+	  Say Y here to enable kmemtrace at boot-time by default. Whatever
+	  the choice, the behavior can be overridden by a kernel parameter,
+	  as described in documentation.
+
 menuconfig BUILD_DOCSRC
 	bool "Build targets in Documentation/ tree"
 	depends on HEADERS_CHECK
diff --git a/mm/Makefile b/mm/Makefile
index c06b45a1ff5f..3782eb66d4b3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -34,3 +34,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c
new file mode 100644
index 000000000000..83ad1cc71a92
--- /dev/null
+++ b/mm/kmemtrace.c
@@ -0,0 +1,335 @@
+/*
+ * Copyright (C) 2008 Pekka Enberg, Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#include <linux/string.h>
+#include <linux/debugfs.h>
+#include <linux/relay.h>
+#include <linux/module.h>
+#include <linux/marker.h>
+#include <linux/gfp.h>
+#include <linux/kmemtrace.h>
+
+#define KMEMTRACE_SUBBUF_SIZE		524288
+#define KMEMTRACE_DEF_N_SUBBUFS		20
+
+static struct rchan *kmemtrace_chan;
+static u32 kmemtrace_buf_overruns;
+
+static unsigned int kmemtrace_n_subbufs;
+#ifdef CONFIG_KMEMTRACE_DEFAULT_ENABLED
+static unsigned int kmemtrace_enabled = 1;
+#else
+static unsigned int kmemtrace_enabled = 0;
+#endif
+
+/*
+ * The sequence number is used for reordering kmemtrace packets
+ * in userspace, since they are logged as per-CPU data.
+ *
+ * atomic_t should always be a 32-bit signed integer. Wraparound is not
+ * likely to occur, but userspace can deal with it by expecting a certain
+ * sequence number in the next packet that will be read.
+ */
+static atomic_t kmemtrace_seq_num;
+
+#define KMEMTRACE_ABI_VERSION		1
+
+static u32 kmemtrace_abi_version __read_mostly = KMEMTRACE_ABI_VERSION;
+
+enum kmemtrace_event_id {
+	KMEMTRACE_EVENT_ALLOC = 0,
+	KMEMTRACE_EVENT_FREE,
+};
+
+struct kmemtrace_event {
+	u8		event_id;
+	u8		type_id;
+	u16		event_size;
+	s32		seq_num;
+	u64		call_site;
+	u64		ptr;
+} __attribute__ ((__packed__));
+
+struct kmemtrace_stats_alloc {
+	u64		bytes_req;
+	u64		bytes_alloc;
+	u32		gfp_flags;
+	s32		numa_node;
+} __attribute__ ((__packed__));
+
+static void kmemtrace_probe_alloc(void *probe_data, void *call_data,
+				  const char *format, va_list *args)
+{
+	unsigned long flags;
+	struct kmemtrace_event *ev;
+	struct kmemtrace_stats_alloc *stats;
+	void *buf;
+
+	local_irq_save(flags);
+
+	buf = relay_reserve(kmemtrace_chan,
+			    sizeof(struct kmemtrace_event) +
+			    sizeof(struct kmemtrace_stats_alloc));
+	if (!buf)
+		goto failed;
+
+	/*
+	 * Don't convert this to use structure initializers,
+	 * C99 does not guarantee the rvalues evaluation order.
+	 */
+
+	ev = buf;
+	ev->event_id = KMEMTRACE_EVENT_ALLOC;
+	ev->type_id = va_arg(*args, int);
+	ev->event_size = sizeof(struct kmemtrace_event) +
+			 sizeof(struct kmemtrace_stats_alloc);
+	ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num);
+	ev->call_site = va_arg(*args, unsigned long);
+	ev->ptr = va_arg(*args, unsigned long);
+
+	stats = buf + sizeof(struct kmemtrace_event);
+	stats->bytes_req = va_arg(*args, unsigned long);
+	stats->bytes_alloc = va_arg(*args, unsigned long);
+	stats->gfp_flags = va_arg(*args, unsigned long);
+	stats->numa_node = va_arg(*args, int);
+
+failed:
+	local_irq_restore(flags);
+}
+
+static void kmemtrace_probe_free(void *probe_data, void *call_data,
+				 const char *format, va_list *args)
+{
+	unsigned long flags;
+	struct kmemtrace_event *ev;
+
+	local_irq_save(flags);
+
+	ev = relay_reserve(kmemtrace_chan, sizeof(struct kmemtrace_event));
+	if (!ev)
+		goto failed;
+
+	/*
+	 * Don't convert this to use structure initializers,
+	 * C99 does not guarantee the rvalues evaluation order.
+	 */
+	ev->event_id = KMEMTRACE_EVENT_FREE;
+	ev->type_id = va_arg(*args, int);
+	ev->event_size = sizeof(struct kmemtrace_event);
+	ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num);
+	ev->call_site = va_arg(*args, unsigned long);
+	ev->ptr = va_arg(*args, unsigned long);
+
+failed:
+	local_irq_restore(flags);
+}
+
+static struct dentry *
+kmemtrace_create_buf_file(const char *filename, struct dentry *parent,
+			  int mode, struct rchan_buf *buf, int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+				   &relay_file_operations);
+}
+
+static int kmemtrace_remove_buf_file(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+
+	return 0;
+}
+
+static int kmemtrace_subbuf_start(struct rchan_buf *buf,
+				  void *subbuf,
+				  void *prev_subbuf,
+				  size_t prev_padding)
+{
+	if (relay_buf_full(buf)) {
+		/*
+		 * We know it's not SMP-safe, but neither
+		 * debugfs_create_u32() is.
+		 */
+		kmemtrace_buf_overruns++;
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct rchan_callbacks relay_callbacks = {
+	.create_buf_file = kmemtrace_create_buf_file,
+	.remove_buf_file = kmemtrace_remove_buf_file,
+	.subbuf_start = kmemtrace_subbuf_start,
+};
+
+static struct dentry *kmemtrace_dir;
+static struct dentry *kmemtrace_overruns_dentry;
+static struct dentry *kmemtrace_abi_version_dentry;
+
+static struct dentry *kmemtrace_enabled_dentry;
+
+static int kmemtrace_start_probes(void)
+{
+	int err;
+
+	err = marker_probe_register("kmemtrace_alloc", "type_id %d "
+				    "call_site %lu ptr %lu "
+				    "bytes_req %lu bytes_alloc %lu "
+				    "gfp_flags %lu node %d",
+				    kmemtrace_probe_alloc, NULL);
+	if (err)
+		return err;
+	err = marker_probe_register("kmemtrace_free", "type_id %d "
+				    "call_site %lu ptr %lu",
+				    kmemtrace_probe_free, NULL);
+
+	return err;
+}
+
+static void kmemtrace_stop_probes(void)
+{
+	marker_probe_unregister("kmemtrace_alloc",
+				kmemtrace_probe_alloc, NULL);
+	marker_probe_unregister("kmemtrace_free",
+				kmemtrace_probe_free, NULL);
+}
+
+static int kmemtrace_enabled_get(void *data, u64 *val)
+{
+	*val = *((int *) data);
+
+	return 0;
+}
+
+static int kmemtrace_enabled_set(void *data, u64 val)
+{
+	u64 old_val = kmemtrace_enabled;
+
+	*((int *) data) = !!val;
+
+	if (old_val == val)
+		return 0;
+	if (val)
+		kmemtrace_start_probes();
+	else
+		kmemtrace_stop_probes();
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(kmemtrace_enabled_fops,
+			kmemtrace_enabled_get,
+			kmemtrace_enabled_set, "%llu\n");
+
+static void kmemtrace_cleanup(void)
+{
+	if (kmemtrace_enabled_dentry)
+		debugfs_remove(kmemtrace_enabled_dentry);
+
+	kmemtrace_stop_probes();
+
+	if (kmemtrace_abi_version_dentry)
+		debugfs_remove(kmemtrace_abi_version_dentry);
+	if (kmemtrace_overruns_dentry)
+		debugfs_remove(kmemtrace_overruns_dentry);
+
+	relay_close(kmemtrace_chan);
+	kmemtrace_chan = NULL;
+
+	if (kmemtrace_dir)
+		debugfs_remove(kmemtrace_dir);
+}
+
+static int __init kmemtrace_setup_late(void)
+{
+	if (!kmemtrace_chan)
+		goto failed;
+
+	kmemtrace_dir = debugfs_create_dir("kmemtrace", NULL);
+	if (!kmemtrace_dir)
+		goto cleanup;
+
+	kmemtrace_abi_version_dentry =
+		debugfs_create_u32("abi_version", S_IRUSR,
+				   kmemtrace_dir, &kmemtrace_abi_version);
+	kmemtrace_overruns_dentry =
+		debugfs_create_u32("total_overruns", S_IRUSR,
+				   kmemtrace_dir, &kmemtrace_buf_overruns);
+	if (!kmemtrace_overruns_dentry || !kmemtrace_abi_version_dentry)
+		goto cleanup;
+
+	kmemtrace_enabled_dentry =
+		debugfs_create_file("enabled", S_IRUSR | S_IWUSR,
+				    kmemtrace_dir, &kmemtrace_enabled,
+				    &kmemtrace_enabled_fops);
+	if (!kmemtrace_enabled_dentry)
+		goto cleanup;
+
+	if (relay_late_setup_files(kmemtrace_chan, "cpu", kmemtrace_dir))
+		goto cleanup;
+
+	printk(KERN_INFO "kmemtrace: fully up.\n");
+
+	return 0;
+
+cleanup:
+	kmemtrace_cleanup();
+failed:
+	return 1;
+}
+late_initcall(kmemtrace_setup_late);
+
+static int __init kmemtrace_set_boot_enabled(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strcmp(str, "yes"))
+		kmemtrace_enabled = 1;
+	else if (!strcmp(str, "no"))
+		kmemtrace_enabled = 0;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+early_param("kmemtrace.enable", kmemtrace_set_boot_enabled);
+
+static int __init kmemtrace_set_subbufs(char *str)
+{
+	get_option(&str, &kmemtrace_n_subbufs);
+	return 0;
+}
+early_param("kmemtrace.subbufs", kmemtrace_set_subbufs);
+
+void kmemtrace_init(void)
+{
+	if (!kmemtrace_enabled)
+		return;
+
+	if (!kmemtrace_n_subbufs)
+		kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS;
+
+	kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE,
+				    kmemtrace_n_subbufs, &relay_callbacks,
+				    NULL);
+	if (unlikely(!kmemtrace_chan)) {
+		printk(KERN_ERR "kmemtrace: could not open relay channel.\n");
+		return;
+	}
+
+	if (unlikely(kmemtrace_start_probes()))
+		goto probe_fail;
+
+	printk(KERN_INFO "kmemtrace: early init successful.\n");
+
+	return;
+
+probe_fail:
+	printk(KERN_ERR "kmemtrace: could not register marker probes!\n");
+	kmemtrace_cleanup();
+}
+
-- 
cgit v1.2.3


From 36555751c6751a5bdfd6d7bdf0648343bb1ef0de Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Sun, 10 Aug 2008 20:14:05 +0300
Subject: kmemtrace: SLAB hooks.

This adds hooks for the SLAB allocator, to allow tracing with kmemtrace.

We also convert some inline functions to __always_inline to make sure
_RET_IP_, which expands to __builtin_return_address(0), always works
as expected.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slab_def.h | 68 ++++++++++++++++++++++++++++++++++++++++------
 mm/slab.c                | 71 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 123 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 39c3a5eb8ebe..7555ce99f6d2 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,6 +14,7 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
+#include <linux/kmemtrace.h>
 
 /* Size description struct for general caches. */
 struct cache_sizes {
@@ -28,8 +29,26 @@ extern struct cache_sizes malloc_sizes[];
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
-static inline void *kmalloc(size_t size, gfp_t flags)
+#ifdef CONFIG_KMEMTRACE
+extern void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags);
+extern size_t slab_buffer_size(struct kmem_cache *cachep);
+#else
+static __always_inline void *
+kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
 {
+	return kmem_cache_alloc(cachep, flags);
+}
+static inline size_t slab_buffer_size(struct kmem_cache *cachep)
+{
+	return 0;
+}
+#endif
+
+static __always_inline void *kmalloc(size_t size, gfp_t flags)
+{
+	struct kmem_cache *cachep;
+	void *ret;
+
 	if (__builtin_constant_p(size)) {
 		int i = 0;
 
@@ -50,10 +69,17 @@ static inline void *kmalloc(size_t size, gfp_t flags)
 found:
 #ifdef CONFIG_ZONE_DMA
 		if (flags & GFP_DMA)
-			return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep,
-						flags);
+			cachep = malloc_sizes[i].cs_dmacachep;
+		else
 #endif
-		return kmem_cache_alloc(malloc_sizes[i].cs_cachep, flags);
+			cachep = malloc_sizes[i].cs_cachep;
+
+		ret = kmem_cache_alloc_notrace(cachep, flags);
+
+		kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
+				     size, slab_buffer_size(cachep), flags);
+
+		return ret;
 	}
 	return __kmalloc(size, flags);
 }
@@ -62,8 +88,25 @@ found:
 extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
 extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
-static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+#ifdef CONFIG_KMEMTRACE
+extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
+					   gfp_t flags,
+					   int nodeid);
+#else
+static __always_inline void *
+kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
+			      gfp_t flags,
+			      int nodeid)
+{
+	return kmem_cache_alloc_node(cachep, flags, nodeid);
+}
+#endif
+
+static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
+	struct kmem_cache *cachep;
+	void *ret;
+
 	if (__builtin_constant_p(size)) {
 		int i = 0;
 
@@ -84,11 +127,18 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 found:
 #ifdef CONFIG_ZONE_DMA
 		if (flags & GFP_DMA)
-			return kmem_cache_alloc_node(malloc_sizes[i].cs_dmacachep,
-						flags, node);
+			cachep = malloc_sizes[i].cs_dmacachep;
+		else
 #endif
-		return kmem_cache_alloc_node(malloc_sizes[i].cs_cachep,
-						flags, node);
+			cachep = malloc_sizes[i].cs_cachep;
+
+		ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
+
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_,
+					  ret, size, slab_buffer_size(cachep),
+					  flags, node);
+
+		return ret;
 	}
 	return __kmalloc_node(size, flags, node);
 }
diff --git a/mm/slab.c b/mm/slab.c
index a14787799014..b6d9b8cdefa9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -112,6 +112,7 @@
 #include	<linux/rtmutex.h>
 #include	<linux/reciprocal_div.h>
 #include	<linux/debugobjects.h>
+#include	<linux/kmemtrace.h>
 
 #include	<asm/cacheflush.h>
 #include	<asm/tlbflush.h>
@@ -568,6 +569,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 
 #endif
 
+#ifdef CONFIG_KMEMTRACE
+size_t slab_buffer_size(struct kmem_cache *cachep)
+{
+	return cachep->buffer_size;
+}
+EXPORT_SYMBOL(slab_buffer_size);
+#endif
+
 /*
  * Do not go above this order unless 0 objects fit into the slab.
  */
@@ -3613,10 +3622,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
  */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-	return __cache_alloc(cachep, flags, __builtin_return_address(0));
+	void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+
+	kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+			     obj_size(cachep), cachep->buffer_size, flags);
+
+	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+{
+	return __cache_alloc(cachep, flags, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+#endif
+
 /**
  * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
  * @cachep: the cache we're checking against
@@ -3661,23 +3683,47 @@ out:
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
-	return __cache_alloc_node(cachep, flags, nodeid,
-			__builtin_return_address(0));
+	void *ret = __cache_alloc_node(cachep, flags, nodeid,
+				       __builtin_return_address(0));
+
+	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+				  obj_size(cachep), cachep->buffer_size,
+				  flags, nodeid);
+
+	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
+				    gfp_t flags,
+				    int nodeid)
+{
+	return __cache_alloc_node(cachep, flags, nodeid,
+				  __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+#endif
+
 static __always_inline void *
 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 {
 	struct kmem_cache *cachep;
+	void *ret;
 
 	cachep = kmem_find_general_cachep(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	return kmem_cache_alloc_node(cachep, flags, node);
+	ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
+
+	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+				  (unsigned long) caller, ret,
+				  size, cachep->buffer_size, flags, node);
+
+	return ret;
 }
 
-#ifdef CONFIG_DEBUG_SLAB
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return __do_kmalloc_node(size, flags, node,
@@ -3710,6 +3756,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 					  void *caller)
 {
 	struct kmem_cache *cachep;
+	void *ret;
 
 	/* If you want to save a few bytes .text space: replace
 	 * __ with kmem_.
@@ -3719,11 +3766,17 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 	cachep = __find_general_cachep(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	return __cache_alloc(cachep, flags, caller);
+	ret = __cache_alloc(cachep, flags, caller);
+
+	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
+			     (unsigned long) caller, ret,
+			     size, cachep->buffer_size, flags);
+
+	return ret;
 }
 
 
-#ifdef CONFIG_DEBUG_SLAB
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
 void *__kmalloc(size_t size, gfp_t flags)
 {
 	return __do_kmalloc(size, flags, __builtin_return_address(0));
@@ -3762,6 +3815,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 		debug_check_no_obj_freed(objp, obj_size(cachep));
 	__cache_free(cachep, objp);
 	local_irq_restore(flags);
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, objp);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -3788,6 +3843,8 @@ void kfree(const void *objp)
 	debug_check_no_obj_freed(objp, obj_size(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, objp);
 }
 EXPORT_SYMBOL(kfree);
 
-- 
cgit v1.2.3


From 3eae2cb24a96509e0a38cc48dc1538a2826f4e33 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Sun, 10 Aug 2008 20:14:07 +0300
Subject: kmemtrace: SLOB hooks.

This adds hooks for the SLOB allocator, to allow tracing with kmemtrace.

We also convert some inline functions to __always_inline to make sure
_RET_IP_, which expands to __builtin_return_address(0), always works
as expected.

Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slob_def.h |  9 +++++----
 mm/slob.c                | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 36 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
index 59a3fa476ab9..0ec00b39d006 100644
--- a/include/linux/slob_def.h
+++ b/include/linux/slob_def.h
@@ -3,14 +3,15 @@
 
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
-static inline void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep,
+					      gfp_t flags)
 {
 	return kmem_cache_alloc_node(cachep, flags, -1);
 }
 
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
 
-static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return __kmalloc_node(size, flags, node);
 }
@@ -23,12 +24,12 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * kmalloc is the normal method of allocating memory
  * in the kernel.
  */
-static inline void *kmalloc(size_t size, gfp_t flags)
+static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
 	return __kmalloc_node(size, flags, -1);
 }
 
-static inline void *__kmalloc(size_t size, gfp_t flags)
+static __always_inline void *__kmalloc(size_t size, gfp_t flags)
 {
 	return kmalloc(size, flags);
 }
diff --git a/mm/slob.c b/mm/slob.c
index cb675d126791..55de44ae5d30 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,6 +65,7 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
+#include <linux/kmemtrace.h>
 #include <asm/atomic.h>
 
 /*
@@ -463,27 +464,38 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 {
 	unsigned int *m;
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+	void *ret;
 
 	if (size < PAGE_SIZE - align) {
 		if (!size)
 			return ZERO_SIZE_PTR;
 
 		m = slob_alloc(size + align, gfp, align, node);
+
 		if (!m)
 			return NULL;
 		*m = size;
-		return (void *)m + align;
+		ret = (void *)m + align;
+
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+					  _RET_IP_, ret,
+					  size, size + align, gfp, node);
 	} else {
-		void *ret;
+		unsigned int order = get_order(size);
 
-		ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
+		ret = slob_new_page(gfp | __GFP_COMP, order, node);
 		if (ret) {
 			struct page *page;
 			page = virt_to_page(ret);
 			page->private = size;
 		}
-		return ret;
+
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+					  _RET_IP_, ret,
+					  size, PAGE_SIZE << order, gfp, node);
 	}
+
+	return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
 
@@ -501,6 +513,8 @@ void kfree(const void *block)
 		slob_free(m, *m + align);
 	} else
 		put_page(&sp->page);
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, block);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -569,10 +583,19 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
 	void *b;
 
-	if (c->size < PAGE_SIZE)
+	if (c->size < PAGE_SIZE) {
 		b = slob_alloc(c->size, flags, c->align, node);
-	else
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
+					  _RET_IP_, b, c->size,
+					  SLOB_UNITS(c->size) * SLOB_UNIT,
+					  flags, node);
+	} else {
 		b = slob_new_page(flags, get_order(c->size), node);
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
+					  _RET_IP_, b, c->size,
+					  PAGE_SIZE << get_order(c->size),
+					  flags, node);
+	}
 
 	if (c->ctor)
 		c->ctor(b);
@@ -608,6 +631,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 	} else {
 		__kmem_cache_free(b, c->size);
 	}
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, b);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
-- 
cgit v1.2.3


From 5b882be4e00e53a44f47ad7eb997cac2938848bf Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Tue, 19 Aug 2008 20:43:26 +0300
Subject: kmemtrace: SLUB hooks.

This adds hooks for the SLUB allocator, to allow tracing with kmemtrace.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slub_def.h | 53 ++++++++++++++++++++++++++++++++++++---
 mm/slub.c                | 65 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 109 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 2f5c16b1aacd..dc28432b5b9a 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,6 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
+#include <linux/kmemtrace.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
@@ -204,13 +205,31 @@ static __always_inline struct kmem_cache *kmalloc_slab(size_t size)
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
+#ifdef CONFIG_KMEMTRACE
+extern void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags);
+#else
+static __always_inline void *
+kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+{
+	return kmem_cache_alloc(s, gfpflags);
+}
+#endif
+
 static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 {
-	return (void *)__get_free_pages(flags | __GFP_COMP, get_order(size));
+	unsigned int order = get_order(size);
+	void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order);
+
+	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
+			     size, PAGE_SIZE << order, flags);
+
+	return ret;
 }
 
 static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
+	void *ret;
+
 	if (__builtin_constant_p(size)) {
 		if (size > PAGE_SIZE)
 			return kmalloc_large(size, flags);
@@ -221,7 +240,13 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 			if (!s)
 				return ZERO_SIZE_PTR;
 
-			return kmem_cache_alloc(s, flags);
+			ret = kmem_cache_alloc_notrace(s, flags);
+
+			kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
+					     _THIS_IP_, ret,
+					     size, s->size, flags);
+
+			return ret;
 		}
 	}
 	return __kmalloc(size, flags);
@@ -231,8 +256,24 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
+#ifdef CONFIG_KMEMTRACE
+extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+					   gfp_t gfpflags,
+					   int node);
+#else
+static __always_inline void *
+kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+			      gfp_t gfpflags,
+			      int node)
+{
+	return kmem_cache_alloc_node(s, gfpflags, node);
+}
+#endif
+
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
+	void *ret;
+
 	if (__builtin_constant_p(size) &&
 		size <= PAGE_SIZE && !(flags & SLUB_DMA)) {
 			struct kmem_cache *s = kmalloc_slab(size);
@@ -240,7 +281,13 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 		if (!s)
 			return ZERO_SIZE_PTR;
 
-		return kmem_cache_alloc_node(s, flags, node);
+		ret = kmem_cache_alloc_node_notrace(s, flags, node);
+
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+					  _THIS_IP_, ret,
+					  size, s->size, flags, node);
+
+		return ret;
 	}
 	return __kmalloc_node(size, flags, node);
 }
diff --git a/mm/slub.c b/mm/slub.c
index 06da86654875..4c48a0146afd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -24,6 +24,7 @@
 #include <linux/kallsyms.h>
 #include <linux/memory.h>
 #include <linux/math64.h>
+#include <linux/kmemtrace.h>
 
 /*
  * Lock order:
@@ -1613,18 +1614,46 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
-	return slab_alloc(s, gfpflags, -1, _RET_IP_);
+	void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
+
+	kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+			     s->objsize, s->size, gfpflags);
+
+	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+{
+	return slab_alloc(s, gfpflags, -1, _RET_IP_);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+#endif
+
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
-	return slab_alloc(s, gfpflags, node, _RET_IP_);
+	void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+
+	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+				  s->objsize, s->size, gfpflags, node);
+
+	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 #endif
 
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+				    gfp_t gfpflags,
+				    int node)
+{
+	return slab_alloc(s, gfpflags, node, _RET_IP_);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+#endif
+
 /*
  * Slow patch handling. This may still be called frequently since objects
  * have a longer lifetime than the cpu slabs in most processing loads.
@@ -1732,6 +1761,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 	page = virt_to_head_page(x);
 
 	slab_free(s, page, x, _RET_IP_);
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, x);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -2650,6 +2681,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
 void *__kmalloc(size_t size, gfp_t flags)
 {
 	struct kmem_cache *s;
+	void *ret;
 
 	if (unlikely(size > PAGE_SIZE))
 		return kmalloc_large(size, flags);
@@ -2659,7 +2691,12 @@ void *__kmalloc(size_t size, gfp_t flags)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	return slab_alloc(s, flags, -1, _RET_IP_);
+	ret = slab_alloc(s, flags, -1, _RET_IP_);
+
+	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
+			     size, s->size, flags);
+
+	return ret;
 }
 EXPORT_SYMBOL(__kmalloc);
 
@@ -2678,16 +2715,30 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	struct kmem_cache *s;
+	void *ret;
 
-	if (unlikely(size > PAGE_SIZE))
-		return kmalloc_large_node(size, flags, node);
+	if (unlikely(size > PAGE_SIZE)) {
+		ret = kmalloc_large_node(size, flags, node);
+
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+					  _RET_IP_, ret,
+					  size, PAGE_SIZE << get_order(size),
+					  flags, node);
+
+		return ret;
+	}
 
 	s = get_slab(size, flags);
 
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	return slab_alloc(s, flags, node, _RET_IP_);
+	ret = slab_alloc(s, flags, node, _RET_IP_);
+
+	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
+				  size, s->size, flags, node);
+
+	return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
@@ -2745,6 +2796,8 @@ void kfree(const void *x)
 		return;
 	}
 	slab_free(page->slab, page, object, _RET_IP_);
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, x);
 }
 EXPORT_SYMBOL(kfree);
 
-- 
cgit v1.2.3


From 73cd6af0413225b0ada8b8881c3e0cfd26506dfa Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Tue, 19 Aug 2008 20:43:24 +0300
Subject: kmemtrace: Better alternative to "kmemtrace: fix printk format
 warnings".

Fix the problem "kmemtrace: fix printk format warnings" attempted to fix,
but resulted in marker-probe format mismatch warnings. Instead of carrying
size_t into probes, we get rid of it by casting to unsigned long, just as
we did with gfp_t.

This way, we don't need to change marker format strings and we don't have
to rely on other format specifiers like "%zu", making for consistent use
of more generic data types (since there are no format specifiers for
gfp_t, for example).

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/kmemtrace.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
index 2c332010cb4e..5bea8ead6a6b 100644
--- a/include/linux/kmemtrace.h
+++ b/include/linux/kmemtrace.h
@@ -33,7 +33,8 @@ static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu "
 		   "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d",
 		   type_id, call_site, (unsigned long) ptr,
-		   bytes_req, bytes_alloc, (unsigned long) gfp_flags, node);
+		   (unsigned long) bytes_req, (unsigned long) bytes_alloc,
+		   (unsigned long) gfp_flags, node);
 }
 
 static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-- 
cgit v1.2.3


From 36994e58a48fb8f9651c7dc845a6de298aba5bfc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 29 Dec 2008 13:42:23 -0800
Subject: tracing/kmemtrace: normalize the raw tracer event to the unified
 tracing API

Impact: new tracer plugin

This patch adapts kmemtrace raw events tracing to the unified tracing API.

To enable and use this tracer, just do the following:

 echo kmemtrace > /debugfs/tracing/current_tracer
 cat /debugfs/tracing/trace

You will have the following output:

 # tracer: kmemtrace
 #
 #
 # ALLOC  TYPE  REQ   GIVEN  FLAGS           POINTER         NODE    CALLER
 # FREE   |      |     |       |              |   |            |        |
 # |

type_id 1 call_site 18446744071565527833 ptr 18446612134395152256
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345164672 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345164912 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345165152 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 0 call_site 18446744071566144042 ptr 18446612134346191680 bytes_req 1304 bytes_alloc 1312 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584

That was to stay backward compatible with the format output produced in
inux/tracepoint.h.

This is the default ouput, but note that I tried something else.

If you change an option:

echo kmem_minimalistic > /debugfs/trace_options

and then cat /debugfs/trace, you will have the following output:

 # tracer: kmemtrace
 #
 #
 # ALLOC  TYPE  REQ   GIVEN  FLAGS           POINTER         NODE    CALLER
 # FREE   |      |     |       |              |   |            |        |
 # |

   -      C                            0xffff88007c088780          file_free_rcu
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc780     -1   d_alloc
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc870     -1   d_alloc
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc960     -1   d_alloc
   +      K   1304   1312   000000d0   0xffff8800791d7340     -1   reiserfs_alloc_inode
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   -      C                            0xffff88007cad6000          putname
   +      K    992   1000   000000d0   0xffff880079045b58     -1   alloc_inode
   +      K    768   1024   000080d0   0xffff88007c096400     -1   alloc_pipe_info
   +      K    240    240   000000d0   0xffff8800790dca50     -1   d_alloc
   +      K    272    320   000080d0   0xffff88007c088780     -1   get_empty_filp
   +      K    272    320   000080d0   0xffff88007c088000     -1   get_empty_filp

Yeah I shall confess kmem_minimalistic should be: kmem_alternative.

Whatever, I find it more readable but this a personal opinion of course.
We can drop it if you want.

On the ALLOC/FREE column, + means an allocation and - a free.

On the type column, you have K = kmalloc, C = cache, P = page

I would like the flags to be GFP_* strings but that would not be easy to not
break the column with strings....

About the node...it seems to always be -1. I don't know why but that shouldn't
be difficult to find.

I moved linux/tracepoint.h to trace/tracepoint.h as well. I think that would
be more easy to find the tracer headers if they are all in their common
directory.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kmemtrace.h |  86 ------------
 include/linux/slab_def.h  |   2 +-
 include/linux/slub_def.h  |   2 +-
 include/trace/kmemtrace.h |  75 ++++++++++
 init/main.c               |   2 +-
 kernel/trace/Kconfig      |  22 +++
 kernel/trace/Makefile     |   1 +
 kernel/trace/kmemtrace.c  | 343 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h      |  25 ++++
 lib/Kconfig.debug         |  20 ---
 mm/kmemtrace.c            |   2 +-
 mm/slob.c                 |   2 +-
 mm/slub.c                 |   2 +-
 13 files changed, 472 insertions(+), 112 deletions(-)
 delete mode 100644 include/linux/kmemtrace.h
 create mode 100644 include/trace/kmemtrace.h
 create mode 100644 kernel/trace/kmemtrace.c

(limited to 'include/linux')

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
deleted file mode 100644
index 5bea8ead6a6b..000000000000
--- a/include/linux/kmemtrace.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-#include <linux/marker.h>
-
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
-
-#ifdef CONFIG_KMEMTRACE
-
-extern void kmemtrace_init(void);
-
-static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node)
-{
-	trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu "
-		   "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d",
-		   type_id, call_site, (unsigned long) ptr,
-		   (unsigned long) bytes_req, (unsigned long) bytes_alloc,
-		   (unsigned long) gfp_flags, node);
-}
-
-static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr)
-{
-	trace_mark(kmemtrace_free, "type_id %d call_site %lu ptr %lu",
-		   type_id, call_site, (unsigned long) ptr);
-}
-
-#else /* CONFIG_KMEMTRACE */
-
-static inline void kmemtrace_init(void)
-{
-}
-
-static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node)
-{
-}
-
-static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr)
-{
-}
-
-#endif /* CONFIG_KMEMTRACE */
-
-static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
-					unsigned long call_site,
-					const void *ptr,
-					size_t bytes_req,
-					size_t bytes_alloc,
-					gfp_t gfp_flags)
-{
-	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
-				  bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 7555ce99f6d2..455f9affea9a 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,7 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 /* Size description struct for general caches. */
 struct cache_sizes {
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index dc28432b5b9a..6b657f7dcb2b 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,7 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h
new file mode 100644
index 000000000000..ad8b7857855a
--- /dev/null
+++ b/include/trace/kmemtrace.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/marker.h>
+
+enum kmemtrace_type_id {
+	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
+	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
+	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
+};
+
+#ifdef CONFIG_KMEMTRACE
+
+extern void kmemtrace_init(void);
+
+extern void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node);
+
+extern void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr);
+
+#else /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_init(void)
+{
+}
+
+static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node)
+{
+}
+
+static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr)
+{
+}
+
+#endif /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
+					unsigned long call_site,
+					const void *ptr,
+					size_t bytes_req,
+					size_t bytes_alloc,
+					gfp_t gfp_flags)
+{
+	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
+				  bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+
diff --git a/init/main.c b/init/main.c
index 9711586aa7c9..beca7aaddb22 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,7 +70,7 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a4ff6fc3a6..27fb74b06b3c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -264,6 +264,28 @@ config HW_BRANCH_TRACER
 	  This tracer records all branches on the system in a circular
 	  buffer giving access to the last N branches for each cpu.
 
+config KMEMTRACE
+	bool "Trace SLAB allocations"
+	select TRACING
+	depends on RELAY
+	help
+	  kmemtrace provides tracing for slab allocator functions, such as
+	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
+	  data is then fed to the userspace application in order to analyse
+	  allocation hotspots, internal fragmentation and so on, making it
+	  possible to see how well an allocator performs, as well as debug
+	  and profile kernel code.
+
+	  This requires an userspace application to use. See
+	  Documentation/vm/kmemtrace.txt for more information.
+
+	  Saying Y will make the kernel somewhat larger and slower. However,
+	  if you disable kmemtrace at run-time or boot-time, the performance
+	  impact is minimal (depending on the arch the kernel is built for).
+
+	  If unsure, say N.
+
+
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a93653f..513dc86b5dfa 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -33,5 +33,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
+obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
new file mode 100644
index 000000000000..d69cbe3c2a4b
--- /dev/null
+++ b/kernel/trace/kmemtrace.c
@@ -0,0 +1,343 @@
+/*
+ * Memory allocator tracing
+ *
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <linux/dcache.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <trace/kmemtrace.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_KMEM_OPT_MINIMAL	0x1
+
+static struct tracer_opt kmem_opts[] = {
+	/* Default disable the minimalistic output */
+	{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
+	{ }
+};
+
+static struct tracer_flags kmem_tracer_flags = {
+	.val = 0,
+	.opts = kmem_opts
+};
+
+
+static bool kmem_tracing_enabled __read_mostly;
+static struct trace_array *kmemtrace_array;
+
+static int kmem_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	kmemtrace_array = tr;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr, cpu);
+
+	kmem_tracing_enabled = true;
+
+	return 0;
+}
+
+static void kmem_trace_reset(struct trace_array *tr)
+{
+	kmem_tracing_enabled = false;
+}
+
+static void kmemtrace_headers(struct seq_file *s)
+{
+	/* Don't need headers for the original kmemtrace output */
+	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
+		return;
+
+	seq_printf(s, "#\n");
+	seq_printf(s, "# ALLOC  TYPE  REQ   GIVEN  FLAGS     "
+			"      POINTER         NODE    CALLER\n");
+	seq_printf(s, "# FREE   |      |     |       |       "
+			"       |   |            |        |\n");
+	seq_printf(s, "# |\n\n");
+}
+
+/*
+ * The two following functions give the original output from kmemtrace,
+ * or something close to....perhaps they need some missing things
+ */
+static enum print_line_t
+kmemtrace_print_alloc_original(struct trace_iterator *iter,
+				struct kmemtrace_alloc_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Taken from the old linux/kmemtrace.h */
+	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
+	  "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
+	   entry->type_id, entry->call_site, (unsigned long) entry->ptr,
+	   (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
+	   (unsigned long) entry->gfp_flags, entry->node);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_original(struct trace_iterator *iter,
+				struct kmemtrace_free_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Taken from the old linux/kmemtrace.h */
+	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
+	   entry->type_id, entry->call_site, (unsigned long) entry->ptr);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+
+/* The two other following provide a more minimalistic output */
+static enum print_line_t
+kmemtrace_print_alloc_compress(struct trace_iterator *iter,
+					struct kmemtrace_alloc_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Alloc entry */
+	ret = trace_seq_printf(s, "  +      ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Type */
+	switch (entry->type_id) {
+	case KMEMTRACE_TYPE_KMALLOC:
+		ret = trace_seq_printf(s, "K   ");
+		break;
+	case KMEMTRACE_TYPE_CACHE:
+		ret = trace_seq_printf(s, "C   ");
+		break;
+	case KMEMTRACE_TYPE_PAGES:
+		ret = trace_seq_printf(s, "P   ");
+		break;
+	default:
+		ret = trace_seq_printf(s, "?   ");
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Requested */
+	ret = trace_seq_printf(s, "%4d   ", entry->bytes_req);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Allocated */
+	ret = trace_seq_printf(s, "%4d   ", entry->bytes_alloc);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Flags
+	 * TODO: would be better to see the name of the GFP flag names
+	 */
+	ret = trace_seq_printf(s, "%08x   ", entry->gfp_flags);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Pointer to allocated */
+	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Node */
+	ret = trace_seq_printf(s, "%4d   ", entry->node);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Call site */
+	ret = seq_print_ip_sym(s, entry->call_site, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (!trace_seq_printf(s, "\n"))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_compress(struct trace_iterator *iter,
+				struct kmemtrace_free_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Free entry */
+	ret = trace_seq_printf(s, "  -      ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Type */
+	switch (entry->type_id) {
+	case KMEMTRACE_TYPE_KMALLOC:
+		ret = trace_seq_printf(s, "K     ");
+		break;
+	case KMEMTRACE_TYPE_CACHE:
+		ret = trace_seq_printf(s, "C     ");
+		break;
+	case KMEMTRACE_TYPE_PAGES:
+		ret = trace_seq_printf(s, "P     ");
+		break;
+	default:
+		ret = trace_seq_printf(s, "?     ");
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Skip requested/allocated/flags */
+	ret = trace_seq_printf(s, "                       ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Pointer to allocated */
+	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Skip node */
+	ret = trace_seq_printf(s, "       ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Call site */
+	ret = seq_print_ip_sym(s, entry->call_site, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (!trace_seq_printf(s, "\n"))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+
+	switch (entry->type) {
+	case TRACE_KMEM_ALLOC: {
+		struct kmemtrace_alloc_entry *field;
+		trace_assign_type(field, entry);
+		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+			return kmemtrace_print_alloc_compress(iter, field);
+		else
+			return kmemtrace_print_alloc_original(iter, field);
+	}
+
+	case TRACE_KMEM_FREE: {
+		struct kmemtrace_free_entry *field;
+		trace_assign_type(field, entry);
+		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+			return kmemtrace_print_free_compress(iter, field);
+		else
+			return kmemtrace_print_free_original(iter, field);
+	}
+
+	default:
+		return TRACE_TYPE_UNHANDLED;
+	}
+}
+
+/* Trace allocations */
+void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+			     unsigned long call_site,
+			     const void *ptr,
+			     size_t bytes_req,
+			     size_t bytes_alloc,
+			     gfp_t gfp_flags,
+			     int node)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_alloc_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+	unsigned long irq_flags;
+
+	if (!kmem_tracing_enabled)
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+	entry->bytes_req = bytes_req;
+	entry->bytes_alloc = bytes_alloc;
+	entry->gfp_flags = gfp_flags;
+	entry->node	=	node;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
+
+void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+		       unsigned long call_site,
+		       const void *ptr)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_free_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+	unsigned long irq_flags;
+
+	if (!kmem_tracing_enabled)
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_FREE;
+	entry->type_id	= type_id;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
+
+static struct tracer kmem_tracer __read_mostly = {
+	.name		= "kmemtrace",
+	.init		= kmem_trace_init,
+	.reset		= kmem_trace_reset,
+	.print_line	= kmemtrace_print_line,
+	.print_header = kmemtrace_headers,
+	.flags		= &kmem_tracer_flags
+};
+
+static int __init init_kmem_tracer(void)
+{
+	return register_tracer(&kmem_tracer);
+}
+
+device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f864036..534505bb39b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,6 +9,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
+#include <trace/kmemtrace.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -29,6 +30,8 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_HW_BRANCHES,
+	TRACE_KMEM_ALLOC,
+	TRACE_KMEM_FREE,
 	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
@@ -170,6 +173,24 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
+struct kmemtrace_alloc_entry {
+	struct trace_entry	ent;
+	enum kmemtrace_type_id type_id;
+	unsigned long call_site;
+	const void *ptr;
+	size_t bytes_req;
+	size_t bytes_alloc;
+	gfp_t gfp_flags;
+	int node;
+};
+
+struct kmemtrace_free_entry {
+	struct trace_entry	ent;
+	enum kmemtrace_type_id type_id;
+	unsigned long call_site;
+	const void *ptr;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -280,6 +301,10 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
  		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
+		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
+			  TRACE_KMEM_ALLOC);	\
+		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
+			  TRACE_KMEM_FREE);	\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b5417e23ba94..b0f239e443bc 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -803,26 +803,6 @@ config FIREWIRE_OHCI_REMOTE_DMA
 
 	  If unsure, say N.
 
-config KMEMTRACE
-	bool "Kernel memory tracer (kmemtrace)"
-	depends on RELAY && DEBUG_FS && MARKERS
-	help
-	  kmemtrace provides tracing for slab allocator functions, such as
-	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
-	  data is then fed to the userspace application in order to analyse
-	  allocation hotspots, internal fragmentation and so on, making it
-	  possible to see how well an allocator performs, as well as debug
-	  and profile kernel code.
-
-	  This requires an userspace application to use. See
-	  Documentation/vm/kmemtrace.txt for more information.
-
-	  Saying Y will make the kernel somewhat larger and slower. However,
-	  if you disable kmemtrace at run-time or boot-time, the performance
-	  impact is minimal (depending on the arch the kernel is built for).
-
-	  If unsure, say N.
-
 menuconfig BUILD_DOCSRC
 	bool "Build targets in Documentation/ tree"
 	depends on HEADERS_CHECK
diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c
index 2a70a805027c..0573b5080cc4 100644
--- a/mm/kmemtrace.c
+++ b/mm/kmemtrace.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/marker.h>
 #include <linux/gfp.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 #define KMEMTRACE_SUBBUF_SIZE		524288
 #define KMEMTRACE_DEF_N_SUBBUFS		20
diff --git a/mm/slob.c b/mm/slob.c
index 0f1a49f40690..4d1c0fc33b6b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,7 +65,7 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 #include <asm/atomic.h>
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index cc4001fee7ac..7bf8cf8ec082 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,7 +16,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
-- 
cgit v1.2.3


From f00012074b1a1a67d9c8603617bbbab267347ca6 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 9 Jan 2009 11:29:42 +0800
Subject: ftrace, ia64: Add macro for ftrace_caller

Define FTRACE_ADDR. In IA64, a function pointer isn't a 'unsigned long' but a
'struct {unsigned long ip, unsigned long gp}'.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 4 ++++
 kernel/trace/ftrace.c  | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b9cb7e..054721487574 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -126,6 +126,10 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
+
+#ifndef FTRACE_ADDR
+#define FTRACE_ADDR ((unsigned long)ftrace_caller)
+#endif
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern void ftrace_graph_caller(void);
 extern int ftrace_enable_ftrace_graph_caller(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 76bb884b6e16..9f536108d3f3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -455,7 +455,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	unsigned long ip, fl;
 	unsigned long ftrace_addr;
 
-	ftrace_addr = (unsigned long)ftrace_caller;
+	ftrace_addr = (unsigned long)FTRACE_ADDR;
 
 	ip = rec->ip;
 
-- 
cgit v1.2.3


From e162b39a368f0401e41b558f430c354d12a85b37 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Thu, 15 Jan 2009 11:08:40 -0800
Subject: softlockup: decouple hung tasks check from softlockup detection

Decoupling allows:

* hung tasks check to happen at very low priority

* hung tasks check and softlockup to be enabled/disabled independently
  at compile and/or run-time

* individual panic settings to be enabled disabled independently
  at compile and/or run-time

* softlockup threshold to be reduced without increasing hung tasks
  poll frequency (hung task check is expensive relative to softlock watchdog)

* hung task check to be zero over-head when disabled at run-time

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  14 +++-
 kernel/Makefile       |   1 +
 kernel/hung_task.c    | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/softlockup.c   | 100 -------------------------
 kernel/sysctl.c       |  15 +++-
 lib/Kconfig.debug     |  38 ++++++++++
 6 files changed, 261 insertions(+), 105 deletions(-)
 create mode 100644 kernel/hung_task.c

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 54cbabf3b871..f2f94d532302 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -297,9 +297,6 @@ extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
 				    struct file *filp, void __user *buffer,
 				    size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
-extern unsigned long sysctl_hung_task_check_count;
-extern unsigned long sysctl_hung_task_timeout_secs;
-extern unsigned long sysctl_hung_task_warnings;
 extern int softlockup_thresh;
 #else
 static inline void softlockup_tick(void)
@@ -316,6 +313,15 @@ static inline void touch_all_softlockup_watchdogs(void)
 }
 #endif
 
+#ifdef CONFIG_DETECT_HUNG_TASK
+extern unsigned int  sysctl_hung_task_panic;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_warnings;
+extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+					 struct file *filp, void __user *buffer,
+					 size_t *lenp, loff_t *ppos);
+#endif
 
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
@@ -1236,7 +1242,7 @@ struct task_struct {
 /* ipc stuff */
 	struct sysv_sem sysvsem;
 #endif
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_DETECT_HUNG_TASK
 /* hung task detection */
 	unsigned long last_switch_timestamp;
 	unsigned long last_switch_count;
diff --git a/kernel/Makefile b/kernel/Makefile
index 2aebc4cd7878..979745f1b4bc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 000000000000..ba5a77cad3bb
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,198 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+static unsigned long __read_mostly hung_task_poll_jiffies;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+static int __read_mostly did_panic;
+
+static struct task_struct *watchdog_task;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+				CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+
+static int __init hung_task_panic_setup(char *str)
+{
+	sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+
+	return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = hung_task_panic,
+};
+
+/*
+ * Returns seconds, approximately.  We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+	int this_cpu = raw_smp_processor_id();
+
+	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+}
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+	unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+	if (t->flags & PF_FROZEN)
+		return;
+
+	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+		t->last_switch_count = switch_count;
+		t->last_switch_timestamp = now;
+		return;
+	}
+	if ((long)(now - t->last_switch_timestamp) <
+					sysctl_hung_task_timeout_secs)
+		return;
+	if (!sysctl_hung_task_warnings)
+		return;
+	sysctl_hung_task_warnings--;
+
+	/*
+	 * Ok, the task did not get scheduled for more than 2 minutes,
+	 * complain:
+	 */
+	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+			"%ld seconds.\n", t->comm, t->pid,
+			sysctl_hung_task_timeout_secs);
+	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+			" disables this message.\n");
+	sched_show_task(t);
+	__debug_show_held_locks(t);
+
+	t->last_switch_timestamp = now;
+	touch_nmi_watchdog();
+
+	if (sysctl_hung_task_panic)
+		panic("hung_task: blocked tasks");
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(void)
+{
+	int max_count = sysctl_hung_task_check_count;
+	unsigned long now = get_timestamp();
+	struct task_struct *g, *t;
+
+	/*
+	 * If the system crashed already then all bets are off,
+	 * do not report extra hung tasks:
+	 */
+	if (test_taint(TAINT_DIE) || did_panic)
+		return;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, t) {
+		if (!--max_count)
+			goto unlock;
+		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+		if (t->state == TASK_UNINTERRUPTIBLE)
+			check_hung_task(t, now);
+	} while_each_thread(g, t);
+ unlock:
+	read_unlock(&tasklist_lock);
+}
+
+static void update_poll_jiffies(void)
+{
+	/* timeout of 0 will disable the watchdog */
+	if (sysctl_hung_task_timeout_secs == 0)
+		hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT;
+	else
+		hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2;
+}
+
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+				  struct file *filp, void __user *buffer,
+				  size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+	if (ret || !write)
+		goto out;
+
+	update_poll_jiffies();
+
+	wake_up_process(watchdog_task);
+
+ out:
+	return ret;
+}
+
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+	set_user_nice(current, 0);
+	update_poll_jiffies();
+
+	for ( ; ; ) {
+		while (schedule_timeout_interruptible(hung_task_poll_jiffies));
+		check_hung_uninterruptible_tasks();
+	}
+
+	return 0;
+}
+
+static int __init hung_task_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+
+	return 0;
+}
+
+module_init(hung_task_init);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 85d5a2455103..88796c330838 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -165,98 +165,12 @@ void softlockup_tick(void)
 		panic("softlockup: hung tasks");
 }
 
-/*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
- * Zero means infinite timeout - no checking done:
- */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
-
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
-
-/*
- * Only do the hung-tasks check on one CPU:
- */
-static int check_cpu __read_mostly = -1;
-
-static void check_hung_task(struct task_struct *t, unsigned long now)
-{
-	unsigned long switch_count = t->nvcsw + t->nivcsw;
-
-	if (t->flags & PF_FROZEN)
-		return;
-
-	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
-		t->last_switch_count = switch_count;
-		t->last_switch_timestamp = now;
-		return;
-	}
-	if ((long)(now - t->last_switch_timestamp) <
-					sysctl_hung_task_timeout_secs)
-		return;
-	if (!sysctl_hung_task_warnings)
-		return;
-	sysctl_hung_task_warnings--;
-
-	/*
-	 * Ok, the task did not get scheduled for more than 2 minutes,
-	 * complain:
-	 */
-	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
-			"%ld seconds.\n", t->comm, t->pid,
-			sysctl_hung_task_timeout_secs);
-	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
-			" disables this message.\n");
-	sched_show_task(t);
-	__debug_show_held_locks(t);
-
-	t->last_switch_timestamp = now;
-	touch_nmi_watchdog();
-
-	if (softlockup_panic)
-		panic("softlockup: blocked tasks");
-}
-
-/*
- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
- */
-static void check_hung_uninterruptible_tasks(int this_cpu)
-{
-	int max_count = sysctl_hung_task_check_count;
-	unsigned long now = get_timestamp(this_cpu);
-	struct task_struct *g, *t;
-
-	/*
-	 * If the system crashed already then all bets are off,
-	 * do not report extra hung tasks:
-	 */
-	if (test_taint(TAINT_DIE) || did_panic)
-		return;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, t) {
-		if (!--max_count)
-			goto unlock;
-		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
-		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, now);
-	} while_each_thread(g, t);
- unlock:
-	read_unlock(&tasklist_lock);
-}
-
 /*
  * The watchdog thread - runs every second and touches the timestamp.
  */
 static int watchdog(void *__bind_cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-	int this_cpu = (long)__bind_cpu;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
@@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu)
 		if (kthread_should_stop())
 			break;
 
-		if (this_cpu == check_cpu) {
-			if (sysctl_hung_task_timeout_secs)
-				check_hung_uninterruptible_tasks(this_cpu);
-		}
-
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
@@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		check_cpu = cpumask_any(cpu_online_mask);
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		if (hotcpu == check_cpu) {
-			/* Pick any other online cpu. */
-			check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
-		}
-		break;
-
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 		if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 596dc31a7116..2481ed30d2b5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -805,6 +805,19 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &neg_one,
 		.extra2		= &sixty,
 	},
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_panic",
+		.data		= &sysctl_hung_task_panic,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_check_count",
@@ -820,7 +833,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_hung_task_timeout_secs,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &proc_dohung_task_timeout_secs,
 		.strategy	= &sysctl_intvec,
 	},
 	{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4c9ae6085c75..883ecea22f37 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -186,6 +186,44 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
 	default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC
 	default 1 if BOOTPARAM_SOFTLOCKUP_PANIC
 
+config DETECT_HUNG_TASK
+	bool "Detect Hung Tasks"
+	depends on DEBUG_KERNEL
+	default y
+	help
+	  Say Y here to enable the kernel to detect "hung tasks",
+	  which are bugs that cause the task to be stuck in
+	  uninterruptible "D" state indefinitiley.
+
+	  When a hung task is detected, the kernel will print the
+	  current stack trace (which you should report), but the
+	  task will stay in uninterruptible state. If lockdep is
+	  enabled then all held locks will also be reported. This
+	  feature has negligible overhead.
+
+config BOOTPARAM_HUNG_TASK_PANIC
+	bool "Panic (Reboot) On Hung Tasks"
+	depends on DETECT_HUNG_TASK
+	help
+	  Say Y here to enable the kernel to panic on "hung tasks",
+	  which are bugs that cause the kernel to leave a task stuck
+	  in uninterruptible "D" state.
+
+	  The panic can be used in combination with panic_timeout,
+	  to cause the system to reboot automatically after a
+	  hung task has been detected. This feature is useful for
+	  high-availability systems that have uptime guarantees and
+	  where a hung tasks must be resolved ASAP.
+
+	  Say N if unsure.
+
+config BOOTPARAM_HUNG_TASK_PANIC_VALUE
+	int
+	depends on DETECT_HUNG_TASK
+	range 0 1
+	default 0 if !BOOTPARAM_HUNG_TASK_PANIC
+	default 1 if BOOTPARAM_HUNG_TASK_PANIC
+
 config SCHED_DEBUG
 	bool "Collect scheduler debugging info"
 	depends on DEBUG_KERNEL && PROC_FS
-- 
cgit v1.2.3


From b1818748b0cf9427e48acf9713295e829a0d715f Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:31:01 +0100
Subject: x86, ftrace, hw-branch-tracer: dump trace on oops

Dump the branch trace on an oops (based on ftrace_dump_on_oops).

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/dumpstack.c      |  6 ++++++
 include/linux/ftrace.h           | 13 +++++++++++++
 kernel/trace/trace.h             |  1 -
 kernel/trace/trace_hw_branches.c | 29 ++++++++++++++++++++++-------
 4 files changed, 41 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6b1f6f6f8661..077c9ea655fc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -14,6 +14,7 @@
 #include <linux/bug.h>
 #include <linux/nmi.h>
 #include <linux/sysfs.h>
+#include <linux/ftrace.h>
 
 #include <asm/stacktrace.h>
 
@@ -195,6 +196,11 @@ unsigned __kprobes long oops_begin(void)
 	int cpu;
 	unsigned long flags;
 
+	/* notify the hw-branch tracer so it may disable tracing and
+	   add the last trace to the trace buffer -
+	   the earlier this happens, the more useful the trace. */
+	trace_hw_branch_oops();
+
 	oops_enter();
 
 	/* racy, but better than risking deadlock. */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 054721487574..9f7880d87c39 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -496,4 +496,17 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 
 #endif /* CONFIG_TRACING */
 
+
+#ifdef CONFIG_HW_BRANCH_TRACER
+
+void trace_hw_branch(u64 from, u64 to);
+void trace_hw_branch_oops(void);
+
+#else /* CONFIG_HW_BRANCH_TRACER */
+
+static inline void trace_hw_branch(u64 from, u64 to) {}
+static inline void trace_hw_branch_oops(void) {}
+
+#endif /* CONFIG_HW_BRANCH_TRACER */
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 54b72781e920..b96037d970df 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -438,7 +438,6 @@ void trace_function(struct trace_array *tr,
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 398195397c75..e56df2c7d679 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -40,6 +40,7 @@ static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
 static int __read_mostly trace_hw_branches_enabled;
+static struct trace_array *hw_branch_trace __read_mostly;
 
 
 /*
@@ -128,6 +129,8 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 
 static int bts_trace_init(struct trace_array *tr)
 {
+	hw_branch_trace = tr;
+
 	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
@@ -170,8 +173,9 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	return TRACE_TYPE_UNHANDLED;
 }
 
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
+void trace_hw_branch(u64 from, u64 to)
 {
+	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
 	unsigned long irq1, irq2;
@@ -204,8 +208,7 @@ void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
 	local_irq_restore(irq1);
 }
 
-static void trace_bts_at(struct trace_array *tr,
-			 const struct bts_trace *trace, void *at)
+static void trace_bts_at(const struct bts_trace *trace, void *at)
 {
 	struct bts_struct bts;
 	int err = 0;
@@ -220,7 +223,7 @@ static void trace_bts_at(struct trace_array *tr,
 
 	switch (bts.qualifier) {
 	case BTS_BRANCH:
-		trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
+		trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
 		break;
 	}
 }
@@ -236,12 +239,15 @@ static void trace_bts_cpu(void *arg)
 	const struct bts_trace *trace;
 	unsigned char *at;
 
-	if (!this_tracer)
+	if (unlikely(!tr))
 		return;
 
 	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
 		return;
 
+	if (unlikely(!this_tracer))
+		return;
+
 	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
@@ -249,11 +255,11 @@ static void trace_bts_cpu(void *arg)
 
 	for (at = trace->ds.top; (void *)at < trace->ds.end;
 	     at += trace->ds.size)
-		trace_bts_at(tr, trace, at);
+		trace_bts_at(trace, at);
 
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     at += trace->ds.size)
-		trace_bts_at(tr, trace, at);
+		trace_bts_at(trace, at);
 
 out:
 	ds_resume_bts(this_tracer);
@@ -268,6 +274,15 @@ static void trace_bts_prepare(struct trace_iterator *iter)
 	mutex_unlock(&bts_tracer_mutex);
 }
 
+void trace_hw_branch_oops(void)
+{
+	mutex_lock(&bts_tracer_mutex);
+
+	trace_bts_cpu(hw_branch_trace);
+
+	mutex_unlock(&bts_tracer_mutex);
+}
+
 struct tracer bts_tracer __read_mostly =
 {
 	.name		= "hw-branch-tracer",
-- 
cgit v1.2.3


From 7e49fcce1bdadd723ae6a0b3b324c4daced61563 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Jan 2009 19:01:40 -0500
Subject: trace, lockdep: manual preempt count adding for local_bh_disable

Impact: fix to preempt trace triggering lockdep check_flag failure

In local_bh_disable, the use of add_preempt_count causes the
preempt tracer to start recording the time preemption is off.
But because it already modified the preempt_count to show
softirqs disabled, and before it called the lockdep code to
handle this, it causes a state that lockdep can not handle.

The preempt tracer will reset the ring buffer on start of a trace,
and the ring buffer reset code does a spin_lock_irqsave. This
calls into lockdep and lockdep will fail when it detects the
invalid state of having softirqs disabled but the internal
current->softirqs_enabled is still set.

The fix is to manually add the SOFTIRQ_OFFSET to preempt count
and call the preempt tracer code outside the lockdep critical
area.

Thanks to Peter Zijlstra for suggesting this solution.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/sched.c        |  8 ++++----
 kernel/softirq.c      | 13 ++++++++++++-
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..33085b88f87b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,6 +137,8 @@ extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
 
+extern unsigned long get_parent_ip(unsigned long addr);
+
 struct seq_file;
 struct cfs_rq;
 struct task_group;
diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c842a8..c154825ae753 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4399,10 +4399,7 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-				defined(CONFIG_PREEMPT_TRACER))
-
-static inline unsigned long get_parent_ip(unsigned long addr)
+unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
 		addr = CALLER_ADDR2;
@@ -4412,6 +4409,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
 	return addr;
 }
 
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
 void __kprobes add_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..6edfc2c11d99 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
 
@@ -79,13 +80,23 @@ static void __local_bh_disable(unsigned long ip)
 	WARN_ON_ONCE(in_irq());
 
 	raw_local_irq_save(flags);
-	add_preempt_count(SOFTIRQ_OFFSET);
+	/*
+	 * The preempt tracer hooks into add_preempt_count and will break
+	 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
+	 * is set and before current->softirq_enabled is cleared.
+	 * We must manually increment preempt_count here and manually
+	 * call the trace_preempt_off later.
+	 */
+	preempt_count() += SOFTIRQ_OFFSET;
 	/*
 	 * Were softirqs turned off above:
 	 */
 	if (softirq_count() == SOFTIRQ_OFFSET)
 		trace_softirqs_off(ip);
 	raw_local_irq_restore(flags);
+
+	if (preempt_count() == SOFTIRQ_OFFSET)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 #else /* !CONFIG_TRACE_IRQFLAGS */
 static inline void __local_bh_disable(unsigned long ip)
-- 
cgit v1.2.3


From 9011262a37cb438f0fa9394b5e83840db8f9680a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 23 Jan 2009 12:06:23 -0200
Subject: ftrace: add ftrace_vprintk

Impact: new helper function

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 8 ++++++++
 kernel/trace/trace.c   | 9 +++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9f7880d87c39..7840e718c6c7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -302,6 +302,9 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 extern int
 __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
+# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap)
+extern int
+__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 extern void ftrace_dump(void);
 #else
 static inline void
@@ -317,6 +320,11 @@ ftrace_printk(const char *fmt, ...)
 {
 	return 0;
 }
+static inline int
+ftrace_vprintk(const char *fmt, va_list ap)
+{
+	return 0;
+}
 static inline void ftrace_dump(void) { }
 #endif
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2129ab9d2a48..2f8ac1f008f5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2951,6 +2951,15 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(__ftrace_printk);
 
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
-- 
cgit v1.2.3


From 157f9c00e88529ed84bd7d581a40d411e5414cf0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 26 Jan 2009 15:00:56 -0200
Subject: tracing/blktrace: fix up checkpatch reported problems in ftrace
 plugin patch

Also make sure sparse (make C=2 block/blktrace.o) is happy too.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c             | 40 +++++++++++++++++++++++++---------------
 fs/partitions/check.c        |  5 +----
 include/linux/blktrace_api.h |  5 +++++
 3 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/blktrace.c b/block/blktrace.c
index 630f167f8240..1b2267c798b6 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -37,7 +37,7 @@ static int __read_mostly  blk_tracer_enabled;
 
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
-	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC ) },
+	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
 	{ }
 };
 
@@ -169,7 +169,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	pid_t pid;
 	int cpu, pc = 0;
 
-	if (unlikely(bt->trace_state != Blktrace_running || !blk_tracer_enabled))
+	if (unlikely(bt->trace_state != Blktrace_running ||
+		     !blk_tracer_enabled))
 		return;
 
 	what |= ddir_act[rw & WRITE];
@@ -192,7 +193,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 						 sizeof(*t) + pdu_len, &flags);
 		if (!event)
 			return;
-		
+
 		ent = ring_buffer_event_data(event);
 		t = (struct blk_io_trace *)ent;
 		pc = preempt_count();
@@ -234,7 +235,7 @@ record_it:
 		if (blk_tr) {
 			ring_buffer_unlock_commit(blk_tr->buffer, event, flags);
 			if (pid != 0 &&
-			    (blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) == 0 &&
+			    !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) &&
 			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
 				__trace_stack(blk_tr, NULL, flags, 5, pc);
 			trace_wake_up();
@@ -955,19 +956,27 @@ static void blk_unregister_tracepoints(void)
 
 static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 {
-        int i = 0;
+	int i = 0;
 
-        if (t->action & BLK_TC_DISCARD)	   rwbs[i++] = 'D';
-        else if (t->action & BLK_TC_WRITE) rwbs[i++] = 'W';
-        else if (t->bytes)		   rwbs[i++] = 'R';
-        else				   rwbs[i++] = 'N';
+	if (t->action & BLK_TC_DISCARD)
+		rwbs[i++] = 'D';
+	else if (t->action & BLK_TC_WRITE)
+		rwbs[i++] = 'W';
+	else if (t->bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
 
-        if (t->action & BLK_TC_AHEAD)	   rwbs[i++] = 'A';
-        if (t->action & BLK_TC_BARRIER)	   rwbs[i++] = 'B';
-        if (t->action & BLK_TC_SYNC)	   rwbs[i++] = 'S';
-        if (t->action & BLK_TC_META)	   rwbs[i++] = 'M';
+	if (t->action & BLK_TC_AHEAD)
+		rwbs[i++] = 'A';
+	if (t->action & BLK_TC_BARRIER)
+		rwbs[i++] = 'B';
+	if (t->action & BLK_TC_SYNC)
+		rwbs[i++] = 'S';
+	if (t->action & BLK_TC_META)
+		rwbs[i++] = 'M';
 
-        rwbs[i] = '\0';
+	rwbs[i] = '\0';
 }
 
 static inline
@@ -1049,7 +1058,8 @@ static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 	return trace_seq_printf(s, "[%s]\n", cmd);
 }
 
-static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent)
+static int blk_log_with_error(struct trace_seq *s,
+			      const struct trace_entry *ent)
 {
 	if (t_sec(ent))
 		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 01714efdc65a..8a17f7edcc74 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -19,6 +19,7 @@
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
+#include <linux/blktrace_api.h>
 
 #include "check.h"
 
@@ -268,10 +269,6 @@ ssize_t part_fail_store(struct device *dev,
 }
 #endif
 
-#ifdef CONFIG_BLK_DEV_IO_TRACE
-extern struct attribute_group blk_trace_attr_group;
-#endif
-
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 1dba3493d520..59b4b2e8ab67 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -142,6 +142,9 @@ struct blk_user_trace_setup {
 
 #ifdef __KERNEL__
 #if defined(CONFIG_BLK_DEV_IO_TRACE)
+
+#include <linux/sysfs.h>
+
 struct blk_trace {
 	int trace_state;
 	struct rchan *rchan;
@@ -192,6 +195,8 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
 
+extern struct attribute_group blk_trace_attr_group;
+
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_shutdown(q)			do { } while (0)
-- 
cgit v1.2.3


From 5e54f5986a579b8445aa1d5ad3435c2cf7568bed Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Fri, 30 Jan 2009 15:29:54 -0800
Subject: softlockup: remove unused definition for spawn_softlockup_task

The definition of spawn_softlockup_task in sched.h became
unnecessary once it was converted to the early_initcall()
interface.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f2f94d532302..2a2811c6239d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -302,9 +302,6 @@ extern int softlockup_thresh;
 static inline void softlockup_tick(void)
 {
 }
-static inline void spawn_softlockup_task(void)
-{
-}
 static inline void touch_softlockup_watchdog(void)
 {
 }
-- 
cgit v1.2.3


From 9705ecc5c1f8f34f756164a711b4cc61110c0283 Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@openmoko.org>
Date: Tue, 27 Jan 2009 19:23:12 +0530
Subject: pcf50633_charger: Enable periodic charging restart

The battery charger state machine switches into charging mode when
the battery voltage falls below 96% of a battery float voltage. But
the voltage drop in Li-ion batteries is marginal(1~2 %) till about
80% of its capacity - which means, after a BATFULL, charging won't
be restarted until 80%.

This work_struct function restarts charging at regular intervals to
make sure the battery doesn't discharge too much.

Signed-off-by: Balaji Rao <balajirrao@openmoko.org>
Cc: Andy Green <andy@openmoko.com>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 drivers/power/pcf50633-charger.c  | 73 +++++++++++++++++++++++++++++++++++++--
 include/linux/mfd/pcf50633/core.h |  2 ++
 2 files changed, 73 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/pcf50633-charger.c b/drivers/power/pcf50633-charger.c
index 41aec2acbb91..1fe1e851a8dd 100644
--- a/drivers/power/pcf50633-charger.c
+++ b/drivers/power/pcf50633-charger.c
@@ -36,6 +36,8 @@ struct pcf50633_mbc {
 
 	struct power_supply usb;
 	struct power_supply adapter;
+
+	struct delayed_work charging_restart_work;
 };
 
 int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma)
@@ -43,6 +45,8 @@ int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma)
 	struct pcf50633_mbc *mbc = platform_get_drvdata(pcf->mbc_pdev);
 	int ret = 0;
 	u8 bits;
+	int charging_start = 1;
+	u8 mbcs2, chgmod;
 
 	if (ma >= 1000)
 		bits = PCF50633_MBCC7_USB_1000mA;
@@ -50,8 +54,10 @@ int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma)
 		bits = PCF50633_MBCC7_USB_500mA;
 	else if (ma >= 100)
 		bits = PCF50633_MBCC7_USB_100mA;
-	else
+	else {
 		bits = PCF50633_MBCC7_USB_SUSPEND;
+		charging_start = 0;
+	}
 
 	ret = pcf50633_reg_set_bit_mask(pcf, PCF50633_REG_MBCC7,
 					PCF50633_MBCC7_USB_MASK, bits);
@@ -60,6 +66,22 @@ int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma)
 	else
 		dev_info(pcf->dev, "usb curlim to %d mA\n", ma);
 
+	/* Manual charging start */
+	mbcs2 = pcf50633_reg_read(pcf, PCF50633_REG_MBCS2);
+	chgmod = (mbcs2 & PCF50633_MBCS2_MBC_MASK);
+
+	/* If chgmod == BATFULL, setting chgena has no effect.
+	 * We need to set resume instead.
+	 */
+	if (chgmod != PCF50633_MBCS2_MBC_BAT_FULL)
+		pcf50633_reg_set_bit_mask(pcf, PCF50633_REG_MBCC1,
+				PCF50633_MBCC1_CHGENA, PCF50633_MBCC1_CHGENA);
+	else
+		pcf50633_reg_set_bit_mask(pcf, PCF50633_REG_MBCC1,
+				PCF50633_MBCC1_RESUME, PCF50633_MBCC1_RESUME);
+
+	mbc->usb_active = charging_start;
+
 	power_supply_changed(&mbc->usb);
 
 	return ret;
@@ -160,10 +182,44 @@ static struct attribute_group mbc_attr_group = {
 	.attrs	= pcf50633_mbc_sysfs_entries,
 };
 
+/* MBC state machine switches into charging mode when the battery voltage
+ * falls below 96% of a battery float voltage. But the voltage drop in Li-ion
+ * batteries is marginal(1~2 %) till about 80% of its capacity - which means,
+ * after a BATFULL, charging won't be restarted until 80%.
+ *
+ * This work_struct function restarts charging at regular intervals to make
+ * sure we don't discharge too much
+ */
+
+static void pcf50633_mbc_charging_restart(struct work_struct *work)
+{
+	struct pcf50633_mbc *mbc;
+	u8 mbcs2, chgmod;
+
+	mbc = container_of(work, struct pcf50633_mbc,
+				charging_restart_work.work);
+
+	mbcs2 = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCS2);
+	chgmod = (mbcs2 & PCF50633_MBCS2_MBC_MASK);
+
+	if (chgmod != PCF50633_MBCS2_MBC_BAT_FULL)
+		return;
+
+	/* Restart charging */
+	pcf50633_reg_set_bit_mask(mbc->pcf, PCF50633_REG_MBCC1,
+				PCF50633_MBCC1_RESUME, PCF50633_MBCC1_RESUME);
+	mbc->usb_active = 1;
+	power_supply_changed(&mbc->usb);
+
+	dev_info(mbc->pcf->dev, "Charging restarted\n");
+}
+
 static void
 pcf50633_mbc_irq_handler(int irq, void *data)
 {
 	struct pcf50633_mbc *mbc = data;
+	int chg_restart_interval =
+			mbc->pcf->pdata->charging_restart_interval;
 
 	/* USB */
 	if (irq == PCF50633_IRQ_USBINS) {
@@ -172,6 +228,7 @@ pcf50633_mbc_irq_handler(int irq, void *data)
 		mbc->usb_online = 0;
 		mbc->usb_active = 0;
 		pcf50633_mbc_usb_curlim_set(mbc->pcf, 0);
+		cancel_delayed_work_sync(&mbc->charging_restart_work);
 	}
 
 	/* Adapter */
@@ -186,7 +243,14 @@ pcf50633_mbc_irq_handler(int irq, void *data)
 	if (irq == PCF50633_IRQ_BATFULL) {
 		mbc->usb_active = 0;
 		mbc->adapter_active = 0;
-	}
+
+		if (chg_restart_interval > 0)
+			schedule_delayed_work(&mbc->charging_restart_work,
+							chg_restart_interval);
+	} else if (irq == PCF50633_IRQ_USBLIMON)
+		mbc->usb_active = 0;
+	else if (irq == PCF50633_IRQ_USBLIMOFF)
+		mbc->usb_active = 1;
 
 	power_supply_changed(&mbc->usb);
 	power_supply_changed(&mbc->adapter);
@@ -303,6 +367,9 @@ static int __devinit pcf50633_mbc_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	INIT_DELAYED_WORK(&mbc->charging_restart_work,
+				pcf50633_mbc_charging_restart);
+
 	ret = sysfs_create_group(&pdev->dev.kobj, &mbc_attr_group);
 	if (ret)
 		dev_err(mbc->pcf->dev, "failed to create sysfs entries\n");
@@ -328,6 +395,8 @@ static int __devexit pcf50633_mbc_remove(struct platform_device *pdev)
 	power_supply_unregister(&mbc->usb);
 	power_supply_unregister(&mbc->adapter);
 
+	cancel_delayed_work_sync(&mbc->charging_restart_work);
+
 	kfree(mbc);
 
 	return 0;
diff --git a/include/linux/mfd/pcf50633/core.h b/include/linux/mfd/pcf50633/core.h
index 4455b212d75a..c8f51c3c0a72 100644
--- a/include/linux/mfd/pcf50633/core.h
+++ b/include/linux/mfd/pcf50633/core.h
@@ -29,6 +29,8 @@ struct pcf50633_platform_data {
 	char **batteries;
 	int num_batteries;
 
+	int charging_restart_interval;
+
 	/* Callbacks */
 	void (*probe_done)(struct pcf50633 *);
 	void (*mbc_event_callback)(struct pcf50633 *, int);
-- 
cgit v1.2.3


From cc52a29e6245acd9032fcfa0ffcab4cc612de986 Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@openmoko.org>
Date: Tue, 27 Jan 2009 19:22:55 +0530
Subject: pcf50633_charger: Remove unused mbc_set_status function

The 'pcf50633_mbc_set_status' function is unused, so remove it.

Signed-off-by: Balaji Rao <balajirrao@openmoko.org>
Cc: Andy Green <andy@openmoko.com>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 drivers/power/pcf50633-charger.c | 15 ---------------
 include/linux/mfd/pcf50633/mbc.h |  1 -
 2 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/pcf50633-charger.c b/drivers/power/pcf50633-charger.c
index 1fe1e851a8dd..e8b278f71781 100644
--- a/drivers/power/pcf50633-charger.c
+++ b/drivers/power/pcf50633-charger.c
@@ -106,21 +106,6 @@ int pcf50633_mbc_get_status(struct pcf50633 *pcf)
 }
 EXPORT_SYMBOL_GPL(pcf50633_mbc_get_status);
 
-void pcf50633_mbc_set_status(struct pcf50633 *pcf, int what, int status)
-{
-	struct pcf50633_mbc *mbc = platform_get_drvdata(pcf->mbc_pdev);
-
-	if (what & PCF50633_MBC_USB_ONLINE)
-		mbc->usb_online = !!status;
-	if (what & PCF50633_MBC_USB_ACTIVE)
-		mbc->usb_active = !!status;
-	if (what & PCF50633_MBC_ADAPTER_ONLINE)
-		mbc->adapter_online = !!status;
-	if (what & PCF50633_MBC_ADAPTER_ACTIVE)
-		mbc->adapter_active = !!status;
-}
-EXPORT_SYMBOL_GPL(pcf50633_mbc_set_status);
-
 static ssize_t
 show_chgmode(struct device *dev, struct device_attribute *attr, char *buf)
 {
diff --git a/include/linux/mfd/pcf50633/mbc.h b/include/linux/mfd/pcf50633/mbc.h
index 6e17619b773a..4119579acf2c 100644
--- a/include/linux/mfd/pcf50633/mbc.h
+++ b/include/linux/mfd/pcf50633/mbc.h
@@ -128,7 +128,6 @@ enum pcf50633_reg_mbcs3 {
 int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma);
 
 int pcf50633_mbc_get_status(struct pcf50633 *);
-void pcf50633_mbc_set_status(struct pcf50633 *, int what, int status);
 
 #endif
 
-- 
cgit v1.2.3


From 5bf2b994bfe11bfe86231050897b2d881ca544d9 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Sun, 18 Jan 2009 17:40:27 +0100
Subject: pda_power: Add optional OTG transceiver and voltage regulator support

This patch allows machines to use an OTG transceiver driver instead of
supplying a custom is_usb_online callback to check USB power.
Also, in the case that the OTG transceiver handles charger control when
connected to USB, a regulator named "ac_draw" can be supplied instead of
the custom set_charge callback to control the charger when connected to
AC.

The check for (transceiver->state == OTG_STATE_B_PERIPHERAL) in
otg_is_usb_online is probably too simple, I'm just using this with a
peripheral only device and gpio_vbus + bq24022. I'm not sure which other
OTG states can supply power.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 drivers/power/pda_power.c | 89 ++++++++++++++++++++++++++++++++++++++++-------
 include/linux/pda_power.h |  2 ++
 2 files changed, 79 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/pda_power.c b/drivers/power/pda_power.c
index b56a704409d2..a232de6a5703 100644
--- a/drivers/power/pda_power.c
+++ b/drivers/power/pda_power.c
@@ -12,11 +12,14 @@
 
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/err.h>
 #include <linux/interrupt.h>
 #include <linux/power_supply.h>
 #include <linux/pda_power.h>
+#include <linux/regulator/consumer.h>
 #include <linux/timer.h>
 #include <linux/jiffies.h>
+#include <linux/usb/otg.h>
 
 static inline unsigned int get_irq_flags(struct resource *res)
 {
@@ -35,6 +38,11 @@ static struct timer_list supply_timer;
 static struct timer_list polling_timer;
 static int polling;
 
+#ifdef CONFIG_USB_OTG_UTILS
+static struct otg_transceiver *transceiver;
+#endif
+static struct regulator *ac_draw;
+
 enum {
 	PDA_PSY_OFFLINE = 0,
 	PDA_PSY_ONLINE = 1,
@@ -104,18 +112,35 @@ static void update_status(void)
 
 static void update_charger(void)
 {
-	if (!pdata->set_charge)
-		return;
-
-	if (new_ac_status > 0) {
-		dev_dbg(dev, "charger on (AC)\n");
-		pdata->set_charge(PDA_POWER_CHARGE_AC);
-	} else if (new_usb_status > 0) {
-		dev_dbg(dev, "charger on (USB)\n");
-		pdata->set_charge(PDA_POWER_CHARGE_USB);
-	} else {
-		dev_dbg(dev, "charger off\n");
-		pdata->set_charge(0);
+	static int regulator_enabled;
+	int max_uA = pdata->ac_max_uA;
+
+	if (pdata->set_charge) {
+		if (new_ac_status > 0) {
+			dev_dbg(dev, "charger on (AC)\n");
+			pdata->set_charge(PDA_POWER_CHARGE_AC);
+		} else if (new_usb_status > 0) {
+			dev_dbg(dev, "charger on (USB)\n");
+			pdata->set_charge(PDA_POWER_CHARGE_USB);
+		} else {
+			dev_dbg(dev, "charger off\n");
+			pdata->set_charge(0);
+		}
+	} else if (ac_draw) {
+		if (new_ac_status > 0) {
+			regulator_set_current_limit(ac_draw, max_uA, max_uA);
+			if (!regulator_enabled) {
+				dev_dbg(dev, "charger on (AC)\n");
+				regulator_enable(ac_draw);
+				regulator_enabled = 1;
+			}
+		} else {
+			if (regulator_enabled) {
+				dev_dbg(dev, "charger off\n");
+				regulator_disable(ac_draw);
+				regulator_enabled = 0;
+			}
+		}
 	}
 }
 
@@ -194,6 +219,13 @@ static void polling_timer_func(unsigned long unused)
 		  jiffies + msecs_to_jiffies(pdata->polling_interval));
 }
 
+#ifdef CONFIG_USB_OTG_UTILS
+static int otg_is_usb_online(void)
+{
+	return (transceiver->state == OTG_STATE_B_PERIPHERAL);
+}
+#endif
+
 static int pda_power_probe(struct platform_device *pdev)
 {
 	int ret = 0;
@@ -227,6 +259,9 @@ static int pda_power_probe(struct platform_device *pdev)
 	if (!pdata->polling_interval)
 		pdata->polling_interval = 2000;
 
+	if (!pdata->ac_max_uA)
+		pdata->ac_max_uA = 500000;
+
 	setup_timer(&charger_timer, charger_timer_func, 0);
 	setup_timer(&supply_timer, supply_timer_func, 0);
 
@@ -240,6 +275,13 @@ static int pda_power_probe(struct platform_device *pdev)
 		pda_psy_usb.num_supplicants = pdata->num_supplicants;
 	}
 
+	ac_draw = regulator_get(dev, "ac_draw");
+	if (IS_ERR(ac_draw)) {
+		dev_dbg(dev, "couldn't get ac_draw regulator\n");
+		ac_draw = NULL;
+		ret = PTR_ERR(ac_draw);
+	}
+
 	if (pdata->is_ac_online) {
 		ret = power_supply_register(&pdev->dev, &pda_psy_ac);
 		if (ret) {
@@ -261,6 +303,13 @@ static int pda_power_probe(struct platform_device *pdev)
 		}
 	}
 
+#ifdef CONFIG_USB_OTG_UTILS
+	transceiver = otg_get_transceiver();
+	if (transceiver && !pdata->is_usb_online) {
+		pdata->is_usb_online = otg_is_usb_online;
+	}
+#endif
+
 	if (pdata->is_usb_online) {
 		ret = power_supply_register(&pdev->dev, &pda_psy_usb);
 		if (ret) {
@@ -300,10 +349,18 @@ usb_irq_failed:
 usb_supply_failed:
 	if (pdata->is_ac_online && ac_irq)
 		free_irq(ac_irq->start, &pda_psy_ac);
+#ifdef CONFIG_USB_OTG_UTILS
+	if (transceiver)
+		otg_put_transceiver(transceiver);
+#endif
 ac_irq_failed:
 	if (pdata->is_ac_online)
 		power_supply_unregister(&pda_psy_ac);
 ac_supply_failed:
+	if (ac_draw) {
+		regulator_put(ac_draw);
+		ac_draw = NULL;
+	}
 	if (pdata->exit)
 		pdata->exit(dev);
 init_failed:
@@ -327,6 +384,14 @@ static int pda_power_remove(struct platform_device *pdev)
 		power_supply_unregister(&pda_psy_usb);
 	if (pdata->is_ac_online)
 		power_supply_unregister(&pda_psy_ac);
+#ifdef CONFIG_USB_OTG_UTILS
+	if (transceiver)
+		otg_put_transceiver(transceiver);
+#endif
+	if (ac_draw) {
+		regulator_put(ac_draw);
+		ac_draw = NULL;
+	}
 	if (pdata->exit)
 		pdata->exit(dev);
 
diff --git a/include/linux/pda_power.h b/include/linux/pda_power.h
index cb7d10f30763..d4cf7a2ceb3e 100644
--- a/include/linux/pda_power.h
+++ b/include/linux/pda_power.h
@@ -31,6 +31,8 @@ struct pda_power_pdata {
 	unsigned int wait_for_status; /* msecs, default is 500 */
 	unsigned int wait_for_charger; /* msecs, default is 500 */
 	unsigned int polling_interval; /* msecs, default is 2000 */
+
+	unsigned long ac_max_uA; /* current to draw when on AC */
 };
 
 #endif /* __PDA_POWER_H__ */
-- 
cgit v1.2.3


From f036be96dd9ce442ffb9ab33e3c165f5178815c0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Feb 2009 13:45:43 +0100
Subject: printk: introduce printk_once()

This pattern shows up frequently in the kernel:

  static int once = 1;
  ...

		if (once) {
			once = 0;
			printk(KERN_ERR "message\n");
		}
  ...

So add a printk_once() helper macro that reduces this to a single line
of:

		printk_once(KERN_ERR "message\n");

It works analogously to WARN_ONCE() & friends. (We use a macro not
an inline because vararg expansion in inlines looks awkward and the
macro is simple enough.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 343df9ef2412..3c183d9864ae 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -242,6 +242,19 @@ extern struct ratelimit_state printk_ratelimit_state;
 extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
+
+/*
+ * Print a one-time message (analogous to WARN_ONCE() et al):
+ */
+#define printk_once(x...) ({			\
+	static int __print_once = 1;		\
+						\
+	if (__print_once) {			\
+		__print_once = 0;		\
+		printk(x);			\
+	}					\
+})
+
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
@@ -253,6 +266,10 @@ static inline int printk_ratelimit(void) { return 0; }
 static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 					  unsigned int interval_msec)	\
 		{ return false; }
+
+/* No effect, but we still get type checking even in the !PRINTK case: */
+#define printk_once(x...) printk(x)
+
 #endif
 
 extern int printk_needs_cpu(int cpu);
-- 
cgit v1.2.3


From 0a9877514c4fed10a70720293b37213dd172ee3e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 16:12:56 -0200
Subject: ring_buffer: remove unused flags parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: API change, cleanup

>From ring_buffer_{lock_reserve,unlock_commit}.

$ codiff /tmp/vmlinux.before /tmp/vmlinux.after
linux-2.6-tip/kernel/trace/trace.c:
  trace_vprintk              |  -14
  trace_graph_return         |  -14
  trace_graph_entry          |  -10
  trace_function             |   -8
  __ftrace_trace_stack       |   -8
  ftrace_trace_userstack     |   -8
  tracing_sched_switch_trace |   -8
  ftrace_trace_special       |  -12
  tracing_sched_wakeup_trace |   -8
 9 functions changed, 90 bytes removed, diff: -90

linux-2.6-tip/block/blktrace.c:
  __blk_add_trace |   -1
 1 function changed, 1 bytes removed, diff: -1

/tmp/vmlinux.after:
 10 functions changed, 91 bytes removed, diff: -91

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c                 |  8 +++---
 include/linux/ring_buffer.h      |  9 +++----
 kernel/trace/kmemtrace.c         | 12 +++------
 kernel/trace/ring_buffer.c       |  9 ++-----
 kernel/trace/trace.c             | 56 ++++++++++++++--------------------------
 kernel/trace/trace_boot.c        | 12 +++------
 kernel/trace/trace_branch.c      |  7 +++--
 kernel/trace/trace_hw_branches.c |  6 ++---
 kernel/trace/trace_mmiotrace.c   | 12 +++------
 kernel/trace/trace_power.c       | 12 +++------
 10 files changed, 51 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/block/blktrace.c b/block/blktrace.c
index d9d7146ee023..8e52f24cc8f9 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -165,7 +165,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
 	struct blk_io_trace *t;
-	unsigned long flags;
+	unsigned long flags = 0;
 	unsigned long *sequence;
 	pid_t pid;
 	int cpu, pc = 0;
@@ -191,7 +191,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		tracing_record_cmdline(current);
 
 		event = ring_buffer_lock_reserve(blk_tr->buffer,
-						 sizeof(*t) + pdu_len, &flags);
+						 sizeof(*t) + pdu_len);
 		if (!event)
 			return;
 
@@ -241,11 +241,11 @@ record_it:
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
 		if (blk_tr) {
-			ring_buffer_unlock_commit(blk_tr->buffer, event, flags);
+			ring_buffer_unlock_commit(blk_tr->buffer, event);
 			if (pid != 0 &&
 			    !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) &&
 			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
-				__trace_stack(blk_tr, flags, 5, pc);
+				__trace_stack(blk_tr, 0, 5, pc);
 			trace_wake_up();
 			return;
 		}
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b3b359660082..3110d92e7d81 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -74,13 +74,10 @@ void ring_buffer_free(struct ring_buffer *buffer);
 
 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
 
-struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer,
-			 unsigned long length,
-			 unsigned long *flags);
+struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer,
+						   unsigned long length);
 int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-			      struct ring_buffer_event *event,
-			      unsigned long flags);
+			      struct ring_buffer_event *event);
 int ring_buffer_write(struct ring_buffer *buffer,
 		      unsigned long length, void *data);
 
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index f04c0625f1cd..256749d1032a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -272,13 +272,11 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	struct ring_buffer_event *event;
 	struct kmemtrace_alloc_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
-	unsigned long irq_flags;
 
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -292,7 +290,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	entry->gfp_flags = gfp_flags;
 	entry->node	=	node;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -305,13 +303,11 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	struct ring_buffer_event *event;
 	struct kmemtrace_free_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
-	unsigned long irq_flags;
 
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -322,7 +318,7 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d7374ceef..aee76b3eeed2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1257,7 +1257,6 @@ static DEFINE_PER_CPU(int, rb_need_resched);
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
  * @length: the length of the data to reserve (excluding event header)
- * @flags: a pointer to save the interrupt flags
  *
  * Returns a reseverd event on the ring buffer to copy directly to.
  * The user of this interface will need to get the body to write into
@@ -1270,9 +1269,7 @@ static DEFINE_PER_CPU(int, rb_need_resched);
  * If NULL is returned, then nothing has been allocated or locked.
  */
 struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer,
-			 unsigned long length,
-			 unsigned long *flags)
+ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
@@ -1339,15 +1336,13 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
  * ring_buffer_unlock_commit - commit a reserved
  * @buffer: The buffer to commit to
  * @event: The event pointer to commit.
- * @flags: the interrupt flags received from ring_buffer_lock_reserve.
  *
  * This commits the data to the ring buffer, and releases any locks held.
  *
  * Must be paired with ring_buffer_lock_reserve.
  */
 int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-			      struct ring_buffer_event *event,
-			      unsigned long flags)
+			      struct ring_buffer_event *event)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu = raw_smp_processor_id();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3536ef41575d..eb453a238a6f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -783,14 +783,12 @@ trace_function(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
-	unsigned long irq_flags;
 
 	/* If we are reading the ring buffer, don't trace */
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -798,7 +796,7 @@ trace_function(struct trace_array *tr,
 	entry->ent.type			= TRACE_FN;
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -809,20 +807,18 @@ static void __trace_graph_entry(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ent_entry *entry;
-	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_GRAPH_ENT;
 	entry->graph_ent			= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 
 static void __trace_graph_return(struct trace_array *tr,
@@ -832,20 +828,18 @@ static void __trace_graph_return(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ret_entry *entry;
-	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_GRAPH_RET;
 	entry->ret				= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
 
@@ -866,10 +860,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -884,7 +876,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -912,13 +904,11 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
-	unsigned long irq_flags;
 
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -933,7 +923,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace_user(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -950,10 +940,8 @@ ftrace_trace_special(void *__tr,
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -962,9 +950,9 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, irq_flags, 4, pc);
-	ftrace_trace_userstack(tr, irq_flags, pc);
+	ring_buffer_unlock_commit(tr->buffer, event);
+	ftrace_trace_stack(tr, 0, 4, pc);
+	ftrace_trace_userstack(tr, 0, pc);
 
 	trace_wake_up();
 }
@@ -984,10 +972,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -1000,7 +986,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 5, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 }
@@ -1013,10 +999,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -1029,7 +1013,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_prio		= wakee->prio;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 
@@ -2841,7 +2825,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	trace_buf[len] = 0;
 
 	size = sizeof(*entry) + len + 1;
-	event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, size);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
@@ -2852,7 +2836,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
 	spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 1f07895977a0..4e08debf662d 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -132,7 +132,6 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
 	struct trace_boot_call *entry;
-	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
 	if (!tr || !pre_initcalls_finished)
@@ -144,15 +143,14 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_BOOT_CALL;
 	entry->boot_call = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
@@ -164,7 +162,6 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
 	struct trace_boot_ret *entry;
-	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
 	if (!tr || !pre_initcalls_finished)
@@ -173,15 +170,14 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_BOOT_RET;
 	entry->boot_ret = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 027e83690615..770e52acfc10 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -33,7 +33,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
-	unsigned long flags, irq_flags;
+	unsigned long flags;
 	int cpu, pc;
 	const char *p;
 
@@ -52,8 +52,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 
@@ -75,7 +74,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index fff3545fc866..e720c001db2b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -175,7 +175,7 @@ void trace_hw_branch(u64 from, u64 to)
 	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
-	unsigned long irq1, irq2;
+	unsigned long irq1;
 	int cpu;
 
 	if (unlikely(!tr))
@@ -189,7 +189,7 @@ void trace_hw_branch(u64 from, u64 to)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
@@ -198,7 +198,7 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event, irq2);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index ec78e244242e..104ddebc11d1 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,10 +307,8 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
-	unsigned long irq_flags;
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -319,7 +317,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_RW;
 	entry->rw			= *rw;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -337,10 +335,8 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
-	unsigned long irq_flags;
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -349,7 +345,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_MAP;
 	entry->map			= *map;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index faa6ab7a1f5c..3b1a292d12d2 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -115,7 +115,6 @@ void trace_power_end(struct power_trace *it)
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
-	unsigned long irq_flags;
 	struct trace_array *tr = power_trace;
 
 	if (!trace_power_enabled)
@@ -125,15 +124,14 @@ void trace_power_end(struct power_trace *it)
 	it->end = ktime_get();
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
@@ -148,7 +146,6 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
-	unsigned long irq_flags;
 	struct trace_array *tr = power_trace;
 
 	if (!trace_power_enabled)
@@ -162,15 +159,14 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	it->end = it->stamp;
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
-- 
cgit v1.2.3


From 78d904b46a72fcf15ea6a39672bbef92953876b5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Feb 2009 18:43:07 -0500
Subject: ring-buffer: add NMI protection for spinlocks

Impact: prevent deadlock in NMI

The ring buffers are not yet totally lockless with writing to
the buffer. When a writer crosses a page, it grabs a per cpu spinlock
to protect against a reader. The spinlocks taken by a writer are not
to protect against other writers, since a writer can only write to
its own per cpu buffer. The spinlocks protect against readers that
can touch any cpu buffer. The writers are made to be reentrant
with the spinlocks disabling interrupts.

The problem arises when an NMI writes to the buffer, and that write
crosses a page boundary. If it grabs a spinlock, it can be racing
with another writer (since disabling interrupts does not protect
against NMIs) or with a reader on the same CPU. Luckily, most of the
users are not reentrant and protects against this issue. But if a
user of the ring buffer becomes reentrant (which is what the ring
buffers do allow), if the NMI also writes to the ring buffer then
we risk the chance of a deadlock.

This patch moves the ftrace_nmi_enter called by nmi_enter() to the
ring buffer code. It replaces the current ftrace_nmi_enter that is
used by arch specific code to arch_ftrace_nmi_enter and updates
the Kconfig to handle it.

When an NMI is called, it will set a per cpu variable in the ring buffer
code and will clear it when the NMI exits. If a write to the ring buffer
crosses page boundaries inside an NMI, a trylock is used on the spin
lock instead. If the spinlock fails to be acquired, then the entry
is discarded.

This bug appeared in the ftrace work in the RT tree, where event tracing
is reentrant. This workaround solved the deadlocks that appeared there.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/ftrace.c   |  8 ++++----
 include/linux/ftrace_irq.h | 10 +++++++++-
 kernel/trace/Kconfig       |  8 ++++++++
 kernel/trace/ring_buffer.c | 48 ++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 68 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 73f7fe8fd4d1..a6be725cb049 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,6 +34,7 @@ config X86
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE || FUNCTION_GRAPH_TRACER
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 4d33224c055f..4c683587055b 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -113,7 +113,7 @@ static void ftrace_mod_code(void)
 					     MCOUNT_INSN_SIZE);
 }
 
-void ftrace_nmi_enter(void)
+void arch_ftrace_nmi_enter(void)
 {
 	atomic_inc(&in_nmi);
 	/* Must have in_nmi seen before reading write flag */
@@ -124,7 +124,7 @@ void ftrace_nmi_enter(void)
 	}
 }
 
-void ftrace_nmi_exit(void)
+void arch_ftrace_nmi_exit(void)
 {
 	/* Finish all executions before clearing in_nmi */
 	smp_wmb();
@@ -376,12 +376,12 @@ int ftrace_disable_ftrace_graph_caller(void)
  */
 static atomic_t in_nmi;
 
-void ftrace_nmi_enter(void)
+void arch_ftrace_nmi_enter(void)
 {
 	atomic_inc(&in_nmi);
 }
 
-void ftrace_nmi_exit(void)
+void arch_ftrace_nmi_exit(void)
 {
 	atomic_dec(&in_nmi);
 }
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 366a054d0b05..29de6779a963 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -2,7 +2,15 @@
 #define _LINUX_FTRACE_IRQ_H
 
 
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
+#ifdef CONFIG_FTRACE_NMI_ENTER
+extern void arch_ftrace_nmi_enter(void);
+extern void arch_ftrace_nmi_exit(void);
+#else
+static inline void arch_ftrace_nmi_enter(void) { }
+static inline void arch_ftrace_nmi_exit(void) { }
+#endif
+
+#ifdef CONFIG_RING_BUFFER
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 28f2644484d9..25131a5d5e4f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
 config NOP_TRACER
 	bool
 
+config HAVE_FTRACE_NMI_ENTER
+	bool
+
 config HAVE_FUNCTION_TRACER
 	bool
 
@@ -37,6 +40,11 @@ config TRACER_MAX_TRACE
 config RING_BUFFER
 	bool
 
+config FTRACE_NMI_ENTER
+       bool
+       depends on HAVE_FTRACE_NMI_ENTER
+       default y
+
 config TRACING
 	bool
 	select DEBUG_FS
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d7374ceef..a60a6a852f42 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
@@ -18,6 +19,35 @@
 
 #include "trace.h"
 
+/*
+ * Since the write to the buffer is still not fully lockless,
+ * we must be careful with NMIs. The locks in the writers
+ * are taken when a write crosses to a new page. The locks
+ * protect against races with the readers (this will soon
+ * be fixed with a lockless solution).
+ *
+ * Because we can not protect against NMIs, and we want to
+ * keep traces reentrant, we need to manage what happens
+ * when we are in an NMI.
+ */
+static DEFINE_PER_CPU(int, rb_in_nmi);
+
+void ftrace_nmi_enter(void)
+{
+	__get_cpu_var(rb_in_nmi)++;
+	/* call arch specific handler too */
+	arch_ftrace_nmi_enter();
+}
+
+void ftrace_nmi_exit(void)
+{
+	arch_ftrace_nmi_exit();
+	__get_cpu_var(rb_in_nmi)--;
+	/* NMIs are not recursive */
+	WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
+}
+
+
 /*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
@@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
 	unsigned long flags;
+	bool lock_taken = false;
 
 	commit_page = cpu_buffer->commit_page;
 	/* we just need to protect against interrupts */
@@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		struct buffer_page *next_page = tail_page;
 
 		local_irq_save(flags);
-		__raw_spin_lock(&cpu_buffer->lock);
+		/*
+		 * NMIs can happen after we take the lock.
+		 * If we are in an NMI, only take the lock
+		 * if it is not already taken. Otherwise
+		 * simply fail.
+		 */
+		if (unlikely(__get_cpu_var(rb_in_nmi))) {
+			if (!__raw_spin_trylock(&cpu_buffer->lock))
+				goto out_unlock;
+		} else
+			__raw_spin_lock(&cpu_buffer->lock);
+
+		lock_taken = true;
 
 		rb_inc_page(cpu_buffer, &next_page);
 
@@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (tail <= BUF_PAGE_SIZE)
 		local_set(&tail_page->write, tail);
 
-	__raw_spin_unlock(&cpu_buffer->lock);
+	if (likely(lock_taken))
+		__raw_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 	return NULL;
 }
-- 
cgit v1.2.3


From d8b891a2db13c8ed296158d6f8c4e335896d0cef Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Feb 2009 19:54:51 -0500
Subject: ring-buffer: allow tracing_off to be used in core kernel code

tracing_off() is the fastest way to stop recording to the ring buffers.
This may be used in places like panic and die, just before the
ftrace_dump is called.

This patch adds the appropriate CPP conditionals to make it a stub
function when the ring buffer is not configured it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b3b359660082..ac94c066f6e9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -124,9 +124,18 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 u64 ring_buffer_time_stamp(int cpu);
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
+/*
+ * The below functions are fine to use outside the tracing facility.
+ */
+#ifdef CONFIG_RING_BUFFER
 void tracing_on(void);
 void tracing_off(void);
 void tracing_off_permanent(void);
+#else
+static inline void tracing_on(void) { }
+static inline void tracing_off(void) { }
+static inline void tracing_off_permanent(void) { }
+#endif
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
 void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
-- 
cgit v1.2.3


From 375b38b4214f29109a393ab762d468054bf52354 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 6 Feb 2009 00:51:37 -0500
Subject: nmi: add generic nmi tracking state

This code adds an in_nmi() macro that uses the current tasks preempt count
to track when it is in NMI context. Other parts of the kernel can
use this to determine if the context is in NMI context or not.

This code was inspired by the -rt patch in_nmi version that was
written by Peter Zijlstra, who borrowed that code from
Mathieu Desnoyers.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/hardirq.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f83288347dda..f3cf86e1465b 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -61,6 +61,12 @@
 #error PREEMPT_ACTIVE is too low!
 #endif
 
+#define NMI_OFFSET	(PREEMPT_ACTIVE << 1)
+
+#if NMI_OFFSET >= 0x80000000
+#error PREEMPT_ACTIVE too high!
+#endif
+
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
@@ -73,6 +79,11 @@
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
 
+/*
+ * Are we in NMI context?
+ */
+#define in_nmi()	(preempt_count() & NMI_OFFSET)
+
 #if defined(CONFIG_PREEMPT)
 # define PREEMPT_INATOMIC_BASE kernel_locked()
 # define PREEMPT_CHECK_OFFSET 1
@@ -167,6 +178,8 @@ extern void irq_exit(void);
 #define nmi_enter()				\
 	do {					\
 		ftrace_nmi_enter();		\
+		BUG_ON(in_nmi());		\
+		add_preempt_count(NMI_OFFSET);	\
 		lockdep_off();			\
 		rcu_nmi_enter();		\
 		__irq_enter();			\
@@ -177,6 +190,8 @@ extern void irq_exit(void);
 		__irq_exit();			\
 		rcu_nmi_exit();			\
 		lockdep_on();			\
+		BUG_ON(!in_nmi());		\
+		sub_preempt_count(NMI_OFFSET);	\
 		ftrace_nmi_exit();		\
 	} while (0)
 
-- 
cgit v1.2.3


From a81bd80a0b0a405dc0483e2c428332d69da2c79f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 6 Feb 2009 01:45:16 -0500
Subject: ring-buffer: use generic version of in_nmi

Impact: clean up

Now that a generic in_nmi is available, this patch removes the
special code in the ring_buffer and implements the in_nmi generic
version instead.

With this change, I was also able to rename the "arch_ftrace_nmi_enter"
back to "ftrace_nmi_enter" and remove the code from the ring buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/ftrace.c   |  4 ++--
 include/linux/ftrace_irq.h |  8 --------
 kernel/trace/ring_buffer.c | 43 +++++++++++++------------------------------
 3 files changed, 15 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 918073c6681b..d74d75e0952d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -113,7 +113,7 @@ static void ftrace_mod_code(void)
 					     MCOUNT_INSN_SIZE);
 }
 
-void arch_ftrace_nmi_enter(void)
+void ftrace_nmi_enter(void)
 {
 	atomic_inc(&nmi_running);
 	/* Must have nmi_running seen before reading write flag */
@@ -124,7 +124,7 @@ void arch_ftrace_nmi_enter(void)
 	}
 }
 
-void arch_ftrace_nmi_exit(void)
+void ftrace_nmi_exit(void)
 {
 	/* Finish all executions before clearing nmi_running */
 	smp_wmb();
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 29de6779a963..dca7bf8cffe2 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -3,14 +3,6 @@
 
 
 #ifdef CONFIG_FTRACE_NMI_ENTER
-extern void arch_ftrace_nmi_enter(void);
-extern void arch_ftrace_nmi_exit(void);
-#else
-static inline void arch_ftrace_nmi_enter(void) { }
-static inline void arch_ftrace_nmi_exit(void) { }
-#endif
-
-#ifdef CONFIG_RING_BUFFER
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a60a6a852f42..5ee344417cd5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,6 +8,7 @@
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
+#include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -19,35 +20,6 @@
 
 #include "trace.h"
 
-/*
- * Since the write to the buffer is still not fully lockless,
- * we must be careful with NMIs. The locks in the writers
- * are taken when a write crosses to a new page. The locks
- * protect against races with the readers (this will soon
- * be fixed with a lockless solution).
- *
- * Because we can not protect against NMIs, and we want to
- * keep traces reentrant, we need to manage what happens
- * when we are in an NMI.
- */
-static DEFINE_PER_CPU(int, rb_in_nmi);
-
-void ftrace_nmi_enter(void)
-{
-	__get_cpu_var(rb_in_nmi)++;
-	/* call arch specific handler too */
-	arch_ftrace_nmi_enter();
-}
-
-void ftrace_nmi_exit(void)
-{
-	arch_ftrace_nmi_exit();
-	__get_cpu_var(rb_in_nmi)--;
-	/* NMIs are not recursive */
-	WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
-}
-
-
 /*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
@@ -1027,12 +999,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 		local_irq_save(flags);
 		/*
+		 * Since the write to the buffer is still not
+		 * fully lockless, we must be careful with NMIs.
+		 * The locks in the writers are taken when a write
+		 * crosses to a new page. The locks protect against
+		 * races with the readers (this will soon be fixed
+		 * with a lockless solution).
+		 *
+		 * Because we can not protect against NMIs, and we
+		 * want to keep traces reentrant, we need to manage
+		 * what happens when we are in an NMI.
+		 *
 		 * NMIs can happen after we take the lock.
 		 * If we are in an NMI, only take the lock
 		 * if it is not already taken. Otherwise
 		 * simply fail.
 		 */
-		if (unlikely(__get_cpu_var(rb_in_nmi))) {
+		if (unlikely(in_nmi())) {
 			if (!__raw_spin_trylock(&cpu_buffer->lock))
 				goto out_unlock;
 		} else
-- 
cgit v1.2.3


From 57794a9d48b63e34acbe63282628c9f029603308 Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Fri, 6 Feb 2009 17:33:27 +0800
Subject: trace: trivial fixes in comment typos.

Impact: clean up

Fixed several typos in the comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h | 2 +-
 kernel/trace/ftrace.c  | 6 +++---
 kernel/trace/trace.h   | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7840e718c6c7..5e302d636fc2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -140,7 +140,7 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
 #endif
 
 /**
- * ftrace_make_nop - convert code into top
+ * ftrace_make_nop - convert code into nop
  * @mod: module structure if called by module load initialization
  * @rec: the mcount call site record
  * @addr: the address that the call site should be calling
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 68610031780b..1796e018fbff 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -465,7 +465,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	 * it is not enabled then do nothing.
 	 *
 	 * If this record is not to be traced and
-	 * it is enabled then disabled it.
+	 * it is enabled then disable it.
 	 *
 	 */
 	if (rec->flags & FTRACE_FL_NOTRACE) {
@@ -485,7 +485,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 		if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
 			return 0;
 
-		/* Record is not filtered and is not enabled do nothing */
+		/* Record is not filtered or enabled, do nothing */
 		if (!fl)
 			return 0;
 
@@ -507,7 +507,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 		} else {
 
-			/* if record is not enabled do nothing */
+			/* if record is not enabled, do nothing */
 			if (!(rec->flags & FTRACE_FL_ENABLED))
 				return 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5efc4c707f7e..f92aba52a894 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -616,12 +616,12 @@ extern struct tracer nop_trace;
  * preempt_enable (after a disable), a schedule might take place
  * causing an infinite recursion.
  *
- * To prevent this, we read the need_recshed flag before
+ * To prevent this, we read the need_resched flag before
  * disabling preemption. When we want to enable preemption we
  * check the flag, if it is set, then we call preempt_enable_no_resched.
  * Otherwise, we call preempt_enable.
  *
- * The rational for doing the above is that if need resched is set
+ * The rational for doing the above is that if need_resched is set
  * and we have yet to reschedule, we are either in an atomic location
  * (where we do not need to check for scheduling) or we are inside
  * the scheduler and do not want to resched.
@@ -642,7 +642,7 @@ static inline int ftrace_preempt_disable(void)
  *
  * This is a scheduler safe way to enable preemption and not miss
  * any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we were either inside an atomic or
+ * If resched is set, then we are either inside an atomic or
  * are inside the scheduler (we would have already scheduled
  * otherwise). In this case, we do not want to call normal
  * preempt_enable, but preempt_enable_no_resched instead.
-- 
cgit v1.2.3


From 1292211058aaf872eeb2a0e2677d237916b4501f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Feb 2009 22:16:12 +0100
Subject: tracing/power: move the power trace headers to a dedicated file

Impact: cleanup

Move the power tracer headers to trace/power.h to keep ftrace.h and power bits
more easy to maintain as separated topics.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |  2 +-
 arch/x86/kernel/process.c                  |  2 +-
 include/linux/ftrace.h                     | 30 -------------------------
 include/trace/power.h                      | 35 ++++++++++++++++++++++++++++++
 kernel/trace/trace.h                       |  1 +
 kernel/trace/trace_power.c                 |  2 +-
 6 files changed, 39 insertions(+), 33 deletions(-)
 create mode 100644 include/trace/power.h

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..7ed925edf4d2 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e68bb9e30864..026819ffcb0c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 5e302d636fc2..106b7909d500 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -339,36 +339,6 @@ ftrace_init_module(struct module *mod,
 		   unsigned long *start, unsigned long *end) { }
 #endif
 
-enum {
-	POWER_NONE = 0,
-	POWER_CSTATE = 1,
-	POWER_PSTATE = 2,
-};
-
-struct power_trace {
-#ifdef CONFIG_POWER_TRACER
-	ktime_t			stamp;
-	ktime_t			end;
-	int			type;
-	int			state;
-#endif
-};
-
-#ifdef CONFIG_POWER_TRACER
-extern void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_end(struct power_trace *it);
-#else
-static inline void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_end(struct power_trace *it) { }
-#endif
-
-
 /*
  * Structure that defines an entry function trace.
  */
diff --git a/include/trace/power.h b/include/trace/power.h
new file mode 100644
index 000000000000..c7cefbcdaea4
--- /dev/null
+++ b/include/trace/power.h
@@ -0,0 +1,35 @@
+#ifndef _TRACE_POWER_H
+#define _TRACE_POWER_H
+
+#include <linux/ktime.h>
+
+enum {
+	POWER_NONE = 0,
+	POWER_CSTATE = 1,
+	POWER_PSTATE = 2,
+};
+
+struct power_trace {
+#ifdef CONFIG_POWER_TRACER
+	ktime_t			stamp;
+	ktime_t			end;
+	int			type;
+	int			state;
+#endif
+};
+
+#ifdef CONFIG_POWER_TRACER
+extern void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_end(struct power_trace *it);
+#else
+static inline void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_end(struct power_trace *it) { }
+#endif
+
+#endif /* _TRACE_POWER_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a011ec062225..1ecfb9d2b365 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -10,6 +10,7 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <trace/kmemtrace.h>
+#include <trace/power.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index bfc21f8079ab..b1d0d087d3a6 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -11,7 +11,7 @@
 
 #include <linux/init.h>
 #include <linux/debugfs.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 #include <linux/kallsyms.h>
 #include <linux/module.h>
 
-- 
cgit v1.2.3


From 17406b82d621930cca8ccc1272cdac9a7dae8e40 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Fri, 6 Feb 2009 15:37:47 -0800
Subject: softlockup: remove timestamp checking from hung_task

Impact: saves sizeof(long) bytes per task_struct

By guaranteeing that sysctl_hung_task_timeout_secs have elapsed between
tasklist scans we can avoid using timestamps.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 -
 kernel/fork.c         |  8 +++-----
 kernel/hung_task.c    | 48 +++++++++---------------------------------------
 3 files changed, 12 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2a2811c6239d..e0d723fea9f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1241,7 +1241,6 @@ struct task_struct {
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
 /* hung task detection */
-	unsigned long last_switch_timestamp;
 	unsigned long last_switch_count;
 #endif
 /* CPU-specific state of this task */
diff --git a/kernel/fork.c b/kernel/fork.c
index fb9444282836..bf582f75014b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -639,6 +639,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 
 	tsk->min_flt = tsk->maj_flt = 0;
 	tsk->nvcsw = tsk->nivcsw = 0;
+#ifdef CONFIG_DETECT_HUNG_TASK
+	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+#endif
 
 	tsk->mm = NULL;
 	tsk->active_mm = NULL;
@@ -1041,11 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
-#ifdef CONFIG_DETECT_HUNG_TASK
-	p->last_switch_count = 0;
-	p->last_switch_timestamp = 0;
-#endif
-
 	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 3951a80e7cbe..0c924de58cb2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -34,7 +34,6 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
  * Zero means infinite timeout - no checking done:
  */
 unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
-static unsigned long __read_mostly hung_task_poll_jiffies;
 
 unsigned long __read_mostly sysctl_hung_task_warnings = 10;
 
@@ -69,33 +68,17 @@ static struct notifier_block panic_block = {
 	.notifier_call = hung_task_panic,
 };
 
-/*
- * Returns seconds, approximately.  We don't need nanosecond
- * resolution, and we don't need to waste time with a big divide when
- * 2^30ns == 1.074s.
- */
-static unsigned long get_timestamp(void)
-{
-	int this_cpu = raw_smp_processor_id();
-
-	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
-}
-
-static void check_hung_task(struct task_struct *t, unsigned long now,
-			    unsigned long timeout)
+static void check_hung_task(struct task_struct *t, unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
 
 	if (t->flags & PF_FROZEN)
 		return;
 
-	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+	if (switch_count != t->last_switch_count) {
 		t->last_switch_count = switch_count;
-		t->last_switch_timestamp = now;
 		return;
 	}
-	if ((long)(now - t->last_switch_timestamp) < timeout)
-		return;
 	if (!sysctl_hung_task_warnings)
 		return;
 	sysctl_hung_task_warnings--;
@@ -111,7 +94,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 	sched_show_task(t);
 	__debug_show_held_locks(t);
 
-	t->last_switch_timestamp = now;
 	touch_nmi_watchdog();
 
 	if (sysctl_hung_task_panic)
@@ -145,7 +127,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
 	int batch_count = HUNG_TASK_BATCHING;
-	unsigned long now = get_timestamp();
 	struct task_struct *g, *t;
 
 	/*
@@ -168,19 +149,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, now, timeout);
+			check_hung_task(t, timeout);
 	} while_each_thread(g, t);
  unlock:
 	rcu_read_unlock();
 }
 
-static void update_poll_jiffies(void)
+static unsigned long timeout_jiffies(unsigned long timeout)
 {
 	/* timeout of 0 will disable the watchdog */
-	if (sysctl_hung_task_timeout_secs == 0)
-		hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT;
-	else
-		hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2;
+	return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
 }
 
 /*
@@ -197,8 +175,6 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 	if (ret || !write)
 		goto out;
 
-	update_poll_jiffies();
-
 	wake_up_process(watchdog_task);
 
  out:
@@ -211,20 +187,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 static int watchdog(void *dummy)
 {
 	set_user_nice(current, 0);
-	update_poll_jiffies();
 
 	for ( ; ; ) {
-		unsigned long timeout;
+		unsigned long timeout = sysctl_hung_task_timeout_secs;
 
-		while (schedule_timeout_interruptible(hung_task_poll_jiffies));
+		while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
+			timeout = sysctl_hung_task_timeout_secs;
 
-		/*
-		 * Need to cache timeout here to avoid timeout being set
-		 * to 0 via sysctl while inside check_hung_*_tasks().
-		 */
-		timeout = sysctl_hung_task_timeout_secs;
-		if (timeout)
-			check_hung_uninterruptible_tasks(timeout);
+		check_hung_uninterruptible_tasks(timeout);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From c3706f005c3aaf570e71f0f083fdbb59a5a9fa2e Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 10 Feb 2009 01:03:18 -0500
Subject: tracing: fix typos in comments

Impact: clean up.

Fix typos in the comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h      | 2 +-
 kernel/trace/ring_buffer.c       | 8 ++++----
 kernel/trace/trace.c             | 2 +-
 kernel/trace/trace_hw_branches.c | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 3c103d636da3..8e6646a54acf 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -8,7 +8,7 @@ struct ring_buffer;
 struct ring_buffer_iter;
 
 /*
- * Don't reference this struct directly, use functions below.
+ * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
 	u32		type:2, len:3, time_delta:27;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 10d202ea06f3..fa64e1f003eb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -91,7 +91,7 @@ EXPORT_SYMBOL_GPL(tracing_off);
  * tracing_off_permanent - permanently disable ring buffers
  *
  * This function, once called, will disable all ring buffers
- * permanenty.
+ * permanently.
  */
 void tracing_off_permanent(void)
 {
@@ -210,7 +210,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
-	local_t		 commit;	/* write commited index */
+	local_t		 commit;	/* write committed index */
 	unsigned char	 data[];	/* data of buffer page */
 };
 
@@ -260,7 +260,7 @@ struct ring_buffer_per_cpu {
 	struct list_head		pages;
 	struct buffer_page		*head_page;	/* read from head */
 	struct buffer_page		*tail_page;	/* write to tail */
-	struct buffer_page		*commit_page;	/* commited pages */
+	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
 	unsigned long			overrun;
 	unsigned long			entries;
@@ -303,7 +303,7 @@ struct ring_buffer_iter {
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
  *
- * As a safty measure we check to make sure the data pages have not
+ * As a safety measure we check to make sure the data pages have not
  * been corrupted.
  */
 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d89821283b47..d7c175a442df 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1963,7 +1963,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	struct tracer_opt *trace_opts = current_trace->flags->opts;
 
 
-	/* calulate max size */
+	/* calculate max size */
 	for (i = 0; trace_options[i]; i++) {
 		len += strlen(trace_options[i]);
 		len += 3; /* "no" and space */
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e3e7db61c067..0794dd33f27b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -75,7 +75,7 @@ static void bts_trace_start(struct trace_array *tr)
 }
 
 /*
- * Start tracing on the current cpu.
+ * Stop tracing on the current cpu.
  * The argument is ignored.
  *
  * pre: bts_tracer_mutex must be locked.
-- 
cgit v1.2.3


From 5a5fb7dbe88dd57dc2bef0f3be9da991e789612d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Feb 2009 10:53:37 -0500
Subject: preempt-count: force hardirq-count to max of 10

To add a bit in the preempt_count to be set when in NMI context, we
found that some archs did not have enough bits to spare. This is
due to the hardirq_count being a mask that can hold NR_IRQS.

Some archs allow for over 16000 IRQs, and that would require a mask
of 14 bits. The sofitrq mask is 8 bits and the preempt disable mask
is also 8 bits.  The PREEMP_ACTIVE bit is bit 30, and bit 31 would
make the preempt_count (which is type int) a negative number.
A negative preempt_count is a sign of failure.

Add them up 14+8+8+1+1 you get 32 bits. No room for the NMI bit.

But the hardirq_count is to track the number of nested IRQs, not
the number of total IRQs.  This originally took the paranoid approach
of setting the max nesting to NR_IRQS. But when we have archs with
over 1000 IRQs, it is not practical to think they will ever all
nest on a single CPU. Not to mention that this would most definitely
cause a stack overflow.

This patch sets a max of 10 bits to be used for IRQ nesting.
I did a 'git grep HARDIRQ' to examine all users of HARDIRQ_BITS and
HARDIRQ_MASK, and found that making it a max of 10 would not hurt
anyone. I did find that the m68k expected it to be 8 bits, so
I allow for the archs to set the number to be less than 10.

I removed the setting of HARDIRQ_BITS from the archs that set it
to more than 10. This includes ALPHA, ia64 and avr32.

This will always allow room for the NMI bit, and if we need to allow
for NMI nesting, we have 4 bits to play with.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/alpha/include/asm/hardirq.h | 13 -----------
 arch/avr32/include/asm/hardirq.h | 11 ---------
 arch/ia64/include/asm/hardirq.h  | 10 ---------
 include/linux/hardirq.h          | 48 ++++++++++++++++++++--------------------
 4 files changed, 24 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/hardirq.h b/arch/alpha/include/asm/hardirq.h
index d953e234daa8..88971460fa6c 100644
--- a/arch/alpha/include/asm/hardirq.h
+++ b/arch/alpha/include/asm/hardirq.h
@@ -14,17 +14,4 @@ typedef struct {
 
 void ack_bad_irq(unsigned int irq);
 
-#define HARDIRQ_BITS	12
-
-/*
- * The hardirq mask has to be large enough to have
- * space for potentially nestable IRQ sources in the system
- * to nest on a single CPU. On Alpha, interrupts are masked at the CPU
- * by IPL as well as at the system level. We only have 8 IPLs (UNIX PALcode)
- * so we really only have 8 nestable IRQs, but allow some overhead
- */
-#if (1 << HARDIRQ_BITS) < 16
-#error HARDIRQ_BITS is too low!
-#endif
-
 #endif /* _ALPHA_HARDIRQ_H */
diff --git a/arch/avr32/include/asm/hardirq.h b/arch/avr32/include/asm/hardirq.h
index 267354356f60..015bc75ea798 100644
--- a/arch/avr32/include/asm/hardirq.h
+++ b/arch/avr32/include/asm/hardirq.h
@@ -20,15 +20,4 @@ void ack_bad_irq(unsigned int irq);
 
 #endif /* __ASSEMBLY__ */
 
-#define HARDIRQ_BITS	12
-
-/*
- * The hardirq mask has to be large enough to have
- * space for potentially all IRQ sources in the system
- * nesting on a single CPU:
- */
-#if (1 << HARDIRQ_BITS) < NR_IRQS
-# error HARDIRQ_BITS is too low!
-#endif
-
 #endif /* __ASM_AVR32_HARDIRQ_H */
diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h
index 140e495b8e0e..d514cd9edb49 100644
--- a/arch/ia64/include/asm/hardirq.h
+++ b/arch/ia64/include/asm/hardirq.h
@@ -20,16 +20,6 @@
 
 #define local_softirq_pending()		(local_cpu_data->softirq_pending)
 
-#define HARDIRQ_BITS	14
-
-/*
- * The hardirq mask has to be large enough to have space for potentially all IRQ sources
- * in the system nesting on a single CPU:
- */
-#if (1 << HARDIRQ_BITS) < NR_IRQS
-# error HARDIRQ_BITS is too low!
-#endif
-
 extern void __iomem *ipi_base_addr;
 
 void ack_bad_irq(unsigned int irq);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f3cf86e1465b..9841221f53f2 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -15,61 +15,61 @@
  * - bits 0-7 are the preemption count (max preemption depth: 256)
  * - bits 8-15 are the softirq count (max # of softirqs: 256)
  *
- * The hardirq count can be overridden per architecture, the default is:
+ * The hardirq count can in theory reach the same as NR_IRQS.
+ * In reality, the number of nested IRQS is limited to the stack
+ * size as well. For archs with over 1000 IRQS it is not practical
+ * to expect that they will all nest. We give a max of 10 bits for
+ * hardirq nesting. An arch may choose to give less than 10 bits.
+ * m68k expects it to be 8.
  *
- * - bits 16-27 are the hardirq count (max # of hardirqs: 4096)
- * - ( bit 28 is the PREEMPT_ACTIVE flag. )
+ * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
+ * - bit 26 is the NMI_MASK
+ * - bit 28 is the PREEMPT_ACTIVE flag
  *
  * PREEMPT_MASK: 0x000000ff
  * SOFTIRQ_MASK: 0x0000ff00
- * HARDIRQ_MASK: 0x0fff0000
+ * HARDIRQ_MASK: 0x03ff0000
+ *     NMI_MASK: 0x04000000
  */
 #define PREEMPT_BITS	8
 #define SOFTIRQ_BITS	8
+#define NMI_BITS	1
 
-#ifndef HARDIRQ_BITS
-#define HARDIRQ_BITS	12
+#define MAX_HARDIRQ_BITS 10
 
-#ifndef MAX_HARDIRQS_PER_CPU
-#define MAX_HARDIRQS_PER_CPU NR_IRQS
+#ifndef HARDIRQ_BITS
+# define HARDIRQ_BITS	MAX_HARDIRQ_BITS
 #endif
 
-/*
- * The hardirq mask has to be large enough to have space for potentially
- * all IRQ sources in the system nesting on a single CPU.
- */
-#if (1 << HARDIRQ_BITS) < MAX_HARDIRQS_PER_CPU
-# error HARDIRQ_BITS is too low!
-#endif
+#if HARDIRQ_BITS > MAX_HARDIRQ_BITS
+#error HARDIRQ_BITS too high!
 #endif
 
 #define PREEMPT_SHIFT	0
 #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
 #define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define NMI_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
 
 #define __IRQ_MASK(x)	((1UL << (x))-1)
 
 #define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
 #define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
 #define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+#define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
 
 #define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
 #define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
 #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
+#define NMI_OFFSET	(1UL << NMI_SHIFT)
 
-#if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
+#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
 #error PREEMPT_ACTIVE is too low!
 #endif
 
-#define NMI_OFFSET	(PREEMPT_ACTIVE << 1)
-
-#if NMI_OFFSET >= 0x80000000
-#error PREEMPT_ACTIVE too high!
-#endif
-
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
-#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
+				 | NMI_MASK))
 
 /*
  * Are we doing bottom half or hardware interrupt processing?
@@ -82,7 +82,7 @@
 /*
  * Are we in NMI context?
  */
-#define in_nmi()	(preempt_count() & NMI_OFFSET)
+#define in_nmi()	(preempt_count() & NMI_MASK)
 
 #if defined(CONFIG_PREEMPT)
 # define PREEMPT_INATOMIC_BASE kernel_locked()
-- 
cgit v1.2.3


From 2a7b8df04c11a70105c1abe67d006455d3bdc944 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Feb 2009 14:16:46 -0500
Subject: sched: do not account for NMIs

Impact: avoid corruption in system time accounting

Martin Schwidefsky told me that there was an issue with NMIs and
system accounting. The problem is that the accounting code is
not reentrant, and if an NMI goes off after an interrupt it can
corrupt the accounting.

For now, the best we can do is to treat NMIs like SMIs and they
are not accounted for.

This patch changes nmi_enter to not call __irq_enter and to do
the preempt-count and tracing calls directly.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/hardirq.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 9841221f53f2..faa1cf848bcd 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -175,24 +175,24 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()				\
-	do {					\
-		ftrace_nmi_enter();		\
-		BUG_ON(in_nmi());		\
-		add_preempt_count(NMI_OFFSET);	\
-		lockdep_off();			\
-		rcu_nmi_enter();		\
-		__irq_enter();			\
+#define nmi_enter()						\
+	do {							\
+		ftrace_nmi_enter();				\
+		BUG_ON(in_nmi());				\
+		add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET);	\
+		lockdep_off();					\
+		rcu_nmi_enter();				\
+		trace_hardirq_enter();				\
 	} while (0)
 
-#define nmi_exit()				\
-	do {					\
-		__irq_exit();			\
-		rcu_nmi_exit();			\
-		lockdep_on();			\
-		BUG_ON(!in_nmi());		\
-		sub_preempt_count(NMI_OFFSET);	\
-		ftrace_nmi_exit();		\
+#define nmi_exit()						\
+	do {							\
+		trace_hardirq_exit();				\
+		rcu_nmi_exit();					\
+		lockdep_on();					\
+		BUG_ON(!in_nmi());				\
+		sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET);	\
+		ftrace_nmi_exit();				\
 	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */
-- 
cgit v1.2.3


From f6180773d90595650e11de0118bb112018290915 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 00:40:25 -0500
Subject: ftrace: add command interface for function selection

Allow for other tracers to add their own commands for function
selection. This interface gives a trace the ability to name a
command for function selection. Right now it is pretty limited
in what it offers, but this is a building step for more features.

The :mod: command is converted to this interface and also serves
as a template for other implementations.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  16 ++++++++
 kernel/trace/ftrace.c  | 106 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 111 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 106b7909d500..f0a0ecc63b5c 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -95,6 +95,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 		   loff_t *ppos);
 #endif
 
+struct ftrace_func_command {
+	struct list_head	list;
+	char			*name;
+	int			(*func)(char *func, char *cmd,
+					char *params, int enable);
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
@@ -119,6 +126,9 @@ struct dyn_ftrace {
 int ftrace_force_update(void);
 void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
+int register_ftrace_command(struct ftrace_func_command *cmd);
+int unregister_ftrace_command(struct ftrace_func_command *cmd);
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_dyn_arch_init(void *data);
@@ -202,6 +212,12 @@ extern void ftrace_enable_daemon(void);
 # define ftrace_disable_daemon()		do { } while (0)
 # define ftrace_enable_daemon()			do { } while (0)
 static inline void ftrace_release(void *start, unsigned long size) { }
+static inline int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+}
+static inline int unregister_ftrace_command(char *cmd_name)
+{
+}
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /* totally disable ftrace - can not re-enable after this */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 340f88b68d9e..45a44c402566 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1239,9 +1239,93 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 	spin_unlock(&ftrace_lock);
 }
 
+/*
+ * We register the module command as a template to show others how
+ * to register the a command as well.
+ */
+
+static int
+ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
+{
+	char *mod;
+
+	/*
+	 * cmd == 'mod' because we only registered this func
+	 * for the 'mod' ftrace_func_command.
+	 * But if you register one func with multiple commands,
+	 * you can tell which command was used by the cmd
+	 * parameter.
+	 */
+
+	/* we must have a module name */
+	if (!param)
+		return -EINVAL;
+
+	mod = strsep(&param, ":");
+	if (!strlen(mod))
+		return -EINVAL;
+
+	ftrace_match_module_records(func, mod, enable);
+	return 0;
+}
+
+static struct ftrace_func_command ftrace_mod_cmd = {
+	.name			= "mod",
+	.func			= ftrace_mod_callback,
+};
+
+static int __init ftrace_mod_cmd_init(void)
+{
+	return register_ftrace_command(&ftrace_mod_cmd);
+}
+device_initcall(ftrace_mod_cmd_init);
+
+static LIST_HEAD(ftrace_commands);
+static DEFINE_MUTEX(ftrace_cmd_mutex);
+
+int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+	struct ftrace_func_command *p;
+	int ret = 0;
+
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry(p, &ftrace_commands, list) {
+		if (strcmp(cmd->name, p->name) == 0) {
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+	}
+	list_add(&cmd->list, &ftrace_commands);
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
+
+	return ret;
+}
+
+int unregister_ftrace_command(struct ftrace_func_command *cmd)
+{
+	struct ftrace_func_command *p, *n;
+	int ret = -ENODEV;
+
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry_safe(p, n, &ftrace_commands, list) {
+		if (strcmp(cmd->name, p->name) == 0) {
+			ret = 0;
+			list_del_init(&p->list);
+			goto out_unlock;
+		}
+	}
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
+
+	return ret;
+}
+
 static int ftrace_process_regex(char *buff, int len, int enable)
 {
-	char *func, *mod, *command, *next = buff;
+	struct ftrace_func_command *p;
+	char *func, *command, *next = buff;
+	int ret = -EINVAL;
 
 	func = strsep(&next, ":");
 
@@ -1250,21 +1334,21 @@ static int ftrace_process_regex(char *buff, int len, int enable)
 		return 0;
 	}
 
-	/* command fonud */
+	/* command found */
 
 	command = strsep(&next, ":");
 
-	if (strcmp(command, "mod") == 0) {
-		/* only match modules */
-		if (!next)
-			return -EINVAL;
-
-		mod = strsep(&next, ":");
-		ftrace_match_module_records(func, mod, enable);
-		return 0;
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry(p, &ftrace_commands, list) {
+		if (strcmp(p->name, command) == 0) {
+			ret = p->func(func, command, next, enable);
+			goto out_unlock;
+		}
 	}
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
 
-	return -EINVAL;
+	return ret;
 }
 
 static ssize_t
-- 
cgit v1.2.3


From 59df055f1991c9fc0c71a9230663c39188f6972f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 15:29:06 -0500
Subject: ftrace: trace different functions with a different tracer

Impact: new feature

Currently, the function tracer only gives you an ability to hook
a tracer to all functions being traced. The dynamic function trace
allows you to pick and choose which of those functions will be
traced, but all functions being traced will call all tracers that
registered with the function tracer.

This patch adds a new feature that allows a tracer to hook to specific
functions, even when all functions are being traced. It allows for
different functions to call different tracer hooks.

The way this is accomplished is by a special function that will hook
to the function tracer and will set up a hash table knowing which
tracer hook to call with which function. This is the most general
and easiest method to accomplish this. Later, an arch may choose
to supply their own method in changing the mcount call of a function
to call a different tracer. But that will be an exercise for the
future.

To register a function:

 struct ftrace_hook_ops {
	void			(*func)(unsigned long ip,
					unsigned long parent_ip,
					void **data);
	int			(*callback)(unsigned long ip, void **data);
	void			(*free)(void **data);
 };

 int register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
				  void *data);

glob is a simple glob to search for the functions to hook.
ops is a pointer to the operations (listed below)
data is the default data to be passed to the hook functions when traced

ops:
 func is the hook function to call when the functions are traced
 callback is a callback function that is called when setting up the hash.
   That is, if the tracer needs to do something special for each
   function, that is being traced, and wants to give each function
   its own data. The address of the entry data is passed to this
   callback, so that the callback may wish to update the entry to
   whatever it would like.
 free is a callback for when the entry is freed. In case the tracer
   allocated any data, it is give the chance to free it.

To unregister we have three functions:

  void
  unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
				void *data)

This will unregister all hooks that match glob, point to ops, and
have its data matching data. (note, if glob is NULL, blank or '*',
all functions will be tested).

  void
  unregister_ftrace_function_hook_func(char *glob,
				 struct ftrace_hook_ops *ops)

This will unregister all functions matching glob that has an entry
pointing to ops.

  void unregister_ftrace_function_hook_all(char *glob)

This simply unregisters all funcs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  18 ++++
 kernel/trace/ftrace.c  | 247 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 265 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f0a0ecc63b5c..13918c4400ad 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -106,6 +106,24 @@ struct ftrace_func_command {
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+struct ftrace_hook_ops {
+	void			(*func)(unsigned long ip,
+					unsigned long parent_ip,
+					void **data);
+	int			(*callback)(unsigned long ip, void **data);
+	void			(*free)(void **data);
+};
+
+extern int
+register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+			      void *data);
+extern void
+unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				void *data);
+extern void
+unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops);
+extern void unregister_ftrace_function_hook_all(char *glob);
+
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FAILED	= (1 << 1),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 157d4f68b0e0..0b80e325f296 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,7 @@
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
+#include <linux/hash.h>
 
 #include <asm/ftrace.h>
 
@@ -1245,6 +1246,252 @@ static int __init ftrace_mod_cmd_init(void)
 }
 device_initcall(ftrace_mod_cmd_init);
 
+#define FTRACE_HASH_BITS 7
+#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_hook {
+	struct hlist_node	node;
+	struct ftrace_hook_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+static void
+function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_func_hook *entry;
+	struct hlist_head *hhd;
+	struct hlist_node *n;
+	unsigned long key;
+	int resched;
+
+	key = hash_long(ip, FTRACE_HASH_BITS);
+
+	hhd = &ftrace_func_hash[key];
+
+	if (hlist_empty(hhd))
+		return;
+
+	/*
+	 * Disable preemption for these calls to prevent a RCU grace
+	 * period. This syncs the hash iteration and freeing of items
+	 * on the hash. rcu_read_lock is too dangerous here.
+	 */
+	resched = ftrace_preempt_disable();
+	hlist_for_each_entry_rcu(entry, n, hhd, node) {
+		if (entry->ip == ip)
+			entry->ops->func(ip, parent_ip, &entry->data);
+	}
+	ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_hook_ops __read_mostly =
+{
+	.func = function_trace_hook_call,
+};
+
+static int ftrace_hook_registered;
+
+static void __enable_ftrace_function_hook(void)
+{
+	int i;
+
+	if (ftrace_hook_registered)
+		return;
+
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+		if (hhd->first)
+			break;
+	}
+	/* Nothing registered? */
+	if (i == FTRACE_FUNC_HASHSIZE)
+		return;
+
+	__register_ftrace_function(&trace_hook_ops);
+	ftrace_startup(0);
+	ftrace_hook_registered = 1;
+}
+
+static void __disable_ftrace_function_hook(void)
+{
+	int i;
+
+	if (!ftrace_hook_registered)
+		return;
+
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+		if (hhd->first)
+			return;
+	}
+
+	/* no more funcs left */
+	__unregister_ftrace_function(&trace_hook_ops);
+	ftrace_shutdown(0);
+	ftrace_hook_registered = 0;
+}
+
+
+static void ftrace_free_entry_rcu(struct rcu_head *rhp)
+{
+	struct ftrace_func_hook *entry =
+		container_of(rhp, struct ftrace_func_hook, rcu);
+
+	if (entry->ops->free)
+		entry->ops->free(&entry->data);
+	kfree(entry);
+}
+
+
+int
+register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+			      void *data)
+{
+	struct ftrace_func_hook *entry;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	unsigned long key;
+	int type, len, not;
+	int count = 0;
+	char *search;
+
+	type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+	len = strlen(search);
+
+	/* we do not support '!' for function hooks */
+	if (WARN_ON(not))
+		return -EINVAL;
+
+	mutex_lock(&ftrace_lock);
+	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+
+		if (!ftrace_match_record(rec, search, len, type))
+			continue;
+
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry) {
+			/* If we did not hook to any, then return error */
+			if (!count)
+				count = -ENOMEM;
+			goto out_unlock;
+		}
+
+		count++;
+
+		entry->data = data;
+
+		/*
+		 * The caller might want to do something special
+		 * for each function we find. We call the callback
+		 * to give the caller an opportunity to do so.
+		 */
+		if (ops->callback) {
+			if (ops->callback(rec->ip, &entry->data) < 0) {
+				/* caller does not like this func */
+				kfree(entry);
+				continue;
+			}
+		}
+
+		entry->ops = ops;
+		entry->ip = rec->ip;
+
+		key = hash_long(entry->ip, FTRACE_HASH_BITS);
+		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
+
+	} while_for_each_ftrace_rec();
+	__enable_ftrace_function_hook();
+
+ out_unlock:
+	mutex_unlock(&ftrace_lock);
+
+	return count;
+}
+
+enum {
+	HOOK_TEST_FUNC		= 1,
+	HOOK_TEST_DATA		= 2
+};
+
+static void
+__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				  void *data, int flags)
+{
+	struct ftrace_func_hook *entry;
+	struct hlist_node *n, *tmp;
+	char str[KSYM_SYMBOL_LEN];
+	int type = MATCH_FULL;
+	int i, len = 0;
+	char *search;
+
+	if (glob && (strcmp(glob, "*") || !strlen(glob)))
+		glob = NULL;
+	else {
+		int not;
+
+		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+		len = strlen(search);
+
+		/* we do not support '!' for function hooks */
+		if (WARN_ON(not))
+			return;
+	}
+
+	mutex_lock(&ftrace_lock);
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+
+		hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
+
+			/* break up if statements for readability */
+			if ((flags & HOOK_TEST_FUNC) && entry->ops != ops)
+				continue;
+
+			if ((flags & HOOK_TEST_DATA) && entry->data != data)
+				continue;
+
+			/* do this last, since it is the most expensive */
+			if (glob) {
+				kallsyms_lookup(entry->ip, NULL, NULL,
+						NULL, str);
+				if (!ftrace_match(str, glob, len, type))
+					continue;
+			}
+
+			hlist_del(&entry->node);
+			call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+		}
+	}
+	__disable_ftrace_function_hook();
+	mutex_unlock(&ftrace_lock);
+}
+
+void
+unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				void *data)
+{
+	__unregister_ftrace_function_hook(glob, ops, data,
+					  HOOK_TEST_FUNC | HOOK_TEST_DATA);
+}
+
+void
+unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops)
+{
+	__unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC);
+}
+
+void unregister_ftrace_function_hook_all(char *glob)
+{
+	__unregister_ftrace_function_hook(glob, NULL, NULL, 0);
+}
+
 static LIST_HEAD(ftrace_commands);
 static DEFINE_MUTEX(ftrace_cmd_mutex);
 
-- 
cgit v1.2.3


From 988ae9d6b2bc3ebdc1a488490250a6812f85e9d4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 19:17:02 -0500
Subject: ring-buffer: add tracing_is_on to test if ring buffer is enabled

This patch adds the tracing_is_on() interface to tell if the ring
buffer is turned on or not.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h | 2 ++
 kernel/trace/ring_buffer.c  | 9 +++++++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 8e6646a54acf..f5e793d69bd3 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -128,10 +128,12 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 void tracing_on(void);
 void tracing_off(void);
 void tracing_off_permanent(void);
+int tracing_is_on(void);
 #else
 static inline void tracing_on(void) { }
 static inline void tracing_off(void) { }
 static inline void tracing_off_permanent(void) { }
+static inline int tracing_is_on(void) { return 0; }
 #endif
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2b4626ce95d6..8f19f1aa42b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -98,6 +98,15 @@ void tracing_off_permanent(void)
 	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 }
 
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+	return ring_buffer_flags == RB_BUFFERS_ON;
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
+
 #include "trace.h"
 
 /* Up this if you want to test the TIME_EXTENTS and normalization */
-- 
cgit v1.2.3


From 809dcf29ce4e1723709910878e050bd187617e0e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 23:06:01 -0500
Subject: ftrace: add pretty print to selected fuction traces

This patch adds a call back for the tracers that have hooks to
selected functions. This allows the tracer to show better output
in the set_ftrace_filter file.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h | 6 ++++++
 kernel/trace/ftrace.c  | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 13918c4400ad..b331e216d8a1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -106,12 +106,18 @@ struct ftrace_func_command {
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+struct seq_file;
+
 struct ftrace_hook_ops {
 	void			(*func)(unsigned long ip,
 					unsigned long parent_ip,
 					void **data);
 	int			(*callback)(unsigned long ip, void **data);
 	void			(*free)(void **data);
+	int			(*print)(struct seq_file *m,
+					 unsigned long ip,
+					 struct ftrace_hook_ops *ops,
+					 void *data);
 };
 
 extern int
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e058848cddb..6533c1d20155 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -834,6 +834,9 @@ static int t_hash_show(struct seq_file *m, void *v)
 
 	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
 
+	if (rec->ops->print)
+		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
+
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 	seq_printf(m, "%s:", str);
 
-- 
cgit v1.2.3


From 97d0bb8dcd8c2812e1927cdb51d7b1f9c98352b5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Feb 2009 11:47:39 +0100
Subject: ftrace: fix !CONFIG_FTRACE [un_]register_ftrace_command() prototypes

Impact: build fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b331e216d8a1..63281228ce3e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -238,9 +238,11 @@ extern void ftrace_enable_daemon(void);
 static inline void ftrace_release(void *start, unsigned long size) { }
 static inline int register_ftrace_command(struct ftrace_func_command *cmd)
 {
+	return -EINVAL;
 }
 static inline int unregister_ftrace_command(char *cmd_name)
 {
+	return -EINVAL;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
-- 
cgit v1.2.3


From b6887d7916e44c1d8913084fb6aa5004d9473f1a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 12:32:04 -0500
Subject: ftrace: rename _hook to _probe

Impact: clean up

Ingo Molnar did not like the _hook naming convention used by the
select function tracer. Luis Claudio R. Goncalves suggested using
the "_probe" extension. This patch implements the change of
calling the functions and variables "_hook" and replacing them
with "_probe".

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h         | 12 +++----
 kernel/trace/ftrace.c          | 78 +++++++++++++++++++++---------------------
 kernel/trace/trace_functions.c | 26 +++++++-------
 3 files changed, 58 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 63281228ce3e..9d224c43e634 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -108,7 +108,7 @@ struct ftrace_func_command {
 
 struct seq_file;
 
-struct ftrace_hook_ops {
+struct ftrace_probe_ops {
 	void			(*func)(unsigned long ip,
 					unsigned long parent_ip,
 					void **data);
@@ -116,19 +116,19 @@ struct ftrace_hook_ops {
 	void			(*free)(void **data);
 	int			(*print)(struct seq_file *m,
 					 unsigned long ip,
-					 struct ftrace_hook_ops *ops,
+					 struct ftrace_probe_ops *ops,
 					 void *data);
 };
 
 extern int
-register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			      void *data);
 extern void
-unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				void *data);
 extern void
-unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops);
-extern void unregister_ftrace_function_hook_all(char *glob);
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops);
+extern void unregister_ftrace_function_probe_all(char *glob);
 
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index af9d95c0e4de..330a059f6ed7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -255,9 +255,9 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
 static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
 
-struct ftrace_func_hook {
+struct ftrace_func_probe {
 	struct hlist_node	node;
-	struct ftrace_hook_ops	*ops;
+	struct ftrace_probe_ops	*ops;
 	unsigned long		flags;
 	unsigned long		ip;
 	void			*data;
@@ -830,11 +830,11 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 
 static int t_hash_show(struct seq_file *m, void *v)
 {
-	struct ftrace_func_hook *rec;
+	struct ftrace_func_probe *rec;
 	struct hlist_node *hnd = v;
 	char str[KSYM_SYMBOL_LEN];
 
-	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
+	rec = hlist_entry(hnd, struct ftrace_func_probe, node);
 
 	if (rec->ops->print)
 		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1351,9 +1351,9 @@ static int __init ftrace_mod_cmd_init(void)
 device_initcall(ftrace_mod_cmd_init);
 
 static void
-function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
+function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct hlist_head *hhd;
 	struct hlist_node *n;
 	unsigned long key;
@@ -1379,18 +1379,18 @@ function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
 	ftrace_preempt_enable(resched);
 }
 
-static struct ftrace_ops trace_hook_ops __read_mostly =
+static struct ftrace_ops trace_probe_ops __read_mostly =
 {
-	.func = function_trace_hook_call,
+	.func = function_trace_probe_call,
 };
 
-static int ftrace_hook_registered;
+static int ftrace_probe_registered;
 
-static void __enable_ftrace_function_hook(void)
+static void __enable_ftrace_function_probe(void)
 {
 	int i;
 
-	if (ftrace_hook_registered)
+	if (ftrace_probe_registered)
 		return;
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
@@ -1402,16 +1402,16 @@ static void __enable_ftrace_function_hook(void)
 	if (i == FTRACE_FUNC_HASHSIZE)
 		return;
 
-	__register_ftrace_function(&trace_hook_ops);
+	__register_ftrace_function(&trace_probe_ops);
 	ftrace_startup(0);
-	ftrace_hook_registered = 1;
+	ftrace_probe_registered = 1;
 }
 
-static void __disable_ftrace_function_hook(void)
+static void __disable_ftrace_function_probe(void)
 {
 	int i;
 
-	if (!ftrace_hook_registered)
+	if (!ftrace_probe_registered)
 		return;
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
@@ -1421,16 +1421,16 @@ static void __disable_ftrace_function_hook(void)
 	}
 
 	/* no more funcs left */
-	__unregister_ftrace_function(&trace_hook_ops);
+	__unregister_ftrace_function(&trace_probe_ops);
 	ftrace_shutdown(0);
-	ftrace_hook_registered = 0;
+	ftrace_probe_registered = 0;
 }
 
 
 static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 {
-	struct ftrace_func_hook *entry =
-		container_of(rhp, struct ftrace_func_hook, rcu);
+	struct ftrace_func_probe *entry =
+		container_of(rhp, struct ftrace_func_probe, rcu);
 
 	if (entry->ops->free)
 		entry->ops->free(&entry->data);
@@ -1439,10 +1439,10 @@ static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 
 
 int
-register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			      void *data)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	int type, len, not;
@@ -1453,7 +1453,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 	type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
 	len = strlen(search);
 
-	/* we do not support '!' for function hooks */
+	/* we do not support '!' for function probes */
 	if (WARN_ON(not))
 		return -EINVAL;
 
@@ -1468,7 +1468,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 
 		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 		if (!entry) {
-			/* If we did not hook to any, then return error */
+			/* If we did not process any, then return error */
 			if (!count)
 				count = -ENOMEM;
 			goto out_unlock;
@@ -1498,7 +1498,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
 
 	} while_for_each_ftrace_rec();
-	__enable_ftrace_function_hook();
+	__enable_ftrace_function_probe();
 
  out_unlock:
 	mutex_unlock(&ftrace_lock);
@@ -1507,15 +1507,15 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 }
 
 enum {
-	HOOK_TEST_FUNC		= 1,
-	HOOK_TEST_DATA		= 2
+	PROBE_TEST_FUNC		= 1,
+	PROBE_TEST_DATA		= 2
 };
 
 static void
-__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				  void *data, int flags)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct hlist_node *n, *tmp;
 	char str[KSYM_SYMBOL_LEN];
 	int type = MATCH_FULL;
@@ -1530,7 +1530,7 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
 		len = strlen(search);
 
-		/* we do not support '!' for function hooks */
+		/* we do not support '!' for function probes */
 		if (WARN_ON(not))
 			return;
 	}
@@ -1542,10 +1542,10 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
 
 			/* break up if statements for readability */
-			if ((flags & HOOK_TEST_FUNC) && entry->ops != ops)
+			if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
 				continue;
 
-			if ((flags & HOOK_TEST_DATA) && entry->data != data)
+			if ((flags & PROBE_TEST_DATA) && entry->data != data)
 				continue;
 
 			/* do this last, since it is the most expensive */
@@ -1560,27 +1560,27 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 			call_rcu(&entry->rcu, ftrace_free_entry_rcu);
 		}
 	}
-	__disable_ftrace_function_hook();
+	__disable_ftrace_function_probe();
 	mutex_unlock(&ftrace_lock);
 }
 
 void
-unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				void *data)
 {
-	__unregister_ftrace_function_hook(glob, ops, data,
-					  HOOK_TEST_FUNC | HOOK_TEST_DATA);
+	__unregister_ftrace_function_probe(glob, ops, data,
+					  PROBE_TEST_FUNC | PROBE_TEST_DATA);
 }
 
 void
-unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops)
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
 {
-	__unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC);
+	__unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
 }
 
-void unregister_ftrace_function_hook_all(char *glob)
+void unregister_ftrace_function_probe_all(char *glob)
 {
-	__unregister_ftrace_function_hook(glob, NULL, NULL, 0);
+	__unregister_ftrace_function_probe(glob, NULL, NULL, 0);
 }
 
 static LIST_HEAD(ftrace_commands);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 021a574c5988..6ea73ed03bfa 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -269,21 +269,21 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
 
 static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_hook_ops *ops, void *data);
+			 struct ftrace_probe_ops *ops, void *data);
 
-static struct ftrace_hook_ops traceon_hook_ops = {
+static struct ftrace_probe_ops traceon_probe_ops = {
 	.func			= ftrace_traceon,
 	.print			= ftrace_trace_onoff_print,
 };
 
-static struct ftrace_hook_ops traceoff_hook_ops = {
+static struct ftrace_probe_ops traceoff_probe_ops = {
 	.func			= ftrace_traceoff,
 	.print			= ftrace_trace_onoff_print,
 };
 
 static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_hook_ops *ops, void *data)
+			 struct ftrace_probe_ops *ops, void *data)
 {
 	char str[KSYM_SYMBOL_LEN];
 	long count = (long)data;
@@ -291,7 +291,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 	kallsyms_lookup(ip, NULL, NULL, NULL, str);
 	seq_printf(m, "%s:", str);
 
-	if (ops == &traceon_hook_ops)
+	if (ops == &traceon_probe_ops)
 		seq_printf(m, "traceon");
 	else
 		seq_printf(m, "traceoff");
@@ -306,15 +306,15 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 static int
 ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 {
-	struct ftrace_hook_ops *ops;
+	struct ftrace_probe_ops *ops;
 
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_hook_ops;
+		ops = &traceon_probe_ops;
 	else
-		ops = &traceoff_hook_ops;
+		ops = &traceoff_probe_ops;
 
-	unregister_ftrace_function_hook_func(glob, ops);
+	unregister_ftrace_function_probe_func(glob, ops);
 
 	return 0;
 }
@@ -322,7 +322,7 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 static int
 ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 {
-	struct ftrace_hook_ops *ops;
+	struct ftrace_probe_ops *ops;
 	void *count = (void *)-1;
 	char *number;
 	int ret;
@@ -336,9 +336,9 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_hook_ops;
+		ops = &traceon_probe_ops;
 	else
-		ops = &traceoff_hook_ops;
+		ops = &traceoff_probe_ops;
 
 	if (!param)
 		goto out_reg;
@@ -357,7 +357,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 		return ret;
 
  out_reg:
-	ret = register_ftrace_function_hook(glob, ops, count);
+	ret = register_ftrace_function_probe(glob, ops, count);
 
 	return ret;
 }
-- 
cgit v1.2.3


From fe1200b63d158b28eef6d4de1e5b5f99c681ba2f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Tue, 17 Feb 2009 12:05:07 -0500
Subject: SLUB: Introduce and use SLUB_MAX_SIZE and SLUB_PAGE_SHIFT constants

As a preparational patch to bump up page allocator pass-through threshold,
introduce two new constants SLUB_MAX_SIZE and SLUB_PAGE_SHIFT and convert
mm/slub.c to use them.

Reported-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Tested-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slub_def.h | 19 ++++++++++++++++---
 mm/slub.c                | 16 ++++++++--------
 2 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 2f5c16b1aacd..986e09dcfd8f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -120,11 +120,24 @@ struct kmem_cache {
 
 #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
 
+/*
+ * Maximum kmalloc object size handled by SLUB. Larger object allocations
+ * are passed through to the page allocator. The page allocator "fastpath"
+ * is relatively slow so we need this value sufficiently high so that
+ * performance critical objects are allocated through the SLUB fastpath.
+ *
+ * This should be dropped to PAGE_SIZE / 2 once the page allocator
+ * "fastpath" becomes competitive with the slab allocator fastpaths.
+ */
+#define SLUB_MAX_SIZE (PAGE_SIZE)
+
+#define SLUB_PAGE_SHIFT (PAGE_SHIFT + 1)
+
 /*
  * We keep the general caches in an array of slab caches that are used for
  * 2^x bytes of allocations.
  */
-extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1];
+extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT];
 
 /*
  * Sorry that the following has to be that ugly but some versions of GCC
@@ -212,7 +225,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
-		if (size > PAGE_SIZE)
+		if (size > SLUB_MAX_SIZE)
 			return kmalloc_large(size, flags);
 
 		if (!(flags & SLUB_DMA)) {
@@ -234,7 +247,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	if (__builtin_constant_p(size) &&
-		size <= PAGE_SIZE && !(flags & SLUB_DMA)) {
+		size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
 			struct kmem_cache *s = kmalloc_slab(size);
 
 		if (!s)
diff --git a/mm/slub.c b/mm/slub.c
index bdc9abb08a23..5a5e7f5bf799 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2475,7 +2475,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
  *		Kmalloc subsystem
  *******************************************************************/
 
-struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 
 static int __init setup_slub_min_order(char *str)
@@ -2537,7 +2537,7 @@ panic:
 }
 
 #ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
+static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
 
 static void sysfs_add_func(struct work_struct *w)
 {
@@ -2658,7 +2658,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large(size, flags);
 
 	s = get_slab(size, flags);
@@ -2686,7 +2686,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large_node(size, flags, node);
 
 	s = get_slab(size, flags);
@@ -2985,7 +2985,7 @@ void __init kmem_cache_init(void)
 		caches++;
 	}
 
-	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
+	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
 			"kmalloc", 1 << i, GFP_KERNEL);
 		caches++;
@@ -3022,7 +3022,7 @@ void __init kmem_cache_init(void)
 	slab_state = UP;
 
 	/* Provide the correct kmalloc names now that the caches are up */
-	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
+	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
 			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
 
@@ -3222,7 +3222,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large(size, gfpflags);
 
 	s = get_slab(size, gfpflags);
@@ -3238,7 +3238,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large_node(size, gfpflags, node);
 
 	s = get_slab(size, gfpflags);
-- 
cgit v1.2.3


From 6503e5df08008b9a47022b5e9ebba658c8fa69af Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Thu, 27 Nov 2008 17:48:13 +0000
Subject: thermal: use integers rather than strings for thermal values

The thermal API currently uses strings to pass values to userspace. This
makes it difficult to use from within the kernel. Change the interface
to use integers and fix up the consumers.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Acked-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/fan.c                  | 20 ++++----
 drivers/acpi/processor_thermal.c    | 20 ++++----
 drivers/acpi/thermal.c              | 80 ++++++++++++++++++++------------
 drivers/acpi/video.c                | 22 +++++----
 drivers/platform/x86/intel_menlow.c | 29 ++++--------
 drivers/thermal/thermal_sys.c       | 91 +++++++++++++++++++++++++++++++------
 include/linux/thermal.h             | 32 +++++++++----
 7 files changed, 198 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c
index eaaee1660bdf..ae41cf3cf4e5 100644
--- a/drivers/acpi/fan.c
+++ b/drivers/acpi/fan.c
@@ -68,31 +68,35 @@ static struct acpi_driver acpi_fan_driver = {
 };
 
 /* thermal cooling device callbacks */
-static int fan_get_max_state(struct thermal_cooling_device *cdev, char *buf)
+static int fan_get_max_state(struct thermal_cooling_device *cdev, unsigned long
+			     *state)
 {
 	/* ACPI fan device only support two states: ON/OFF */
-	return sprintf(buf, "1\n");
+	*state = 1;
+	return 0;
 }
 
-static int fan_get_cur_state(struct thermal_cooling_device *cdev, char *buf)
+static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long
+			     *state)
 {
 	struct acpi_device *device = cdev->devdata;
-	int state;
 	int result;
+	int acpi_state;
 
 	if (!device)
 		return -EINVAL;
 
-	result = acpi_bus_get_power(device->handle, &state);
+	result = acpi_bus_get_power(device->handle, &acpi_state);
 	if (result)
 		return result;
 
-	return sprintf(buf, "%s\n", state == ACPI_STATE_D3 ? "0" :
-			 (state == ACPI_STATE_D0 ? "1" : "unknown"));
+	*state = (acpi_state == ACPI_STATE_D3 ? 0 :
+		 (acpi_state == ACPI_STATE_D0 ? 1 : -1));
+	return 0;
 }
 
 static int
-fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned int state)
+fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state)
 {
 	struct acpi_device *device = cdev->devdata;
 	int result;
diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c
index b1eb376fae45..0e47e299a9ac 100644
--- a/drivers/acpi/processor_thermal.c
+++ b/drivers/acpi/processor_thermal.c
@@ -373,7 +373,8 @@ static int acpi_processor_max_state(struct acpi_processor *pr)
 	return max_state;
 }
 static int
-processor_get_max_state(struct thermal_cooling_device *cdev, char *buf)
+processor_get_max_state(struct thermal_cooling_device *cdev,
+			unsigned long *state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_processor *pr = acpi_driver_data(device);
@@ -381,28 +382,29 @@ processor_get_max_state(struct thermal_cooling_device *cdev, char *buf)
 	if (!device || !pr)
 		return -EINVAL;
 
-	return sprintf(buf, "%d\n", acpi_processor_max_state(pr));
+	*state = acpi_processor_max_state(pr);
+	return 0;
 }
 
 static int
-processor_get_cur_state(struct thermal_cooling_device *cdev, char *buf)
+processor_get_cur_state(struct thermal_cooling_device *cdev,
+			unsigned long *cur_state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_processor *pr = acpi_driver_data(device);
-	int cur_state;
 
 	if (!device || !pr)
 		return -EINVAL;
 
-	cur_state = cpufreq_get_cur_state(pr->id);
+	*cur_state = cpufreq_get_cur_state(pr->id);
 	if (pr->flags.throttling)
-		cur_state += pr->throttling.state;
-
-	return sprintf(buf, "%d\n", cur_state);
+		*cur_state += pr->throttling.state;
+	return 0;
 }
 
 static int
-processor_set_cur_state(struct thermal_cooling_device *cdev, unsigned int state)
+processor_set_cur_state(struct thermal_cooling_device *cdev,
+			unsigned long state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_processor *pr = acpi_driver_data(device);
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 99e6f1f8ea45..1c410ef859c6 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -954,7 +954,8 @@ static void acpi_thermal_check(void *data)
 /* sys I/F for generic thermal sysfs support */
 #define KELVIN_TO_MILLICELSIUS(t) (t * 100 - 273200)
 
-static int thermal_get_temp(struct thermal_zone_device *thermal, char *buf)
+static int thermal_get_temp(struct thermal_zone_device *thermal,
+			    unsigned long *temp)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 	int result;
@@ -966,25 +967,28 @@ static int thermal_get_temp(struct thermal_zone_device *thermal, char *buf)
 	if (result)
 		return result;
 
-	return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS(tz->temperature));
+	*temp = KELVIN_TO_MILLICELSIUS(tz->temperature);
+	return 0;
 }
 
 static const char enabled[] = "kernel";
 static const char disabled[] = "user";
 static int thermal_get_mode(struct thermal_zone_device *thermal,
-				char *buf)
+				enum thermal_device_mode *mode)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 
 	if (!tz)
 		return -EINVAL;
 
-	return sprintf(buf, "%s\n", tz->tz_enabled ?
-			enabled : disabled);
+	*mode = tz->tz_enabled ? THERMAL_DEVICE_ENABLED :
+		THERMAL_DEVICE_DISABLED;
+
+	return 0;
 }
 
 static int thermal_set_mode(struct thermal_zone_device *thermal,
-				const char *buf)
+				enum thermal_device_mode mode)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 	int enable;
@@ -995,9 +999,9 @@ static int thermal_set_mode(struct thermal_zone_device *thermal,
 	/*
 	 * enable/disable thermal management from ACPI thermal driver
 	 */
-	if (!strncmp(buf, enabled, sizeof enabled - 1))
+	if (mode == THERMAL_DEVICE_ENABLED)
 		enable = 1;
-	else if (!strncmp(buf, disabled, sizeof disabled - 1))
+	else if (mode == THERMAL_DEVICE_DISABLED)
 		enable = 0;
 	else
 		return -EINVAL;
@@ -1013,7 +1017,7 @@ static int thermal_set_mode(struct thermal_zone_device *thermal,
 }
 
 static int thermal_get_trip_type(struct thermal_zone_device *thermal,
-				 int trip, char *buf)
+				 int trip, enum thermal_trip_type *type)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 	int i;
@@ -1022,27 +1026,35 @@ static int thermal_get_trip_type(struct thermal_zone_device *thermal,
 		return -EINVAL;
 
 	if (tz->trips.critical.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "critical\n");
+		if (!trip) {
+			*type = THERMAL_TRIP_CRITICAL;
+			return 0;
+		}
 		trip--;
 	}
 
 	if (tz->trips.hot.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "hot\n");
+		if (!trip) {
+			*type = THERMAL_TRIP_HOT;
+			return 0;
+		}
 		trip--;
 	}
 
 	if (tz->trips.passive.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "passive\n");
+		if (!trip) {
+			*type = THERMAL_TRIP_PASSIVE;
+			return 0;
+		}
 		trip--;
 	}
 
 	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE &&
 		tz->trips.active[i].flags.valid; i++) {
-		if (!trip)
-			return sprintf(buf, "active%d\n", i);
+		if (!trip) {
+			*type = THERMAL_TRIP_ACTIVE;
+			return 0;
+		}
 		trip--;
 	}
 
@@ -1050,7 +1062,7 @@ static int thermal_get_trip_type(struct thermal_zone_device *thermal,
 }
 
 static int thermal_get_trip_temp(struct thermal_zone_device *thermal,
-				 int trip, char *buf)
+				 int trip, unsigned long *temp)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 	int i;
@@ -1059,31 +1071,39 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal,
 		return -EINVAL;
 
 	if (tz->trips.critical.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS(
-				tz->trips.critical.temperature));
+		if (!trip) {
+			*temp = KELVIN_TO_MILLICELSIUS(
+				tz->trips.critical.temperature);
+			return 0;
+		}
 		trip--;
 	}
 
 	if (tz->trips.hot.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS(
-					tz->trips.hot.temperature));
+		if (!trip) {
+			*temp = KELVIN_TO_MILLICELSIUS(
+				tz->trips.hot.temperature);
+			return 0;
+		}
 		trip--;
 	}
 
 	if (tz->trips.passive.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS(
-					tz->trips.passive.temperature));
+		if (!trip) {
+			*temp = KELVIN_TO_MILLICELSIUS(
+				tz->trips.passive.temperature);
+			return 0;
+		}
 		trip--;
 	}
 
 	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE &&
 		tz->trips.active[i].flags.valid; i++) {
-		if (!trip)
-			return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS(
-					tz->trips.active[i].temperature));
+		if (!trip) {
+			*temp = KELVIN_TO_MILLICELSIUS(
+				tz->trips.active[i].temperature);
+			return 0;
+		}
 		trip--;
 	}
 
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index bb5ed059114a..5259d502add6 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -358,32 +358,36 @@ static struct output_properties acpi_output_properties = {
 
 
 /* thermal cooling device callbacks */
-static int video_get_max_state(struct thermal_cooling_device *cdev, char *buf)
+static int video_get_max_state(struct thermal_cooling_device *cdev, unsigned
+			       long *state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_video_device *video = acpi_driver_data(device);
 
-	return sprintf(buf, "%d\n", video->brightness->count - 3);
+	*state = video->brightness->count - 3;
+	return 0;
 }
 
-static int video_get_cur_state(struct thermal_cooling_device *cdev, char *buf)
+static int video_get_cur_state(struct thermal_cooling_device *cdev, unsigned
+			       long *state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_video_device *video = acpi_driver_data(device);
 	unsigned long long level;
-	int state;
+	int offset;
 
 	acpi_video_device_lcd_get_level_current(video, &level);
-	for (state = 2; state < video->brightness->count; state++)
-		if (level == video->brightness->levels[state])
-			return sprintf(buf, "%d\n",
-				       video->brightness->count - state - 1);
+	for (offset = 2; offset < video->brightness->count; offset++)
+		if (level == video->brightness->levels[offset]) {
+			*state = video->brightness->count - offset - 1;
+			return 0;
+		}
 
 	return -EINVAL;
 }
 
 static int
-video_set_cur_state(struct thermal_cooling_device *cdev, unsigned int state)
+video_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_video_device *video = acpi_driver_data(device);
diff --git a/drivers/platform/x86/intel_menlow.c b/drivers/platform/x86/intel_menlow.c
index 27b7662955bb..29432a50be45 100644
--- a/drivers/platform/x86/intel_menlow.c
+++ b/drivers/platform/x86/intel_menlow.c
@@ -57,8 +57,8 @@ MODULE_LICENSE("GPL");
  * In that case max_cstate would be n-1
  * GTHS returning '0' would mean that no bandwidth control states are supported
  */
-static int memory_get_int_max_bandwidth(struct thermal_cooling_device *cdev,
-					unsigned long *max_state)
+static int memory_get_max_bandwidth(struct thermal_cooling_device *cdev,
+				    unsigned long *max_state)
 {
 	struct acpi_device *device = cdev->devdata;
 	acpi_handle handle = device->handle;
@@ -83,22 +83,12 @@ static int memory_get_int_max_bandwidth(struct thermal_cooling_device *cdev,
 	return 0;
 }
 
-static int memory_get_max_bandwidth(struct thermal_cooling_device *cdev,
-				    char *buf)
-{
-	unsigned long value;
-	if (memory_get_int_max_bandwidth(cdev, &value))
-		return -EINVAL;
-
-	return sprintf(buf, "%ld\n", value);
-}
-
 static int memory_get_cur_bandwidth(struct thermal_cooling_device *cdev,
-				    char *buf)
+				    unsigned long *value)
 {
 	struct acpi_device *device = cdev->devdata;
 	acpi_handle handle = device->handle;
-	unsigned long long value;
+	unsigned long long result;
 	struct acpi_object_list arg_list;
 	union acpi_object arg;
 	acpi_status status = AE_OK;
@@ -108,15 +98,16 @@ static int memory_get_cur_bandwidth(struct thermal_cooling_device *cdev,
 	arg.type = ACPI_TYPE_INTEGER;
 	arg.integer.value = MEMORY_ARG_CUR_BANDWIDTH;
 	status = acpi_evaluate_integer(handle, MEMORY_GET_BANDWIDTH,
-				       &arg_list, &value);
+				       &arg_list, &result);
 	if (ACPI_FAILURE(status))
 		return -EFAULT;
 
-	return sprintf(buf, "%llu\n", value);
+	*value = result;
+	return 0;
 }
 
 static int memory_set_cur_bandwidth(struct thermal_cooling_device *cdev,
-				    unsigned int state)
+				    unsigned long state)
 {
 	struct acpi_device *device = cdev->devdata;
 	acpi_handle handle = device->handle;
@@ -126,7 +117,7 @@ static int memory_set_cur_bandwidth(struct thermal_cooling_device *cdev,
 	unsigned long long temp;
 	unsigned long max_state;
 
-	if (memory_get_int_max_bandwidth(cdev, &max_state))
+	if (memory_get_max_bandwidth(cdev, &max_state))
 		return -EFAULT;
 
 	if (state > max_state)
@@ -142,7 +133,7 @@ static int memory_set_cur_bandwidth(struct thermal_cooling_device *cdev,
 				  &temp);
 
 	printk(KERN_INFO
-	       "Bandwidth value was %d: status is %d\n", state, status);
+	       "Bandwidth value was %ld: status is %d\n", state, status);
 	if (ACPI_FAILURE(status))
 		return -EFAULT;
 
diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c
index 8171ca17b936..bd139adc6d32 100644
--- a/drivers/thermal/thermal_sys.c
+++ b/drivers/thermal/thermal_sys.c
@@ -104,22 +104,36 @@ static ssize_t
 temp_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
+	long temperature;
+	int ret;
 
 	if (!tz->ops->get_temp)
 		return -EPERM;
 
-	return tz->ops->get_temp(tz, buf);
+	ret = tz->ops->get_temp(tz, &temperature);
+
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%ld\n", temperature);
 }
 
 static ssize_t
 mode_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
+	enum thermal_device_mode mode;
+	int result;
 
 	if (!tz->ops->get_mode)
 		return -EPERM;
 
-	return tz->ops->get_mode(tz, buf);
+	result = tz->ops->get_mode(tz, &mode);
+	if (result)
+		return result;
+
+	return sprintf(buf, "%s\n", mode == THERMAL_DEVICE_ENABLED ? "enabled"
+		       : "disabled");
 }
 
 static ssize_t
@@ -132,7 +146,13 @@ mode_store(struct device *dev, struct device_attribute *attr,
 	if (!tz->ops->set_mode)
 		return -EPERM;
 
-	result = tz->ops->set_mode(tz, buf);
+	if (!strncmp(buf, "enabled", sizeof("enabled")))
+		result = tz->ops->set_mode(tz, THERMAL_DEVICE_ENABLED);
+	else if (!strncmp(buf, "disabled", sizeof("disabled")))
+		result = tz->ops->set_mode(tz, THERMAL_DEVICE_DISABLED);
+	else
+		result = -EINVAL;
+
 	if (result)
 		return result;
 
@@ -144,7 +164,8 @@ trip_point_type_show(struct device *dev, struct device_attribute *attr,
 		     char *buf)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
-	int trip;
+	enum thermal_trip_type type;
+	int trip, result;
 
 	if (!tz->ops->get_trip_type)
 		return -EPERM;
@@ -152,7 +173,22 @@ trip_point_type_show(struct device *dev, struct device_attribute *attr,
 	if (!sscanf(attr->attr.name, "trip_point_%d_type", &trip))
 		return -EINVAL;
 
-	return tz->ops->get_trip_type(tz, trip, buf);
+	result = tz->ops->get_trip_type(tz, trip, &type);
+	if (result)
+		return result;
+
+	switch (type) {
+	case THERMAL_TRIP_CRITICAL:
+		return sprintf(buf, "critical");
+	case THERMAL_TRIP_HOT:
+		return sprintf(buf, "hot");
+	case THERMAL_TRIP_PASSIVE:
+		return sprintf(buf, "passive");
+	case THERMAL_TRIP_ACTIVE:
+		return sprintf(buf, "active");
+	default:
+		return sprintf(buf, "unknown");
+	}
 }
 
 static ssize_t
@@ -160,7 +196,8 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
 		     char *buf)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
-	int trip;
+	int trip, ret;
+	long temperature;
 
 	if (!tz->ops->get_trip_temp)
 		return -EPERM;
@@ -168,7 +205,12 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
 	if (!sscanf(attr->attr.name, "trip_point_%d_temp", &trip))
 		return -EINVAL;
 
-	return tz->ops->get_trip_temp(tz, trip, buf);
+	ret = tz->ops->get_trip_temp(tz, trip, &temperature);
+
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%ld\n", temperature);
 }
 
 static DEVICE_ATTR(type, 0444, type_show, NULL);
@@ -236,8 +278,13 @@ thermal_cooling_device_max_state_show(struct device *dev,
 				      struct device_attribute *attr, char *buf)
 {
 	struct thermal_cooling_device *cdev = to_cooling_device(dev);
+	unsigned long state;
+	int ret;
 
-	return cdev->ops->get_max_state(cdev, buf);
+	ret = cdev->ops->get_max_state(cdev, &state);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%ld\n", state);
 }
 
 static ssize_t
@@ -245,8 +292,13 @@ thermal_cooling_device_cur_state_show(struct device *dev,
 				      struct device_attribute *attr, char *buf)
 {
 	struct thermal_cooling_device *cdev = to_cooling_device(dev);
+	unsigned long state;
+	int ret;
 
-	return cdev->ops->get_cur_state(cdev, buf);
+	ret = cdev->ops->get_cur_state(cdev, &state);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%ld\n", state);
 }
 
 static ssize_t
@@ -255,10 +307,10 @@ thermal_cooling_device_cur_state_store(struct device *dev,
 				       const char *buf, size_t count)
 {
 	struct thermal_cooling_device *cdev = to_cooling_device(dev);
-	int state;
+	unsigned long state;
 	int result;
 
-	if (!sscanf(buf, "%d\n", &state))
+	if (!sscanf(buf, "%ld\n", &state))
 		return -EINVAL;
 
 	if (state < 0)
@@ -312,13 +364,20 @@ static DEVICE_ATTR(name, 0444, name_show, NULL);
 static ssize_t
 temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
+	long temperature;
+	int ret;
 	struct thermal_hwmon_attr *hwmon_attr
 			= container_of(attr, struct thermal_hwmon_attr, attr);
 	struct thermal_zone_device *tz
 			= container_of(hwmon_attr, struct thermal_zone_device,
 				       temp_input);
 
-	return tz->ops->get_temp(tz, buf);
+	ret = tz->ops->get_temp(tz, &temperature);
+
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%ld\n", temperature);
 }
 
 static ssize_t
@@ -330,8 +389,14 @@ temp_crit_show(struct device *dev, struct device_attribute *attr,
 	struct thermal_zone_device *tz
 			= container_of(hwmon_attr, struct thermal_zone_device,
 				       temp_crit);
+	long temperature;
+	int ret;
+
+	ret = tz->ops->get_trip_temp(tz, 0, &temperature);
+	if (ret)
+		return ret;
 
-	return tz->ops->get_trip_temp(tz, 0, buf);
+	return sprintf(buf, "%ld\n", temperature);
 }
 
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 917707e6151d..4cb3292fb6e4 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -31,23 +31,39 @@
 struct thermal_zone_device;
 struct thermal_cooling_device;
 
+enum thermal_device_mode {
+	THERMAL_DEVICE_DISABLED = 0,
+	THERMAL_DEVICE_ENABLED,
+};
+
+enum thermal_trip_type {
+	THERMAL_TRIP_ACTIVE = 0,
+	THERMAL_TRIP_PASSIVE,
+	THERMAL_TRIP_HOT,
+	THERMAL_TRIP_CRITICAL,
+};
+
 struct thermal_zone_device_ops {
 	int (*bind) (struct thermal_zone_device *,
 		     struct thermal_cooling_device *);
 	int (*unbind) (struct thermal_zone_device *,
 		       struct thermal_cooling_device *);
-	int (*get_temp) (struct thermal_zone_device *, char *);
-	int (*get_mode) (struct thermal_zone_device *, char *);
-	int (*set_mode) (struct thermal_zone_device *, const char *);
-	int (*get_trip_type) (struct thermal_zone_device *, int, char *);
-	int (*get_trip_temp) (struct thermal_zone_device *, int, char *);
+	int (*get_temp) (struct thermal_zone_device *, unsigned long *);
+	int (*get_mode) (struct thermal_zone_device *,
+			 enum thermal_device_mode *);
+	int (*set_mode) (struct thermal_zone_device *,
+		enum thermal_device_mode);
+	int (*get_trip_type) (struct thermal_zone_device *, int,
+		enum thermal_trip_type *);
+	int (*get_trip_temp) (struct thermal_zone_device *, int,
+			      unsigned long *);
 	int (*get_crit_temp) (struct thermal_zone_device *, unsigned long *);
 };
 
 struct thermal_cooling_device_ops {
-	int (*get_max_state) (struct thermal_cooling_device *, char *);
-	int (*get_cur_state) (struct thermal_cooling_device *, char *);
-	int (*set_cur_state) (struct thermal_cooling_device *, unsigned int);
+	int (*get_max_state) (struct thermal_cooling_device *, unsigned long *);
+	int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *);
+	int (*set_cur_state) (struct thermal_cooling_device *, unsigned long);
 };
 
 #define THERMAL_TRIPS_NONE -1
-- 
cgit v1.2.3


From 000ab691172db3921efa3cb7f17fc79235a1de7f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 13:35:06 -0500
Subject: ftrace: allow archs to preform pre and post process for code
 modification

This patch creates the weak functions: ftrace_arch_code_modify_prepare
and ftrace_arch_code_modify_post_process that are called before and
after the stop machine is called to modify the kernel text.

If the arch needs to do pre or post processing, it only needs to define
these functions.

[ Update: Ingo Molnar suggested using the name ftrace_arch_code_modify_*
          over using ftrace_arch_modify_* ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  3 +++
 kernel/trace/ftrace.c  | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b9cb7e..fdb2a89ae543 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -99,6 +99,9 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+int ftrace_arch_code_modify_prepare(void);
+int ftrace_arch_code_modify_post_process(void);
+
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FAILED	= (1 << 1),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fdf913dfc7e8..72316d9647bd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -585,6 +585,24 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 	return 1;
 }
 
+/*
+ * archs can override this function if they must do something
+ * before the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_prepare(void)
+{
+	return 0;
+}
+
+/*
+ * archs can override this function if they must do something
+ * after the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_post_process(void)
+{
+	return 0;
+}
+
 static int __ftrace_modify_code(void *data)
 {
 	int *command = data;
@@ -607,7 +625,17 @@ static int __ftrace_modify_code(void *data)
 
 static void ftrace_run_update_code(int command)
 {
+	int ret;
+
+	ret = ftrace_arch_code_modify_prepare();
+	FTRACE_WARN_ON(ret);
+	if (ret)
+		return;
+
 	stop_machine(__ftrace_modify_code, &command, NULL);
+
+	ret = ftrace_arch_code_modify_post_process();
+	FTRACE_WARN_ON(ret);
 }
 
 static ftrace_func_t saved_ftrace_func;
-- 
cgit v1.2.3


From b1569e99c795bf83b4ddf41c4f1c42761ab7f75e Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Wed, 3 Dec 2008 17:55:32 +0000
Subject: ACPI: move thermal trip handling to generic thermal layer

The ACPI code currently carries its own thermal trip handling, meaning that
any other thermal implementation will need to reimplement it. Move the code
to the generic thermal layer.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/thermal.c        | 458 +++++-------------------------------------
 drivers/thermal/thermal_sys.c | 188 ++++++++++++++++-
 include/linux/thermal.h       |  15 +-
 3 files changed, 248 insertions(+), 413 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 1c410ef859c6..0ec48d2f85c5 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -37,7 +37,6 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/proc_fs.h>
-#include <linux/timer.h>
 #include <linux/jiffies.h>
 #include <linux/kmod.h>
 #include <linux/seq_file.h>
@@ -190,7 +189,6 @@ struct acpi_thermal {
 	struct acpi_thermal_state state;
 	struct acpi_thermal_trips trips;
 	struct acpi_handle_list devices;
-	struct timer_list timer;
 	struct thermal_zone_device *thermal_zone;
 	int tz_enabled;
 	struct mutex lock;
@@ -290,6 +288,11 @@ static int acpi_thermal_set_polling(struct acpi_thermal *tz, int seconds)
 
 	tz->polling_frequency = seconds * 10;	/* Convert value to deci-seconds */
 
+	tz->thermal_zone->polling_delay = seconds * 1000;
+
+	if (tz->tz_enabled)
+		thermal_zone_device_update(tz->thermal_zone);
+
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
 			  "Polling frequency set to %lu seconds\n",
 			  tz->polling_frequency/10));
@@ -569,386 +572,11 @@ static int acpi_thermal_get_trip_points(struct acpi_thermal *tz)
 	return acpi_thermal_trips_update(tz, ACPI_TRIPS_INIT);
 }
 
-static int acpi_thermal_critical(struct acpi_thermal *tz)
-{
-	if (!tz || !tz->trips.critical.flags.valid)
-		return -EINVAL;
-
-	if (tz->temperature >= tz->trips.critical.temperature) {
-		printk(KERN_WARNING PREFIX "Critical trip point\n");
-		tz->trips.critical.flags.enabled = 1;
-	} else if (tz->trips.critical.flags.enabled)
-		tz->trips.critical.flags.enabled = 0;
-
-	acpi_bus_generate_proc_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL,
-				tz->trips.critical.flags.enabled);
-	acpi_bus_generate_netlink_event(tz->device->pnp.device_class,
-					  dev_name(&tz->device->dev),
-					  ACPI_THERMAL_NOTIFY_CRITICAL,
-					  tz->trips.critical.flags.enabled);
-
-	/* take no action if nocrt is set */
-	if(!nocrt) {
-		printk(KERN_EMERG
-			"Critical temperature reached (%ld C), shutting down.\n",
-			KELVIN_TO_CELSIUS(tz->temperature));
-		orderly_poweroff(true);
-	}
-
-	return 0;
-}
-
-static int acpi_thermal_hot(struct acpi_thermal *tz)
-{
-	if (!tz || !tz->trips.hot.flags.valid)
-		return -EINVAL;
-
-	if (tz->temperature >= tz->trips.hot.temperature) {
-		printk(KERN_WARNING PREFIX "Hot trip point\n");
-		tz->trips.hot.flags.enabled = 1;
-	} else if (tz->trips.hot.flags.enabled)
-		tz->trips.hot.flags.enabled = 0;
-
-	acpi_bus_generate_proc_event(tz->device, ACPI_THERMAL_NOTIFY_HOT,
-				tz->trips.hot.flags.enabled);
-	acpi_bus_generate_netlink_event(tz->device->pnp.device_class,
-					  dev_name(&tz->device->dev),
-					  ACPI_THERMAL_NOTIFY_HOT,
-					  tz->trips.hot.flags.enabled);
-
-	/* TBD: Call user-mode "sleep(S4)" function if nocrt is cleared */
-
-	return 0;
-}
-
-static void acpi_thermal_passive(struct acpi_thermal *tz)
-{
-	int result = 1;
-	struct acpi_thermal_passive *passive = NULL;
-	int trend = 0;
-	int i = 0;
-
-
-	if (!tz || !tz->trips.passive.flags.valid)
-		return;
-
-	passive = &(tz->trips.passive);
-
-	/*
-	 * Above Trip?
-	 * -----------
-	 * Calculate the thermal trend (using the passive cooling equation)
-	 * and modify the performance limit for all passive cooling devices
-	 * accordingly.  Note that we assume symmetry.
-	 */
-	if (tz->temperature >= passive->temperature) {
-		trend =
-		    (passive->tc1 * (tz->temperature - tz->last_temperature)) +
-		    (passive->tc2 * (tz->temperature - passive->temperature));
-		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-				  "trend[%d]=(tc1[%lu]*(tmp[%lu]-last[%lu]))+(tc2[%lu]*(tmp[%lu]-psv[%lu]))\n",
-				  trend, passive->tc1, tz->temperature,
-				  tz->last_temperature, passive->tc2,
-				  tz->temperature, passive->temperature));
-		passive->flags.enabled = 1;
-		/* Heating up? */
-		if (trend > 0)
-			for (i = 0; i < passive->devices.count; i++)
-				acpi_processor_set_thermal_limit(passive->
-								 devices.
-								 handles[i],
-								 ACPI_PROCESSOR_LIMIT_INCREMENT);
-		/* Cooling off? */
-		else if (trend < 0) {
-			for (i = 0; i < passive->devices.count; i++)
-				/*
-				 * assume that we are on highest
-				 * freq/lowest thrott and can leave
-				 * passive mode, even in error case
-				 */
-				if (!acpi_processor_set_thermal_limit
-				    (passive->devices.handles[i],
-				     ACPI_PROCESSOR_LIMIT_DECREMENT))
-					result = 0;
-			/*
-			 * Leave cooling mode, even if the temp might
-			 * higher than trip point This is because some
-			 * machines might have long thermal polling
-			 * frequencies (tsp) defined. We will fall back
-			 * into passive mode in next cycle (probably quicker)
-			 */
-			if (result) {
-				passive->flags.enabled = 0;
-				ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-						  "Disabling passive cooling, still above threshold,"
-						  " but we are cooling down\n"));
-			}
-		}
-		return;
-	}
-
-	/*
-	 * Below Trip?
-	 * -----------
-	 * Implement passive cooling hysteresis to slowly increase performance
-	 * and avoid thrashing around the passive trip point.  Note that we
-	 * assume symmetry.
-	 */
-	if (!passive->flags.enabled)
-		return;
-	for (i = 0; i < passive->devices.count; i++)
-		if (!acpi_processor_set_thermal_limit
-		    (passive->devices.handles[i],
-		     ACPI_PROCESSOR_LIMIT_DECREMENT))
-			result = 0;
-	if (result) {
-		passive->flags.enabled = 0;
-		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-				  "Disabling passive cooling (zone is cool)\n"));
-	}
-}
-
-static void acpi_thermal_active(struct acpi_thermal *tz)
-{
-	int result = 0;
-	struct acpi_thermal_active *active = NULL;
-	int i = 0;
-	int j = 0;
-	unsigned long maxtemp = 0;
-
-
-	if (!tz)
-		return;
-
-	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) {
-		active = &(tz->trips.active[i]);
-		if (!active || !active->flags.valid)
-			break;
-		if (tz->temperature >= active->temperature) {
-			/*
-			 * Above Threshold?
-			 * ----------------
-			 * If not already enabled, turn ON all cooling devices
-			 * associated with this active threshold.
-			 */
-			if (active->temperature > maxtemp)
-				tz->state.active_index = i;
-			maxtemp = active->temperature;
-			if (active->flags.enabled)
-				continue;
-			for (j = 0; j < active->devices.count; j++) {
-				result =
-				    acpi_bus_set_power(active->devices.
-						       handles[j],
-						       ACPI_STATE_D0);
-				if (result) {
-					printk(KERN_WARNING PREFIX
-						      "Unable to turn cooling device [%p] 'on'\n",
-						      active->devices.
-						      handles[j]);
-					continue;
-				}
-				active->flags.enabled = 1;
-				ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-						  "Cooling device [%p] now 'on'\n",
-						  active->devices.handles[j]));
-			}
-			continue;
-		}
-		if (!active->flags.enabled)
-			continue;
-		/*
-		 * Below Threshold?
-		 * ----------------
-		 * Turn OFF all cooling devices associated with this
-		 * threshold.
-		 */
-		for (j = 0; j < active->devices.count; j++) {
-			result = acpi_bus_set_power(active->devices.handles[j],
-						    ACPI_STATE_D3);
-			if (result) {
-				printk(KERN_WARNING PREFIX
-					      "Unable to turn cooling device [%p] 'off'\n",
-					      active->devices.handles[j]);
-				continue;
-			}
-			active->flags.enabled = 0;
-			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-					  "Cooling device [%p] now 'off'\n",
-					  active->devices.handles[j]));
-		}
-	}
-}
-
-static void acpi_thermal_check(void *context);
-
-static void acpi_thermal_run(unsigned long data)
-{
-	struct acpi_thermal *tz = (struct acpi_thermal *)data;
-	if (!tz->zombie)
-		acpi_os_execute(OSL_GPE_HANDLER, acpi_thermal_check, (void *)data);
-}
-
-static void acpi_thermal_active_off(void *data)
-{
-	int result = 0;
-	struct acpi_thermal *tz = data;
-	int i = 0;
-	int j = 0;
-	struct acpi_thermal_active *active = NULL;
-
-	if (!tz) {
-		printk(KERN_ERR PREFIX "Invalid (NULL) context\n");
-		return;
-	}
-
-	result = acpi_thermal_get_temperature(tz);
-	if (result)
-		return;
-
-	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) {
-		active = &(tz->trips.active[i]);
-		if (!active || !active->flags.valid)
-			break;
-		if (tz->temperature >= active->temperature) {
-			/*
-			 * If the thermal temperature is greater than the
-			 * active threshod, unnecessary to turn off the
-			 * the active cooling device.
-			 */
-			continue;
-		}
-		/*
-		 * Below Threshold?
-		 * ----------------
-		 * Turn OFF all cooling devices associated with this
-		 * threshold.
-		 */
-		for (j = 0; j < active->devices.count; j++)
-			result = acpi_bus_set_power(active->devices.handles[j],
-						    ACPI_STATE_D3);
-	}
-}
-
 static void acpi_thermal_check(void *data)
 {
-	int result = 0;
 	struct acpi_thermal *tz = data;
-	unsigned long sleep_time = 0;
-	unsigned long timeout_jiffies = 0;
-	int i = 0;
-	struct acpi_thermal_state state;
-
-
-	if (!tz) {
-		printk(KERN_ERR PREFIX "Invalid (NULL) context\n");
-		return;
-	}
-
-	/* Check if someone else is already running */
-	if (!mutex_trylock(&tz->lock))
-		return;
-
-	state = tz->state;
-
-	result = acpi_thermal_get_temperature(tz);
-	if (result)
-		goto unlock;
-
-	if (!tz->tz_enabled)
-		goto unlock;
-
-	memset(&tz->state, 0, sizeof(tz->state));
-
-	/*
-	 * Check Trip Points
-	 * -----------------
-	 * Compare the current temperature to the trip point values to see
-	 * if we've entered one of the thermal policy states.  Note that
-	 * this function determines when a state is entered, but the 
-	 * individual policy decides when it is exited (e.g. hysteresis).
-	 */
-	if (tz->trips.critical.flags.valid)
-		state.critical |=
-		    (tz->temperature >= tz->trips.critical.temperature);
-	if (tz->trips.hot.flags.valid)
-		state.hot |= (tz->temperature >= tz->trips.hot.temperature);
-	if (tz->trips.passive.flags.valid)
-		state.passive |=
-		    (tz->temperature >= tz->trips.passive.temperature);
-	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++)
-		if (tz->trips.active[i].flags.valid)
-			state.active |=
-			    (tz->temperature >=
-			     tz->trips.active[i].temperature);
-
-	/*
-	 * Invoke Policy
-	 * -------------
-	 * Separated from the above check to allow individual policy to 
-	 * determine when to exit a given state.
-	 */
-	if (state.critical)
-		acpi_thermal_critical(tz);
-	if (state.hot)
-		acpi_thermal_hot(tz);
-	if (state.passive)
-		acpi_thermal_passive(tz);
-	if (state.active)
-		acpi_thermal_active(tz);
-
-	/*
-	 * Calculate State
-	 * ---------------
-	 * Again, separated from the above two to allow independent policy
-	 * decisions.
-	 */
-	tz->state.critical = tz->trips.critical.flags.enabled;
-	tz->state.hot = tz->trips.hot.flags.enabled;
-	tz->state.passive = tz->trips.passive.flags.enabled;
-	tz->state.active = 0;
-	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++)
-		tz->state.active |= tz->trips.active[i].flags.enabled;
-
-	/*
-	 * Calculate Sleep Time
-	 * --------------------
-	 * If we're in the passive state, use _TSP's value.  Otherwise
-	 * use the default polling frequency (e.g. _TZP).  If no polling
-	 * frequency is specified then we'll wait forever (at least until
-	 * a thermal event occurs).  Note that _TSP and _TZD values are
-	 * given in 1/10th seconds (we must covert to milliseconds).
-	 */
-	if (tz->state.passive) {
-		sleep_time = tz->trips.passive.tsp * 100;
-		timeout_jiffies =  jiffies + (HZ * sleep_time) / 1000;
-	} else if (tz->polling_frequency > 0) {
-		sleep_time = tz->polling_frequency * 100;
-		timeout_jiffies =  round_jiffies(jiffies + (HZ * sleep_time) / 1000);
-	}
-
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "%s: temperature[%lu] sleep[%lu]\n",
-			  tz->name, tz->temperature, sleep_time));
 
-	/*
-	 * Schedule Next Poll
-	 * ------------------
-	 */
-	if (!sleep_time) {
-		if (timer_pending(&(tz->timer)))
-			del_timer(&(tz->timer));
-	} else {
-		if (timer_pending(&(tz->timer)))
-			mod_timer(&(tz->timer), timeout_jiffies);
-		else {
-			tz->timer.data = (unsigned long)tz;
-			tz->timer.function = acpi_thermal_run;
-			tz->timer.expires = timeout_jiffies;
-			add_timer(&(tz->timer));
-		}
-	}
-      unlock:
-	mutex_unlock(&tz->lock);
+	thermal_zone_device_update(tz->thermal_zone);
 }
 
 /* sys I/F for generic thermal sysfs support */
@@ -1122,6 +750,29 @@ static int thermal_get_crit_temp(struct thermal_zone_device *thermal,
 		return -EINVAL;
 }
 
+static int thermal_notify(struct thermal_zone_device *thermal, int trip,
+			   enum thermal_trip_type trip_type)
+{
+	u8 type = 0;
+	struct acpi_thermal *tz = thermal->devdata;
+
+	if (trip_type == THERMAL_TRIP_CRITICAL)
+		type = ACPI_THERMAL_NOTIFY_CRITICAL;
+	else if (trip_type == THERMAL_TRIP_HOT)
+		type = ACPI_THERMAL_NOTIFY_HOT;
+	else
+		return 0;
+
+	acpi_bus_generate_proc_event(tz->device, type, 1);
+	acpi_bus_generate_netlink_event(tz->device->pnp.device_class,
+					tz->device->dev.bus_id, type, 1);
+
+	if (trip_type == THERMAL_TRIP_CRITICAL && nocrt)
+		return 1;
+
+	return 0;
+}
+
 typedef int (*cb)(struct thermal_zone_device *, int,
 		  struct thermal_cooling_device *);
 static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal,
@@ -1214,6 +865,7 @@ static struct thermal_zone_device_ops acpi_thermal_zone_ops = {
 	.get_trip_type = thermal_get_trip_type,
 	.get_trip_temp = thermal_get_trip_temp,
 	.get_crit_temp = thermal_get_crit_temp,
+	.notify = thermal_notify,
 };
 
 static int acpi_thermal_register_thermal_zone(struct acpi_thermal *tz)
@@ -1234,8 +886,21 @@ static int acpi_thermal_register_thermal_zone(struct acpi_thermal *tz)
 
 	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE &&
 			tz->trips.active[i].flags.valid; i++, trips++);
-	tz->thermal_zone = thermal_zone_device_register("acpitz",
-					trips, tz, &acpi_thermal_zone_ops);
+
+	if (tz->trips.passive.flags.valid)
+		tz->thermal_zone =
+			thermal_zone_device_register("acpitz", trips, tz,
+						     &acpi_thermal_zone_ops,
+						     tz->trips.passive.tc1,
+						     tz->trips.passive.tc2,
+						     tz->trips.passive.tsp*100,
+						     tz->polling_frequency*100);
+	else
+		tz->thermal_zone =
+			thermal_zone_device_register("acpitz", trips, tz,
+						     &acpi_thermal_zone_ops,
+						     0, 0, 0,
+						     tz->polling_frequency);
 	if (IS_ERR(tz->thermal_zone))
 		return -ENODEV;
 
@@ -1467,13 +1132,13 @@ static int acpi_thermal_polling_seq_show(struct seq_file *seq, void *offset)
 	if (!tz)
 		goto end;
 
-	if (!tz->polling_frequency) {
+	if (!tz->thermal_zone->polling_delay) {
 		seq_puts(seq, "<polling disabled>\n");
 		goto end;
 	}
 
-	seq_printf(seq, "polling frequency:       %lu seconds\n",
-		   (tz->polling_frequency / 10));
+	seq_printf(seq, "polling frequency:       %d seconds\n",
+		   (tz->thermal_zone->polling_delay / 1000));
 
       end:
 	return 0;
@@ -1703,12 +1368,6 @@ static int acpi_thermal_add(struct acpi_device *device)
 	if (result)
 		goto unregister_thermal_zone;
 
-	init_timer(&tz->timer);
-
-	acpi_thermal_active_off(tz);
-
-	acpi_thermal_check(tz);
-
 	status = acpi_install_notify_handler(device->handle,
 					     ACPI_DEVICE_NOTIFY,
 					     acpi_thermal_notify, tz);
@@ -1737,36 +1396,15 @@ static int acpi_thermal_remove(struct acpi_device *device, int type)
 	acpi_status status = AE_OK;
 	struct acpi_thermal *tz = NULL;
 
-
 	if (!device || !acpi_driver_data(device))
 		return -EINVAL;
 
 	tz = acpi_driver_data(device);
 
-	/* avoid timer adding new defer task */
-	tz->zombie = 1;
-	/* wait for running timer (on other CPUs) finish */
-	del_timer_sync(&(tz->timer));
-	/* synchronize deferred task */
-	acpi_os_wait_events_complete(NULL);
-	/* deferred task may reinsert timer */
-	del_timer_sync(&(tz->timer));
-
 	status = acpi_remove_notify_handler(device->handle,
 					    ACPI_DEVICE_NOTIFY,
 					    acpi_thermal_notify);
 
-	/* Terminate policy */
-	if (tz->trips.passive.flags.valid && tz->trips.passive.flags.enabled) {
-		tz->trips.passive.flags.enabled = 0;
-		acpi_thermal_passive(tz);
-	}
-	if (tz->trips.active[0].flags.valid
-	    && tz->trips.active[0].flags.enabled) {
-		tz->trips.active[0].flags.enabled = 0;
-		acpi_thermal_active(tz);
-	}
-
 	acpi_thermal_remove_fs(device);
 	acpi_thermal_unregister_thermal_zone(tz);
 	mutex_destroy(&tz->lock);
diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c
index bd139adc6d32..6378741882f3 100644
--- a/drivers/thermal/thermal_sys.c
+++ b/drivers/thermal/thermal_sys.c
@@ -30,6 +30,7 @@
 #include <linux/idr.h>
 #include <linux/thermal.h>
 #include <linux/spinlock.h>
+#include <linux/reboot.h>
 
 MODULE_AUTHOR("Zhang Rui");
 MODULE_DESCRIPTION("Generic thermal management sysfs support");
@@ -517,6 +518,97 @@ thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
 }
 #endif
 
+static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
+					    int delay)
+{
+	cancel_delayed_work(&(tz->poll_queue));
+
+	if (!delay)
+		return;
+
+	if (delay > 1000)
+		schedule_delayed_work(&(tz->poll_queue),
+				      round_jiffies(msecs_to_jiffies(delay)));
+	else
+		schedule_delayed_work(&(tz->poll_queue),
+				      msecs_to_jiffies(delay));
+}
+
+static void thermal_zone_device_passive(struct thermal_zone_device *tz,
+					int temp, int trip_temp, int trip)
+{
+	int trend = 0;
+	struct thermal_cooling_device_instance *instance;
+	struct thermal_cooling_device *cdev;
+	long state, max_state;
+
+	/*
+	 * Above Trip?
+	 * -----------
+	 * Calculate the thermal trend (using the passive cooling equation)
+	 * and modify the performance limit for all passive cooling devices
+	 * accordingly.  Note that we assume symmetry.
+	 */
+	if (temp >= trip_temp) {
+		tz->passive = true;
+
+		trend = (tz->tc1 * (temp - tz->last_temperature)) +
+			(tz->tc2 * (temp - trip_temp));
+
+		/* Heating up? */
+		if (trend > 0) {
+			list_for_each_entry(instance, &tz->cooling_devices,
+					    node) {
+				if (instance->trip != trip)
+					continue;
+				cdev = instance->cdev;
+				cdev->ops->get_cur_state(cdev, &state);
+				cdev->ops->get_max_state(cdev, &max_state);
+				if (state++ < max_state)
+					cdev->ops->set_cur_state(cdev, state);
+			}
+		} else if (trend < 0) { /* Cooling off? */
+			list_for_each_entry(instance, &tz->cooling_devices,
+					    node) {
+				if (instance->trip != trip)
+					continue;
+				cdev = instance->cdev;
+				cdev->ops->get_cur_state(cdev, &state);
+				cdev->ops->get_max_state(cdev, &max_state);
+				if (state > 0)
+					cdev->ops->set_cur_state(cdev, --state);
+			}
+		}
+		return;
+	}
+
+	/*
+	 * Below Trip?
+	 * -----------
+	 * Implement passive cooling hysteresis to slowly increase performance
+	 * and avoid thrashing around the passive trip point.  Note that we
+	 * assume symmetry.
+	 */
+	list_for_each_entry(instance, &tz->cooling_devices, node) {
+		if (instance->trip != trip)
+			continue;
+		cdev = instance->cdev;
+		cdev->ops->get_cur_state(cdev, &state);
+		cdev->ops->get_max_state(cdev, &max_state);
+		if (state > 0)
+			cdev->ops->set_cur_state(cdev, --state);
+		if (state == 0)
+			tz->passive = false;
+	}
+}
+
+static void thermal_zone_device_check(struct work_struct *work)
+{
+	struct thermal_zone_device *tz = container_of(work, struct
+						      thermal_zone_device,
+						      poll_queue.work);
+	thermal_zone_device_update(tz);
+}
 
 /**
  * thermal_zone_bind_cooling_device - bind a cooling device to a thermal zone
@@ -786,21 +878,102 @@ void thermal_cooling_device_unregister(struct
 
 EXPORT_SYMBOL(thermal_cooling_device_unregister);
 
+/**
+ * thermal_zone_device_update - force an update of a thermal zone's state
+ * @ttz:	the thermal zone to update
+ */
+
+void thermal_zone_device_update(struct thermal_zone_device *tz)
+{
+	int count, ret = 0;
+	long temp, trip_temp;
+	enum thermal_trip_type trip_type;
+	struct thermal_cooling_device_instance *instance;
+	struct thermal_cooling_device *cdev;
+
+	mutex_lock(&tz->lock);
+
+	tz->ops->get_temp(tz, &temp);
+
+	for (count = 0; count < tz->trips; count++) {
+		tz->ops->get_trip_type(tz, count, &trip_type);
+		tz->ops->get_trip_temp(tz, count, &trip_temp);
+
+		switch (trip_type) {
+		case THERMAL_TRIP_CRITICAL:
+			if (temp > trip_temp) {
+				if (tz->ops->notify)
+					ret = tz->ops->notify(tz, count,
+							      trip_type);
+				if (!ret) {
+					printk(KERN_EMERG
+					       "Critical temperature reached (%ld C), shutting down.\n",
+					       temp/1000);
+					orderly_poweroff(true);
+				}
+			}
+			break;
+		case THERMAL_TRIP_HOT:
+			if (temp > trip_temp)
+				if (tz->ops->notify)
+					tz->ops->notify(tz, count, trip_type);
+			break;
+		case THERMAL_TRIP_ACTIVE:
+			list_for_each_entry(instance, &tz->cooling_devices,
+					    node) {
+				if (instance->trip != count)
+					continue;
+
+				cdev = instance->cdev;
+
+				if (temp > trip_temp)
+					cdev->ops->set_cur_state(cdev, 1);
+				else
+					cdev->ops->set_cur_state(cdev, 0);
+			}
+			break;
+		case THERMAL_TRIP_PASSIVE:
+			if (temp > trip_temp || tz->passive)
+				thermal_zone_device_passive(tz, temp,
+							    trip_temp, count);
+			break;
+		}
+	}
+	tz->last_temperature = temp;
+	if (tz->passive)
+		thermal_zone_device_set_polling(tz, tz->passive_delay);
+	else if (tz->polling_delay)
+		thermal_zone_device_set_polling(tz, tz->polling_delay);
+	mutex_unlock(&tz->lock);
+}
+EXPORT_SYMBOL(thermal_zone_device_update);
+
 /**
  * thermal_zone_device_register - register a new thermal zone device
  * @type:	the thermal zone device type
  * @trips:	the number of trip points the thermal zone support
  * @devdata:	private device data
  * @ops:	standard thermal zone device callbacks
+ * @tc1:	thermal coefficient 1 for passive calculations
+ * @tc2:	thermal coefficient 2 for passive calculations
+ * @passive_delay: number of milliseconds to wait between polls when
+ *		   performing passive cooling
+ * @polling_delay: number of milliseconds to wait between polls when checking
+ *		   whether trip points have been crossed (0 for interrupt
+ *		   driven systems)
  *
  * thermal_zone_device_unregister() must be called when the device is no
- * longer needed.
+ * longer needed. The passive cooling formula uses tc1 and tc2 as described in
+ * section 11.1.5.1 of the ACPI specification 3.0.
  */
 struct thermal_zone_device *thermal_zone_device_register(char *type,
 							 int trips,
 							 void *devdata, struct
 							 thermal_zone_device_ops
-							 *ops)
+							 *ops, int tc1, int
+							 tc2,
+							 int passive_delay,
+							 int polling_delay)
 {
 	struct thermal_zone_device *tz;
 	struct thermal_cooling_device *pos;
@@ -834,6 +1007,11 @@ struct thermal_zone_device *thermal_zone_device_register(char *type,
 	tz->device.class = &thermal_class;
 	tz->devdata = devdata;
 	tz->trips = trips;
+	tz->tc1 = tc1;
+	tz->tc2 = tc2;
+	tz->passive_delay = passive_delay;
+	tz->polling_delay = polling_delay;
+
 	dev_set_name(&tz->device, "thermal_zone%d", tz->id);
 	result = device_register(&tz->device);
 	if (result) {
@@ -879,6 +1057,10 @@ struct thermal_zone_device *thermal_zone_device_register(char *type,
 		}
 	mutex_unlock(&thermal_list_lock);
 
+	INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check);
+
+	thermal_zone_device_update(tz);
+
 	if (!result)
 		return tz;
 
@@ -918,6 +1100,8 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 		    tz->ops->unbind(tz, cdev);
 	mutex_unlock(&thermal_list_lock);
 
+	thermal_zone_device_set_polling(tz, 0);
+
 	if (tz->type[0])
 		device_remove_file(&tz->device, &dev_attr_type);
 	device_remove_file(&tz->device, &dev_attr_temp);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 4cb3292fb6e4..a81c61521ba4 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -27,6 +27,7 @@
 
 #include <linux/idr.h>
 #include <linux/device.h>
+#include <linux/workqueue.h>
 
 struct thermal_zone_device;
 struct thermal_cooling_device;
@@ -58,6 +59,8 @@ struct thermal_zone_device_ops {
 	int (*get_trip_temp) (struct thermal_zone_device *, int,
 			      unsigned long *);
 	int (*get_crit_temp) (struct thermal_zone_device *, unsigned long *);
+	int (*notify) (struct thermal_zone_device *, int,
+		       enum thermal_trip_type);
 };
 
 struct thermal_cooling_device_ops {
@@ -104,11 +107,18 @@ struct thermal_zone_device {
 	struct device device;
 	void *devdata;
 	int trips;
+	int tc1;
+	int tc2;
+	int passive_delay;
+	int polling_delay;
+	int last_temperature;
+	bool passive;
 	struct thermal_zone_device_ops *ops;
 	struct list_head cooling_devices;
 	struct idr idr;
 	struct mutex lock;	/* protect cooling devices list */
 	struct list_head node;
+	struct delayed_work poll_queue;
 #if defined(CONFIG_THERMAL_HWMON)
 	struct list_head hwmon_node;
 	struct thermal_hwmon_device *hwmon;
@@ -120,13 +130,16 @@ struct thermal_zone_device {
 struct thermal_zone_device *thermal_zone_device_register(char *, int, void *,
 							 struct
 							 thermal_zone_device_ops
-							 *);
+							 *, int tc1, int tc2,
+							 int passive_freq,
+							 int polling_freq);
 void thermal_zone_device_unregister(struct thermal_zone_device *);
 
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *);
 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
+void thermal_zone_device_update(struct thermal_zone_device *);
 struct thermal_cooling_device *thermal_cooling_device_register(char *, void *,
 							       struct
 							       thermal_cooling_device_ops
-- 
cgit v1.2.3


From 7c37730cd31ddb2d3a1da142af9b18c29b8c433b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Feb 2009 12:07:53 -0500
Subject: tracing: add DEFINE_TRACE_FMT to tracepoint.h

This patch creates a DEFINE_TRACE_FMT to map to DECLARE_TRACE.
This allows for the developers to place format strings and
args in with their tracepoint declaration. A tracer may now
override the DEFINE_TRACE_FMT macro and use it to record
a default format.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 757005458366..34ae464effff 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -153,4 +153,7 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_sched();
 }
 
+#define DEFINE_TRACE_FMT(name, proto, args, fmt)		\
+	DECLARE_TRACE(name, TPPROTO(proto), TPARGS(args))
+
 #endif
-- 
cgit v1.2.3


From 6e2756376c706e4da3454a272947983f92e80a7e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Feb 2009 13:59:48 +0100
Subject: generic-ipi: remove CSD_FLAG_WAIT

Oleg noticed that we don't strictly need CSD_FLAG_WAIT, rework
the code so that we can use CSD_FLAG_LOCK for both purposes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blk-softirq.c |  2 +-
 include/linux/smp.h |  3 +-
 kernel/sched.c      |  2 +-
 kernel/smp.c        | 90 ++++++++++++++---------------------------------------
 kernel/softirq.c    |  2 +-
 5 files changed, 28 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ce0efc6b26dc..ee9c21602228 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -64,7 +64,7 @@ static int raise_blk_irq(int cpu, struct request *rq)
 		data->info = rq;
 		data->flags = 0;
 
-		__smp_call_function_single(cpu, data);
+		__smp_call_function_single(cpu, data, 0);
 		return 0;
 	}
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 715196b09d67..00866d7fdf34 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -82,7 +82,8 @@ smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
 	return 0;
 }
 
-void __smp_call_function_single(int cpuid, struct call_single_data *data);
+void __smp_call_function_single(int cpuid, struct call_single_data *data,
+				int wait);
 
 /*
  * Generic and arch helpers
diff --git a/kernel/sched.c b/kernel/sched.c
index 410eec404133..d4c2749a2998 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1093,7 +1093,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
 	if (rq == this_rq()) {
 		hrtimer_restart(timer);
 	} else if (!rq->hrtick_csd_pending) {
-		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
 		rq->hrtick_csd_pending = 1;
 	}
 }
diff --git a/kernel/smp.c b/kernel/smp.c
index 7a0ce25829dc..f5308258891a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -23,8 +23,7 @@ static struct {
 };
 
 enum {
-	CSD_FLAG_WAIT		= 0x01,
-	CSD_FLAG_LOCK		= 0x02,
+	CSD_FLAG_LOCK		= 0x01,
 };
 
 struct call_function_data {
@@ -94,31 +93,6 @@ static int __cpuinit init_call_single_data(void)
 }
 early_initcall(init_call_single_data);
 
-/*
- * csd_wait/csd_complete are used for synchronous ipi calls
- */
-static void csd_wait_prepare(struct call_single_data *data)
-{
-	data->flags |= CSD_FLAG_WAIT;
-}
-
-static void csd_complete(struct call_single_data *data)
-{
-	if (data->flags & CSD_FLAG_WAIT) {
-		/*
-		 * ensure we're all done before saying we are
-		 */
-		smp_mb();
-		data->flags &= ~CSD_FLAG_WAIT;
-	}
-}
-
-static void csd_wait(struct call_single_data *data)
-{
-	while (data->flags & CSD_FLAG_WAIT)
-		cpu_relax();
-}
-
 /*
  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
  *
@@ -126,10 +100,15 @@ static void csd_wait(struct call_single_data *data)
  * function call. For multi-cpu calls its even more interesting as we'll have
  * to ensure no other cpu is observing our csd.
  */
-static void csd_lock(struct call_single_data *data)
+static void csd_lock_wait(struct call_single_data *data)
 {
 	while (data->flags & CSD_FLAG_LOCK)
 		cpu_relax();
+}
+
+static void csd_lock(struct call_single_data *data)
+{
+	csd_lock_wait(data);
 	data->flags = CSD_FLAG_LOCK;
 
 	/*
@@ -155,11 +134,12 @@ static void csd_unlock(struct call_single_data *data)
  * Insert a previously allocated call_single_data element for execution
  * on the given CPU. data must already have ->func, ->info, and ->flags set.
  */
-static void generic_exec_single(int cpu, struct call_single_data *data)
+static
+void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 {
 	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
-	int wait = data->flags & CSD_FLAG_WAIT, ipi;
 	unsigned long flags;
+	int ipi;
 
 	spin_lock_irqsave(&dst->lock, flags);
 	ipi = list_empty(&dst->list);
@@ -182,7 +162,7 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
 		arch_send_call_function_single_ipi(cpu);
 
 	if (wait)
-		csd_wait(data);
+		csd_lock_wait(data);
 }
 
 /*
@@ -232,7 +212,6 @@ void generic_smp_call_function_interrupt(void)
 		if (refs)
 			continue;
 
-		csd_complete(&data->csd);
 		csd_unlock(&data->csd);
 	}
 
@@ -270,9 +249,6 @@ void generic_smp_call_function_single_interrupt(void)
 
 		data->func(data->info);
 
-		if (data_flags & CSD_FLAG_WAIT)
-			csd_complete(data);
-
 		/*
 		 * Unlocked CSDs are valid through generic_exec_single()
 		 */
@@ -313,36 +289,16 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		func(info);
 		local_irq_restore(flags);
 	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-		struct call_single_data *data;
+		struct call_single_data *data = &d;
 
-		if (!wait) {
-			/*
-			 * We are calling a function on a single CPU
-			 * and we are not going to wait for it to finish.
-			 * We use a per cpu data to pass the information to
-			 * that CPU. Since all callers of this code will
-			 * use the same data, we must synchronize the
-			 * callers to prevent a new caller from corrupting
-			 * the data before the callee can access it.
-			 *
-			 * The CSD_FLAG_LOCK is used to let us know when
-			 * the IPI handler is done with the data.
-			 * The first caller will set it, and the callee
-			 * will clear it. The next caller must wait for
-			 * it to clear before we set it again. This
-			 * will make sure the callee is done with the
-			 * data before a new caller will use it.
-			 */
+		if (!wait)
 			data = &__get_cpu_var(csd_data);
-			csd_lock(data);
-		} else {
-			data = &d;
-			csd_wait_prepare(data);
-		}
+
+		csd_lock(data);
 
 		data->func = func;
 		data->info = info;
-		generic_exec_single(cpu, data);
+		generic_exec_single(cpu, data, wait);
 	} else {
 		err = -ENXIO;	/* CPU not online */
 	}
@@ -362,12 +318,15 @@ EXPORT_SYMBOL(smp_call_function_single);
  * instance.
  *
  */
-void __smp_call_function_single(int cpu, struct call_single_data *data)
+void __smp_call_function_single(int cpu, struct call_single_data *data,
+				int wait)
 {
+	csd_lock(data);
+
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
+	WARN_ON(wait && irqs_disabled());
 
-	generic_exec_single(cpu, data);
+	generic_exec_single(cpu, data, wait);
 }
 
 /* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
@@ -425,9 +384,6 @@ void smp_call_function_many(const struct cpumask *mask,
 	csd_lock(&data->csd);
 
 	spin_lock_irqsave(&data->lock, flags);
-	if (wait)
-		csd_wait_prepare(&data->csd);
-
 	data->csd.func = func;
 	data->csd.info = info;
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
@@ -456,7 +412,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	/* optionally wait for the CPUs to complete */
 	if (wait)
-		csd_wait(&data->csd);
+		csd_lock_wait(&data->csd);
 }
 EXPORT_SYMBOL(smp_call_function_many);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..48c3d5d627a8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -496,7 +496,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
 		cp->flags = 0;
 		cp->priv = softirq;
 
-		__smp_call_function_single(cpu, cp);
+		__smp_call_function_single(cpu, cp, 0);
 		return 0;
 	}
 	return 1;
-- 
cgit v1.2.3


From eef62a6826b8ab530cefff5aa55c1661a209c803 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Feb 2009 15:49:52 -0500
Subject: tracing: rename DEFINE_TRACE_FMT to just TRACE_FORMAT

There's been a bit confusion to whether DEFINE/DECLARE_TRACE_FMT should
be a DEFINE or a DECLARE. Ingo Molnar suggested simply calling it
TRACE_FORMAT.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h        |  2 +-
 include/trace/sched_event_types.h | 26 +++++++++++++-------------
 kernel/trace/trace_events.h       |  4 ++--
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 34ae464effff..3de09fa8e01d 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -153,7 +153,7 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_sched();
 }
 
-#define DEFINE_TRACE_FMT(name, proto, args, fmt)		\
+#define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, TPPROTO(proto), TPARGS(args))
 
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index a4f662940f4e..a3d3d66a51c8 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -1,72 +1,72 @@
 
 /* use <trace/sched.h> instead */
-#ifndef DEFINE_TRACE_FMT
+#ifndef TRACE_FORMAT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
 
-DEFINE_TRACE_FMT(sched_kthread_stop,
+TRACE_FORMAT(sched_kthread_stop,
 	TPPROTO(struct task_struct *t),
 	TPARGS(t),
 	TPFMT("task %s:%d", t->comm, t->pid));
 
-DEFINE_TRACE_FMT(sched_kthread_stop_ret,
+TRACE_FORMAT(sched_kthread_stop_ret,
 	TPPROTO(int ret),
 	TPARGS(ret),
 	TPFMT("ret=%d", ret));
 
-DEFINE_TRACE_FMT(sched_wait_task,
+TRACE_FORMAT(sched_wait_task,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 	TPARGS(rq, p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_wakeup,
+TRACE_FORMAT(sched_wakeup,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d %s",
 	      p->comm, p->pid, success?"succeeded":"failed"));
 
-DEFINE_TRACE_FMT(sched_wakeup_new,
+TRACE_FORMAT(sched_wakeup_new,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d",
 	      p->comm, p->pid, success?"succeeded":"failed"));
 
-DEFINE_TRACE_FMT(sched_switch,
+TRACE_FORMAT(sched_switch,
 	TPPROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
 	TPARGS(rq, prev, next),
 	TPFMT("task %s:%d ==> %s:%d",
 	      prev->comm, prev->pid, next->comm, next->pid));
 
-DEFINE_TRACE_FMT(sched_migrate_task,
+TRACE_FORMAT(sched_migrate_task,
 	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
 	TPARGS(p, orig_cpu, dest_cpu),
 	TPFMT("task %s:%d from: %d  to: %d",
 	      p->comm, p->pid, orig_cpu, dest_cpu));
 
-DEFINE_TRACE_FMT(sched_process_free,
+TRACE_FORMAT(sched_process_free,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_process_exit,
+TRACE_FORMAT(sched_process_exit,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_process_wait,
+TRACE_FORMAT(sched_process_wait,
 	TPPROTO(struct pid *pid),
 	TPARGS(pid),
 	TPFMT("pid %d", pid));
 
-DEFINE_TRACE_FMT(sched_process_fork,
+TRACE_FORMAT(sched_process_fork,
 	TPPROTO(struct task_struct *parent, struct task_struct *child),
 	TPARGS(parent, child),
 	TPFMT("parent %s:%d  child %s:%d",
 	      parent->comm, parent->pid, child->comm, child->pid));
 
-DEFINE_TRACE_FMT(sched_signal_send,
+TRACE_FORMAT(sched_signal_send,
 	TPPROTO(int sig, struct task_struct *p),
 	TPARGS(sig, p),
 	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid));
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
index cb8455b3ac9a..deb95e5006c8 100644
--- a/kernel/trace/trace_events.h
+++ b/kernel/trace/trace_events.h
@@ -17,8 +17,8 @@ struct ftrace_event_call {
 #undef TPFMT
 #define TPFMT(fmt, args...)	fmt "\n", ##args
 
-#undef DEFINE_TRACE_FMT
-#define DEFINE_TRACE_FMT(call, proto, args, fmt)			\
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
 {									\
 	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
-- 
cgit v1.2.3


From 3cdfdf91fcc77cfc82592e2b5c2ab35abe819c41 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Feb 2009 15:54:30 -0500
Subject: tracing: wrap arguments with PARAMS

Peter Zijlstra warned that TPPROTO and TPARGS might become something
other than a simple copy of itself. To prevent this from having
side effects in the TRACE_FORMAT macro in tracepoint.h, we add a
PARAMS() macro to be defined as just a wrapper.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 3de09fa8e01d..62d13391a240 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -153,7 +153,8 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_sched();
 }
 
+#define PARAMS(args...) args
 #define TRACE_FORMAT(name, proto, args, fmt)		\
-	DECLARE_TRACE(name, TPPROTO(proto), TPARGS(args))
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
 #endif
-- 
cgit v1.2.3


From 14131f2f98ac350ee9e73faed916d2238a8b6a0d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Feb 2009 18:47:11 +0100
Subject: tracing: implement trace_clock_*() APIs

Impact: implement new tracing timestamp APIs

Add three trace clock variants, with differing scalability/precision
tradeoffs:

 -   local: CPU-local trace clock
 -  medium: scalable global clock with some jitter
 -  global: globally monotonic, serialized clock

Make the ring-buffer use the local trace clock internally.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/trace_clock.h |  19 +++++++++
 kernel/trace/Makefile       |   1 +
 kernel/trace/ring_buffer.c  |   5 +--
 kernel/trace/trace_clock.c  | 101 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/trace_clock.h
 create mode 100644 kernel/trace/trace_clock.c

(limited to 'include/linux')

diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h
new file mode 100644
index 000000000000..7a8130384087
--- /dev/null
+++ b/include/linux/trace_clock.h
@@ -0,0 +1,19 @@
+#ifndef _LINUX_TRACE_CLOCK_H
+#define _LINUX_TRACE_CLOCK_H
+
+/*
+ * 3 trace clock variants, with differing scalability/precision
+ * tradeoffs:
+ *
+ *  -   local: CPU-local trace clock
+ *  -  medium: scalable global clock with some jitter
+ *  -  global: globally monotonic, serialized clock
+ */
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+extern u64 notrace trace_clock_local(void);
+extern u64 notrace trace_clock(void);
+extern u64 notrace trace_clock_global(void);
+
+#endif /* _LINUX_TRACE_CLOCK_H */
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 664b6c0dc75a..c931fe0560cb 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8f19f1aa42b0..a8c275c01e83 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/trace_clock.h>
 #include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
@@ -12,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>	/* used for sched_clock() (for now) */
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
@@ -112,14 +112,13 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
-/* FIXME!!! */
 u64 ring_buffer_time_stamp(int cpu)
 {
 	u64 time;
 
 	preempt_disable_notrace();
 	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = sched_clock() << DEBUG_SHIFT;
+	time = trace_clock_local() << DEBUG_SHIFT;
 	preempt_enable_no_resched_notrace();
 
 	return time;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
new file mode 100644
index 000000000000..2d4953f93560
--- /dev/null
+++ b/kernel/trace/trace_clock.c
@@ -0,0 +1,101 @@
+/*
+ * tracing clocks
+ *
+ *  Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Implements 3 trace clock variants, with differing scalability/precision
+ * tradeoffs:
+ *
+ *  -   local: CPU-local trace clock
+ *  -  medium: scalable global clock with some jitter
+ *  -  global: globally monotonic, serialized clock
+ *
+ * Tracer plugins will chose a default from these clocks.
+ */
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/ktime.h>
+
+/*
+ * trace_clock_local(): the simplest and least coherent tracing clock.
+ *
+ * Useful for tracing that does not cross to other CPUs nor
+ * does it go through idle events.
+ */
+u64 notrace trace_clock_local(void)
+{
+	/*
+	 * sched_clock() is an architecture implemented, fast, scalable,
+	 * lockless clock. It is not guaranteed to be coherent across
+	 * CPUs, nor across CPU idle events.
+	 */
+	return sched_clock();
+}
+
+/*
+ * trace_clock(): 'inbetween' trace clock. Not completely serialized,
+ * but not completely incorrect when crossing CPUs either.
+ *
+ * This is based on cpu_clock(), which will allow at most ~1 jiffy of
+ * jitter between CPUs. So it's a pretty scalable clock, but there
+ * can be offsets in the trace data.
+ */
+u64 notrace trace_clock(void)
+{
+	return cpu_clock(raw_smp_processor_id());
+}
+
+
+/*
+ * trace_clock_global(): special globally coherent trace clock
+ *
+ * It has higher overhead than the other trace clocks but is still
+ * an order of magnitude faster than GTOD derived hardware clocks.
+ *
+ * Used by plugins that need globally coherent timestamps.
+ */
+
+static u64 prev_trace_clock_time;
+
+static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+u64 notrace trace_clock_global(void)
+{
+	unsigned long flags;
+	int this_cpu;
+	u64 now;
+
+	raw_local_irq_save(flags);
+
+	this_cpu = raw_smp_processor_id();
+	now = cpu_clock(this_cpu);
+	/*
+	 * If in an NMI context then dont risk lockups and return the
+	 * cpu_clock() time:
+	 */
+	if (unlikely(in_nmi()))
+		goto out;
+
+	__raw_spin_lock(&trace_clock_lock);
+
+	/*
+	 * TODO: if this happens often then maybe we should reset
+	 * my_scd->clock to prev_trace_clock_time+1, to make sure
+	 * we start ticking with the local clock from now on?
+	 */
+	if ((s64)(now - prev_trace_clock_time) < 0)
+		now = prev_trace_clock_time + 1;
+
+	prev_trace_clock_time = now;
+
+	__raw_spin_unlock(&trace_clock_lock);
+
+ out:
+	raw_local_irq_restore(flags);
+
+	return now;
+}
-- 
cgit v1.2.3


From 629928041c53771f9902753d50fef6b35f36d33d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 28 Feb 2009 02:47:59 -0500
Subject: tracing: create the C style tracing for the sched subsystem

This patch utilizes the TRACE_EVENT_FORMAT macro to enable the C style
faster tracing for the sched subsystem trace points.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h        |   3 +
 include/trace/sched_event_types.h | 119 ++++++++++++++++++++++++++++++--------
 2 files changed, 97 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 62d13391a240..152b2f03fb86 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,4 +157,7 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
+#define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt)	\
+	TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt))
+
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index 2ada206565a3..ba059c10b58a 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -1,6 +1,6 @@
 
 /* use <trace/sched.h> instead */
-#ifndef TRACE_FORMAT
+#ifndef TRACE_EVENT_FORMAT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
@@ -8,70 +8,139 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM sched
 
-TRACE_FORMAT(sched_kthread_stop,
+TRACE_EVENT_FORMAT(sched_kthread_stop,
 	TPPROTO(struct task_struct *t),
 	TPARGS(t),
-	TPFMT("task %s:%d", t->comm, t->pid));
+	TPFMT("task %s:%d", t->comm, t->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, t->pid)
+	),
+	TPRAWFMT("task %d")
+	);
 
-TRACE_FORMAT(sched_kthread_stop_ret,
+TRACE_EVENT_FORMAT(sched_kthread_stop_ret,
 	TPPROTO(int ret),
 	TPARGS(ret),
-	TPFMT("ret=%d", ret));
+	TPFMT("ret=%d", ret),
+	TRACE_STRUCT(
+		TRACE_FIELD(int, ret, ret)
+	),
+	TPRAWFMT("ret=%d")
+	);
 
-TRACE_FORMAT(sched_wait_task,
+TRACE_EVENT_FORMAT(sched_wait_task,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 	TPARGS(rq, p),
-	TPFMT("task %s:%d", p->comm, p->pid));
+	TPFMT("task %s:%d", p->comm, p->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+	),
+	TPRAWFMT("task %d")
+	);
 
-TRACE_FORMAT(sched_wakeup,
+TRACE_EVENT_FORMAT(sched_wakeup,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d %s",
-	      p->comm, p->pid, success?"succeeded":"failed"));
+	      p->comm, p->pid, success ? "succeeded" : "failed"),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("task %d success=%d")
+	);
 
-TRACE_FORMAT(sched_wakeup_new,
+TRACE_EVENT_FORMAT(sched_wakeup_new,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d",
-	      p->comm, p->pid, success?"succeeded":"failed"));
+	      p->comm, p->pid, success ? "succeeded" : "failed"),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("task %d success=%d")
+	);
 
-TRACE_FORMAT(sched_switch,
+TRACE_EVENT_FORMAT(sched_switch,
 	TPPROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
 	TPARGS(rq, prev, next),
 	TPFMT("task %s:%d ==> %s:%d",
-	      prev->comm, prev->pid, next->comm, next->pid));
+	      prev->comm, prev->pid, next->comm, next->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, prev_pid, prev->pid)
+		TRACE_FIELD(int, prev_prio, prev->prio)
+		TRACE_FIELD(pid_t, next_pid, next->pid)
+		TRACE_FIELD(int, next_prio, next->prio)
+	),
+	TPRAWFMT("prev %d:%d ==> next %d:%d")
+	);
 
-TRACE_FORMAT(sched_migrate_task,
+TRACE_EVENT_FORMAT(sched_migrate_task,
 	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
 	TPARGS(p, orig_cpu, dest_cpu),
 	TPFMT("task %s:%d from: %d  to: %d",
-	      p->comm, p->pid, orig_cpu, dest_cpu));
+	      p->comm, p->pid, orig_cpu, dest_cpu),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+		TRACE_FIELD(int, orig_cpu, orig_cpu)
+		TRACE_FIELD(int, dest_cpu, dest_cpu)
+	),
+	TPRAWFMT("task %d  from: %d to: %d")
+	);
 
-TRACE_FORMAT(sched_process_free,
+TRACE_EVENT_FORMAT(sched_process_free,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid));
+	TPFMT("task %s:%d", p->comm, p->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+	),
+	TPRAWFMT("task %d")
+	);
 
-TRACE_FORMAT(sched_process_exit,
+TRACE_EVENT_FORMAT(sched_process_exit,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid));
+	TPFMT("task %s:%d", p->comm, p->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+	),
+	TPRAWFMT("task %d")
+	);
 
-TRACE_FORMAT(sched_process_wait,
+TRACE_EVENT_FORMAT(sched_process_wait,
 	TPPROTO(struct pid *pid),
 	TPARGS(pid),
-	TPFMT("pid %d", pid));
+	TPFMT("pid %d", pid_nr(pid)),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, pid_nr(pid))
+	),
+	TPRAWFMT("task %d")
+	);
 
-TRACE_FORMAT(sched_process_fork,
+TRACE_EVENT_FORMAT(sched_process_fork,
 	TPPROTO(struct task_struct *parent, struct task_struct *child),
 	TPARGS(parent, child),
 	TPFMT("parent %s:%d  child %s:%d",
-	      parent->comm, parent->pid, child->comm, child->pid));
+	      parent->comm, parent->pid, child->comm, child->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, parent, parent->pid)
+		TRACE_FIELD(pid_t, child, child->pid)
+	),
+	TPRAWFMT("parent %d  child %d")
+	);
 
-TRACE_FORMAT(sched_signal_send,
+TRACE_EVENT_FORMAT(sched_signal_send,
 	TPPROTO(int sig, struct task_struct *p),
 	TPARGS(sig, p),
-	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid));
+	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(int, sig, sig)
+		TRACE_FIELD(pid_t, pid, p->pid)
+	),
+	TPRAWFMT("sig: %d  task %d")
+	);
 
 #undef TRACE_SYSTEM
-- 
cgit v1.2.3


From c79a61f55773d2519fd0525bf58385f7d20752d3 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Date: Fri, 27 Feb 2009 21:30:03 +0100
Subject: tracing: make CALLER_ADDRx overwriteable

The current definition of CALLER_ADDRx isn't suitable for all platforms.
E.g. for ARM __builtin_return_address(N) doesn't work for N > 0 and
AFAIK for powerpc there are no frame pointers needed to have a working
__builtin_return_address.  This patch allows defining the CALLER_ADDRx
macros in <asm/ftrace.h> and let these take precedence.

Because now <asm/ftrace.h> is included unconditionally in
<linux/ftrace.h> all archs that don't already had this include get an
empty one for free.

Signed-off-by: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/alpha/include/asm/ftrace.h    |  1 +
 arch/avr32/include/asm/ftrace.h    |  1 +
 arch/blackfin/include/asm/ftrace.h |  1 +
 arch/cris/include/asm/ftrace.h     |  1 +
 arch/h8300/include/asm/ftrace.h    |  1 +
 arch/m68k/include/asm/ftrace.h     |  1 +
 arch/mips/include/asm/ftrace.h     |  1 +
 arch/parisc/include/asm/ftrace.h   |  1 +
 arch/um/include/asm/ftrace.h       |  1 +
 arch/xtensa/include/asm/ftrace.h   |  1 +
 include/asm-frv/ftrace.h           |  1 +
 include/asm-m32r/ftrace.h          |  1 +
 include/asm-mn10300/ftrace.h       |  1 +
 include/linux/ftrace.h             | 41 +++++++++++++++++++-------------------
 14 files changed, 34 insertions(+), 20 deletions(-)
 create mode 100644 arch/alpha/include/asm/ftrace.h
 create mode 100644 arch/avr32/include/asm/ftrace.h
 create mode 100644 arch/blackfin/include/asm/ftrace.h
 create mode 100644 arch/cris/include/asm/ftrace.h
 create mode 100644 arch/h8300/include/asm/ftrace.h
 create mode 100644 arch/m68k/include/asm/ftrace.h
 create mode 100644 arch/mips/include/asm/ftrace.h
 create mode 100644 arch/parisc/include/asm/ftrace.h
 create mode 100644 arch/um/include/asm/ftrace.h
 create mode 100644 arch/xtensa/include/asm/ftrace.h
 create mode 100644 include/asm-frv/ftrace.h
 create mode 100644 include/asm-m32r/ftrace.h
 create mode 100644 include/asm-mn10300/ftrace.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/ftrace.h b/arch/alpha/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/alpha/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/avr32/include/asm/ftrace.h b/arch/avr32/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/avr32/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/blackfin/include/asm/ftrace.h b/arch/blackfin/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/blackfin/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/cris/include/asm/ftrace.h b/arch/cris/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/cris/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/h8300/include/asm/ftrace.h b/arch/h8300/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/h8300/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/m68k/include/asm/ftrace.h b/arch/m68k/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/m68k/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/mips/include/asm/ftrace.h b/arch/mips/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/mips/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/parisc/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/um/include/asm/ftrace.h b/arch/um/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/um/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/xtensa/include/asm/ftrace.h b/arch/xtensa/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/xtensa/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/include/asm-frv/ftrace.h b/include/asm-frv/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/include/asm-frv/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/include/asm-m32r/ftrace.h b/include/asm-m32r/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/include/asm-m32r/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/include/asm-mn10300/ftrace.h b/include/asm-mn10300/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/include/asm-mn10300/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 847bb3c48dd0..1f69ac7c1587 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -11,6 +11,8 @@
 #include <linux/bitops.h>
 #include <linux/sched.h>
 
+#include <asm/ftrace.h>
+
 #ifdef CONFIG_FUNCTION_TRACER
 
 extern int ftrace_enabled;
@@ -103,8 +105,6 @@ struct ftrace_func_command {
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
-#include <asm/ftrace.h>
 
 int ftrace_arch_code_modify_prepare(void);
 int ftrace_arch_code_modify_post_process(void);
@@ -282,24 +282,25 @@ static inline void __ftrace_enabled_restore(int enabled)
 #endif
 }
 
-#ifdef CONFIG_FRAME_POINTER
-/* TODO: need to fix this for ARM */
-# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
-# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
-# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
-# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
-# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
-# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
-#else
-# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-# define CALLER_ADDR1 0UL
-# define CALLER_ADDR2 0UL
-# define CALLER_ADDR3 0UL
-# define CALLER_ADDR4 0UL
-# define CALLER_ADDR5 0UL
-# define CALLER_ADDR6 0UL
-#endif
+#ifndef HAVE_ARCH_CALLER_ADDR
+# ifdef CONFIG_FRAME_POINTER
+#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#  define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+#  define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+#  define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
+#  define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
+#  define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
+# else
+#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define CALLER_ADDR1 0UL
+#  define CALLER_ADDR2 0UL
+#  define CALLER_ADDR3 0UL
+#  define CALLER_ADDR4 0UL
+#  define CALLER_ADDR5 0UL
+#  define CALLER_ADDR6 0UL
+# endif
+#endif /* ifndef HAVE_ARCH_CALLER_ADDR */
 
 #ifdef CONFIG_IRQSOFF_TRACER
   extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
-- 
cgit v1.2.3


From ef7a4a161472b952941bf78855a9cd95703c024e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 00:27:49 -0500
Subject: ring-buffer: fix ring_buffer_read_page

The ring_buffer_read_page was broken if it were to only copy part
of the page. This patch fixes that up as well as adds a parameter
to allow a length field, in order to only copy part of the buffer page.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h |  7 +++-
 kernel/trace/ring_buffer.c  | 92 +++++++++++++++++++++++++++++----------------
 2 files changed, 64 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f5e793d69bd3..79fcbc4b09d6 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -121,6 +121,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 u64 ring_buffer_time_stamp(int cpu);
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
+size_t ring_buffer_page_len(void *page);
+
+
 /*
  * The below functions are fine to use outside the tracing facility.
  */
@@ -138,8 +141,8 @@ static inline int tracing_is_on(void) { return 0; }
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
 void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
-int ring_buffer_read_page(struct ring_buffer *buffer,
-			  void **data_page, int cpu, int full);
+int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
+			  size_t len, int cpu, int full);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9baad7ee4b36..2ad6bae95a3d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -234,6 +234,11 @@ static void rb_init_page(struct buffer_data_page *bpage)
 	local_set(&bpage->commit, 0);
 }
 
+size_t ring_buffer_page_len(void *page)
+{
+	return local_read(&((struct buffer_data_page *)page)->commit);
+}
+
 /*
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
@@ -2378,8 +2383,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
  */
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 {
-	unsigned long addr;
 	struct buffer_data_page *bpage;
+	unsigned long addr;
 
 	addr = __get_free_page(GFP_KERNEL);
 	if (!addr)
@@ -2387,6 +2392,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 
 	bpage = (void *)addr;
 
+	rb_init_page(bpage);
+
 	return bpage;
 }
 
@@ -2406,6 +2413,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  * ring_buffer_read_page - extract a page from the ring buffer
  * @buffer: buffer to extract from
  * @data_page: the page to use allocated from ring_buffer_alloc_read_page
+ * @len: amount to extract
  * @cpu: the cpu of the buffer to extract
  * @full: should the extraction only happen when the page is full.
  *
@@ -2418,7 +2426,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *	rpage = ring_buffer_alloc_read_page(buffer);
  *	if (!rpage)
  *		return error;
- *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
+ *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
  *	if (ret >= 0)
  *		process_page(rpage, ret);
  *
@@ -2435,71 +2443,89 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *  <0 if no data has been transferred.
  */
 int ring_buffer_read_page(struct ring_buffer *buffer,
-			    void **data_page, int cpu, int full)
+			  void **data_page, size_t len, int cpu, int full)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
+	struct buffer_page *reader;
 	unsigned long flags;
+	unsigned int commit;
 	unsigned int read;
 	int ret = -1;
 
 	if (!data_page)
-		return 0;
+		return -1;
 
 	bpage = *data_page;
 	if (!bpage)
-		return 0;
+		return -1;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
-	/*
-	 * rb_buffer_peek will get the next ring buffer if
-	 * the current reader page is empty.
-	 */
-	event = rb_buffer_peek(buffer, cpu, NULL);
-	if (!event)
+	reader = rb_get_reader_page(cpu_buffer);
+	if (!reader)
 		goto out;
 
-	/* check for data */
-	if (!local_read(&cpu_buffer->reader_page->page->commit))
-		goto out;
+	event = rb_reader_event(cpu_buffer);
+
+	read = reader->read;
+	commit = rb_page_commit(reader);
 
-	read = cpu_buffer->reader_page->read;
 	/*
-	 * If the writer is already off of the read page, then simply
-	 * switch the read page with the given page. Otherwise
-	 * we need to copy the data from the reader to the writer.
+	 * If len > what's left on the page, and the writer is also off of
+	 * the read page, then simply switch the read page with the given
+	 * page. Otherwise we need to copy the data from the reader to the
+	 * writer.
 	 */
-	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
-		unsigned int commit = rb_page_commit(cpu_buffer->reader_page);
+	if ((len < (commit - read)) ||
+	    cpu_buffer->reader_page == cpu_buffer->commit_page) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
+		unsigned int pos = read;
+		unsigned int size;
 
 		if (full)
 			goto out;
-		/* The writer is still on the reader page, we must copy */
-		memcpy(bpage->data + read, rpage->data + read, commit - read);
 
-		/* consume what was read */
-		cpu_buffer->reader_page->read = commit;
+		if (len > (commit - read))
+			len = (commit - read);
+
+		size = rb_event_length(event);
+
+		if (len < size)
+			goto out;
+
+		/* Need to copy one event at a time */
+		do {
+			memcpy(bpage->data + pos, rpage->data + pos, size);
+
+			len -= size;
+
+			rb_advance_reader(cpu_buffer);
+			pos = reader->read;
+
+			event = rb_reader_event(cpu_buffer);
+			size = rb_event_length(event);
+		} while (len > size);
 
 		/* update bpage */
-		local_set(&bpage->commit, commit);
-		if (!read)
-			bpage->time_stamp = rpage->time_stamp;
+		local_set(&bpage->commit, pos);
+		bpage->time_stamp = rpage->time_stamp;
+
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
-		bpage = cpu_buffer->reader_page->page;
-		cpu_buffer->reader_page->page = *data_page;
-		local_set(&cpu_buffer->reader_page->write, 0);
-		cpu_buffer->reader_page->read = 0;
+		bpage = reader->page;
+		reader->page = *data_page;
+		local_set(&reader->write, 0);
+		reader->read = 0;
 		*data_page = bpage;
+
+		/* update the entry counter */
+		rb_remove_entries(cpu_buffer, bpage, read);
 	}
 	ret = read;
 
-	/* update the entry counter */
-	rb_remove_entries(cpu_buffer, bpage, read);
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.2.3


From 5e1607a00bd082972629d3d68c95c8bcf902b55a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 10:24:48 +0100
Subject: tracing: rename ftrace_printk() => trace_printk()

Impact: cleanup

Use a more generic name - this also allows the prototype to move
to kernel.h and be generally available to kernel developers who
want to do some quick tracing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ftrace.txt |  6 +++---
 include/linux/ftrace.h   | 18 +++++++++---------
 kernel/trace/trace.c     |  8 ++++----
 kernel/trace/trace.h     |  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 2041ee951c1a..22614bef6359 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -1466,11 +1466,11 @@ want, depending on your needs.
 
 
 You can put some comments on specific functions by using
-ftrace_printk() For example, if you want to put a comment inside
+trace_printk() For example, if you want to put a comment inside
 the __might_sleep() function, you just have to include
-<linux/ftrace.h> and call ftrace_printk() inside __might_sleep()
+<linux/ftrace.h> and call trace_printk() inside __might_sleep()
 
-ftrace_printk("I'm a comment!\n")
+trace_printk("I'm a comment!\n")
 
 will produce:
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f69ac7c1587..fbb9c364e166 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -329,11 +329,11 @@ extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
 /**
- * ftrace_printk - printf formatting in the ftrace buffer
+ * trace_printk - printf formatting in the ftrace buffer
  * @fmt: the printf format for printing
  *
- * Note: __ftrace_printk is an internal function for ftrace_printk and
- *       the @ip is passed in via the ftrace_printk macro.
+ * Note: __trace_printk is an internal function for trace_printk and
+ *       the @ip is passed in via the trace_printk macro.
  *
  * This function allows a kernel developer to debug fast path sections
  * that printk is not appropriate for. By scattering in various
@@ -341,14 +341,14 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
  * where problems are occurring.
  *
  * This is intended as a debugging tool for the developer only.
- * Please refrain from leaving ftrace_printks scattered around in
+ * Please refrain from leaving trace_printks scattered around in
  * your code.
  */
-# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt)
+# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
 extern int
-__ftrace_printk(unsigned long ip, const char *fmt, ...)
+__trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
-# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap)
+# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 extern void ftrace_dump(void);
@@ -356,13 +356,13 @@ extern void ftrace_dump(void);
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
-ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
+trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
 
 static inline void tracing_start(void) { }
 static inline void tracing_stop(void) { }
 static inline void ftrace_off_permanent(void) { }
 static inline int
-ftrace_printk(const char *fmt, ...)
+trace_printk(const char *fmt, ...)
 {
 	return 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d1ef43999d9e..c0e9c1263393 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -48,7 +48,7 @@ unsigned long __read_mostly	tracing_thresh;
  * We need to change this state when a selftest is running.
  * A selftest will lurk into the ring-buffer to count the
  * entries inserted during the selftest although some concurrent
- * insertions into the ring-buffer such as ftrace_printk could occurred
+ * insertions into the ring-buffer such as trace_printk could occurred
  * at the same time, giving false positive or negative results.
  */
 static bool __read_mostly tracing_selftest_running;
@@ -291,7 +291,7 @@ static const char *trace_options[] = {
 	"block",
 	"stacktrace",
 	"sched-tree",
-	"ftrace_printk",
+	"trace_printk",
 	"ftrace_preempt",
 	"branch",
 	"annotate",
@@ -3768,7 +3768,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
-int __ftrace_printk(unsigned long ip, const char *fmt, ...)
+int __trace_printk(unsigned long ip, const char *fmt, ...)
 {
 	int ret;
 	va_list ap;
@@ -3781,7 +3781,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	va_end(ap);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ftrace_printk);
+EXPORT_SYMBOL_GPL(__trace_printk);
 
 int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 12cd119cca32..8beff03fda68 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -115,7 +115,7 @@ struct userstack_entry {
 };
 
 /*
- * ftrace_printk entry:
+ * trace_printk entry:
  */
 struct print_entry {
 	struct trace_entry	ent;
-- 
cgit v1.2.3


From 526211bc58c4b3265352801c5a7f469af5c34711 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 10:28:45 +0100
Subject: tracing: move utility functions from ftrace.h to kernel.h

Make common utility functions such as trace_printk() and
tracing_start()/tracing_stop() generally available to kernel
code.

Cc: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 58 ++------------------------------------------------
 include/linux/kernel.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index fbb9c364e166..5b64303ec9f2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -318,62 +318,6 @@ static inline void __ftrace_enabled_restore(int enabled)
 # define trace_preempt_off(a0, a1)		do { } while (0)
 #endif
 
-#ifdef CONFIG_TRACING
-extern int ftrace_dump_on_oops;
-
-extern void tracing_start(void);
-extern void tracing_stop(void);
-extern void ftrace_off_permanent(void);
-
-extern void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
-
-/**
- * trace_printk - printf formatting in the ftrace buffer
- * @fmt: the printf format for printing
- *
- * Note: __trace_printk is an internal function for trace_printk and
- *       the @ip is passed in via the trace_printk macro.
- *
- * This function allows a kernel developer to debug fast path sections
- * that printk is not appropriate for. By scattering in various
- * printk like tracing in the code, a developer can quickly see
- * where problems are occurring.
- *
- * This is intended as a debugging tool for the developer only.
- * Please refrain from leaving trace_printks scattered around in
- * your code.
- */
-# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
-extern int
-__trace_printk(unsigned long ip, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
-extern int
-__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
-extern void ftrace_dump(void);
-#else
-static inline void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
-static inline int
-trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
-
-static inline void tracing_start(void) { }
-static inline void tracing_stop(void) { }
-static inline void ftrace_off_permanent(void) { }
-static inline int
-trace_printk(const char *fmt, ...)
-{
-	return 0;
-}
-static inline int
-ftrace_vprintk(const char *fmt, va_list ap)
-{
-	return 0;
-}
-static inline void ftrace_dump(void) { }
-#endif
-
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
 extern void ftrace_init_module(struct module *mod,
@@ -542,6 +486,8 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 	return tsk->trace & TSK_TRACE_FL_GRAPH;
 }
 
+extern int ftrace_dump_on_oops;
+
 #endif /* CONFIG_TRACING */
 
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7fa371898e3e..08bf5da86676 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -367,6 +367,64 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 	({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; })
 #endif
 
+/*
+ * General tracing related utility functions - trace_printk(),
+ * tracing_start()/tracing_stop:
+ */
+#ifdef CONFIG_TRACING
+extern void tracing_start(void);
+extern void tracing_stop(void);
+extern void ftrace_off_permanent(void);
+
+extern void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+/**
+ * trace_printk - printf formatting in the ftrace buffer
+ * @fmt: the printf format for printing
+ *
+ * Note: __trace_printk is an internal function for trace_printk and
+ *       the @ip is passed in via the trace_printk macro.
+ *
+ * This function allows a kernel developer to debug fast path sections
+ * that printk is not appropriate for. By scattering in various
+ * printk like tracing in the code, a developer can quickly see
+ * where problems are occurring.
+ *
+ * This is intended as a debugging tool for the developer only.
+ * Please refrain from leaving trace_printks scattered around in
+ * your code.
+ */
+# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
+extern int
+__trace_printk(unsigned long ip, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
+extern int
+__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
+extern void ftrace_dump(void);
+#else
+static inline void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
+static inline int
+trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
+
+static inline void tracing_start(void) { }
+static inline void tracing_stop(void) { }
+static inline void ftrace_off_permanent(void) { }
+static inline int
+trace_printk(const char *fmt, ...)
+{
+	return 0;
+}
+static inline int
+ftrace_vprintk(const char *fmt, va_list ap)
+{
+	return 0;
+}
+static inline void ftrace_dump(void) { }
+#endif
+
 /*
  *      Display an IP address in readable format.
  */
-- 
cgit v1.2.3


From 0012693ad4f636c720fed3802027f9427962f540 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Mar 2009 01:49:22 +0100
Subject: tracing/function-graph-tracer: use the more lightweight local clock

Impact: decrease hangs risks with the graph tracer on slow systems

Since the function graph tracer can spend too much time on timer
interrupts, it's better now to use the more lightweight local
clock. Anyway, the function graph traces are more reliable on a
per cpu trace.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49af243d.06e9300a.53ad.ffff840c@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c             |  2 +-
 include/linux/ftrace.h               | 13 +++++++------
 kernel/trace/trace_functions_graph.c |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3925ec0184b1..a85da1764b1c 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -436,7 +436,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	calltime = cpu_clock(raw_smp_processor_id());
+	calltime = trace_clock_local();
 
 	if (ftrace_push_return_trace(old, calltime,
 				self_addr, &trace.depth) == -EBUSY) {
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f69ac7c1587..6ea62acbe4b6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1,15 +1,16 @@
 #ifndef _LINUX_FTRACE_H
 #define _LINUX_FTRACE_H
 
-#include <linux/linkage.h>
-#include <linux/fs.h>
-#include <linux/ktime.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/module.h>
+#include <linux/trace_clock.h>
 #include <linux/kallsyms.h>
+#include <linux/linkage.h>
 #include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/ktime.h>
 #include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/fs.h>
 
 #include <asm/ftrace.h>
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c009553a8e81..e527f2f66c73 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -112,7 +112,7 @@ unsigned long ftrace_return_to_handler(void)
 	unsigned long ret;
 
 	ftrace_pop_return_trace(&trace, &ret);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
+	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
 
 	if (unlikely(!ret)) {
-- 
cgit v1.2.3


From 2002c258faaa8f89543df284fdbaa9e4b171547f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Mar 2009 10:35:56 -0500
Subject: tracing: add tracing_on/tracing_off to kernel.h

Impact: cleanup

The functions tracing_start/tracing_stop have been moved to kernel.h.
These are not the functions a developer most likely wants to use
when they want to insert a place to stop tracing and restart it from
user space.

tracing_start/tracing_stop was created to work with things like
suspend to ram, where even calling smp_processor_id() can crash the
system. The tracing_start/tracing_stop was used to stop the tracer from
doing anything. These are still light weight functions, but add a bit
more overhead to be able to stop the tracers. They also have no interface
back to userland. That is, if the kernel calls tracing_stop, userland
can not start tracing.

What a developer most likely wants to use is tracing_on/tracing_off.
These are very light weight functions (simply sets or clears a bit).
These functions just stop recording into the ring buffer. The tracers
don't even know that this happens except that they would receive NULL
from the ring_buffer_lock_reserve function.

Also, there's a way for the user land to enable or disable this bit.
In debugfs/tracing/tracing_on, a user may echo "0" (same as tracing_off())
or echo "1" (same as tracing_on()) into this file. This becomes handy when
a kernel developer is debugging and wants tracing to turn off when it
hits an anomaly. Then the developer can examine the trace, and restart
tracing if they want to try again (echo 1 > tracing_on).

This patch moves the prototypes for tracing_on/tracing_off to kernel.h
and comments their use, so that a kernel developer will know how
to use them.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/kernel.h      | 29 ++++++++++++++++++++++++++++-
 include/linux/ring_buffer.h | 15 ---------------
 2 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 08bf5da86676..d4614a8a034b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -369,8 +369,35 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 
 /*
  * General tracing related utility functions - trace_printk(),
- * tracing_start()/tracing_stop:
+ * tracing_on/tracing_off and tracing_start()/tracing_stop
+ *
+ * Use tracing_on/tracing_off when you want to quickly turn on or off
+ * tracing. It simply enables or disables the recording of the trace events.
+ * This also corresponds to the user space debugfs/tracing/tracing_on
+ * file, which gives a means for the kernel and userspace to interact.
+ * Place a tracing_off() in the kernel where you want tracing to end.
+ * From user space, examine the trace, and then echo 1 > tracing_on
+ * to continue tracing.
+ *
+ * tracing_stop/tracing_start has slightly more overhead. It is used
+ * by things like suspend to ram where disabling the recording of the
+ * trace is not enough, but tracing must actually stop because things
+ * like calling smp_processor_id() may crash the system.
+ *
+ * Most likely, you want to use tracing_on/tracing_off.
  */
+#ifdef CONFIG_RING_BUFFER
+void tracing_on(void);
+void tracing_off(void);
+/* trace_off_permanent stops recording with no way to bring it back */
+void tracing_off_permanent(void);
+int tracing_is_on(void);
+#else
+static inline void tracing_on(void) { }
+static inline void tracing_off(void) { }
+static inline void tracing_off_permanent(void) { }
+static inline int tracing_is_on(void) { return 0; }
+#endif
 #ifdef CONFIG_TRACING
 extern void tracing_start(void);
 extern void tracing_stop(void);
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 79fcbc4b09d6..b1a0068a5557 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -124,21 +124,6 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 size_t ring_buffer_page_len(void *page);
 
 
-/*
- * The below functions are fine to use outside the tracing facility.
- */
-#ifdef CONFIG_RING_BUFFER
-void tracing_on(void);
-void tracing_off(void);
-void tracing_off_permanent(void);
-int tracing_is_on(void);
-#else
-static inline void tracing_on(void) { }
-static inline void tracing_off(void) { }
-static inline void tracing_off_permanent(void) { }
-static inline int tracing_is_on(void) { return 0; }
-#endif
-
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
 void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
 int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
-- 
cgit v1.2.3


From 0e39ac444636ff5be39b26f1cb56d79594654dda Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 6 Mar 2009 10:35:52 -0500
Subject: tracing, Text Edit Lock - Architecture Independent Code

This is an architecture independant synchronization around kernel text
modifications through use of a global mutex.

A mutex has been chosen so that kprobes, the main user of this, can sleep
during memory allocation between the memory read of the instructions it
must replace and the memory write of the breakpoint.

Other user of this interface: immediate values.

Paravirt and alternatives are always done when SMP is inactive, so there
is no need to use locks.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
LKML-Reference: <49B142D8.7020601@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/memory.h |  6 ++++++
 mm/memory.c            | 10 ++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 3fdc10806d31..86a6c0f0518d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -99,4 +99,10 @@ enum mem_add_context { BOOT, HOTPLUG };
 #define hotplug_memory_notifier(fn, pri) do { } while (0)
 #endif
 
+/*
+ * Kernel text modification mutex, used for code patching. Users of this lock
+ * can sleep.
+ */
+extern struct mutex text_mutex;
+
 #endif /* _LINUX_MEMORY_H_ */
diff --git a/mm/memory.c b/mm/memory.c
index baa999e87cd2..05fab3bc5b4b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -48,6 +48,8 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/delayacct.h>
+#include <linux/kprobes.h>
+#include <linux/mutex.h>
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
@@ -99,6 +101,14 @@ int randomize_va_space __read_mostly =
 					2;
 #endif
 
+/*
+ * mutex protecting text section modification (dynamic code patching).
+ * some users need to sleep (allocating memory...) while they hold this lock.
+ *
+ * NOT exported to modules - patching kernel text is a really delicate matter.
+ */
+DEFINE_MUTEX(text_mutex);
+
 static int __init disable_randmaps(char *s)
 {
 	randomize_va_space = 0;
-- 
cgit v1.2.3


From 4370aa4aa75391a5e2e06bccb0919109f725ed8e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:46 +0100
Subject: vsprintf: add binary printf

Impact: add new APIs for binary trace printk infrastructure

vbin_printf(): write args to binary buffer, string is copied
when "%s" is occurred.

bstr_printf(): read from binary buffer for args and format a string

[fweisbec@gmail.com: rebase]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1236356510-8381-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/string.h |   7 +
 lib/Kconfig            |   3 +
 lib/vsprintf.c         | 442 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 452 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index d18fc198aa2f..27ac31784ad2 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -10,6 +10,7 @@
 #include <linux/compiler.h>	/* for inline */
 #include <linux/types.h>	/* for size_t */
 #include <linux/stddef.h>	/* for NULL */
+#include <stdarg.h>
 
 extern char *strndup_user(const char __user *, long);
 
@@ -111,6 +112,12 @@ extern void argv_free(char **argv);
 
 extern bool sysfs_streq(const char *s1, const char *s2);
 
+#ifdef CONFIG_BINARY_PRINTF
+int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
+int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
+int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4);
+#endif
+
 extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
 			const void *from, size_t available);
 
diff --git a/lib/Kconfig b/lib/Kconfig
index 03c2c24b9083..97d62cf091a7 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -2,6 +2,9 @@
 # Library configuration
 #
 
+config BINARY_PRINTF
+	def_bool n
+
 menu "Library routines"
 
 config BITREVERSE
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 0fbd0121d91d..3543bbe8b1bc 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1058,6 +1058,448 @@ int sprintf(char * buf, const char *fmt, ...)
 }
 EXPORT_SYMBOL(sprintf);
 
+#ifdef CONFIG_BINARY_PRINTF
+/*
+ * bprintf service:
+ * vbin_printf() - VA arguments to binary data
+ * bstr_printf() - Binary data to text string
+ */
+
+/**
+ * vbin_printf - Parse a format string and place args' binary value in a buffer
+ * @bin_buf: The buffer to place args' binary value
+ * @size: The size of the buffer(by words(32bits), not characters)
+ * @fmt: The format string to use
+ * @args: Arguments for the format string
+ *
+ * The format follows C99 vsnprintf, except %n is ignored, and its argument
+ * is skiped.
+ *
+ * The return value is the number of words(32bits) which would be generated for
+ * the given input.
+ *
+ * NOTE:
+ * If the return value is greater than @size, the resulting bin_buf is NOT
+ * valid for bstr_printf().
+ */
+int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
+{
+	char *str, *end;
+	int qualifier;
+
+	str = (char *)bin_buf;
+	end = (char *)(bin_buf + size);
+
+#define save_arg(type)							\
+do {									\
+	if (sizeof(type) == 8) {					\
+		unsigned long long value;				\
+		str = PTR_ALIGN(str, sizeof(u32));			\
+		value = va_arg(args, unsigned long long);		\
+		if (str + sizeof(type) <= end) {			\
+			*(u32 *)str = *(u32 *)&value;			\
+			*(u32 *)(str + 4) = *((u32 *)&value + 1);	\
+		}							\
+	} else {							\
+		unsigned long value;					\
+		str = PTR_ALIGN(str, sizeof(type));			\
+		value = va_arg(args, int);				\
+		if (str + sizeof(type) <= end)				\
+			*(typeof(type) *)str = (type)value;		\
+	}								\
+	str += sizeof(type);						\
+} while (0)
+
+	for (; *fmt ; ++fmt) {
+		if (*fmt != '%')
+			continue;
+
+repeat:
+		/* parse flags */
+		++fmt;		/* this also skips first '%' */
+		if (*fmt == '-' || *fmt == '+' || *fmt == ' '
+				|| *fmt == '#' || *fmt == '0')
+			goto repeat;
+
+		/* parse field width */
+		if (isdigit(*fmt))
+			skip_atoi(&fmt);
+		else if (*fmt == '*') {
+			++fmt;
+			/* it's the next argument */
+			save_arg(int);
+		}
+
+		/* parse the precision */
+		if (*fmt == '.') {
+			++fmt;
+			if (isdigit(*fmt))
+				skip_atoi(&fmt);
+			else if (*fmt == '*') {
+				++fmt;
+				/* it's the next argument */
+				save_arg(int);
+			}
+		}
+
+		/* parse the conversion qualifier */
+		qualifier = -1;
+		if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
+		    *fmt == 'Z' || *fmt == 'z' || *fmt == 't') {
+			qualifier = *fmt;
+			++fmt;
+			if (qualifier == 'l' && *fmt == 'l') {
+				qualifier = 'L';
+				++fmt;
+			}
+		}
+
+		/* parse format type */
+		switch (*fmt) {
+		case 'c':
+			save_arg(char);
+			continue;
+		case 's': {
+			/* save the string argument */
+			const char *save_str = va_arg(args, char *);
+			size_t len;
+			if ((unsigned long)save_str > (unsigned long)-PAGE_SIZE
+					|| (unsigned long)save_str < PAGE_SIZE)
+				save_str = "<NULL>";
+			len = strlen(save_str);
+			if (str + len + 1 < end)
+				memcpy(str, save_str, len + 1);
+			str += len + 1;
+			continue;
+		}
+		case 'p':
+			save_arg(void *);
+			/* skip all alphanumeric pointer suffixes */
+			while (isalnum(fmt[1]))
+				fmt++;
+			continue;
+		case 'n': {
+			/* skip %n 's argument */
+			void *skip_arg;
+			if (qualifier == 'l')
+				skip_arg = va_arg(args, long *);
+			else if (qualifier == 'Z' || qualifier == 'z')
+				skip_arg = va_arg(args, size_t *);
+			else
+				skip_arg = va_arg(args, int *);
+			continue;
+		}
+		case 'o':
+		case 'x':
+		case 'X':
+		case 'd':
+		case 'i':
+		case 'u':
+			/* save arg for case: 'o', 'x', 'X', 'd', 'i', 'u' */
+			if (qualifier == 'L')
+				save_arg(long long);
+			else if (qualifier == 'l')
+				save_arg(unsigned long);
+			else if (qualifier == 'Z' || qualifier == 'z')
+				save_arg(size_t);
+			else if (qualifier == 't')
+				save_arg(ptrdiff_t);
+			else if (qualifier == 'h')
+				save_arg(short);
+			else
+				save_arg(int);
+			continue;
+		default:
+			if (!*fmt)
+				--fmt;
+			continue;
+		}
+	}
+#undef save_arg
+
+	return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
+}
+EXPORT_SYMBOL_GPL(vbin_printf);
+
+/**
+ * bstr_printf - Format a string from binary arguments and place it in a buffer
+ * @buf: The buffer to place the result into
+ * @size: The size of the buffer, including the trailing null space
+ * @fmt: The format string to use
+ * @bin_buf: Binary arguments for the format string
+ *
+ * This function like C99 vsnprintf, but the difference is that vsnprintf gets
+ * arguments from stack, and bstr_printf gets arguments from @bin_buf which is
+ * a binary buffer that generated by vbin_printf.
+ *
+ * The format follows C99 vsnprintf, but has some extensions:
+ * %pS output the name of a text symbol
+ * %pF output the name of a function pointer
+ * %pR output the address range in a struct resource
+ * %n is ignored
+ *
+ * The return value is the number of characters which would
+ * be generated for the given input, excluding the trailing
+ * '\0', as per ISO C99. If you want to have the exact
+ * number of characters written into @buf as return value
+ * (not including the trailing '\0'), use vscnprintf(). If the
+ * return is greater than or equal to @size, the resulting
+ * string is truncated.
+ */
+int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+{
+	unsigned long long num;
+	int base;
+	char *str, *end, c;
+	const char *args = (const char *)bin_buf;
+
+	int flags;
+	int field_width;
+	int precision;
+	int qualifier;
+
+	if (unlikely((int) size < 0)) {
+		/* There can be only one.. */
+		static char warn = 1;
+		WARN_ON(warn);
+		warn = 0;
+		return 0;
+	}
+
+	str = buf;
+	end = buf + size;
+
+#define get_arg(type)							\
+({									\
+	typeof(type) value;						\
+	if (sizeof(type) == 8) {					\
+		args = PTR_ALIGN(args, sizeof(u32));			\
+		*(u32 *)&value = *(u32 *)args;				\
+		*((u32 *)&value + 1) = *(u32 *)(args + 4);		\
+	} else {							\
+		args = PTR_ALIGN(args, sizeof(type));			\
+		value = *(typeof(type) *)args;				\
+	}								\
+	args += sizeof(type);						\
+	value;								\
+})
+
+	/* Make sure end is always >= buf */
+	if (end < buf) {
+		end = ((void *)-1);
+		size = end - buf;
+	}
+
+	for (; *fmt ; ++fmt) {
+		if (*fmt != '%') {
+			if (str < end)
+				*str = *fmt;
+			++str;
+			continue;
+		}
+
+		/* process flags */
+		flags = 0;
+repeat:
+		++fmt;		/* this also skips first '%' */
+		switch (*fmt) {
+		case '-':
+			flags |= LEFT;
+			goto repeat;
+		case '+':
+			flags |= PLUS;
+			goto repeat;
+		case ' ':
+			flags |= SPACE;
+			goto repeat;
+		case '#':
+			flags |= SPECIAL;
+			goto repeat;
+		case '0':
+			flags |= ZEROPAD;
+			goto repeat;
+		}
+
+		/* get field width */
+		field_width = -1;
+		if (isdigit(*fmt))
+			field_width = skip_atoi(&fmt);
+		else if (*fmt == '*') {
+			++fmt;
+			/* it's the next argument */
+			field_width = get_arg(int);
+			if (field_width < 0) {
+				field_width = -field_width;
+				flags |= LEFT;
+			}
+		}
+
+		/* get the precision */
+		precision = -1;
+		if (*fmt == '.') {
+			++fmt;
+			if (isdigit(*fmt))
+				precision = skip_atoi(&fmt);
+			else if (*fmt == '*') {
+				++fmt;
+				/* it's the next argument */
+				precision = get_arg(int);
+			}
+			if (precision < 0)
+				precision = 0;
+		}
+
+		/* get the conversion qualifier */
+		qualifier = -1;
+		if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
+		    *fmt == 'Z' || *fmt == 'z' || *fmt == 't') {
+			qualifier = *fmt;
+			++fmt;
+			if (qualifier == 'l' && *fmt == 'l') {
+				qualifier = 'L';
+				++fmt;
+			}
+		}
+
+		/* default base */
+		base = 10;
+
+		switch (*fmt) {
+		case 'c':
+			if (!(flags & LEFT)) {
+				while (--field_width > 0) {
+					if (str < end)
+						*str = ' ';
+					++str;
+				}
+			}
+			c = (unsigned char) get_arg(char);
+			if (str < end)
+				*str = c;
+			++str;
+			while (--field_width > 0) {
+				if (str < end)
+					*str = ' ';
+				++str;
+			}
+			continue;
+
+		case 's':{
+			const char *str_arg = args;
+			size_t len = strlen(str_arg);
+			args += len + 1;
+			str = string(str, end, (char *)str_arg, field_width,
+					precision, flags);
+			continue;
+		}
+
+		case 'p':
+			str = pointer(fmt+1, str, end, get_arg(void *),
+					field_width, precision, flags);
+			/* Skip all alphanumeric pointer suffixes */
+			while (isalnum(fmt[1]))
+				fmt++;
+			continue;
+
+		case 'n':
+			/* skip %n */
+			continue;
+
+		case '%':
+			if (str < end)
+				*str = '%';
+			++str;
+			continue;
+
+		/* integer number formats - set up the flags and "break" */
+		case 'o':
+			base = 8;
+			break;
+
+		case 'x':
+			flags |= SMALL;
+		case 'X':
+			base = 16;
+			break;
+
+		case 'd':
+		case 'i':
+			flags |= SIGN;
+		case 'u':
+			break;
+
+		default:
+			if (str < end)
+				*str = '%';
+			++str;
+			if (*fmt) {
+				if (str < end)
+					*str = *fmt;
+				++str;
+			} else {
+				--fmt;
+			}
+			continue;
+		}
+		if (qualifier == 'L')
+			num = get_arg(long long);
+		else if (qualifier == 'l') {
+			num = get_arg(unsigned long);
+			if (flags & SIGN)
+				num = (signed long) num;
+		} else if (qualifier == 'Z' || qualifier == 'z') {
+			num = get_arg(size_t);
+		} else if (qualifier == 't') {
+			num = get_arg(ptrdiff_t);
+		} else if (qualifier == 'h') {
+			num = (unsigned short) get_arg(short);
+			if (flags & SIGN)
+				num = (signed short) num;
+		} else {
+			num = get_arg(unsigned int);
+			if (flags & SIGN)
+				num = (signed int) num;
+		}
+		str = number(str, end, num, base,
+				field_width, precision, flags);
+	}
+	if (size > 0) {
+		if (str < end)
+			*str = '\0';
+		else
+			end[-1] = '\0';
+	}
+#undef get_arg
+
+	/* the trailing null byte doesn't count towards the total */
+	return str - buf;
+}
+EXPORT_SYMBOL_GPL(bstr_printf);
+
+/**
+ * bprintf - Parse a format string and place args' binary value in a buffer
+ * @bin_buf: The buffer to place args' binary value
+ * @size: The size of the buffer(by words(32bits), not characters)
+ * @fmt: The format string to use
+ * @...: Arguments for the format string
+ *
+ * The function returns the number of words(u32) written
+ * into @bin_buf.
+ */
+int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = vbin_printf(bin_buf, size, fmt, args);
+	va_end(args);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bprintf);
+
+#endif /* CONFIG_BINARY_PRINTF */
+
 /**
  * vsscanf - Unformat a buffer into a list of arguments
  * @buf:	input buffer
-- 
cgit v1.2.3


From 1427cdf0592368bdec57276edaf714040ee8744f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:47 +0100
Subject: tracing: infrastructure for supporting binary record

Impact: save on memory for tracing

Current tracers are typically using a struct(like struct ftrace_entry,
struct ctx_switch_entry, struct special_entr etc...)to record a binary
event. These structs can only record a their own kind of events.
A new kind of tracer need a new struct and a lot of code too handle it.

So we need a generic binary record for events. This infrastructure
is for this purpose.

[fweisbec@gmail.com: rebase against latest -tip, make it safe while sched
tracing as reported by Steven Rostedt]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h       |  3 ++
 kernel/trace/Kconfig         |  6 +++
 kernel/trace/Makefile        |  1 +
 kernel/trace/trace.c         | 56 ++++++++++++++++++++++++++++
 kernel/trace/trace.h         | 12 ++++++
 kernel/trace/trace_bprintk.c | 87 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.c  | 75 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 240 insertions(+)
 create mode 100644 kernel/trace/trace_bprintk.c

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 498769425eb2..1c9cdca02580 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -223,6 +223,9 @@ extern int ftrace_make_nop(struct module *mod,
  */
 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
+#ifdef CONFIG_TRACE_BPRINTK
+extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+#endif
 
 /* May be defined in arch */
 extern int ftrace_arch_read_dyn_info(char *buf, int size);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 058d949a3214..ad8d3617d0a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -97,6 +97,12 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
+config TRACE_BPRINTK
+	bool "Binary printk for tracing"
+	default y
+	depends on TRACING
+	select BINARY_PRINTF
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f44736c7574a..46557ef4c379 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
+obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e6144acf2b75..ff53509e19f8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3792,6 +3792,62 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
+/**
+ * trace_vbprintk - write binary msg to tracing buffer
+ *
+ * Caller must insure @fmt are valid when msg is in tracing buffer.
+ */
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
+{
+	static DEFINE_SPINLOCK(trace_buf_lock);
+	static u32 trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	struct bprintk_entry *entry;
+	unsigned long flags;
+	int resched;
+	int cpu, len = 0, size, pc;
+
+	if (tracing_disabled || !trace_bprintk_enable)
+		return 0;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	spin_lock_irqsave(&trace_buf_lock, flags);
+	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	if (len > TRACE_BUF_SIZE || len < 0)
+		goto out_unlock;
+
+	size = sizeof(*entry) + sizeof(u32) * len;
+	event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->fmt			= fmt;
+
+	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+out_unlock:
+	spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+out:
+	ftrace_preempt_enable(resched);
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8beff03fda68..0f5077f8f957 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
+	TRACE_BPRINTK,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -124,6 +125,16 @@ struct print_entry {
 	char			buf[];
 };
 
+struct bprintk_entry {
+	struct trace_entry ent;
+	unsigned long ip;
+	const char *fmt;
+	u32 buf[];
+};
+#ifdef CONFIG_TRACE_BPRINTK
+extern int trace_bprintk_enable;
+#endif
+
 #define TRACE_OLD_SIZE		88
 
 struct trace_field_cont {
@@ -285,6 +296,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
+		IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
new file mode 100644
index 000000000000..1f8e532c3fb9
--- /dev/null
+++ b/kernel/trace/trace_bprintk.c
@@ -0,0 +1,87 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+
+/* binary printk basic */
+static DEFINE_MUTEX(btrace_mutex);
+static int btrace_metadata_count;
+
+static inline void lock_btrace(void)
+{
+	mutex_lock(&btrace_mutex);
+}
+
+static inline void unlock_btrace(void)
+{
+	mutex_unlock(&btrace_mutex);
+}
+
+static void get_btrace_metadata(void)
+{
+	lock_btrace();
+	btrace_metadata_count++;
+	unlock_btrace();
+}
+
+static void put_btrace_metadata(void)
+{
+	lock_btrace();
+	btrace_metadata_count--;
+	unlock_btrace();
+}
+
+/* events tracer */
+int trace_bprintk_enable;
+
+static void start_bprintk_trace(struct trace_array *tr)
+{
+	get_btrace_metadata();
+	tracing_reset_online_cpus(tr);
+	trace_bprintk_enable = 1;
+}
+
+static void stop_bprintk_trace(struct trace_array *tr)
+{
+	trace_bprintk_enable = 0;
+	tracing_reset_online_cpus(tr);
+	put_btrace_metadata();
+}
+
+static int init_bprintk_trace(struct trace_array *tr)
+{
+	start_bprintk_trace(tr);
+	return 0;
+}
+
+static struct tracer bprintk_trace __read_mostly =
+{
+	.name	     = "events",
+	.init	     = init_bprintk_trace,
+	.reset	     = stop_bprintk_trace,
+	.start	     = start_bprintk_trace,
+	.stop	     = stop_bprintk_trace,
+};
+
+static __init int init_bprintk(void)
+{
+	return register_tracer(&bprintk_trace);
+}
+
+device_initcall(init_bprintk);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 306fef84c503..4ab71201862e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -53,6 +53,26 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
+static int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+
 /**
  * trace_seq_puts - trace sequence printing of simple string
  * @s: trace sequence descriptor
@@ -855,6 +875,60 @@ static struct trace_event trace_print_event = {
 	.raw		= trace_print_raw,
 };
 
+/* TRACE_BPRINTK */
+static enum print_line_t
+trace_bprintk_print(struct trace_iterator *iter, int flags)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bprintk_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_puts(s, ": "))
+		goto partial;
+
+	if (!trace_seq_bprintf(s, field->fmt, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t
+trace_bprintk_raw(struct trace_iterator *iter, int flags)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bprintk_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!trace_seq_printf(s, ": %lx : ", field->ip))
+		goto partial;
+
+	if (!trace_seq_bprintf(s, field->fmt, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_bprintk_event = {
+	.type	 	= TRACE_BPRINTK,
+	.trace		= trace_bprintk_print,
+	.raw		= trace_bprintk_raw,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
@@ -863,6 +937,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_print_event,
+	&trace_bprintk_event,
 	NULL
 };
 
-- 
cgit v1.2.3


From 1ba28e02a18cbdbea123836f6c98efb09cbf59ec Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:48 +0100
Subject: tracing: add trace_bprintk()

Impact: add a generic printk() for tracing, like trace_printk()

trace_bprintk() uses the infrastructure to record events on ring_buffer.

[ fweisbec@gmail.com: ported to latest -tip, made it work if
  !CONFIG_MODULES, never free the format strings from modules
  because we can't keep track of them and conditionnaly create
  the ftrace format strings section (reported by Steven Rostedt) ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |  9 ++++
 include/linux/ftrace.h            | 21 ++++++++++
 include/linux/module.h            |  5 +++
 kernel/module.c                   |  6 +++
 kernel/trace/trace.c              | 15 +++++++
 kernel/trace/trace_bprintk.c      | 87 ++++++++++++++++++++++++++++++++++-----
 6 files changed, 133 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0add6b28c366..48ade3168b13 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -69,6 +69,14 @@
 #define FTRACE_EVENTS()
 #endif
 
+#ifdef CONFIG_TRACING
+#define TRACE_PRINTKS() VMLINUX_SYMBOL(__start___trace_bprintk_fmt) = .;      \
+			 *(__trace_printk_fmt) /* Trace_printk fmt' pointer */ \
+			 VMLINUX_SYMBOL(__stop___trace_bprintk_fmt) = .;
+#else
+#define TRACE_PRINTKS()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -100,6 +108,7 @@
 		*(__vermagic)		/* Kernel version magic */	\
 		*(__markers_strings)	/* Markers: strings */		\
 		*(__tracepoints_strings)/* Tracepoints: strings */	\
+		TRACE_PRINTKS()					\
 	}								\
 									\
 	.rodata1          : AT(ADDR(.rodata1) - LOAD_OFFSET) {		\
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1c9cdca02580..1cc8ca453a9b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -225,6 +225,27 @@ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
 #ifdef CONFIG_TRACE_BPRINTK
 extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+extern int __trace_bprintk(unsigned long ip, const char *fmt, ...)
+		__attribute__ ((format (printf, 2, 3)));
+
+static inline void  ____trace_bprintk_check_format(const char *fmt, ...)
+		__attribute__ ((format (printf, 1, 2)));
+static inline void ____trace_bprintk_check_format(const char *fmt, ...) {}
+#define __trace_bprintk_check_format(fmt, args...)			\
+do {									\
+	if (0)								\
+		____trace_bprintk_check_format(fmt, ##args);		\
+} while (0)
+
+#define trace_bprintk(fmt, args...)					\
+do {									\
+	static char *__attribute__((section("__trace_bprintk_fmt")))	\
+			trace_bprintk_fmt = fmt;			\
+	__trace_bprintk_check_format(fmt, ##args);			\
+	__trace_bprintk(_THIS_IP_, trace_bprintk_fmt, ##args);	\
+} while (0)
+#else
+#define trace_bprintk trace_printk
 #endif
 
 /* May be defined in arch */
diff --git a/include/linux/module.h b/include/linux/module.h
index 145a75528cc1..8cbec972d8e7 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -329,6 +329,11 @@ struct module
 	unsigned int num_tracepoints;
 #endif
 
+#ifdef CONFIG_TRACE_BPRINTK
+	const char **trace_bprintk_fmt_start;
+	unsigned int num_trace_bprintk_fmt;
+#endif
+
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
 	struct list_head modules_which_use_me;
diff --git a/kernel/module.c b/kernel/module.c
index 22d7379709da..2dece104f9a1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2158,6 +2158,12 @@ static noinline struct module *load_module(void __user *umod,
 					&mod->num_tracepoints);
 #endif
 
+#ifdef CONFIG_TRACE_BPRINTK
+	mod->trace_bprintk_fmt_start = section_objs(hdr, sechdrs, secstrings,
+			"__trace_bprintk_fmt", sizeof(char *),
+			&mod->num_trace_bprintk_fmt);
+#endif
+
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ff53509e19f8..46b3cd7a5752 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3848,6 +3848,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
 
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!fmt)
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vbprintk(ip, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_bprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
index 1f8e532c3fb9..f4c245a5cd33 100644
--- a/kernel/trace/trace_bprintk.c
+++ b/kernel/trace/trace_bprintk.c
@@ -19,9 +19,21 @@
 
 #include "trace.h"
 
+#ifdef CONFIG_MODULES
+
 /* binary printk basic */
 static DEFINE_MUTEX(btrace_mutex);
-static int btrace_metadata_count;
+/*
+ * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+struct trace_bprintk_fmt {
+	struct list_head list;
+	char fmt[0];
+};
+
 
 static inline void lock_btrace(void)
 {
@@ -33,26 +45,75 @@ static inline void unlock_btrace(void)
 	mutex_unlock(&btrace_mutex);
 }
 
-static void get_btrace_metadata(void)
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
 {
-	lock_btrace();
-	btrace_metadata_count++;
-	unlock_btrace();
+	struct trace_bprintk_fmt *pos;
+	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+		if (!strcmp(pos->fmt, fmt))
+			return pos;
+	}
+	return NULL;
 }
 
-static void put_btrace_metadata(void)
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
 {
+	const char **iter;
 	lock_btrace();
-	btrace_metadata_count--;
+	for (iter = start; iter < end; iter++) {
+		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+		if (tb_fmt) {
+			*iter = tb_fmt->fmt;
+			continue;
+		}
+
+		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+				+ strlen(*iter) + 1, GFP_KERNEL);
+		if (tb_fmt) {
+			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+			strcpy(tb_fmt->fmt, *iter);
+			*iter = tb_fmt->fmt;
+		} else
+			*iter = NULL;
+	}
 	unlock_btrace();
 }
 
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct module *mod = data;
+	if (mod->num_trace_bprintk_fmt) {
+		const char **start = mod->trace_bprintk_fmt_start;
+		const char **end = start + mod->num_trace_bprintk_fmt;
+
+		if (val == MODULE_STATE_COMING)
+			hold_module_trace_bprintk_format(start, end);
+	}
+	return 0;
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+	.notifier_call = module_trace_bprintk_format_notify,
+};
+
 /* events tracer */
 int trace_bprintk_enable;
 
 static void start_bprintk_trace(struct trace_array *tr)
 {
-	get_btrace_metadata();
 	tracing_reset_online_cpus(tr);
 	trace_bprintk_enable = 1;
 }
@@ -61,7 +122,6 @@ static void stop_bprintk_trace(struct trace_array *tr)
 {
 	trace_bprintk_enable = 0;
 	tracing_reset_online_cpus(tr);
-	put_btrace_metadata();
 }
 
 static int init_bprintk_trace(struct trace_array *tr)
@@ -81,7 +141,14 @@ static struct tracer bprintk_trace __read_mostly =
 
 static __init int init_bprintk(void)
 {
-	return register_tracer(&bprintk_trace);
+	int ret = register_module_notifier(&module_trace_bprintk_format_nb);
+	if (ret)
+		return ret;
+
+	ret = register_tracer(&bprintk_trace);
+	if (ret)
+		unregister_module_notifier(&module_trace_bprintk_format_nb);
+	return ret;
 }
 
 device_initcall(init_bprintk);
-- 
cgit v1.2.3


From 769b0441f438c4bb4872cb8560eb6fe51bcc09ee Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 6 Mar 2009 17:21:49 +0100
Subject: tracing/core: drop the old trace_printk() implementation in favour of
 trace_bprintk()

Impact: faster and lighter tracing

Now that we have trace_bprintk() which is faster and consume lesser
memory than trace_printk() and has the same purpose, we can now drop
the old implementation in favour of the binary one from trace_bprintk(),
which means we move all the implementation of trace_bprintk() to
trace_printk(), so the Api doesn't change except that we must now use
trace_seq_bprintk() to print the TRACE_PRINT entries.

Some changes result of this:

- Previously, trace_bprintk depended of a single tracer and couldn't
  work without. This tracer has been dropped and the whole implementation
  of trace_printk() (like the module formats management) is now integrated
  in the tracing core (comes with CONFIG_TRACING), though we keep the file
  trace_printk (previously trace_bprintk.c) where we can find the module
  management. Thus we don't overflow trace.c

- changes some parts to use trace_seq_bprintk() to print TRACE_PRINT entries.

- change a bit trace_printk/trace_vprintk macros to support non-builtin formats
  constants, and fix 'const' qualifiers warnings. But this is all transparent for
  developers.

- etc...

V2:

- Rebase against last changes
- Fix mispell on the changelog

V3:

- Rebase against last changes (moving trace_printk() to kernel.h)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h               |  25 -----
 include/linux/kernel.h               |  34 +++++-
 include/linux/module.h               |   2 +-
 kernel/trace/Kconfig                 |   7 +-
 kernel/trace/Makefile                |   2 +-
 kernel/trace/trace.c                 | 212 ++++++++++-------------------------
 kernel/trace/trace.h                 |  14 +--
 kernel/trace/trace_bprintk.c         | 154 -------------------------
 kernel/trace/trace_functions_graph.c |   6 +-
 kernel/trace/trace_mmiotrace.c       |   9 +-
 kernel/trace/trace_output.c          |  70 ++----------
 kernel/trace/trace_output.h          |   2 +
 kernel/trace/trace_printk.c          | 138 +++++++++++++++++++++++
 13 files changed, 262 insertions(+), 413 deletions(-)
 delete mode 100644 kernel/trace/trace_bprintk.c
 create mode 100644 kernel/trace/trace_printk.c

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1cc8ca453a9b..e1583f2639b0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -223,31 +223,6 @@ extern int ftrace_make_nop(struct module *mod,
  */
 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
-#ifdef CONFIG_TRACE_BPRINTK
-extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
-extern int __trace_bprintk(unsigned long ip, const char *fmt, ...)
-		__attribute__ ((format (printf, 2, 3)));
-
-static inline void  ____trace_bprintk_check_format(const char *fmt, ...)
-		__attribute__ ((format (printf, 1, 2)));
-static inline void ____trace_bprintk_check_format(const char *fmt, ...) {}
-#define __trace_bprintk_check_format(fmt, args...)			\
-do {									\
-	if (0)								\
-		____trace_bprintk_check_format(fmt, ##args);		\
-} while (0)
-
-#define trace_bprintk(fmt, args...)					\
-do {									\
-	static char *__attribute__((section("__trace_bprintk_fmt")))	\
-			trace_bprintk_fmt = fmt;			\
-	__trace_bprintk_check_format(fmt, ##args);			\
-	__trace_bprintk(_THIS_IP_, trace_bprintk_fmt, ##args);	\
-} while (0)
-#else
-#define trace_bprintk trace_printk
-#endif
-
 /* May be defined in arch */
 extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7aef15c4645e..4e726b9a71ec 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -423,6 +423,16 @@ extern void ftrace_off_permanent(void);
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
+static inline void __attribute__ ((format (printf, 1, 2)))
+____trace_printk_check_format(const char *fmt, ...)
+{
+}
+#define __trace_printk_check_format(fmt, args...)			\
+do {									\
+	if (0)								\
+		____trace_printk_check_format(fmt, ##args);		\
+} while (0)
+
 /**
  * trace_printk - printf formatting in the ftrace buffer
  * @fmt: the printf format for printing
@@ -439,13 +449,31 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
  * Please refrain from leaving trace_printks scattered around in
  * your code.
  */
-# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
+
+#define trace_printk(fmt, args...)					\
+do {									\
+	static const char *trace_printk_fmt				\
+	__attribute__((section("__trace_printk_fmt")));			\
+	trace_printk_fmt = fmt;					\
+	__trace_printk_check_format(fmt, ##args);			\
+	__trace_printk(_THIS_IP_, trace_printk_fmt, ##args);		\
+} while (0)
+
 extern int
 __trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
-# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
+
+#define ftrace_vprintk(fmt, vargs)					\
+do {									\
+	static const char *trace_printk_fmt				\
+	__attribute__((section("__trace_printk_fmt")));			\
+	trace_printk_fmt = fmt;					\
+	__ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs);		\
+} while (0)
+
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
+
 extern void ftrace_dump(void);
 #else
 static inline void
@@ -467,7 +495,7 @@ ftrace_vprintk(const char *fmt, va_list ap)
 	return 0;
 }
 static inline void ftrace_dump(void) { }
-#endif
+#endif /* CONFIG_TRACING */
 
 /*
  *      Display an IP address in readable format.
diff --git a/include/linux/module.h b/include/linux/module.h
index 8cbec972d8e7..22d9878e868c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -329,7 +329,7 @@ struct module
 	unsigned int num_tracepoints;
 #endif
 
-#ifdef CONFIG_TRACE_BPRINTK
+#ifdef CONFIG_TRACING
 	const char **trace_bprintk_fmt_start;
 	unsigned int num_trace_bprintk_fmt;
 #endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ad8d3617d0a6..8e4a2a61cd75 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -52,6 +52,7 @@ config TRACING
 	select STACKTRACE if STACKTRACE_SUPPORT
 	select TRACEPOINTS
 	select NOP_TRACER
+	select BINARY_PRINTF
 
 #
 # Minimum requirements an architecture has to meet for us to
@@ -97,12 +98,6 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
-config TRACE_BPRINTK
-	bool "Binary printk for tracing"
-	default y
-	depends on TRACING
-	select BINARY_PRINTF
-
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 46557ef4c379..c7a2943796eb 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -22,7 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
-obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o
+obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 46b3cd7a5752..cc94f8642485 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1169,6 +1169,67 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+
+/**
+ * trace_vprintk - write binary msg to tracing buffer
+ *
+ */
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+{
+	static DEFINE_SPINLOCK(trace_buf_lock);
+	static u32 trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	struct print_entry *entry;
+	unsigned long flags;
+	int resched;
+	int cpu, len = 0, size, pc;
+
+	if (unlikely(tracing_selftest_running || tracing_disabled))
+		return 0;
+
+	/* Don't pollute graph traces with trace_vprintk internals */
+	pause_graph_tracing();
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	spin_lock_irqsave(&trace_buf_lock, flags);
+	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	if (len > TRACE_BUF_SIZE || len < 0)
+		goto out_unlock;
+
+	size = sizeof(*entry) + sizeof(u32) * len;
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->depth			= depth;
+	entry->fmt			= fmt;
+
+	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+out_unlock:
+	spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+out:
+	ftrace_preempt_enable(resched);
+	unpause_graph_tracing();
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_vprintk);
+
 enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
 	TRACE_FILE_ANNOTATE	= 2,
@@ -1564,7 +1625,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, "%s", field->buf);
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -3714,155 +3775,6 @@ static __init int tracer_init_debugfs(void)
 	return 0;
 }
 
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
-{
-	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
-	static char trace_buf[TRACE_BUF_SIZE];
-
-	struct ring_buffer_event *event;
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	int cpu, len = 0, size, pc;
-	struct print_entry *entry;
-	unsigned long irq_flags;
-
-	if (tracing_disabled || tracing_selftest_running)
-		return 0;
-
-	pc = preempt_count();
-	preempt_disable_notrace();
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (unlikely(atomic_read(&data->disabled)))
-		goto out;
-
-	pause_graph_tracing();
-	raw_local_irq_save(irq_flags);
-	__raw_spin_lock(&trace_buf_lock);
-	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-
-	len = min(len, TRACE_BUF_SIZE-1);
-	trace_buf[len] = 0;
-
-	size = sizeof(*entry) + len + 1;
-	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
-	if (!event)
-		goto out_unlock;
-	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
-	entry->depth			= depth;
-
-	memcpy(&entry->buf, trace_buf, len);
-	entry->buf[len] = 0;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
- out_unlock:
-	__raw_spin_unlock(&trace_buf_lock);
-	raw_local_irq_restore(irq_flags);
-	unpause_graph_tracing();
- out:
-	preempt_enable_notrace();
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(trace_vprintk);
-
-int __trace_printk(unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-	va_end(ap);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__trace_printk);
-
-int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
-{
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-}
-EXPORT_SYMBOL_GPL(__ftrace_vprintk);
-
-/**
- * trace_vbprintk - write binary msg to tracing buffer
- *
- * Caller must insure @fmt are valid when msg is in tracing buffer.
- */
-int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
-{
-	static DEFINE_SPINLOCK(trace_buf_lock);
-	static u32 trace_buf[TRACE_BUF_SIZE];
-
-	struct ring_buffer_event *event;
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	struct bprintk_entry *entry;
-	unsigned long flags;
-	int resched;
-	int cpu, len = 0, size, pc;
-
-	if (tracing_disabled || !trace_bprintk_enable)
-		return 0;
-
-	pc = preempt_count();
-	resched = ftrace_preempt_disable();
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (unlikely(atomic_read(&data->disabled)))
-		goto out;
-
-	spin_lock_irqsave(&trace_buf_lock, flags);
-	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-
-	if (len > TRACE_BUF_SIZE || len < 0)
-		goto out_unlock;
-
-	size = sizeof(*entry) + sizeof(u32) * len;
-	event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc);
-	if (!event)
-		goto out_unlock;
-	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
-	entry->fmt			= fmt;
-
-	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, flags);
-
-out:
-	ftrace_preempt_enable(resched);
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(trace_vbprintk);
-
-int __trace_bprintk(unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!fmt)
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_vbprintk(ip, fmt, ap);
-	va_end(ap);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__trace_bprintk);
-
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0f5077f8f957..6140922392c8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,7 +20,6 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
-	TRACE_BPRINTK,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -120,16 +119,10 @@ struct userstack_entry {
  */
 struct print_entry {
 	struct trace_entry	ent;
-	unsigned long		ip;
+	unsigned long 		ip;
 	int			depth;
-	char			buf[];
-};
-
-struct bprintk_entry {
-	struct trace_entry ent;
-	unsigned long ip;
-	const char *fmt;
-	u32 buf[];
+	const char		*fmt;
+	u32 			buf[];
 };
 #ifdef CONFIG_TRACE_BPRINTK
 extern int trace_bprintk_enable;
@@ -296,7 +289,6 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
-		IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
deleted file mode 100644
index f4c245a5cd33..000000000000
--- a/kernel/trace/trace_bprintk.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * trace binary printk
- *
- * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
- *
- */
-#include <linux/kernel.h>
-#include <linux/ftrace.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/fs.h>
-#include <linux/marker.h>
-#include <linux/uaccess.h>
-
-#include "trace.h"
-
-#ifdef CONFIG_MODULES
-
-/* binary printk basic */
-static DEFINE_MUTEX(btrace_mutex);
-/*
- * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt
- * which are queued on trace_bprintk_fmt_list.
- */
-static LIST_HEAD(trace_bprintk_fmt_list);
-
-struct trace_bprintk_fmt {
-	struct list_head list;
-	char fmt[0];
-};
-
-
-static inline void lock_btrace(void)
-{
-	mutex_lock(&btrace_mutex);
-}
-
-static inline void unlock_btrace(void)
-{
-	mutex_unlock(&btrace_mutex);
-}
-
-
-static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
-{
-	struct trace_bprintk_fmt *pos;
-	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
-		if (!strcmp(pos->fmt, fmt))
-			return pos;
-	}
-	return NULL;
-}
-
-static
-void hold_module_trace_bprintk_format(const char **start, const char **end)
-{
-	const char **iter;
-	lock_btrace();
-	for (iter = start; iter < end; iter++) {
-		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
-		if (tb_fmt) {
-			*iter = tb_fmt->fmt;
-			continue;
-		}
-
-		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
-				+ strlen(*iter) + 1, GFP_KERNEL);
-		if (tb_fmt) {
-			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-			strcpy(tb_fmt->fmt, *iter);
-			*iter = tb_fmt->fmt;
-		} else
-			*iter = NULL;
-	}
-	unlock_btrace();
-}
-
-static int module_trace_bprintk_format_notify(struct notifier_block *self,
-		unsigned long val, void *data)
-{
-	struct module *mod = data;
-	if (mod->num_trace_bprintk_fmt) {
-		const char **start = mod->trace_bprintk_fmt_start;
-		const char **end = start + mod->num_trace_bprintk_fmt;
-
-		if (val == MODULE_STATE_COMING)
-			hold_module_trace_bprintk_format(start, end);
-	}
-	return 0;
-}
-
-#else /* !CONFIG_MODULES */
-__init static int
-module_trace_bprintk_format_notify(struct notifier_block *self,
-		unsigned long val, void *data)
-{
-	return 0;
-}
-#endif /* CONFIG_MODULES */
-
-
-__initdata_or_module static
-struct notifier_block module_trace_bprintk_format_nb = {
-	.notifier_call = module_trace_bprintk_format_notify,
-};
-
-/* events tracer */
-int trace_bprintk_enable;
-
-static void start_bprintk_trace(struct trace_array *tr)
-{
-	tracing_reset_online_cpus(tr);
-	trace_bprintk_enable = 1;
-}
-
-static void stop_bprintk_trace(struct trace_array *tr)
-{
-	trace_bprintk_enable = 0;
-	tracing_reset_online_cpus(tr);
-}
-
-static int init_bprintk_trace(struct trace_array *tr)
-{
-	start_bprintk_trace(tr);
-	return 0;
-}
-
-static struct tracer bprintk_trace __read_mostly =
-{
-	.name	     = "events",
-	.init	     = init_bprintk_trace,
-	.reset	     = stop_bprintk_trace,
-	.start	     = start_bprintk_trace,
-	.stop	     = stop_bprintk_trace,
-};
-
-static __init int init_bprintk(void)
-{
-	int ret = register_module_notifier(&module_trace_bprintk_format_nb);
-	if (ret)
-		return ret;
-
-	ret = register_tracer(&bprintk_trace);
-	if (ret)
-		unregister_module_notifier(&module_trace_bprintk_format_nb);
-	return ret;
-}
-
-device_initcall(init_bprintk);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e527f2f66c73..453ebd3b636e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -742,7 +742,11 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 		}
 
 	/* The comment */
-	ret = trace_seq_printf(s, "/* %s", trace->buf);
+	ret = trace_seq_printf(s, "/* ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_bprintf(s, trace->fmt, trace->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index c401b908e805..23e346a734ca 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -254,15 +254,18 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 {
 	struct trace_entry *entry = iter->ent;
 	struct print_entry *print = (struct print_entry *)entry;
-	const char *msg		= print->buf;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
-	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
 	unsigned secs		= (unsigned long)t;
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
+	ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_bprintf(s, print->fmt, print->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 4ab71201862e..ef8fd661b217 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -53,8 +53,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
-static int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
 	int ret;
@@ -834,54 +833,12 @@ static struct trace_event trace_user_stack_event = {
 };
 
 /* TRACE_PRINT */
-static enum print_line_t trace_print_print(struct trace_iterator *iter,
-					   int flags)
-{
-	struct print_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-
-	if (!trace_seq_printf(s, ": %s", field->buf))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
-{
-	struct print_entry *field;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
-static struct trace_event trace_print_event = {
-	.type	 	= TRACE_PRINT,
-	.trace		= trace_print_print,
-	.raw		= trace_print_raw,
-};
-
-/* TRACE_BPRINTK */
 static enum print_line_t
-trace_bprintk_print(struct trace_iterator *iter, int flags)
+trace_print_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
-	struct bprintk_entry *field;
+	struct print_entry *field;
 
 	trace_assign_type(field, entry);
 
@@ -900,14 +857,13 @@ trace_bprintk_print(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static enum print_line_t
-trace_bprintk_raw(struct trace_iterator *iter, int flags)
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 {
-	struct trace_entry *entry = iter->ent;
+	struct print_entry *field;
 	struct trace_seq *s = &iter->seq;
-	struct bprintk_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!trace_seq_printf(s, ": %lx : ", field->ip))
 		goto partial;
@@ -921,12 +877,11 @@ trace_bprintk_raw(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static struct trace_event trace_bprintk_event = {
-	.type	 	= TRACE_BPRINTK,
-	.trace		= trace_bprintk_print,
-	.raw		= trace_bprintk_raw,
-	.hex		= trace_nop_print,
-	.binary		= trace_nop_print,
+
+static struct trace_event trace_print_event = {
+	.type	 	= TRACE_PRINT,
+	.trace		= trace_print_print,
+	.raw		= trace_print_raw,
 };
 
 static struct trace_event *events[] __initdata = {
@@ -937,7 +892,6 @@ static struct trace_event *events[] __initdata = {
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_print_event,
-	&trace_bprintk_event,
 	NULL
 };
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 8a34d688ed63..3b90e6ade1aa 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -18,6 +18,8 @@ struct trace_event {
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
new file mode 100644
index 000000000000..a50aea22e929
--- /dev/null
+++ b/kernel/trace/trace_printk.c
@@ -0,0 +1,138 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+
+#ifdef CONFIG_MODULES
+
+/*
+ * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+/* serialize accesses to trace_bprintk_fmt_list */
+static DEFINE_MUTEX(btrace_mutex);
+
+struct trace_bprintk_fmt {
+	struct list_head list;
+	char fmt[0];
+};
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
+{
+	struct trace_bprintk_fmt *pos;
+	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+		if (!strcmp(pos->fmt, fmt))
+			return pos;
+	}
+	return NULL;
+}
+
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
+{
+	const char **iter;
+
+	mutex_lock(&btrace_mutex);
+	for (iter = start; iter < end; iter++) {
+		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+		if (tb_fmt) {
+			*iter = tb_fmt->fmt;
+			continue;
+		}
+
+		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+				+ strlen(*iter) + 1, GFP_KERNEL);
+		if (tb_fmt) {
+			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+			strcpy(tb_fmt->fmt, *iter);
+			*iter = tb_fmt->fmt;
+		} else
+			*iter = NULL;
+	}
+	mutex_unlock(&btrace_mutex);
+}
+
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct module *mod = data;
+	if (mod->num_trace_bprintk_fmt) {
+		const char **start = mod->trace_bprintk_fmt_start;
+		const char **end = start + mod->num_trace_bprintk_fmt;
+
+		if (val == MODULE_STATE_COMING)
+			hold_module_trace_bprintk_format(start, end);
+	}
+	return 0;
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+	.notifier_call = module_trace_bprintk_format_notify,
+};
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+ {
+	int ret;
+	va_list ap;
+
+	if (unlikely(!fmt))
+		return 0;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+ {
+	if (unlikely(!fmt))
+		return 0;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
+
+static __init int init_trace_printk(void)
+{
+	return register_module_notifier(&module_trace_bprintk_format_nb);
+}
+
+early_initcall(init_trace_printk);
-- 
cgit v1.2.3


From 73969ff0eda233f140bcbed1251431387b43f383 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Wed, 4 Mar 2009 23:27:14 -0800
Subject: Input: generic driver for rotary encoders on GPIOs

This patch adds a generic driver for rotary encoders connected to GPIO
pins of a system. It relies on gpiolib and generic hardware irqs. The
documentation that also comes with this patch explains the concept and
how to use the driver.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Tested-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 Documentation/input/rotary-encoder.txt | 101 +++++++++++++++
 drivers/input/misc/Kconfig             |  11 ++
 drivers/input/misc/Makefile            |   2 +
 drivers/input/misc/rotary_encoder.c    | 221 +++++++++++++++++++++++++++++++++
 include/linux/rotary_encoder.h         |  13 ++
 5 files changed, 348 insertions(+)
 create mode 100644 Documentation/input/rotary-encoder.txt
 create mode 100644 drivers/input/misc/rotary_encoder.c
 create mode 100644 include/linux/rotary_encoder.h

(limited to 'include/linux')

diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt
new file mode 100644
index 000000000000..435102a26d96
--- /dev/null
+++ b/Documentation/input/rotary-encoder.txt
@@ -0,0 +1,101 @@
+rotary-encoder - a generic driver for GPIO connected devices
+Daniel Mack <daniel@caiaq.de>, Feb 2009
+
+0. Function
+-----------
+
+Rotary encoders are devices which are connected to the CPU or other
+peripherals with two wires. The outputs are phase-shifted by 90 degrees
+and by triggering on falling and rising edges, the turn direction can
+be determined.
+
+The phase diagram of these two outputs look like this:
+
+                  _____       _____       _____
+                 |     |     |     |     |     |
+  Channel A  ____|     |_____|     |_____|     |____
+
+                 :  :  :  :  :  :  :  :  :  :  :  :
+            __       _____       _____       _____
+              |     |     |     |     |     |     |
+  Channel B   |_____|     |_____|     |_____|     |__
+
+                 :  :  :  :  :  :  :  :  :  :  :  :
+  Event          a  b  c  d  a  b  c  d  a  b  c  d
+
+                |<-------->|
+	          one step
+
+
+For more information, please see
+	http://en.wikipedia.org/wiki/Rotary_encoder
+
+
+1. Events / state machine
+-------------------------
+
+a) Rising edge on channel A, channel B in low state
+	This state is used to recognize a clockwise turn
+
+b) Rising edge on channel B, channel A in high state
+	When entering this state, the encoder is put into 'armed' state,
+	meaning that there it has seen half the way of a one-step transition.
+
+c) Falling edge on channel A, channel B in high state
+	This state is used to recognize a counter-clockwise turn
+
+d) Falling edge on channel B, channel A in low state
+	Parking position. If the encoder enters this state, a full transition
+	should have happend, unless it flipped back on half the way. The
+	'armed' state tells us about that.
+
+2. Platform requirements
+------------------------
+
+As there is no hardware dependent call in this driver, the platform it is
+used with must support gpiolib. Another requirement is that IRQs must be
+able to fire on both edges.
+
+
+3. Board integration
+--------------------
+
+To use this driver in your system, register a platform_device with the
+name 'rotary-encoder' and associate the IRQs and some specific platform
+data with it.
+
+struct rotary_encoder_platform_data is declared in
+include/linux/rotary-encoder.h and needs to be filled with the number of
+steps the encoder has and can carry information about externally inverted
+signals (because of used invertig buffer or other reasons).
+
+Because GPIO to IRQ mapping is platform specific, this information must
+be given in seperately to the driver. See the example below.
+
+---------<snip>---------
+
+/* board support file example */
+
+#include <linux/input.h>
+#include <linux/rotary_encoder.h>
+
+#define GPIO_ROTARY_A 1
+#define GPIO_ROTARY_B 2
+
+static struct rotary_encoder_platform_data my_rotary_encoder_info = {
+	.steps		= 24,
+	.axis		= ABS_X,
+	.gpio_a		= GPIO_ROTARY_A,
+	.gpio_b		= GPIO_ROTARY_B,
+	.inverted_a	= 0,
+	.inverted_b	= 0,
+};
+
+static struct platform_device rotary_encoder_device = {
+	.name		= "rotary-encoder",
+	.id		= 0,
+	.dev		= {
+		.platform_data = &my_rotary_encoder_info,
+	}
+};
+
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 67e5553f699a..806d2e66d249 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -227,4 +227,15 @@ config INPUT_PCF50633_PMU
 	 Say Y to include support for delivering  PMU events via  input
 	 layer on NXP PCF50633.
 
+config INPUT_GPIO_ROTARY_ENCODER
+	tristate "Rotary encoders connected to GPIO pins"
+	depends on GPIOLIB && GENERIC_GPIO
+	help
+	  Say Y here to add support for rotary encoders connected to GPIO lines.
+	  Check file:Documentation/incput/rotary_encoder.txt for more
+	  information.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called rotary_encoder.
+
 endif
diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile
index bb62e6efacf3..e86cee66c914 100644
--- a/drivers/input/misc/Makefile
+++ b/drivers/input/misc/Makefile
@@ -22,3 +22,5 @@ obj-$(CONFIG_INPUT_UINPUT)		+= uinput.o
 obj-$(CONFIG_INPUT_APANEL)		+= apanel.o
 obj-$(CONFIG_INPUT_SGI_BTNS)		+= sgi_btns.o
 obj-$(CONFIG_INPUT_PCF50633_PMU)	+= pcf50633-input.o
+obj-$(CONFIG_INPUT_GPIO_ROTARY_ENCODER)	+= rotary_encoder.o
+
diff --git a/drivers/input/misc/rotary_encoder.c b/drivers/input/misc/rotary_encoder.c
new file mode 100644
index 000000000000..5bb3ab51b8c6
--- /dev/null
+++ b/drivers/input/misc/rotary_encoder.c
@@ -0,0 +1,221 @@
+/*
+ * rotary_encoder.c
+ *
+ * (c) 2009 Daniel Mack <daniel@caiaq.de>
+ *
+ * state machine code inspired by code from Tim Ruetz
+ *
+ * A generic driver for rotary encoders connected to GPIO lines.
+ * See file:Documentation/input/rotary_encoder.txt for more information
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/rotary_encoder.h>
+
+#define DRV_NAME "rotary-encoder"
+
+struct rotary_encoder {
+	unsigned int irq_a;
+	unsigned int irq_b;
+	unsigned int pos;
+	unsigned int armed;
+	unsigned int dir;
+	struct input_dev *input;
+	struct rotary_encoder_platform_data *pdata;
+};
+
+static irqreturn_t rotary_encoder_irq(int irq, void *dev_id)
+{
+	struct rotary_encoder *encoder = dev_id;
+	struct rotary_encoder_platform_data *pdata = encoder->pdata;
+	int a = !!gpio_get_value(pdata->gpio_a);
+	int b = !!gpio_get_value(pdata->gpio_b);
+	int state;
+
+	a ^= pdata->inverted_a;
+	b ^= pdata->inverted_b;
+	state = (a << 1) | b;
+
+	switch (state) {
+
+	case 0x0:
+		if (!encoder->armed)
+			break;
+
+		if (encoder->dir) {
+			/* turning counter-clockwise */
+			encoder->pos += pdata->steps;
+			encoder->pos--;
+			encoder->pos %= pdata->steps;
+		} else {
+			/* turning clockwise */
+			encoder->pos++;
+			encoder->pos %= pdata->steps;
+		}
+
+		input_report_abs(encoder->input, pdata->axis, encoder->pos);
+		input_sync(encoder->input);
+
+		encoder->armed = 0;
+		break;
+
+	case 0x1:
+	case 0x2:
+		if (encoder->armed)
+			encoder->dir = state - 1;
+		break;
+
+	case 0x3:
+		encoder->armed = 1;
+		break;
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int __devinit rotary_encoder_probe(struct platform_device *pdev)
+{
+	struct rotary_encoder_platform_data *pdata = pdev->dev.platform_data;
+	struct rotary_encoder *encoder;
+	struct input_dev *input;
+	int err;
+
+	if (!pdata || !pdata->steps) {
+		dev_err(&pdev->dev, "invalid platform data\n");
+		return -ENOENT;
+	}
+
+	encoder = kzalloc(sizeof(struct rotary_encoder), GFP_KERNEL);
+	input = input_allocate_device();
+	if (!encoder || !input) {
+		dev_err(&pdev->dev, "failed to allocate memory for device\n");
+		err = -ENOMEM;
+		goto exit_free_mem;
+	}
+
+	encoder->input = input;
+	encoder->pdata = pdata;
+	encoder->irq_a = gpio_to_irq(pdata->gpio_a);
+	encoder->irq_b = gpio_to_irq(pdata->gpio_b);
+
+	/* create and register the input driver */
+	input->name = pdev->name;
+	input->id.bustype = BUS_HOST;
+	input->dev.parent = &pdev->dev;
+	input->evbit[0] = BIT_MASK(EV_ABS);
+	input_set_abs_params(encoder->input,
+			     pdata->axis, 0, pdata->steps, 0, 1);
+
+	err = input_register_device(input);
+	if (err) {
+		dev_err(&pdev->dev, "failed to register input device\n");
+		goto exit_free_mem;
+	}
+
+	/* request the GPIOs */
+	err = gpio_request(pdata->gpio_a, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "unable to request GPIO %d\n",
+			pdata->gpio_a);
+		goto exit_unregister_input;
+	}
+
+	err = gpio_request(pdata->gpio_b, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "unable to request GPIO %d\n",
+			pdata->gpio_b);
+		goto exit_free_gpio_a;
+	}
+
+	/* request the IRQs */
+	err = request_irq(encoder->irq_a, &rotary_encoder_irq,
+			  IORESOURCE_IRQ_HIGHEDGE | IORESOURCE_IRQ_LOWEDGE,
+			  DRV_NAME, encoder);
+	if (err) {
+		dev_err(&pdev->dev, "unable to request IRQ %d\n",
+			encoder->irq_a);
+		goto exit_free_gpio_b;
+	}
+
+	err = request_irq(encoder->irq_b, &rotary_encoder_irq,
+			  IORESOURCE_IRQ_HIGHEDGE | IORESOURCE_IRQ_LOWEDGE,
+			  DRV_NAME, encoder);
+	if (err) {
+		dev_err(&pdev->dev, "unable to request IRQ %d\n",
+			encoder->irq_b);
+		goto exit_free_irq_a;
+	}
+
+	platform_set_drvdata(pdev, encoder);
+
+	return 0;
+
+exit_free_irq_a:
+	free_irq(encoder->irq_a, encoder);
+exit_free_gpio_b:
+	gpio_free(pdata->gpio_b);
+exit_free_gpio_a:
+	gpio_free(pdata->gpio_a);
+exit_unregister_input:
+	input_unregister_device(input);
+	input = NULL; /* so we don't try to free it */
+exit_free_mem:
+	input_free_device(input);
+	kfree(encoder);
+	return err;
+}
+
+static int __devexit rotary_encoder_remove(struct platform_device *pdev)
+{
+	struct rotary_encoder *encoder = platform_get_drvdata(pdev);
+	struct rotary_encoder_platform_data *pdata = pdev->dev.platform_data;
+
+	free_irq(encoder->irq_a, encoder);
+	free_irq(encoder->irq_b, encoder);
+	gpio_free(pdata->gpio_a);
+	gpio_free(pdata->gpio_b);
+	input_unregister_device(encoder->input);
+	platform_set_drvdata(pdev, NULL);
+	kfree(encoder);
+
+	return 0;
+}
+
+static struct platform_driver rotary_encoder_driver = {
+	.probe		= rotary_encoder_probe,
+	.remove		= __devexit_p(rotary_encoder_remove),
+	.driver		= {
+		.name	= DRV_NAME,
+		.owner	= THIS_MODULE,
+	}
+};
+
+static int __init rotary_encoder_init(void)
+{
+	return platform_driver_register(&rotary_encoder_driver);
+}
+
+static void __exit rotary_encoder_exit(void)
+{
+	platform_driver_unregister(&rotary_encoder_driver);
+}
+
+module_init(rotary_encoder_init);
+module_exit(rotary_encoder_exit);
+
+MODULE_ALIAS("platform:" DRV_NAME);
+MODULE_DESCRIPTION("GPIO rotary encoder driver");
+MODULE_AUTHOR("Daniel Mack <daniel@caiaq.de>");
+MODULE_LICENSE("GPL v2");
+
diff --git a/include/linux/rotary_encoder.h b/include/linux/rotary_encoder.h
new file mode 100644
index 000000000000..12d63a30c347
--- /dev/null
+++ b/include/linux/rotary_encoder.h
@@ -0,0 +1,13 @@
+#ifndef __ROTARY_ENCODER_H__
+#define __ROTARY_ENCODER_H__
+
+struct rotary_encoder_platform_data {
+	unsigned int steps;
+	unsigned int axis;
+	unsigned int gpio_a;
+	unsigned int gpio_b;
+	unsigned int inverted_a;
+	unsigned int inverted_b;
+};
+
+#endif /* __ROTARY_ENCODER_H__ */
-- 
cgit v1.2.3


From 7bffc23e56e92c14b787bf4d95249a32085bfed5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Mar 2009 10:11:36 +0100
Subject: tracing: optimize trace_printk()

Impact: micro-optimization

trace_printk() does this unconditionally:

	trace_printk_fmt = fmt;

Where trace_printk_fmt is an entry into a global array. This is
very SMP-unfriendly.

So only write it once per bootup.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4e726b9a71ec..7742798c9208 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -454,7 +454,10 @@ do {									\
 do {									\
 	static const char *trace_printk_fmt				\
 	__attribute__((section("__trace_printk_fmt")));			\
-	trace_printk_fmt = fmt;					\
+									\
+	if (!trace_printk_fmt)						\
+		trace_printk_fmt = fmt;					\
+									\
 	__trace_printk_check_format(fmt, ##args);			\
 	__trace_printk(_THIS_IP_, trace_printk_fmt, ##args);		\
 } while (0)
@@ -467,7 +470,10 @@ __trace_printk(unsigned long ip, const char *fmt, ...)
 do {									\
 	static const char *trace_printk_fmt				\
 	__attribute__((section("__trace_printk_fmt")));			\
-	trace_printk_fmt = fmt;					\
+									\
+	if (!trace_printk_fmt)						\
+		trace_printk_fmt = fmt;					\
+									\
 	__ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs);		\
 } while (0)
 
-- 
cgit v1.2.3


From b4be468cc1e65110d9144751bf7079dad6f142b7 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Mon, 9 Mar 2009 20:12:52 -0700
Subject: Input: add AD7879 Touchscreen driver

[randy.dunlap@oracle.com: don't use bus_id]
[dtor@mail.ru: locking and other fixups]
Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/Kconfig  |  32 ++
 drivers/input/touchscreen/Makefile |   1 +
 drivers/input/touchscreen/ad7879.c | 782 +++++++++++++++++++++++++++++++++++++
 include/linux/spi/ad7879.h         |  35 ++
 4 files changed, 850 insertions(+)
 create mode 100644 drivers/input/touchscreen/ad7879.c
 create mode 100644 include/linux/spi/ad7879.h

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index afeb7554bfe2..b01fd61dadcc 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -42,6 +42,38 @@ config TOUCHSCREEN_AD7877
 	  To compile this driver as a module, choose M here: the
 	  module will be called ad7877.
 
+config TOUCHSCREEN_AD7879_I2C
+	tristate "AD7879 based touchscreens: AD7879-1 I2C Interface"
+	depends on I2C
+	select TOUCHSCREEN_AD7879
+	help
+	  Say Y here if you have a touchscreen interface using the
+	  AD7879-1 controller, and your board-specific initialization
+	  code includes that in its table of I2C devices.
+
+	  If unsure, say N (but it's safe to say "Y").
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ad7879.
+
+config TOUCHSCREEN_AD7879_SPI
+	tristate "AD7879 based touchscreens: AD7879 SPI Interface"
+	depends on SPI_MASTER && TOUCHSCREEN_AD7879_I2C = n
+	select TOUCHSCREEN_AD7879
+	help
+	  Say Y here if you have a touchscreen interface using the
+	  AD7879 controller, and your board-specific initialization
+	  code includes that in its table of SPI devices.
+
+	  If unsure, say N (but it's safe to say "Y").
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ad7879.
+
+config TOUCHSCREEN_AD7879
+	tristate
+	default n
+
 config TOUCHSCREEN_BITSY
 	tristate "Compaq iPAQ H3600 (Bitsy) touchscreen"
 	depends on SA1100_BITSY
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile
index 45dfcd61be11..6700f7b9d165 100644
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -7,6 +7,7 @@
 wm97xx-ts-y := wm97xx-core.o
 
 obj-$(CONFIG_TOUCHSCREEN_AD7877)	+= ad7877.o
+obj-$(CONFIG_TOUCHSCREEN_AD7879)	+= ad7879.o
 obj-$(CONFIG_TOUCHSCREEN_ADS7846)	+= ads7846.o
 obj-$(CONFIG_TOUCHSCREEN_ATMEL_TSADCC)	+= atmel_tsadcc.o
 obj-$(CONFIG_TOUCHSCREEN_BITSY)		+= h3600_ts_input.o
diff --git a/drivers/input/touchscreen/ad7879.c b/drivers/input/touchscreen/ad7879.c
new file mode 100644
index 000000000000..ea4c61d68683
--- /dev/null
+++ b/drivers/input/touchscreen/ad7879.c
@@ -0,0 +1,782 @@
+/*
+ * Copyright (C) 2008 Michael Hennerich, Analog Devices Inc.
+ *
+ * Description:	AD7879 based touchscreen, and GPIO driver (I2C/SPI Interface)
+ *
+ * Bugs:        Enter bugs at http://blackfin.uclinux.org/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see the file COPYING, or write
+ * to the Free Software Foundation, Inc.,
+ * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * History:
+ * Copyright (c) 2005 David Brownell
+ * Copyright (c) 2006 Nokia Corporation
+ * Various changes: Imre Deak <imre.deak@nokia.com>
+ *
+ * Using code from:
+ *  - corgi_ts.c
+ *	Copyright (C) 2004-2005 Richard Purdie
+ *  - omap_ts.[hc], ads7846.h, ts_osk.c
+ *	Copyright (C) 2002 MontaVista Software
+ *	Copyright (C) 2004 Texas Instruments
+ *	Copyright (C) 2005 Dirk Behme
+ *  - ad7877.c
+ *	Copyright (C) 2006-2008 Analog Devices Inc.
+ */
+
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/spi/spi.h>
+#include <linux/i2c.h>
+
+#include <linux/spi/ad7879.h>
+
+#define AD7879_REG_ZEROS		0
+#define AD7879_REG_CTRL1		1
+#define AD7879_REG_CTRL2		2
+#define AD7879_REG_CTRL3		3
+#define AD7879_REG_AUX1HIGH		4
+#define AD7879_REG_AUX1LOW		5
+#define AD7879_REG_TEMP1HIGH		6
+#define AD7879_REG_TEMP1LOW		7
+#define AD7879_REG_XPLUS		8
+#define AD7879_REG_YPLUS		9
+#define AD7879_REG_Z1			10
+#define AD7879_REG_Z2			11
+#define AD7879_REG_AUXVBAT		12
+#define AD7879_REG_TEMP			13
+#define AD7879_REG_REVID		14
+
+/* Control REG 1 */
+#define AD7879_TMR(x)			((x & 0xFF) << 0)
+#define AD7879_ACQ(x)			((x & 0x3) << 8)
+#define AD7879_MODE_NOC			(0 << 10)	/* Do not convert */
+#define AD7879_MODE_SCC			(1 << 10)	/* Single channel conversion */
+#define AD7879_MODE_SEQ0		(2 << 10)	/* Sequence 0 in Slave Mode */
+#define AD7879_MODE_SEQ1		(3 << 10)	/* Sequence 1 in Master Mode */
+#define AD7879_MODE_INT			(1 << 15)	/* PENIRQ disabled INT enabled */
+
+/* Control REG 2 */
+#define AD7879_FCD(x)			((x & 0x3) << 0)
+#define AD7879_RESET			(1 << 4)
+#define AD7879_MFS(x)			((x & 0x3) << 5)
+#define AD7879_AVG(x)			((x & 0x3) << 7)
+#define	AD7879_SER			(1 << 9)	/* non-differential */
+#define	AD7879_DFR			(0 << 9)	/* differential */
+#define AD7879_GPIOPOL			(1 << 10)
+#define AD7879_GPIODIR			(1 << 11)
+#define AD7879_GPIO_DATA		(1 << 12)
+#define AD7879_GPIO_EN			(1 << 13)
+#define AD7879_PM(x)			((x & 0x3) << 14)
+#define AD7879_PM_SHUTDOWN		(0)
+#define AD7879_PM_DYN			(1)
+#define AD7879_PM_FULLON		(2)
+
+/* Control REG 3 */
+#define AD7879_TEMPMASK_BIT		(1<<15)
+#define AD7879_AUXVBATMASK_BIT		(1<<14)
+#define AD7879_INTMODE_BIT		(1<<13)
+#define AD7879_GPIOALERTMASK_BIT	(1<<12)
+#define AD7879_AUXLOW_BIT		(1<<11)
+#define AD7879_AUXHIGH_BIT		(1<<10)
+#define AD7879_TEMPLOW_BIT		(1<<9)
+#define AD7879_TEMPHIGH_BIT		(1<<8)
+#define AD7879_YPLUS_BIT		(1<<7)
+#define AD7879_XPLUS_BIT		(1<<6)
+#define AD7879_Z1_BIT			(1<<5)
+#define AD7879_Z2_BIT			(1<<4)
+#define AD7879_AUX_BIT			(1<<3)
+#define AD7879_VBAT_BIT			(1<<2)
+#define AD7879_TEMP_BIT			(1<<1)
+
+enum {
+	AD7879_SEQ_XPOS  = 0,
+	AD7879_SEQ_YPOS  = 1,
+	AD7879_SEQ_Z1    = 2,
+	AD7879_SEQ_Z2    = 3,
+	AD7879_NR_SENSE  = 4,
+};
+
+#define	MAX_12BIT			((1<<12)-1)
+#define	TS_PEN_UP_TIMEOUT		msecs_to_jiffies(50)
+
+#if defined(CONFIG_TOUCHSCREEN_AD7879_SPI) || defined(CONFIG_TOUCHSCREEN_AD7879_SPI_MODULE)
+#define AD7879_DEVID		0x7A
+typedef struct spi_device	bus_device;
+#elif defined(CONFIG_TOUCHSCREEN_AD7879_I2C) || defined(CONFIG_TOUCHSCREEN_AD7879_I2C_MODULE)
+#define AD7879_DEVID		0x79
+typedef struct i2c_client	bus_device;
+#endif
+
+struct ad7879 {
+	bus_device		*bus;
+	struct input_dev	*input;
+	struct work_struct	work;
+	struct timer_list	timer;
+
+	struct mutex		mutex;
+	unsigned		disabled:1;	/* P: mutex */
+
+#if defined(CONFIG_TOUCHSCREEN_AD7879_SPI) || defined(CONFIG_TOUCHSCREEN_AD7879_SPI_MODULE)
+	struct spi_message	msg;
+	struct spi_transfer	xfer[AD7879_NR_SENSE + 1];
+	u16			cmd;
+#endif
+	u16			conversion_data[AD7879_NR_SENSE];
+	char			phys[32];
+	u8			first_conversion_delay;
+	u8			acquisition_time;
+	u8			averaging;
+	u8			pen_down_acc_interval;
+	u8			median;
+	u16			x_plate_ohms;
+	u16			pressure_max;
+	u16			gpio_init;
+	u16			cmd_crtl1;
+	u16			cmd_crtl2;
+	u16			cmd_crtl3;
+	unsigned		gpio:1;
+};
+
+static int ad7879_read(bus_device *, u8);
+static int ad7879_write(bus_device *, u8, u16);
+static void ad7879_collect(struct ad7879 *);
+
+static void ad7879_report(struct ad7879 *ts)
+{
+	struct input_dev *input_dev = ts->input;
+	unsigned Rt;
+	u16 x, y, z1, z2;
+
+	x = ts->conversion_data[AD7879_SEQ_XPOS] & MAX_12BIT;
+	y = ts->conversion_data[AD7879_SEQ_YPOS] & MAX_12BIT;
+	z1 = ts->conversion_data[AD7879_SEQ_Z1] & MAX_12BIT;
+	z2 = ts->conversion_data[AD7879_SEQ_Z2] & MAX_12BIT;
+
+	/*
+	 * The samples processed here are already preprocessed by the AD7879.
+	 * The preprocessing function consists of a median and an averaging filter.
+	 * The combination of these two techniques provides a robust solution,
+	 * discarding the spurious noise in the signal and keeping only the data of interest.
+	 * The size of both filters is programmable. (dev.platform_data, see linux/spi/ad7879.h)
+	 * Other user-programmable conversion controls include variable acquisition time,
+	 * and first conversion delay. Up to 16 averages can be taken per conversion.
+	 */
+
+	if (likely(x && z1)) {
+		/* compute touch pressure resistance using equation #1 */
+		Rt = (z2 - z1) * x * ts->x_plate_ohms;
+		Rt /= z1;
+		Rt = (Rt + 2047) >> 12;
+
+		input_report_abs(input_dev, ABS_X, x);
+		input_report_abs(input_dev, ABS_Y, y);
+		input_report_abs(input_dev, ABS_PRESSURE, Rt);
+		input_sync(input_dev);
+	}
+}
+
+static void ad7879_work(struct work_struct *work)
+{
+	struct ad7879 *ts = container_of(work, struct ad7879, work);
+
+	/* use keventd context to read the result registers */
+	ad7879_collect(ts);
+	ad7879_report(ts);
+	mod_timer(&ts->timer, jiffies + TS_PEN_UP_TIMEOUT);
+}
+
+static void ad7879_ts_event_release(struct ad7879 *ts)
+{
+	struct input_dev *input_dev = ts->input;
+
+	input_report_abs(input_dev, ABS_PRESSURE, 0);
+	input_sync(input_dev);
+}
+
+static void ad7879_timer(unsigned long handle)
+{
+	struct ad7879 *ts = (void *)handle;
+
+	ad7879_ts_event_release(ts);
+}
+
+static irqreturn_t ad7879_irq(int irq, void *handle)
+{
+	struct ad7879 *ts = handle;
+
+	/* The repeated conversion sequencer controlled by TMR kicked off too fast.
+	 * We ignore the last and process the sample sequence currently in the queue.
+	 * It can't be older than 9.4ms
+	 */
+
+	if (!work_pending(&ts->work))
+		schedule_work(&ts->work);
+
+	return IRQ_HANDLED;
+}
+
+static void ad7879_setup(struct ad7879 *ts)
+{
+	ts->cmd_crtl3 = AD7879_YPLUS_BIT |
+			AD7879_XPLUS_BIT |
+			AD7879_Z2_BIT |
+			AD7879_Z1_BIT |
+			AD7879_TEMPMASK_BIT |
+			AD7879_AUXVBATMASK_BIT |
+			AD7879_GPIOALERTMASK_BIT;
+
+	ts->cmd_crtl2 = AD7879_PM(AD7879_PM_DYN) | AD7879_DFR |
+			AD7879_AVG(ts->averaging) |
+			AD7879_MFS(ts->median) |
+			AD7879_FCD(ts->first_conversion_delay) |
+			ts->gpio_init;
+
+	ts->cmd_crtl1 = AD7879_MODE_INT | AD7879_MODE_SEQ1 |
+			AD7879_ACQ(ts->acquisition_time) |
+			AD7879_TMR(ts->pen_down_acc_interval);
+
+	ad7879_write(ts->bus, AD7879_REG_CTRL2, ts->cmd_crtl2);
+	ad7879_write(ts->bus, AD7879_REG_CTRL3, ts->cmd_crtl3);
+	ad7879_write(ts->bus, AD7879_REG_CTRL1, ts->cmd_crtl1);
+}
+
+static void ad7879_disable(struct ad7879 *ts)
+{
+	mutex_lock(&ts->mutex);
+
+	if (!ts->disabled) {
+
+		ts->disabled = 1;
+		disable_irq(ts->bus->irq);
+
+		cancel_work_sync(&ts->work);
+
+		if (del_timer_sync(&ts->timer))
+			ad7879_ts_event_release(ts);
+
+		ad7879_write(ts->bus, AD7879_REG_CTRL2,
+			     AD7879_PM(AD7879_PM_SHUTDOWN));
+	}
+
+	mutex_unlock(&ts->mutex);
+}
+
+static void ad7879_enable(struct ad7879 *ts)
+{
+	mutex_lock(&ts->mutex);
+
+	if (ts->disabled) {
+		ad7879_setup(ts);
+		ts->disabled = 0;
+		enable_irq(ts->bus->irq);
+	}
+
+	mutex_unlock(&ts->mutex);
+}
+
+static ssize_t ad7879_disable_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct ad7879 *ts = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%u\n", ts->disabled);
+}
+
+static ssize_t ad7879_disable_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct ad7879 *ts = dev_get_drvdata(dev);
+	unsigned long val;
+	int error;
+
+	error = strict_strtoul(buf, 10, &val);
+	if (error)
+		return error;
+
+	if (val)
+		ad7879_disable(ts);
+	else
+		ad7879_enable(ts);
+
+	return count;
+}
+
+static DEVICE_ATTR(disable, 0664, ad7879_disable_show, ad7879_disable_store);
+
+static ssize_t ad7879_gpio_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct ad7879 *ts = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%u\n", ts->gpio);
+}
+
+static ssize_t ad7879_gpio_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct ad7879 *ts = dev_get_drvdata(dev);
+	unsigned long val;
+	int error;
+
+	error = strict_strtoul(buf, 10, &val);
+	if (error)
+		return error;
+
+	mutex_lock(&ts->mutex);
+	ts->gpio = !!val;
+	error = ad7879_write(ts->bus, AD7879_REG_CTRL2,
+			   ts->gpio ?
+				ts->cmd_crtl2 & ~AD7879_GPIO_DATA :
+				ts->cmd_crtl2 | AD7879_GPIO_DATA);
+	mutex_unlock(&ts->mutex);
+
+	return error ? : count;
+}
+
+static DEVICE_ATTR(gpio, 0664, ad7879_gpio_show, ad7879_gpio_store);
+
+static struct attribute *ad7879_attributes[] = {
+	&dev_attr_disable.attr,
+	&dev_attr_gpio.attr,
+	NULL
+};
+
+static const struct attribute_group ad7879_attr_group = {
+	.attrs = ad7879_attributes,
+};
+
+static int __devinit ad7879_construct(bus_device *bus, struct ad7879 *ts)
+{
+	struct input_dev *input_dev;
+	struct ad7879_platform_data *pdata = bus->dev.platform_data;
+	int err;
+	u16 revid;
+
+	if (!bus->irq) {
+		dev_err(&bus->dev, "no IRQ?\n");
+		return -ENODEV;
+	}
+
+	if (!pdata) {
+		dev_err(&bus->dev, "no platform data?\n");
+		return -ENODEV;
+	}
+
+	input_dev = input_allocate_device();
+	if (!input_dev)
+		return -ENOMEM;
+
+	ts->input = input_dev;
+
+	setup_timer(&ts->timer, ad7879_timer, (unsigned long) ts);
+	INIT_WORK(&ts->work, ad7879_work);
+	mutex_init(&ts->mutex);
+
+	ts->x_plate_ohms = pdata->x_plate_ohms ? : 400;
+	ts->pressure_max = pdata->pressure_max ? : ~0;
+
+	ts->first_conversion_delay = pdata->first_conversion_delay;
+	ts->acquisition_time = pdata->acquisition_time;
+	ts->averaging = pdata->averaging;
+	ts->pen_down_acc_interval = pdata->pen_down_acc_interval;
+	ts->median = pdata->median;
+
+	if (pdata->gpio_output)
+		ts->gpio_init = AD7879_GPIO_EN |
+				(pdata->gpio_default ? 0 : AD7879_GPIO_DATA);
+	else
+		ts->gpio_init = AD7879_GPIO_EN | AD7879_GPIODIR;
+
+	snprintf(ts->phys, sizeof(ts->phys), "%s/input0", dev_name(&bus->dev));
+
+	input_dev->name = "AD7879 Touchscreen";
+	input_dev->phys = ts->phys;
+	input_dev->dev.parent = &bus->dev;
+
+	__set_bit(EV_ABS, input_dev->evbit);
+	__set_bit(ABS_X, input_dev->absbit);
+	__set_bit(ABS_Y, input_dev->absbit);
+	__set_bit(ABS_PRESSURE, input_dev->absbit);
+
+	input_set_abs_params(input_dev, ABS_X,
+			pdata->x_min ? : 0,
+			pdata->x_max ? : MAX_12BIT,
+			0, 0);
+	input_set_abs_params(input_dev, ABS_Y,
+			pdata->y_min ? : 0,
+			pdata->y_max ? : MAX_12BIT,
+			0, 0);
+	input_set_abs_params(input_dev, ABS_PRESSURE,
+			pdata->pressure_min, pdata->pressure_max, 0, 0);
+
+	err = ad7879_write(bus, AD7879_REG_CTRL2, AD7879_RESET);
+
+	if (err < 0) {
+		dev_err(&bus->dev, "Failed to write %s\n", input_dev->name);
+		goto err_free_mem;
+	}
+
+	revid = ad7879_read(bus, AD7879_REG_REVID);
+
+	if ((revid & 0xFF) != AD7879_DEVID) {
+		dev_err(&bus->dev, "Failed to probe %s\n", input_dev->name);
+		err = -ENODEV;
+		goto err_free_mem;
+	}
+
+	ad7879_setup(ts);
+
+	err = request_irq(bus->irq, ad7879_irq,
+			  IRQF_TRIGGER_FALLING | IRQF_SAMPLE_RANDOM,
+			  bus->dev.driver->name, ts);
+
+	if (err) {
+		dev_err(&bus->dev, "irq %d busy?\n", bus->irq);
+		goto err_free_mem;
+	}
+
+	err = sysfs_create_group(&bus->dev.kobj, &ad7879_attr_group);
+	if (err)
+		goto err_free_irq;
+
+	err = input_register_device(input_dev);
+	if (err)
+		goto err_remove_attr;
+
+	dev_info(&bus->dev, "Rev.%d touchscreen, irq %d\n",
+		 revid >> 8, bus->irq);
+
+	return 0;
+
+err_remove_attr:
+	sysfs_remove_group(&bus->dev.kobj, &ad7879_attr_group);
+err_free_irq:
+	free_irq(bus->irq, ts);
+err_free_mem:
+	input_free_device(input_dev);
+
+	return err;
+}
+
+static int __devexit ad7879_destroy(bus_device *bus, struct ad7879 *ts)
+{
+	ad7879_disable(ts);
+	sysfs_remove_group(&ts->bus->dev.kobj, &ad7879_attr_group);
+	free_irq(ts->bus->irq, ts);
+	input_unregister_device(ts->input);
+	dev_dbg(&bus->dev, "unregistered touchscreen\n");
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int ad7879_suspend(bus_device *bus, pm_message_t message)
+{
+	struct ad7879 *ts = dev_get_drvdata(&bus->dev);
+
+	ad7879_disable(ts);
+
+	return 0;
+}
+
+static int ad7879_resume(bus_device *bus)
+{
+	struct ad7879 *ts = dev_get_drvdata(&bus->dev);
+
+	ad7879_enable(ts);
+
+	return 0;
+}
+#else
+#define ad7879_suspend NULL
+#define ad7879_resume  NULL
+#endif
+
+#if defined(CONFIG_TOUCHSCREEN_AD7879_SPI) || defined(CONFIG_TOUCHSCREEN_AD7879_SPI_MODULE)
+#define MAX_SPI_FREQ_HZ		5000000
+#define AD7879_CMD_MAGIC	0xE000
+#define AD7879_CMD_READ		(1 << 10)
+#define AD7879_WRITECMD(reg)	(AD7879_CMD_MAGIC | (reg & 0xF))
+#define AD7879_READCMD(reg)	(AD7879_CMD_MAGIC | AD7879_CMD_READ | (reg & 0xF))
+
+struct ser_req {
+	u16			command;
+	u16			data;
+	struct spi_message	msg;
+	struct spi_transfer	xfer[2];
+};
+
+/*
+ * ad7879_read/write are only used for initial setup and for sysfs controls.
+ * The main traffic is done in ad7879_collect().
+ */
+
+static int ad7879_read(struct spi_device *spi, u8 reg)
+{
+	struct ser_req *req;
+	int status, ret;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	spi_message_init(&req->msg);
+
+	req->command = (u16) AD7879_READCMD(reg);
+	req->xfer[0].tx_buf = &req->command;
+	req->xfer[0].len = 2;
+
+	req->xfer[1].rx_buf = &req->data;
+	req->xfer[1].len = 2;
+
+	spi_message_add_tail(&req->xfer[0], &req->msg);
+	spi_message_add_tail(&req->xfer[1], &req->msg);
+
+	status = spi_sync(spi, &req->msg);
+	ret = status ? : req->data;
+
+	kfree(req);
+
+	return ret;
+}
+
+static int ad7879_write(struct spi_device *spi, u8 reg, u16 val)
+{
+	struct ser_req *req;
+	int status;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	spi_message_init(&req->msg);
+
+	req->command = (u16) AD7879_WRITECMD(reg);
+	req->xfer[0].tx_buf = &req->command;
+	req->xfer[0].len = 2;
+
+	req->data = val;
+	req->xfer[1].tx_buf = &req->data;
+	req->xfer[1].len = 2;
+
+	spi_message_add_tail(&req->xfer[0], &req->msg);
+	spi_message_add_tail(&req->xfer[1], &req->msg);
+
+	status = spi_sync(spi, &req->msg);
+
+	kfree(req);
+
+	return status;
+}
+
+static void ad7879_collect(struct ad7879 *ts)
+{
+	int status = spi_sync(ts->bus, &ts->msg);
+
+	if (status)
+		dev_err(&ts->bus->dev, "spi_sync --> %d\n", status);
+}
+
+static void ad7879_setup_ts_def_msg(struct ad7879 *ts)
+{
+	struct spi_message *m;
+	int i;
+
+	ts->cmd = (u16) AD7879_READCMD(AD7879_REG_XPLUS);
+
+	m = &ts->msg;
+	spi_message_init(m);
+	ts->xfer[0].tx_buf = &ts->cmd;
+	ts->xfer[0].len = 2;
+
+	spi_message_add_tail(&ts->xfer[0], m);
+
+	for (i = 0; i < AD7879_NR_SENSE; i++) {
+		ts->xfer[i + 1].rx_buf = &ts->conversion_data[i];
+		ts->xfer[i + 1].len = 2;
+		spi_message_add_tail(&ts->xfer[i + 1], m);
+	}
+}
+
+static int __devinit ad7879_probe(struct spi_device *spi)
+{
+	struct ad7879 *ts;
+	int error;
+
+	/* don't exceed max specified SPI CLK frequency */
+	if (spi->max_speed_hz > MAX_SPI_FREQ_HZ) {
+		dev_err(&spi->dev, "SPI CLK %d Hz?\n", spi->max_speed_hz);
+		return -EINVAL;
+	}
+
+	ts = kzalloc(sizeof(struct ad7879), GFP_KERNEL);
+	if (!ts)
+		return -ENOMEM;
+
+	dev_set_drvdata(&spi->dev, ts);
+	ts->bus = spi;
+
+	ad7879_setup_ts_def_msg(ts);
+
+	error = ad7879_construct(spi, ts);
+	if (error) {
+		dev_set_drvdata(&spi->dev, NULL);
+		kfree(ts);
+	}
+
+	return 0;
+}
+
+static int __devexit ad7879_remove(struct spi_device *spi)
+{
+	struct ad7879 *ts = dev_get_drvdata(&spi->dev);
+
+	ad7879_destroy(spi, ts);
+	dev_set_drvdata(&spi->dev, NULL);
+	kfree(ts);
+
+	return 0;
+}
+
+static struct spi_driver ad7879_driver = {
+	.driver = {
+		.name	= "ad7879",
+		.bus	= &spi_bus_type,
+		.owner	= THIS_MODULE,
+	},
+	.probe		= ad7879_probe,
+	.remove		= __devexit_p(ad7879_remove),
+	.suspend	= ad7879_suspend,
+	.resume		= ad7879_resume,
+};
+
+static int __init ad7879_init(void)
+{
+	return spi_register_driver(&ad7879_driver);
+}
+module_init(ad7879_init);
+
+static void __exit ad7879_exit(void)
+{
+	spi_unregister_driver(&ad7879_driver);
+}
+module_exit(ad7879_exit);
+
+#elif defined(CONFIG_TOUCHSCREEN_AD7879_I2C) || defined(CONFIG_TOUCHSCREEN_AD7879_I2C_MODULE)
+
+/* All registers are word-sized.
+ * AD7879 uses a high-byte first convention.
+ */
+static int ad7879_read(struct i2c_client *client, u8 reg)
+{
+	return swab16(i2c_smbus_read_word_data(client, reg));
+}
+
+static int ad7879_write(struct i2c_client *client, u8 reg, u16 val)
+{
+	return i2c_smbus_write_word_data(client, reg, swab16(val));
+}
+
+static void ad7879_collect(struct ad7879 *ts)
+{
+	int i;
+
+	for (i = 0; i < AD7879_NR_SENSE; i++)
+		ts->conversion_data[i] = ad7879_read(ts->bus,
+						     AD7879_REG_XPLUS + i);
+}
+
+static int __devinit ad7879_probe(struct i2c_client *client,
+					const struct i2c_device_id *id)
+{
+	struct ad7879 *ts;
+	int error;
+
+	if (!i2c_check_functionality(client->adapter,
+					I2C_FUNC_SMBUS_WORD_DATA)) {
+		dev_err(&client->dev, "SMBUS Word Data not Supported\n");
+		return -EIO;
+	}
+
+	ts = kzalloc(sizeof(struct ad7879), GFP_KERNEL);
+	if (!ts)
+		return -ENOMEM;
+
+	i2c_set_clientdata(client, ts);
+	ts->bus = client;
+
+	error = ad7879_construct(client, ts);
+	if (error) {
+		i2c_set_clientdata(client, NULL);
+		kfree(ts);
+	}
+
+	return 0;
+}
+
+static int __devexit ad7879_remove(struct i2c_client *client)
+{
+	struct ad7879 *ts = dev_get_drvdata(&client->dev);
+
+	ad7879_destroy(client, ts);
+	i2c_set_clientdata(client, NULL);
+	kfree(ts);
+
+	return 0;
+}
+
+static const struct i2c_device_id ad7879_id[] = {
+	{ "ad7879", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, ad7879_id);
+
+static struct i2c_driver ad7879_driver = {
+	.driver = {
+		.name	= "ad7879",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= ad7879_probe,
+	.remove		= __devexit_p(ad7879_remove),
+	.suspend	= ad7879_suspend,
+	.resume		= ad7879_resume,
+	.id_table	= ad7879_id,
+};
+
+static int __init ad7879_init(void)
+{
+	return i2c_add_driver(&ad7879_driver);
+}
+module_init(ad7879_init);
+
+static void __exit ad7879_exit(void)
+{
+	i2c_del_driver(&ad7879_driver);
+}
+module_exit(ad7879_exit);
+#endif
+
+MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
+MODULE_DESCRIPTION("AD7879(-1) touchscreen Driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/spi/ad7879.h b/include/linux/spi/ad7879.h
new file mode 100644
index 000000000000..4231104c9afa
--- /dev/null
+++ b/include/linux/spi/ad7879.h
@@ -0,0 +1,35 @@
+/* linux/spi/ad7879.h */
+
+/* Touchscreen characteristics vary between boards and models.  The
+ * platform_data for the device's "struct device" holds this information.
+ *
+ * It's OK if the min/max values are zero.
+ */
+struct ad7879_platform_data {
+	u16	model;			/* 7879 */
+	u16	x_plate_ohms;
+	u16	x_min, x_max;
+	u16	y_min, y_max;
+	u16	pressure_min, pressure_max;
+
+	/* [0..255] 0=OFF Starts at 1=550us and goes
+	 * all the way to 9.440ms in steps of 35us.
+	 */
+	u8	pen_down_acc_interval;
+	/* [0..15] Starts at 0=128us and goes all the
+	 * way to 4.096ms in steps of 128us.
+	 */
+	u8	first_conversion_delay;
+	/* [0..3] 0 = 2us, 1 = 4us, 2 = 8us, 3 = 16us */
+	u8	acquisition_time;
+	/* [0..3] Average X middle samples 0 = 2, 1 = 4, 2 = 8, 3 = 16 */
+	u8	averaging;
+	/* [0..3] Perform X measurements 0 = OFF,
+	 * 1 = 4, 2 = 8, 3 = 16 (median > averaging)
+	 */
+	u8	median;
+	/* 1 = AUX/VBAT/GPIO set to GPIO Output */
+	u8	gpio_output;
+	/* Initial GPIO pin state (valid if gpio_output = 1) */
+	u8	gpio_default;
+};
-- 
cgit v1.2.3


From 2939b0469d04ba9ac791aca9a81625d7eb50662b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 15:47:18 -0400
Subject: tracing: replace TP<var> with TP_<var>

Impact: clean up

The macros TPPROTO, TPARGS, TPFMT, TPRAWFMT, and TPCMD all look a bit
ugly. This patch adds an underscore to their names.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 Documentation/tracepoints.txt          |  8 +--
 include/linux/tracepoint.h             |  6 +--
 include/trace/block.h                  | 70 ++++++++++++------------
 include/trace/irq_event_types.h        | 16 +++---
 include/trace/lockdep_event_types.h    | 24 ++++-----
 include/trace/power.h                  | 12 ++---
 include/trace/sched_event_types.h      | 98 +++++++++++++++++-----------------
 include/trace/workqueue.h              | 16 +++---
 kernel/trace/trace_event_types.h       | 28 +++++-----
 kernel/trace/trace_events_stage_2.h    |  6 +--
 kernel/trace/trace_events_stage_3.h    |  8 +--
 kernel/trace/trace_export.c            |  8 +--
 kernel/trace/trace_format.h            | 55 -------------------
 samples/tracepoints/tp-samples-trace.h |  8 +--
 14 files changed, 154 insertions(+), 209 deletions(-)
 delete mode 100644 kernel/trace/trace_format.h

(limited to 'include/linux')

diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
index 6f0a044f5b5e..4ff43c6de299 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/tracepoints.txt
@@ -45,8 +45,8 @@ In include/trace/subsys.h :
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(subsys_eventname,
-	TPPROTO(int firstarg, struct task_struct *p),
-	TPARGS(firstarg, p));
+	TP_PROTO(int firstarg, struct task_struct *p),
+	TP_ARGS(firstarg, p));
 
 In subsys/file.c (where the tracing statement must be added) :
 
@@ -66,10 +66,10 @@ Where :
     - subsys is the name of your subsystem.
     - eventname is the name of the event to trace.
 
-- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the
+- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the
   function called by this tracepoint.
 
-- TPARGS(firstarg, p) are the parameters names, same as found in the
+- TP_ARGS(firstarg, p) are the parameters names, same as found in the
   prototype.
 
 Connecting a function (probe) to a tracepoint is done by providing a
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 152b2f03fb86..3bcc3e171443 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -31,8 +31,8 @@ struct tracepoint {
 					 * Keep in sync with vmlinux.lds.h.
 					 */
 
-#define TPPROTO(args...)	args
-#define TPARGS(args...)		args
+#define TP_PROTO(args...)	args
+#define TP_ARGS(args...)		args
 
 #ifdef CONFIG_TRACEPOINTS
 
@@ -65,7 +65,7 @@ struct tracepoint {
 	{								\
 		if (unlikely(__tracepoint_##name.state))		\
 			__DO_TRACE(&__tracepoint_##name,		\
-				TPPROTO(proto), TPARGS(args));		\
+				TP_PROTO(proto), TP_ARGS(args));	\
 	}								\
 	static inline int register_trace_##name(void (*probe)(proto))	\
 	{								\
diff --git a/include/trace/block.h b/include/trace/block.h
index 25c6a1fd5b77..25b7068b819e 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -5,72 +5,72 @@
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(block_rq_abort,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_insert,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_issue,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_requeue,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_complete,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_bio_bounce,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_complete,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_backmerge,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_frontmerge,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_queue,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_getrq,
-	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
-		TPARGS(q, bio, rw));
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+	      TP_ARGS(q, bio, rw));
 
 DECLARE_TRACE(block_sleeprq,
-	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
-		TPARGS(q, bio, rw));
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+	      TP_ARGS(q, bio, rw));
 
 DECLARE_TRACE(block_plug,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_unplug_timer,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_unplug_io,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_split,
-	TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-		TPARGS(q, bio, pdu));
+	TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
+	      TP_ARGS(q, bio, pdu));
 
 DECLARE_TRACE(block_remap,
-	TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		sector_t from, sector_t to),
-		TPARGS(q, bio, dev, from, to));
+	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+		 sector_t from, sector_t to),
+	      TP_ARGS(q, bio, dev, from, to));
 
 #endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
index 65850bc5ea06..0147d9eef5f4 100644
--- a/include/trace/irq_event_types.h
+++ b/include/trace/irq_event_types.h
@@ -9,25 +9,25 @@
 #define TRACE_SYSTEM irq
 
 TRACE_EVENT_FORMAT(irq_handler_entry,
-	TPPROTO(int irq, struct irqaction *action),
-	TPARGS(irq, action),
-	TPFMT("irq=%d handler=%s", irq, action->name),
+	TP_PROTO(int irq, struct irqaction *action),
+	TP_ARGS(irq, action),
+	TP_FMT("irq=%d handler=%s", irq, action->name),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, irq, irq)
 	),
-	TPRAWFMT("irq %d")
+	TP_RAW_FMT("irq %d")
 	);
 
 TRACE_EVENT_FORMAT(irq_handler_exit,
-	TPPROTO(int irq, struct irqaction *action, int ret),
-	TPARGS(irq, action, ret),
-	TPFMT("irq=%d handler=%s return=%s",
+	TP_PROTO(int irq, struct irqaction *action, int ret),
+	TP_ARGS(irq, action, ret),
+	TP_FMT("irq=%d handler=%s return=%s",
 		irq, action->name, ret ? "handled" : "unhandled"),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, irq, irq)
 		TRACE_FIELD(int, ret, ret)
 	),
-	TPRAWFMT("irq %d ret %d")
+	TP_RAW_FMT("irq %d ret %d")
 	);
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
index f713d74a82b4..1f00e8b3543e 100644
--- a/include/trace/lockdep_event_types.h
+++ b/include/trace/lockdep_event_types.h
@@ -10,32 +10,32 @@
 #ifdef CONFIG_LOCKDEP
 
 TRACE_FORMAT(lock_acquire,
-	TPPROTO(struct lockdep_map *lock, unsigned int subclass,
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
 		int trylock, int read, int check,
 		struct lockdep_map *next_lock, unsigned long ip),
-	TPARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TPFMT("%s%s%s", trylock ? "try " : "",
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+	TP_FMT("%s%s%s", trylock ? "try " : "",
 		read ? "read " : "", lock->name)
 	);
 
 TRACE_FORMAT(lock_release,
-	TPPROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-	TPARGS(lock, nested, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_ARGS(lock, nested, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 #ifdef CONFIG_LOCK_STAT
 
 TRACE_FORMAT(lock_contended,
-	TPPROTO(struct lockdep_map *lock, unsigned long ip),
-	TPARGS(lock, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 TRACE_FORMAT(lock_acquired,
-	TPPROTO(struct lockdep_map *lock, unsigned long ip),
-	TPARGS(lock, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 #endif
diff --git a/include/trace/power.h b/include/trace/power.h
index 38aca537e497..ef204666e983 100644
--- a/include/trace/power.h
+++ b/include/trace/power.h
@@ -18,15 +18,15 @@ struct power_trace {
 };
 
 DECLARE_TRACE(power_start,
-	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
-		TPARGS(it, type, state));
+	TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state),
+	      TP_ARGS(it, type, state));
 
 DECLARE_TRACE(power_mark,
-	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
-		TPARGS(it, type, state));
+	TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state),
+	      TP_ARGS(it, type, state));
 
 DECLARE_TRACE(power_end,
-	TPPROTO(struct power_trace *it),
-		TPARGS(it));
+	TP_PROTO(struct power_trace *it),
+	      TP_ARGS(it));
 
 #endif /* _TRACE_POWER_H */
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index a6de5c1601a0..71b14828a957 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -9,143 +9,143 @@
 #define TRACE_SYSTEM sched
 
 TRACE_EVENT_FORMAT(sched_kthread_stop,
-	TPPROTO(struct task_struct *t),
-	TPARGS(t),
-	TPFMT("task %s:%d", t->comm, t->pid),
+	TP_PROTO(struct task_struct *t),
+	TP_ARGS(t),
+	TP_FMT("task %s:%d", t->comm, t->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, t->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_kthread_stop_ret,
-	TPPROTO(int ret),
-	TPARGS(ret),
-	TPFMT("ret=%d", ret),
+	TP_PROTO(int ret),
+	TP_ARGS(ret),
+	TP_FMT("ret=%d", ret),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, ret, ret)
 	),
-	TPRAWFMT("ret=%d")
+	TP_RAW_FMT("ret=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wait_task,
-	TPPROTO(struct rq *rq, struct task_struct *p),
-	TPARGS(rq, p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct rq *rq, struct task_struct *p),
+	TP_ARGS(rq, p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wakeup,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-	TPARGS(rq, p, success),
-	TPFMT("task %s:%d %s",
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+	TP_ARGS(rq, p, success),
+	TP_FMT("task %s:%d %s",
 	      p->comm, p->pid, success ? "succeeded" : "failed"),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, success, success)
 	),
-	TPRAWFMT("task %d success=%d")
+	TP_RAW_FMT("task %d success=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wakeup_new,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-	TPARGS(rq, p, success),
-	TPFMT("task %s:%d",
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+	TP_ARGS(rq, p, success),
+	TP_FMT("task %s:%d",
 	      p->comm, p->pid, success ? "succeeded" : "failed"),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, success, success)
 	),
-	TPRAWFMT("task %d success=%d")
+	TP_RAW_FMT("task %d success=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_switch,
-	TPPROTO(struct rq *rq, struct task_struct *prev,
+	TP_PROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
-	TPARGS(rq, prev, next),
-	TPFMT("task %s:%d ==> %s:%d",
+	TP_ARGS(rq, prev, next),
+	TP_FMT("task %s:%d ==> %s:%d",
 	      prev->comm, prev->pid, next->comm, next->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, prev_pid, prev->pid)
 		TRACE_FIELD(int, prev_prio, prev->prio)
 		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
 				    next_comm,
-				    TPCMD(memcpy(TRACE_ENTRY->next_comm,
+				    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
 						 next->comm,
 						 TASK_COMM_LEN)))
 		TRACE_FIELD(pid_t, next_pid, next->pid)
 		TRACE_FIELD(int, next_prio, next->prio)
 	),
-	TPRAWFMT("prev %d:%d ==> next %s:%d:%d")
+	TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_migrate_task,
-	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-	TPARGS(p, orig_cpu, dest_cpu),
-	TPFMT("task %s:%d from: %d  to: %d",
+	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+	TP_ARGS(p, orig_cpu, dest_cpu),
+	TP_FMT("task %s:%d from: %d  to: %d",
 	      p->comm, p->pid, orig_cpu, dest_cpu),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, orig_cpu, orig_cpu)
 		TRACE_FIELD(int, dest_cpu, dest_cpu)
 	),
-	TPRAWFMT("task %d  from: %d to: %d")
+	TP_RAW_FMT("task %d  from: %d to: %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_free,
-	TPPROTO(struct task_struct *p),
-	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct task_struct *p),
+	TP_ARGS(p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_exit,
-	TPPROTO(struct task_struct *p),
-	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct task_struct *p),
+	TP_ARGS(p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_wait,
-	TPPROTO(struct pid *pid),
-	TPARGS(pid),
-	TPFMT("pid %d", pid_nr(pid)),
+	TP_PROTO(struct pid *pid),
+	TP_ARGS(pid),
+	TP_FMT("pid %d", pid_nr(pid)),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, pid_nr(pid))
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_fork,
-	TPPROTO(struct task_struct *parent, struct task_struct *child),
-	TPARGS(parent, child),
-	TPFMT("parent %s:%d  child %s:%d",
+	TP_PROTO(struct task_struct *parent, struct task_struct *child),
+	TP_ARGS(parent, child),
+	TP_FMT("parent %s:%d  child %s:%d",
 	      parent->comm, parent->pid, child->comm, child->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, parent, parent->pid)
 		TRACE_FIELD(pid_t, child, child->pid)
 	),
-	TPRAWFMT("parent %d  child %d")
+	TP_RAW_FMT("parent %d  child %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_signal_send,
-	TPPROTO(int sig, struct task_struct *p),
-	TPARGS(sig, p),
-	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid),
+	TP_PROTO(int sig, struct task_struct *p),
+	TP_ARGS(sig, p),
+	TP_FMT("sig: %d   task %s:%d", sig, p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, sig, sig)
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("sig: %d  task %d")
+	TP_RAW_FMT("sig: %d  task %d")
 	);
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h
index 867829df4571..7626523deeba 100644
--- a/include/trace/workqueue.h
+++ b/include/trace/workqueue.h
@@ -6,20 +6,20 @@
 #include <linux/sched.h>
 
 DECLARE_TRACE(workqueue_insertion,
-	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TPARGS(wq_thread, work));
+	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TP_ARGS(wq_thread, work));
 
 DECLARE_TRACE(workqueue_execution,
-	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TPARGS(wq_thread, work));
+	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TP_ARGS(wq_thread, work));
 
 /* Trace the creation of one workqueue thread on a cpu */
 DECLARE_TRACE(workqueue_creation,
-	   TPPROTO(struct task_struct *wq_thread, int cpu),
-	   TPARGS(wq_thread, cpu));
+	   TP_PROTO(struct task_struct *wq_thread, int cpu),
+	   TP_ARGS(wq_thread, cpu));
 
 DECLARE_TRACE(workqueue_destruction,
-	   TPPROTO(struct task_struct *wq_thread),
-	   TPARGS(wq_thread));
+	   TP_PROTO(struct task_struct *wq_thread),
+	   TP_ARGS(wq_thread));
 
 #endif /* __TRACE_WORKQUEUE_H */
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fb4eba166433..d94179aa1fc2 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -10,7 +10,7 @@ TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned long, parent_ip, parent_ip)
 	),
-	TPRAWFMT(" %lx <-- %lx")
+	TP_RAW_FMT(" %lx <-- %lx")
 );
 
 TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
@@ -19,7 +19,7 @@ TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
 		TRACE_FIELD(unsigned long, graph_ent.func, func)
 		TRACE_FIELD(int, graph_ent.depth, depth)
 	),
-	TPRAWFMT("--> %lx (%d)")
+	TP_RAW_FMT("--> %lx (%d)")
 );
 
 TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
@@ -28,7 +28,7 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
 		TRACE_FIELD(unsigned long, ret.func, func)
 		TRACE_FIELD(int, ret.depth, depth)
 	),
-	TPRAWFMT("<-- %lx (%d)")
+	TP_RAW_FMT("<-- %lx (%d)")
 );
 
 TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
@@ -41,7 +41,7 @@ TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
 		TRACE_FIELD(unsigned char, next_state, next_state)
 		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
 	),
-	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
 TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
@@ -54,7 +54,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
 		TRACE_FIELD(unsigned char, next_state, next_state)
 		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
 	),
-	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
 TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
@@ -63,7 +63,7 @@ TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
 		TRACE_FIELD(unsigned long, arg2, arg2)
 		TRACE_FIELD(unsigned long, arg3, arg3)
 	),
-	TPRAWFMT("(%08lx) (%08lx) (%08lx)")
+	TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
 );
 
 /*
@@ -83,7 +83,7 @@ TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
 		TRACE_FIELD(unsigned long, caller[6], stack6)
 		TRACE_FIELD(unsigned long, caller[7], stack7)
 	),
-	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
@@ -98,7 +98,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		TRACE_FIELD(unsigned long, caller[6], stack6)
 		TRACE_FIELD(unsigned long, caller[7], stack7)
 	),
-	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
@@ -108,7 +108,7 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 		TRACE_FIELD(unsigned int, depth, depth)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
-	TPRAWFMT("%08lx (%d) %s")
+	TP_RAW_FMT("%08lx (%d) %s")
 );
 
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
@@ -118,7 +118,7 @@ TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
 		TRACE_FIELD(char, correct, correct)
 	),
-	TPRAWFMT("%u:%s:%s (%u)")
+	TP_RAW_FMT("%u:%s:%s (%u)")
 );
 
 TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
@@ -126,7 +126,7 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 		TRACE_FIELD(u64, from, from)
 		TRACE_FIELD(u64, to, to)
 	),
-	TPRAWFMT("from: %llx to: %llx")
+	TP_RAW_FMT("from: %llx to: %llx")
 );
 
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
@@ -136,7 +136,7 @@ TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
 		TRACE_FIELD(int, state_data.type, type)
 		TRACE_FIELD(int, state_data.state, state)
 	),
-	TPRAWFMT("%llx->%llx type:%u state:%u")
+	TP_RAW_FMT("%llx->%llx type:%u state:%u")
 );
 
 TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
@@ -149,7 +149,7 @@ TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
 		TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
 		TRACE_FIELD(int, node, node)
 	),
-	TPRAWFMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
+	TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
 		 " flags:%x node:%d")
 );
 
@@ -159,7 +159,7 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
 		TRACE_FIELD(unsigned long, call_site, call_site)
 		TRACE_FIELD(const void *, ptr, ptr)
 	),
-	TPRAWFMT("type:%u call_site:%lx ptr:%p")
+	TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
 );
 
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index d24a97e74aea..8e2e0f56c2a8 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -20,7 +20,7 @@
  *
  *	field = (typeof(field))entry;
  *
- *	ret = trace_seq_printf(s, <TPRAWFMT> "%s", <ARGS> "\n");
+ *	ret = trace_seq_printf(s, <TP_RAW_FMT> "%s", <ARGS> "\n");
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -44,8 +44,8 @@
 	field->item,
 
 
-#undef TPRAWFMT
-#define TPRAWFMT(args...)	args
+#undef TP_RAW_FMT
+#define TP_RAW_FMT(args...)	args
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 2c8d76c7dbed..557ca52bcdcd 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -106,8 +106,8 @@
  *
  */
 
-#undef TPFMT
-#define TPFMT(fmt, args...)	fmt "\n", ##args
+#undef TP_FMT
+#define TP_FMT(fmt, args...)	fmt "\n", ##args
 
 #define _TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
@@ -152,8 +152,8 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
-#undef TPCMD
-#define TPCMD(cmd...)	cmd
+#undef TP_CMD
+#define TP_CMD(cmd...)	cmd
 
 #undef TRACE_ENTRY
 #define TRACE_ENTRY	entry
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7162ab49d05d..e62bc10f8103 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -26,8 +26,8 @@
 		return 0;
 
 
-#undef TPRAWFMT
-#define TPRAWFMT(args...) args
+#undef TP_RAW_FMT
+#define TP_RAW_FMT(args...) args
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
@@ -57,8 +57,8 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
-#undef TPCMD
-#define TPCMD(cmd...)	cmd
+#undef TP_CMD
+#define TP_CMD(cmd...)	cmd
 
 #undef TRACE_ENTRY
 #define TRACE_ENTRY	entry
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
deleted file mode 100644
index 97e59a9c82ea..000000000000
--- a/kernel/trace/trace_format.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *	struct ftrace_raw_##call field;
- *	int ret;
- *
- *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " size:%d; offset:%d;\n",
- *			       sizeof(field.type),
- *			       offsetof(struct ftrace_raw_##call,
- *					item));
- *
- * }
- */
-
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
-	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-static int								\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
-									\
-	return ret;							\
-}
-
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
index 01724e04c556..dffdc49878af 100644
--- a/samples/tracepoints/tp-samples-trace.h
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -5,9 +5,9 @@
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(subsys_event,
-	TPPROTO(struct inode *inode, struct file *file),
-	TPARGS(inode, file));
+	TP_PROTO(struct inode *inode, struct file *file),
+	TP_ARGS(inode, file));
 DECLARE_TRACE(subsys_eventb,
-	TPPROTO(void),
-	TPARGS());
+	TP_PROTO(void),
+	TP_ARGS());
 #endif
-- 
cgit v1.2.3


From da4d03020c2af32f73e8bfbab0a66620d85bb9bb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 17:14:30 -0400
Subject: tracing: new format for specialized trace points

Impact: clean up and enhancement

The TRACE_EVENT_FORMAT macro looks quite ugly and is limited in its
ability to save data as well as to print the record out. Working with
Ingo Molnar, we came up with a new format that is much more pleasing to
the eye of C developers. This new macro is more C style than the old
macro, and is more obvious to what it does.

Here's the example. The only updated macro in this patch is the
sched_switch trace point.

The old method looked like this:

 TRACE_EVENT_FORMAT(sched_switch,
        TP_PROTO(struct rq *rq, struct task_struct *prev,
                struct task_struct *next),
        TP_ARGS(rq, prev, next),
        TP_FMT("task %s:%d ==> %s:%d",
              prev->comm, prev->pid, next->comm, next->pid),
        TRACE_STRUCT(
                TRACE_FIELD(pid_t, prev_pid, prev->pid)
                TRACE_FIELD(int, prev_prio, prev->prio)
                TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
                                    next_comm,
                                    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
                                                 next->comm,
                                                 TASK_COMM_LEN)))
                TRACE_FIELD(pid_t, next_pid, next->pid)
                TRACE_FIELD(int, next_prio, next->prio)
        ),
        TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
        );

The above method is hard to read and requires two format fields.

The new method:

 /*
  * Tracepoint for task switches, performed by the scheduler:
  *
  * (NOTE: the 'rq' argument is not used by generic trace events,
  *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_switch,

	TP_PROTO(struct rq *rq, struct task_struct *prev,
		 struct task_struct *next),

	TP_ARGS(rq, prev, next),

	TP_STRUCT__entry(
		__array(	char,	prev_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	prev_pid			)
		__field(	int,	prev_prio			)
		__array(	char,	next_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	next_pid			)
		__field(	int,	next_prio			)
	),

	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
		__entry->next_comm, __entry->next_pid, __entry->next_prio),

	TP_fast_assign(
		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
		__entry->prev_pid	= prev->pid;
		__entry->prev_prio	= prev->prio;
		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
		__entry->next_pid	= next->pid;
		__entry->next_prio	= next->prio;
	)
 );

This macro is called TRACE_EVENT, it is broken up into 5 parts:

 TP_PROTO:        the proto type of the trace point
 TP_ARGS:         the arguments of the trace point
 TP_STRUCT_entry: the structure layout of the entry in the ring buffer
 TP_printk:       the printk format
 TP_fast_assign:  the method used to write the entry into the ring buffer

The structure is the definition of how the event will be saved in the
ring buffer. The printk is used by the internal tracing in case of
an oops, and the kernel needs to print out the format of the record
to the console. This the TP_printk gives a means to show the records
in a human readable format. It is also used to print out the data
from the trace file.

The TP_fast_assign is executed directly. It is basically like a C function,
where the __entry is the handle to the record.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          |   3 +
 include/trace/sched_event_types.h   |  48 +++++++----
 kernel/trace/trace.h                |   5 --
 kernel/trace/trace_event_types.h    |   3 +-
 kernel/trace/trace_events.c         | 159 +-----------------------------------
 kernel/trace/trace_events_stage_1.h |  28 ++++---
 kernel/trace/trace_events_stage_2.h |  89 ++++++++++++++++----
 kernel/trace/trace_events_stage_3.h |  34 +++-----
 kernel/trace/trace_export.c         |  23 +++++-
 9 files changed, 159 insertions(+), 233 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 3bcc3e171443..6b4f1bb3701e 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -160,4 +160,7 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt)	\
 	TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt))
 
+#define TRACE_EVENT(name, proto, args, struct, print, assign)	\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index 71b14828a957..aa77fb754038 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -62,25 +62,41 @@ TRACE_EVENT_FORMAT(sched_wakeup_new,
 	TP_RAW_FMT("task %d success=%d")
 	);
 
-TRACE_EVENT_FORMAT(sched_switch,
+/*
+ * Tracepoint for task switches, performed by the scheduler:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_switch,
+
 	TP_PROTO(struct rq *rq, struct task_struct *prev,
-		struct task_struct *next),
+		 struct task_struct *next),
+
 	TP_ARGS(rq, prev, next),
-	TP_FMT("task %s:%d ==> %s:%d",
-	      prev->comm, prev->pid, next->comm, next->pid),
-	TRACE_STRUCT(
-		TRACE_FIELD(pid_t, prev_pid, prev->pid)
-		TRACE_FIELD(int, prev_prio, prev->prio)
-		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
-				    next_comm,
-				    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
-						 next->comm,
-						 TASK_COMM_LEN)))
-		TRACE_FIELD(pid_t, next_pid, next->pid)
-		TRACE_FIELD(int, next_prio, next->prio)
+
+	TP_STRUCT__entry(
+		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	prev_pid			)
+		__field(	int,	prev_prio			)
+		__array(	char,	next_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	next_pid			)
+		__field(	int,	next_prio			)
 	),
-	TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
-	);
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio),
+
+	TP_fast_assign(
+		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+		__entry->prev_pid	= prev->pid;
+		__entry->prev_prio	= prev->prio;
+		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+		__entry->next_pid	= next->pid;
+		__entry->next_prio	= next->prio;
+	)
+);
 
 TRACE_EVENT_FORMAT(sched_migrate_task,
 	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2bfb7d11fc17..c5e1d8865fe4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -751,12 +751,7 @@ struct ftrace_event_call {
 	int		(*regfunc)(void);
 	void		(*unregfunc)(void);
 	int		id;
-	struct dentry	*raw_dir;
-	int		raw_enabled;
-	int		type;
 	int		(*raw_init)(void);
-	int		(*raw_reg)(void);
-	void		(*raw_unreg)(void);
 	int		(*show_format)(struct trace_seq *s);
 };
 
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index d94179aa1fc2..5cca4c978bde 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -106,9 +106,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD(char *, fmt, fmt)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
-	TP_RAW_FMT("%08lx (%d) %s")
+	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
 
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fa32ca320767..1880a6438097 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -59,22 +59,12 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		if (call->raw_enabled) {
-			call->raw_enabled = 0;
-			call->raw_unreg();
-		}
 		break;
 	case 1:
-		if (!call->enabled &&
-		    (call->type & TRACE_EVENT_TYPE_PRINTF)) {
+		if (!call->enabled) {
 			call->enabled = 1;
 			call->regfunc();
 		}
-		if (!call->raw_enabled &&
-		    (call->type & TRACE_EVENT_TYPE_RAW)) {
-			call->raw_enabled = 1;
-			call->raw_reg();
-		}
 		break;
 	}
 }
@@ -300,7 +290,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call = filp->private_data;
 	char *buf;
 
-	if (call->enabled || call->raw_enabled)
+	if (call->enabled)
 		buf = "1\n";
 	else
 		buf = "0\n";
@@ -346,107 +336,6 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
-static ssize_t
-event_type_read(struct file *filp, char __user *ubuf, size_t cnt,
-		loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[16];
-	int r = 0;
-
-	if (call->type & TRACE_EVENT_TYPE_PRINTF)
-		r += sprintf(buf, "printf\n");
-
-	if (call->type & TRACE_EVENT_TYPE_RAW)
-		r += sprintf(buf+r, "raw\n");
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-event_type_write(struct file *filp, const char __user *ubuf, size_t cnt,
-		 loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[64];
-
-	/*
-	 * If there's only one type, we can't change it.
-	 * And currently we always have printf type, and we
-	 * may or may not have raw type.
-	 *
-	 * This is a redundant check, the file should be read
-	 * only if this is the case anyway.
-	 */
-
-	if (!call->raw_init)
-		return -EPERM;
-
-	if (cnt >= sizeof(buf))
-		return -EINVAL;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-
-	if (!strncmp(buf, "printf", 6) &&
-	    (!buf[6] || isspace(buf[6]))) {
-
-		call->type = TRACE_EVENT_TYPE_PRINTF;
-
-		/*
-		 * If raw enabled, the disable it and enable
-		 * printf type.
-		 */
-		if (call->raw_enabled) {
-			call->raw_enabled = 0;
-			call->raw_unreg();
-
-			call->enabled = 1;
-			call->regfunc();
-		}
-
-	} else if (!strncmp(buf, "raw", 3) &&
-	    (!buf[3] || isspace(buf[3]))) {
-
-		call->type = TRACE_EVENT_TYPE_RAW;
-
-		/*
-		 * If printf enabled, the disable it and enable
-		 * raw type.
-		 */
-		if (call->enabled) {
-			call->enabled = 0;
-			call->unregfunc();
-
-			call->raw_enabled = 1;
-			call->raw_reg();
-		}
-	} else
-		return -EINVAL;
-
-	*ppos += cnt;
-
-	return cnt;
-}
-
-static ssize_t
-event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
-			   loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[16];
-	int r = 0;
-
-	r += sprintf(buf, "printf\n");
-
-	if (call->raw_init)
-		r += sprintf(buf+r, "raw\n");
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
 #undef FIELD
 #define FIELD(type, name)						\
 	#type, #name, (unsigned int)offsetof(typeof(field), name),	\
@@ -470,6 +359,7 @@ static int trace_write_header(struct trace_seq *s)
 				FIELD(int, pid),
 				FIELD(int, tgid));
 }
+
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
@@ -527,13 +417,6 @@ static const struct seq_operations show_set_event_seq_ops = {
 	.stop = t_stop,
 };
 
-static const struct file_operations ftrace_avail_fops = {
-	.open = ftrace_event_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 static const struct file_operations ftrace_set_event_fops = {
 	.open = ftrace_event_seq_open,
 	.read = seq_read,
@@ -548,17 +431,6 @@ static const struct file_operations ftrace_enable_fops = {
 	.write = event_enable_write,
 };
 
-static const struct file_operations ftrace_type_fops = {
-	.open = tracing_open_generic,
-	.read = event_type_read,
-	.write = event_type_write,
-};
-
-static const struct file_operations ftrace_available_types_fops = {
-	.open = tracing_open_generic,
-	.read = event_available_types_read,
-};
-
 static const struct file_operations ftrace_event_format_fops = {
 	.open = tracing_open_generic,
 	.read = event_format_read,
@@ -647,9 +519,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		}
 	}
 
-	/* default the output to printf */
-	call->type = TRACE_EVENT_TYPE_PRINTF;
-
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
 		pr_warning("Could not create debugfs "
@@ -665,21 +534,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   "'%s/enable' entry\n", call->name);
 	}
 
-	/* Only let type be writable, if we can change it */
-	entry = debugfs_create_file("type",
-				    call->raw_init ? 0644 : 0444,
-				    call->dir, call,
-				    &ftrace_type_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/type' entry\n", call->name);
-
-	entry = debugfs_create_file("available_types", 0444, call->dir, call,
-				    &ftrace_available_types_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/available_types' entry\n", call->name);
-
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
@@ -704,13 +558,6 @@ static __init int event_trace_init(void)
 	if (!d_tracer)
 		return 0;
 
-	entry = debugfs_create_file("available_events", 0444, d_tracer,
-				    (void *)&show_event_seq_ops,
-				    &ftrace_avail_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'available_events' entry\n");
-
 	entry = debugfs_create_file("set_event", 0644, d_tracer,
 				    (void *)&show_set_event_seq_ops,
 				    &ftrace_set_event_fops);
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 3830a731424c..edfcbd3a0d1b 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -18,19 +18,23 @@
 #define TRACE_FORMAT(call, proto, args, fmt)
 
 #undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)	\
-	struct ftrace_raw_##name {					\
-		struct trace_entry	ent;				\
-		tstruct							\
-	};								\
-	static struct ftrace_event_call event_##name
+#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)
+
+#undef __array
+#define __array(type, item, len)	type	item[len];
 
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
+#undef __field
+#define __field(type, item)		type	item;
 
-#define TRACE_FIELD(type, item, assign) \
-	type item;
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	type_item;
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, print, assign)	\
+	struct ftrace_raw_##name {				\
+		struct trace_entry	ent;			\
+		tstruct						\
+	};							\
+	static struct ftrace_event_call event_##name
 
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 8e2e0f56c2a8..d91bf4c56661 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -32,23 +32,14 @@
  * in binary.
  */
 
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
+#undef __entry
+#define __entry field
 
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign) \
-	field->item,
+#undef TP_printk
+#define TP_printk(fmt, args...) fmt "\n", args
 
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	field->item,
-
-
-#undef TP_RAW_FMT
-#define TP_RAW_FMT(args...)	args
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
 enum print_line_t							\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 {									\
@@ -66,14 +57,76 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	ret = trace_seq_printf(s, tpfmt "%s", tstruct "\n");		\
+	ret = trace_seq_printf(s, print);				\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
 	return TRACE_TYPE_HANDLED;					\
 }
-
+	
 #include <trace/trace_event_types.h>
 
-#include "trace_format.h"
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " size:%d; offset:%d;\n",
+ *			       sizeof(field.type),
+ *			       offsetof(struct ftrace_raw_##call,
+ *					item));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __array
+#define __array(type, item, len)						\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __entry
+#define __entry "REC"
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, func)		\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
+									\
+	return ret;							\
+}
+
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 41b82b93c9c7..8e398d864096 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -144,27 +144,15 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
 
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TP_CMD
-#define TP_CMD(cmd...)	cmd
-
-#undef TRACE_ENTRY
-#define TRACE_ENTRY	entry
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw)	\
+	TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))
 
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	cmd;
+#undef __entry
+#define __entry entry
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
@@ -185,7 +173,7 @@ static void ftrace_raw_event_##call(proto)				\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
-	tstruct;							\
+	assign;								\
 									\
 	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
 }									\
@@ -226,10 +214,8 @@ __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
-	.raw_reg		= ftrace_raw_reg_event_##call,		\
-	.raw_unreg		= ftrace_raw_unreg_event_##call,	\
+	.regfunc		= ftrace_raw_reg_event_##call,		\
+	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
 }
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e62bc10f8103..23ae78430d58 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,7 +15,28 @@
 
 #include "trace_output.h"
 
-#include "trace_format.h"
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
 
 #undef TRACE_FIELD_ZERO_CHAR
 #define TRACE_FIELD_ZERO_CHAR(item)					\
-- 
cgit v1.2.3


From 157587d7ac555458da9f682e3250135e468470a6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 00:15:34 -0400
Subject: tracing: remove obsolete TRACE_EVENT_FORMAT macro

Impact: clean up

The TRACE_EVENT_FORMAT macro is no longer used by trace points
and only the DECLARE_TRACE, TRACE_FORMAT or TRACE_EVENT macros should
be used by them. Although the TRACE_EVENT_FORMAT macro is still used
by the internal tracing utility, it should not be used in core
kernel code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          | 3 ---
 include/trace/lockdep_event_types.h | 2 +-
 include/trace/sched_event_types.h   | 2 +-
 kernel/trace/trace_events_stage_1.h | 3 ---
 kernel/trace/trace_events_stage_3.h | 6 +-----
 5 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 6b4f1bb3701e..69b56988813d 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,9 +157,6 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt)	\
-	TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt))
-
 #define TRACE_EVENT(name, proto, args, struct, print, assign)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
index 1f00e8b3543e..adccfcd2ec8f 100644
--- a/include/trace/lockdep_event_types.h
+++ b/include/trace/lockdep_event_types.h
@@ -1,5 +1,5 @@
 
-#ifndef TRACE_EVENT_FORMAT
+#ifndef TRACE_FORMAT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index 0bbbf410e01f..fb37af672c88 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -1,6 +1,6 @@
 
 /* use <trace/sched.h> instead */
-#ifndef TRACE_EVENT_FORMAT
+#ifndef TRACE_EVENT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index edfcbd3a0d1b..15e9bf965a18 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -17,9 +17,6 @@
 #undef TRACE_FORMAT
 #define TRACE_FORMAT(call, proto, args, fmt)
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)
-
 #undef __array
 #define __array(type, item, len)	type	item[len];
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 8e398d864096..3ba55d4ab073 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -35,7 +35,7 @@
  * }
  *
  *
- * For those macros defined with TRACE_EVENT_FORMAT:
+ * For those macros defined with TRACE_EVENT:
  *
  * static struct ftrace_event_call event_<call>;
  *
@@ -144,10 +144,6 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw)	\
-	TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))
-
 #undef __entry
 #define __entry entry
 
-- 
cgit v1.2.3


From 30a8fecc2d34f086df34fe2f2b926f080e002600 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 12:41:38 -0400
Subject: tracing: flip the TP_printk and TP_fast_assign in the TRACE_EVENT
 macro

Impact: clean up

In trying to stay consistant with the C style format in the TRACE_EVENT
macro, it makes more sense to do the printk after the assigning of
the variables.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          |   2 +-
 include/trace/irq_event_types.h     |   8 +--
 include/trace/sched_event_types.h   | 102 ++++++++++++++++++------------------
 kernel/trace/trace_events_stage_1.h |   2 +-
 kernel/trace/trace_events_stage_2.h |   4 +-
 kernel/trace/trace_events_stage_3.h |   2 +-
 6 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 69b56988813d..c7b09452514b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,7 +157,7 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
-#define TRACE_EVENT(name, proto, args, struct, print, assign)	\
+#define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
 #endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
index 43bcb74dd49f..214bb928fe9e 100644
--- a/include/trace/irq_event_types.h
+++ b/include/trace/irq_event_types.h
@@ -31,13 +31,13 @@ TRACE_EVENT(irq_handler_exit,
 		__field(	int,	ret	)
 	),
 
-	TP_printk("irq=%d return=%s",
-		  __entry->irq, __entry->ret ? "handled" : "unhandled"),
-
 	TP_fast_assign(
 		__entry->irq	= irq;
 		__entry->ret	= ret;
-	)
+	),
+
+	TP_printk("irq=%d return=%s",
+		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index fb37af672c88..63547dc1125f 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -22,12 +22,12 @@ TRACE_EVENT(sched_kthread_stop,
 		__field(	pid_t,	pid			)
 	),
 
-	TP_printk("task %s:%d", __entry->comm, __entry->pid),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
 		__entry->pid	= t->pid;
-	)
+	),
+
+	TP_printk("task %s:%d", __entry->comm, __entry->pid)
 );
 
 /*
@@ -43,11 +43,11 @@ TRACE_EVENT(sched_kthread_stop_ret,
 		__field(	int,	ret	)
 	),
 
-	TP_printk("ret %d", __entry->ret),
-
 	TP_fast_assign(
 		__entry->ret	= ret;
-	)
+	),
+
+	TP_printk("ret %d", __entry->ret)
 );
 
 /*
@@ -68,14 +68,14 @@ TRACE_EVENT(sched_wait_task,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid	= p->pid;
 		__entry->prio	= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -97,16 +97,16 @@ TRACE_EVENT(sched_wakeup,
 		__field(	int,	success			)
 	),
 
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->success	= success;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
 );
 
 /*
@@ -128,16 +128,16 @@ TRACE_EVENT(sched_wakeup_new,
 		__field(	int,	success			)
 	),
 
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->success	= success;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
 );
 
 /*
@@ -162,10 +162,6 @@ TRACE_EVENT(sched_switch,
 		__field(	int,	next_prio			)
 	),
 
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
-		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-		__entry->next_comm, __entry->next_pid, __entry->next_prio),
-
 	TP_fast_assign(
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
 		__entry->prev_pid	= prev->pid;
@@ -173,7 +169,11 @@ TRACE_EVENT(sched_switch,
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		__entry->next_pid	= next->pid;
 		__entry->next_prio	= next->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 
 /*
@@ -193,17 +193,17 @@ TRACE_EVENT(sched_migrate_task,
 		__field(	int,	dest_cpu		)
 	),
 
-	TP_printk("task %s:%d [%d] from: %d  to: %d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->orig_cpu, __entry->dest_cpu),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->orig_cpu	= orig_cpu;
 		__entry->dest_cpu	= dest_cpu;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] from: %d  to: %d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->orig_cpu, __entry->dest_cpu)
 );
 
 /*
@@ -221,14 +221,14 @@ TRACE_EVENT(sched_process_free,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -246,14 +246,14 @@ TRACE_EVENT(sched_process_exit,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -271,14 +271,14 @@ TRACE_EVENT(sched_process_wait,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 		__entry->pid		= pid_nr(pid);
 		__entry->prio		= current->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -297,16 +297,16 @@ TRACE_EVENT(sched_process_fork,
 		__field(	pid_t,	child_pid			)
 	),
 
-	TP_printk("parent %s:%d  child %s:%d",
-		__entry->parent_comm, __entry->parent_pid,
-		__entry->child_comm, __entry->child_pid),
-
 	TP_fast_assign(
 		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
 		__entry->parent_pid	= parent->pid;
 		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
 		__entry->child_pid	= child->pid;
-	)
+	),
+
+	TP_printk("parent %s:%d  child %s:%d",
+		__entry->parent_comm, __entry->parent_pid,
+		__entry->child_comm, __entry->child_pid)
 );
 
 /*
@@ -324,14 +324,14 @@ TRACE_EVENT(sched_signal_send,
 		__field(	pid_t,	pid			)
 	),
 
-	TP_printk("sig: %d  task %s:%d",
-		  __entry->sig, __entry->comm, __entry->pid),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid	= p->pid;
 		__entry->sig	= sig;
-	)
+	),
+
+	TP_printk("sig: %d  task %s:%d",
+		  __entry->sig, __entry->comm, __entry->pid)
 );
 
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 15e9bf965a18..82f68443c556 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -27,7 +27,7 @@
 #define TP_STRUCT__entry(args...) args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, print, assign)	\
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
 	struct ftrace_raw_##name {				\
 		struct trace_entry	ent;			\
 		tstruct						\
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index d91bf4c56661..1ad9f8d2fe45 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -39,7 +39,7 @@
 #define TP_printk(fmt, args...) fmt "\n", args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 {									\
@@ -115,7 +115,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 #define TP_fast_assign(args...) args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, func)		\
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
 static int								\
 ftrace_format_##call(struct trace_seq *s)				\
 {									\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 3ba55d4ab073..d6de06b9201a 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -148,7 +148,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define __entry entry
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
-- 
cgit v1.2.3


From 823f9124fb2e33eeb624d139978a52089f8a02ae Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 12:58:51 -0400
Subject: tracing: document TRACE_EVENT macro in tracepoint.h

Impact: clean up / comments

Kosaki Motohiro asked about an explanation to the TRACE_EVENT macro.
Ingo Molnar replied with a nice description.

This patch takes the description that Ingo wrote (with some slight
modifications) and adds it to the tracepoint.h file.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h | 103 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index c7b09452514b..119ece224c21 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,6 +157,109 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
+
+/*
+ * For use with the TRACE_EVENT macro:
+ *
+ * We define a tracepoint, its arguments, its printk format
+ * and its 'fast binay record' layout.
+ *
+ * Firstly, name your tracepoint via TRACE_EVENT(name : the
+ * 'subsystem_event' notation is fine.
+ *
+ * Think about this whole construct as the
+ * 'trace_sched_switch() function' from now on.
+ *
+ *
+ *  TRACE_EVENT(sched_switch,
+ *
+ *	*
+ *	* A function has a regular function arguments
+ *	* prototype, declare it via TP_PROTO():
+ *	*
+ *
+ * 	TP_PROTO(struct rq *rq, struct task_struct *prev,
+ * 		 struct task_struct *next),
+ *
+ *	*
+ *	* Define the call signature of the 'function'.
+ *	* (Design sidenote: we use this instead of a
+ *	*  TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.)
+ *	*
+ *
+ * 	TP_ARGS(rq, prev, next),
+ *
+ *	*
+ *	* Fast binary tracing: define the trace record via
+ *	* TP_STRUCT__entry(). You can think about it like a
+ *	* regular C structure local variable definition.
+ *	*
+ *	* This is how the trace record is structured and will
+ *	* be saved into the ring buffer. These are the fields
+ *	* that will be exposed to user-space in
+ *	* /debug/tracing/events/<*>/format.
+ *	*
+ *	* The declared 'local variable' is called '__entry'
+ *	*
+ *	* __field(pid_t, prev_prid) is equivalent to a standard declariton:
+ *	*
+ *	*	pid_t	prev_pid;
+ *	*
+ *	* __array(char, prev_comm, TASK_COMM_LEN) is equivalent to:
+ *	*
+ *	*	char	prev_comm[TASK_COMM_LEN];
+ *	*
+ *
+ *	TP_STRUCT__entry(
+ *		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+ *		__field(	pid_t,	prev_pid			)
+ *		__field(	int,	prev_prio			)
+ *		__array(	char,	next_comm,	TASK_COMM_LEN	)
+ *		__field(	pid_t,	next_pid			)
+ *		__field(	int,	next_prio			)
+ *	),
+ *
+ *	*
+ *	* Assign the entry into the trace record, by embedding
+ *	* a full C statement block into TP_fast_assign(). You
+ *	* can refer to the trace record as '__entry' -
+ *	* otherwise you can put arbitrary C code in here.
+ *	*
+ *	* Note: this C code will execute every time a trace event
+ *	* happens, on an active tracepoint.
+ *	*
+ *
+ * 	TP_fast_assign(
+ * 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+ * 		__entry->prev_pid	= prev->pid;
+ * 		__entry->prev_prio	= prev->prio;
+ *		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+ *		__entry->next_pid	= next->pid;
+ * 		__entry->next_prio	= next->prio;
+ *	)
+ *
+ *	*
+ *	* Formatted output of a trace record via TP_printk().
+ *	* This is how the tracepoint will appear under ftrace
+ *	* plugins that make use of this tracepoint.
+ *	*
+ *	* (raw-binary tracing wont actually perform this step.)
+ *	*
+ *
+ *	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+ *		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+ *		__entry->next_comm, __entry->next_pid, __entry->next_prio),
+ *
+ * );
+ *
+ * This macro construct is thus used for the regular printk format
+ * tracing setup, it is used to construct a function pointer based
+ * tracepoint callback (this is used by programmatic plugins and
+ * can also by used by generic instrumentation like SystemTap), and
+ * it is also used to expose a structured trace record in
+ * /debug/tracing/events/.
+ */
+
 #define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
-- 
cgit v1.2.3


From ef18012b248b47ec9a12c3a83ca5e99782d39c5d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 14:10:56 -0400
Subject: tracing: remove funky whitespace in the trace code

Impact: clean up

There existed a lot of <space><tab>'s in the tracing code. This
patch removes them.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h           | 16 +++---
 kernel/trace/blktrace.c              | 10 ++--
 kernel/trace/trace.c                 |  2 +-
 kernel/trace/trace_branch.c          |  2 +-
 kernel/trace/trace_events_stage_3.h  | 98 ++++++++++++++++++------------------
 kernel/trace/trace_export.c          |  2 +-
 kernel/trace/trace_functions_graph.c |  6 +--
 kernel/trace/trace_output.c          | 14 +++---
 kernel/trace/trace_workqueue.c       |  6 +--
 9 files changed, 78 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 119ece224c21..d35a7ee7611f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -178,8 +178,8 @@ static inline void tracepoint_synchronize_unregister(void)
  *	* prototype, declare it via TP_PROTO():
  *	*
  *
- * 	TP_PROTO(struct rq *rq, struct task_struct *prev,
- * 		 struct task_struct *next),
+ *	TP_PROTO(struct rq *rq, struct task_struct *prev,
+ *		 struct task_struct *next),
  *
  *	*
  *	* Define the call signature of the 'function'.
@@ -187,7 +187,7 @@ static inline void tracepoint_synchronize_unregister(void)
  *	*  TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.)
  *	*
  *
- * 	TP_ARGS(rq, prev, next),
+ *	TP_ARGS(rq, prev, next),
  *
  *	*
  *	* Fast binary tracing: define the trace record via
@@ -229,13 +229,13 @@ static inline void tracepoint_synchronize_unregister(void)
  *	* happens, on an active tracepoint.
  *	*
  *
- * 	TP_fast_assign(
- * 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
- * 		__entry->prev_pid	= prev->pid;
- * 		__entry->prev_prio	= prev->prio;
+ *	TP_fast_assign(
+ *		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+ *		__entry->prev_pid	= prev->pid;
+ *		__entry->prev_prio	= prev->prio;
  *		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
  *		__entry->next_pid	= next->pid;
- * 		__entry->next_prio	= next->prio;
+ *		__entry->next_prio	= next->prio;
  *	)
  *
  *	*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e39679a72a3b..bec69d3678c1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -33,7 +33,7 @@ static struct trace_array *blk_tr;
 static int __read_mostly  blk_tracer_enabled;
 
 /* Select an alternative, minimalistic output than the original one */
-#define TRACE_BLK_OPT_CLASSIC 	0x1
+#define TRACE_BLK_OPT_CLASSIC	0x1
 
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
@@ -564,7 +564,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
 /**
  * blk_trace_ioctl: - handle the ioctls associated with tracing
  * @bdev:	the block device
- * @cmd: 	the ioctl cmd
+ * @cmd:	the ioctl cmd
  * @arg:	the argument data, if any
  *
  **/
@@ -1128,9 +1128,9 @@ static void blk_tracer_reset(struct trace_array *tr)
 
 static struct {
 	const char *act[2];
-	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+	int	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
 } what2act[] __read_mostly = {
-	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
+	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
 	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
 	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
@@ -1229,7 +1229,7 @@ static struct tracer blk_tracer __read_mostly = {
 };
 
 static struct trace_event trace_blk_event = {
-	.type	 	= TRACE_BLK,
+	.type		= TRACE_BLK,
 	.trace		= blk_trace_event_print,
 	.binary		= blk_trace_event_print_binary,
 };
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cc94f8642485..8c6a902db40a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -799,7 +799,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->tgid               	= (tsk) ? tsk->tgid : 0;
+	entry->tgid			= (tsk) ? tsk->tgid : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index aaa0755268b9..ad8c22efff41 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -157,7 +157,7 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 
 
 static struct trace_event trace_branch_event = {
-	.type	 	= TRACE_BRANCH,
+	.type		= TRACE_BRANCH,
 	.trace		= trace_branch_print,
 };
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 6ee1de59f19d..ae2e323df0c7 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -5,23 +5,23 @@
  *
  * static void ftrace_event_<call>(proto)
  * {
- * 	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
+ *	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
  * }
  *
  * static int ftrace_reg_event_<call>(void)
  * {
- * 	int ret;
+ *	int ret;
  *
- * 	ret = register_trace_<call>(ftrace_event_<call>);
- * 	if (!ret)
- * 		pr_info("event trace: Could not activate trace point "
- * 			"probe to  <call>");
- * 	return ret;
+ *	ret = register_trace_<call>(ftrace_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to  <call>");
+ *	return ret;
  * }
  *
  * static void ftrace_unreg_event_<call>(void)
  * {
- * 	unregister_trace_<call>(ftrace_event_<call>);
+ *	unregister_trace_<call>(ftrace_event_<call>);
  * }
  *
  * For those macros defined with TRACE_FORMAT:
@@ -29,9 +29,9 @@
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
- * 	.name 			= "<call>",
- * 	.regfunc		= ftrace_reg_event_<call>,
- * 	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.name			= "<call>",
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
  * }
  *
  *
@@ -41,66 +41,66 @@
  *
  * static void ftrace_raw_event_<call>(proto)
  * {
- * 	struct ring_buffer_event *event;
- * 	struct ftrace_raw_<call> *entry; <-- defined in stage 1
- * 	unsigned long irq_flags;
- * 	int pc;
- *
- * 	local_save_flags(irq_flags);
- * 	pc = preempt_count();
- *
- * 	event = trace_current_buffer_lock_reserve(event_<call>.id,
- * 				  sizeof(struct ftrace_raw_<call>),
- * 				  irq_flags, pc);
- * 	if (!event)
- * 		return;
- * 	entry	= ring_buffer_event_data(event);
- *
- * 	<assign>;  <-- Here we assign the entries by the __field and
+ *	struct ring_buffer_event *event;
+ *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ *	unsigned long irq_flags;
+ *	int pc;
+ *
+ *	local_save_flags(irq_flags);
+ *	pc = preempt_count();
+ *
+ *	event = trace_current_buffer_lock_reserve(event_<call>.id,
+ *				  sizeof(struct ftrace_raw_<call>),
+ *				  irq_flags, pc);
+ *	if (!event)
+ *		return;
+ *	entry	= ring_buffer_event_data(event);
+ *
+ *	<assign>;  <-- Here we assign the entries by the __field and
  *			__array macros.
  *
- * 	trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ *	trace_current_buffer_unlock_commit(event, irq_flags, pc);
  * }
  *
  * static int ftrace_raw_reg_event_<call>(void)
  * {
- * 	int ret;
+ *	int ret;
  *
- * 	ret = register_trace_<call>(ftrace_raw_event_<call>);
- * 	if (!ret)
- * 		pr_info("event trace: Could not activate trace point "
- * 			"probe to <call>");
- * 	return ret;
+ *	ret = register_trace_<call>(ftrace_raw_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to <call>");
+ *	return ret;
  * }
  *
  * static void ftrace_unreg_event_<call>(void)
  * {
- * 	unregister_trace_<call>(ftrace_raw_event_<call>);
+ *	unregister_trace_<call>(ftrace_raw_event_<call>);
  * }
  *
  * static struct trace_event ftrace_event_type_<call> = {
- * 	.trace			= ftrace_raw_output_<call>, <-- stage 2
+ *	.trace			= ftrace_raw_output_<call>, <-- stage 2
  * };
  *
  * static int ftrace_raw_init_event_<call>(void)
  * {
- * 	int id;
+ *	int id;
  *
- * 	id = register_ftrace_event(&ftrace_event_type_<call>);
- * 	if (!id)
- * 		return -ENODEV;
- * 	event_<call>.id = id;
- * 	return 0;
+ *	id = register_ftrace_event(&ftrace_event_type_<call>);
+ *	if (!id)
+ *		return -ENODEV;
+ *	event_<call>.id = id;
+ *	return 0;
  * }
  *
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
- * 	.name 			= "<call>",
+ *	.name			= "<call>",
  *	.system			= "<system>",
- * 	.raw_init		= ftrace_raw_init_event_<call>,
- * 	.regfunc		= ftrace_reg_event_<call>,
- * 	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.raw_init		= ftrace_raw_init_event_<call>,
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
  *	.show_format		= ftrace_format_<call>,
  * }
  *
@@ -138,7 +138,7 @@ _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
@@ -163,7 +163,7 @@ static void ftrace_raw_event_##call(proto)				\
 	pc = preempt_count();						\
 									\
 	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				  sizeof(struct ftrace_raw_##call), 	\
+				  sizeof(struct ftrace_raw_##call),	\
 				  irq_flags, pc);			\
 	if (!event)							\
 		return;							\
@@ -208,7 +208,7 @@ static int ftrace_raw_init_event_##call(void)				\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 23ae78430d58..4d9952d3df50 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -94,7 +94,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.id			= proto,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.show_format		= ftrace_format_##call,			\
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 453ebd3b636e..d1493b853e41 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -841,12 +841,12 @@ static void graph_trace_close(struct trace_iterator *iter)
 }
 
 static struct tracer graph_trace __read_mostly = {
-	.name	     	= "function_graph",
+	.name		= "function_graph",
 	.open		= graph_trace_open,
 	.close		= graph_trace_close,
 	.wait_pipe	= poll_wait_pipe,
-	.init	     	= graph_trace_init,
-	.reset	     	= graph_trace_reset,
+	.init		= graph_trace_init,
+	.reset		= graph_trace_reset,
 	.print_line	= print_graph_function,
 	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ef8fd661b217..491832af9ba1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -565,7 +565,7 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 }
 
 static struct trace_event trace_fn_event = {
-	.type	 	= TRACE_FN,
+	.type		= TRACE_FN,
 	.trace		= trace_fn_trace,
 	.raw		= trace_fn_raw,
 	.hex		= trace_fn_hex,
@@ -696,7 +696,7 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_ctx_event = {
-	.type	 	= TRACE_CTX,
+	.type		= TRACE_CTX,
 	.trace		= trace_ctx_print,
 	.raw		= trace_ctx_raw,
 	.hex		= trace_ctx_hex,
@@ -704,7 +704,7 @@ static struct trace_event trace_ctx_event = {
 };
 
 static struct trace_event trace_wake_event = {
-	.type	 	= TRACE_WAKE,
+	.type		= TRACE_WAKE,
 	.trace		= trace_wake_print,
 	.raw		= trace_wake_raw,
 	.hex		= trace_wake_hex,
@@ -759,7 +759,7 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_special_event = {
-	.type	 	= TRACE_SPECIAL,
+	.type		= TRACE_SPECIAL,
 	.trace		= trace_special_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -796,7 +796,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_stack_event = {
-	.type	 	= TRACE_STACK,
+	.type		= TRACE_STACK,
 	.trace		= trace_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -825,7 +825,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_user_stack_event = {
-	.type	 	= TRACE_USER_STACK,
+	.type		= TRACE_USER_STACK,
 	.trace		= trace_user_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -879,7 +879,7 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 
 
 static struct trace_event trace_print_event = {
-	.type	 	= TRACE_PRINT,
+	.type		= TRACE_PRINT,
 	.trace		= trace_print_print,
 	.raw		= trace_print_raw,
 };
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 4664990fe9c5..e542483df623 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -19,14 +19,14 @@ struct cpu_workqueue_stats {
 /* Useful to know if we print the cpu headers */
 	bool		            first_entry;
 	int		            cpu;
-	pid_t 			    pid;
+	pid_t			    pid;
 /* Can be inserted from interrupt or user context, need to be atomic */
-	atomic_t 	            inserted;
+	atomic_t	            inserted;
 /*
  *  Don't need to be atomic, works are serialized in a single workqueue thread
  *  on a single CPU.
  */
-	unsigned int 	 	    executed;
+	unsigned int		    executed;
 };
 
 /* List of workqueue threads on one cpu */
-- 
cgit v1.2.3


From 9e6e70f8d8b6698e0017c56b86525aabe9c7cd4c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:24 -0400
Subject: NFSv4: Support NFSv4 optional attributes in the struct nfs_fattr

Currently, filling struct nfs_fattr is more or less an all or nothing
operation, since NFSv2 and NFSv3 have only mandatory attributes.
In NFSv4, some attributes are optional, and so we may simply not be able to
fill in those fields. Furthermore, NFSv4 allows you to specify which
attributes you are interested in retrieving, thus permitting you to
optimise away retrieval of attributes that you know will no change...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c          | 243 ++++++++++++++++++++++++++++++------------------
 fs/nfs/nfs2xdr.c        |   2 +-
 fs/nfs/nfs3xdr.c        |   6 +-
 fs/nfs/nfs4xdr.c        |   6 +-
 include/linux/nfs_xdr.h |  48 ++++++++--
 5 files changed, 202 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 268ce3a46220..b7656bd3706f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -249,13 +249,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 	struct inode *inode = ERR_PTR(-ENOENT);
 	unsigned long hash;
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
 		goto out_no_inode;
-
-	if (!fattr->nlink) {
-		printk("NFS: Buggy server - nlink == 0!\n");
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
 		goto out_no_inode;
-	}
 
 	hash = nfs_fattr_to_ino_t(fattr);
 
@@ -291,7 +288,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
-			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+			if ((fattr->valid & NFS_ATTR_FATTR_FSID)
+					&& !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
 				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
 					inode->i_op = &nfs_referral_inode_operations;
 				else
@@ -304,28 +302,45 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		else
 			init_special_inode(inode, inode->i_mode, fattr->rdev);
 
+		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+		memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+		nfsi->change_attr = 0;
+		inode->i_size = 0;
+		inode->i_nlink = 0;
+		inode->i_uid = -2;
+		inode->i_gid = -2;
+		inode->i_blocks = 0;
+		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+
 		nfsi->read_cache_jiffies = fattr->time_start;
 		nfsi->attr_gencount = fattr->gencount;
-		inode->i_atime = fattr->atime;
-		inode->i_mtime = fattr->mtime;
-		inode->i_ctime = fattr->ctime;
-		if (fattr->valid & NFS_ATTR_FATTR_V4)
+		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+			inode->i_atime = fattr->atime;
+		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			inode->i_mtime = fattr->mtime;
+		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			inode->i_ctime = fattr->ctime;
+		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			nfsi->change_attr = fattr->change_attr;
-		inode->i_size = nfs_size_to_loff_t(fattr->size);
-		inode->i_nlink = fattr->nlink;
-		inode->i_uid = fattr->uid;
-		inode->i_gid = fattr->gid;
-		if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			inode->i_size = nfs_size_to_loff_t(fattr->size);
+		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
+			inode->i_nlink = fattr->nlink;
+		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+			inode->i_uid = fattr->uid;
+		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+			inode->i_gid = fattr->gid;
+		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+			inode->i_blocks = fattr->du.nfs2.blocks;
+		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
 			/*
 			 * report the blocks in 512byte units
 			 */
 			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-		} else {
-			inode->i_blocks = fattr->du.nfs2.blocks;
 		}
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
-		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 		nfsi->access_cache = RB_ROOT;
 
 		unlock_new_inode(inode);
@@ -812,25 +827,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
-			nfsi->change_attr == fattr->pre_change_attr) {
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
+			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+			&& nfsi->change_attr == fattr->pre_change_attr) {
 		nfsi->change_attr = fattr->change_attr;
 		if (S_ISDIR(inode->i_mode))
 			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	}
 	/* If we have atomic WCC data, we may update some attributes */
-	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
-		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
 			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-		if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+
+	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
 			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 			if (S_ISDIR(inode->i_mode))
 				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-		}
-		if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
-		    nfsi->npages == 0)
-			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 	}
+	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+			&& nfsi->npages == 0)
+			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 }
 
 /**
@@ -850,35 +871,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 
 	/* Has the inode gone and changed behind our back? */
-	if (nfsi->fileid != fattr->fileid
-			|| (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+		return -EIO;
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		return -EIO;
-	}
 
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
 			nfsi->change_attr != fattr->change_attr)
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Verify a few of the more important attributes */
-	if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
-	cur_size = i_size_read(inode);
- 	new_isize = nfs_size_to_loff_t(fattr->size);
-	if (cur_size != new_isize && nfsi->npages == 0)
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		cur_size = i_size_read(inode);
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		if (cur_size != new_isize && nfsi->npages == 0)
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	}
 
 	/* Have any file permissions changed? */
-	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
-			|| inode->i_uid != fattr->uid
-			|| inode->i_gid != fattr->gid)
+	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
 		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
 
 	/* Has the link count changed? */
-	if (inode->i_nlink != fattr->nlink)
+	if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
 		invalid |= NFS_INO_INVALID_ATTR;
 
-	if (!timespec_equal(&inode->i_atime, &fattr->atime))
+	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
 		invalid |= NFS_INO_INVALID_ATIME;
 
 	if (invalid != 0)
@@ -890,11 +915,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+	if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
+		return 0;
 	return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
 }
 
 static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+	if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+		return 0;
 	return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
 }
 
@@ -1030,20 +1059,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
 	/* Don't do a WCC update if these attributes are already stale */
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
 			!nfs_inode_attrs_need_update(inode, fattr)) {
-		fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+		fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+				| NFS_ATTR_FATTR_PRESIZE
+				| NFS_ATTR_FATTR_PREMTIME
+				| NFS_ATTR_FATTR_PRECTIME);
 		goto out_noforce;
 	}
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-			(fattr->valid & NFS_ATTR_WCC_V4) == 0) {
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
 		fattr->pre_change_attr = NFS_I(inode)->change_attr;
-		fattr->valid |= NFS_ATTR_WCC_V4;
+		fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
 	}
-	if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
-			(fattr->valid & NFS_ATTR_WCC) == 0) {
+	if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
 		memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+		fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
 		memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+		fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
 		fattr->pre_size = i_size_read(inode);
-		fattr->valid |= NFS_ATTR_WCC;
+		fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
 	}
 out_noforce:
 	status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1075,18 +1115,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			__func__, inode->i_sb->s_id, inode->i_ino,
 			atomic_read(&inode->i_count), fattr->valid);
 
-	if (nfsi->fileid != fattr->fileid)
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
 		goto out_fileid;
 
 	/*
 	 * Make sure the inode's type hasn't changed.
 	 */
-	if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		goto out_changed;
 
 	server = NFS_SERVER(inode);
 	/* Update the fsid? */
-	if (S_ISDIR(inode->i_mode) &&
+	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
 			!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
 			!test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
 		server->fsid = fattr->fsid;
@@ -1096,14 +1136,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 */
 	nfsi->read_cache_jiffies = fattr->time_start;
 
-	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
-			| NFS_INO_REVAL_PAGECACHE);
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
+	    nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+		    | NFS_INO_INVALID_ATIME
+		    | NFS_INO_REVAL_PAGECACHE);
 
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
 
 	/* More cache consistency checks */
-	if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
+	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+		if (nfsi->change_attr != fattr->change_attr) {
+			dprintk("NFS: change_attr change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			if (S_ISDIR(inode->i_mode))
+				nfs_force_lookup_revalidate(inode);
+			nfsi->change_attr = fattr->change_attr;
+		}
+	}
+
+	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
 		/* NFSv2/v3: Check if the mtime agrees */
 		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
 			dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1111,7 +1164,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 			if (S_ISDIR(inode->i_mode))
 				nfs_force_lookup_revalidate(inode);
+			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
 		/* If ctime has changed we should definitely clear access+acl caches */
 		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
@@ -1122,59 +1178,66 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				invalid |= NFS_INO_INVALID_DATA;
 				nfs_force_lookup_revalidate(inode);
 			}
+			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
 		}
-	} else if (nfsi->change_attr != fattr->change_attr) {
-		dprintk("NFS: change_attr change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-		if (S_ISDIR(inode->i_mode))
-			nfs_force_lookup_revalidate(inode);
 	}
 
 	/* Check if our cached file size is stale */
- 	new_isize = nfs_size_to_loff_t(fattr->size);
-	cur_isize = i_size_read(inode);
-	if (new_isize != cur_isize) {
-		/* Do we perhaps have any outstanding writes, or has
-		 * the file grown beyond our last write? */
-		if (nfsi->npages == 0 || new_isize > cur_isize) {
-			i_size_write(inode, new_isize);
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		cur_isize = i_size_read(inode);
+		if (new_isize != cur_isize) {
+			/* Do we perhaps have any outstanding writes, or has
+			 * the file grown beyond our last write? */
+			if (nfsi->npages == 0 || new_isize > cur_isize) {
+				i_size_write(inode, new_isize);
+				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+			}
+			dprintk("NFS: isize change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
 		}
-		dprintk("NFS: isize change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
 	}
 
 
-	memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-	memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-	memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
-	nfsi->change_attr = fattr->change_attr;
+	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+		memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
 
-	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
-	    inode->i_uid != fattr->uid ||
-	    inode->i_gid != fattr->gid)
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-
-	if (inode->i_nlink != fattr->nlink) {
-		invalid |= NFS_INO_INVALID_ATTR;
-		if (S_ISDIR(inode->i_mode))
-			invalid |= NFS_INO_INVALID_DATA;
+	if (fattr->valid & NFS_ATTR_FATTR_MODE) {
+		if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_mode = fattr->mode;
+		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+		if (inode->i_uid != fattr->uid) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_uid = fattr->uid;
+		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+		if (inode->i_gid != fattr->gid) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_gid = fattr->gid;
+		}
 	}
 
-	inode->i_mode = fattr->mode;
-	inode->i_nlink = fattr->nlink;
-	inode->i_uid = fattr->uid;
-	inode->i_gid = fattr->gid;
+	if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
+		if (inode->i_nlink != fattr->nlink) {
+			invalid |= NFS_INO_INVALID_ATTR;
+			if (S_ISDIR(inode->i_mode))
+				invalid |= NFS_INO_INVALID_DATA;
+			inode->i_nlink = fattr->nlink;
+		}
+	}
 
-	if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
 		/*
 		 * report the blocks in 512byte units
 		 */
 		inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- 	} else {
- 		inode->i_blocks = fattr->du.nfs2.blocks;
  	}
+	if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+		inode->i_blocks = fattr->du.nfs2.blocks;
 
 	/* Update attrtimeo value if we're out of the unstable period */
 	if (invalid & NFS_INO_INVALID_ATTR) {
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d1519..bea99992c302 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -136,7 +136,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time(p, &fattr->atime);
 	p = xdr_decode_time(p, &fattr->mtime);
 	p = xdr_decode_time(p, &fattr->ctime);
-	fattr->valid |= NFS_ATTR_FATTR;
+	fattr->valid |= NFS_ATTR_FATTR_V2;
 	fattr->rdev = new_decode_dev(rdev);
 	if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
 		fattr->type = NFFIFO;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cdeacffde46..c0f7d02aced9 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -177,7 +177,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time3(p, &fattr->ctime);
 
 	/* Update the mode bits */
-	fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3);
+	fattr->valid |= NFS_ATTR_FATTR_V3;
 	return p;
 }
 
@@ -233,7 +233,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_hyper(p, &fattr->pre_size);
 	p = xdr_decode_time3(p, &fattr->pre_mtime);
 	p = xdr_decode_time3(p, &fattr->pre_ctime);
-	fattr->valid |= NFS_ATTR_WCC;
+	fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+		| NFS_ATTR_FATTR_PREMTIME
+		| NFS_ATTR_FATTR_PRECTIME;
 	return p;
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5f0ee3e2bd84..7d220da3db36 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3012,7 +3012,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
 		goto xdr_error;
 	fattr->type = nfs_type2fmt[type].nfs2type;
-	fmode = nfs_type2fmt[type].mode;
+	fattr->mode = nfs_type2fmt[type].mode;
 
 	if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
 		goto xdr_error;
@@ -3026,7 +3026,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 						struct nfs4_fs_locations,
 						fattr))) != 0)
 		goto xdr_error;
-	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
+	if ((status = decode_attr_mode(xdr, bitmap, &fmode)) != 0)
 		goto xdr_error;
 	fattr->mode |= fmode;
 	if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
@@ -3050,7 +3050,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if (fattr->fileid == 0 && fileid != 0)
 		fattr->fileid = fileid;
 	if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
-		fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
+		fattr->valid = NFS_ATTR_FATTR_V4;
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2e5f00066afd..b99295e07cdf 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -27,7 +27,7 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 }
 
 struct nfs_fattr {
-	unsigned short		valid;		/* which fields are valid */
+	unsigned int		valid;		/* which fields are valid */
 	__u64			pre_size;	/* pre_op_attr.size	  */
 	struct timespec		pre_mtime;	/* pre_op_attr.mtime	  */
 	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
@@ -59,12 +59,46 @@ struct nfs_fattr {
 	unsigned long		gencount;
 };
 
-#define NFS_ATTR_WCC		0x0001		/* pre-op WCC data    */
-#define NFS_ATTR_FATTR		0x0002		/* post-op attributes */
-#define NFS_ATTR_FATTR_V3	0x0004		/* NFSv3 attributes */
-#define NFS_ATTR_FATTR_V4	0x0008		/* NFSv4 change attribute */
-#define NFS_ATTR_WCC_V4		0x0010		/* pre-op change attribute */
-#define NFS_ATTR_FATTR_V4_REFERRAL	0x0020		/* NFSv4 referral */
+#define NFS_ATTR_FATTR_TYPE		(1U << 0)
+#define NFS_ATTR_FATTR_MODE		(1U << 1)
+#define NFS_ATTR_FATTR_NLINK		(1U << 2)
+#define NFS_ATTR_FATTR_OWNER		(1U << 3)
+#define NFS_ATTR_FATTR_GROUP		(1U << 4)
+#define NFS_ATTR_FATTR_RDEV		(1U << 5)
+#define NFS_ATTR_FATTR_SIZE		(1U << 6)
+#define NFS_ATTR_FATTR_PRESIZE		(1U << 7)
+#define NFS_ATTR_FATTR_BLOCKS_USED	(1U << 8)
+#define NFS_ATTR_FATTR_SPACE_USED	(1U << 9)
+#define NFS_ATTR_FATTR_FSID		(1U << 10)
+#define NFS_ATTR_FATTR_FILEID		(1U << 11)
+#define NFS_ATTR_FATTR_ATIME		(1U << 12)
+#define NFS_ATTR_FATTR_MTIME		(1U << 13)
+#define NFS_ATTR_FATTR_CTIME		(1U << 14)
+#define NFS_ATTR_FATTR_PREMTIME		(1U << 15)
+#define NFS_ATTR_FATTR_PRECTIME		(1U << 16)
+#define NFS_ATTR_FATTR_CHANGE		(1U << 17)
+#define NFS_ATTR_FATTR_PRECHANGE	(1U << 18)
+#define NFS_ATTR_FATTR_V4_REFERRAL	(1U << 19)	/* NFSv4 referral */
+
+#define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \
+		| NFS_ATTR_FATTR_MODE \
+		| NFS_ATTR_FATTR_NLINK \
+		| NFS_ATTR_FATTR_OWNER \
+		| NFS_ATTR_FATTR_GROUP \
+		| NFS_ATTR_FATTR_RDEV \
+		| NFS_ATTR_FATTR_SIZE \
+		| NFS_ATTR_FATTR_FSID \
+		| NFS_ATTR_FATTR_FILEID \
+		| NFS_ATTR_FATTR_ATIME \
+		| NFS_ATTR_FATTR_MTIME \
+		| NFS_ATTR_FATTR_CTIME)
+#define NFS_ATTR_FATTR_V2 (NFS_ATTR_FATTR \
+		| NFS_ATTR_FATTR_BLOCKS_USED)
+#define NFS_ATTR_FATTR_V3 (NFS_ATTR_FATTR \
+		| NFS_ATTR_FATTR_SPACE_USED)
+#define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \
+		| NFS_ATTR_FATTR_SPACE_USED \
+		| NFS_ATTR_FATTR_CHANGE)
 
 /*
  * Info on the file system
-- 
cgit v1.2.3


From 1ca277d88dafdbc3c5a69d32590e7184b9af6371 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:25 -0400
Subject: NFS: Shrink the struct nfs_fattr

We don't need the bitmap[] field anymore, since the 'valid' field tells us
all we need to know about which attributes were filled in...
Also move the pre-op attributes in order to improve the structure packing.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c        | 3 ---
 include/linux/nfs_xdr.h | 7 +++----
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7d220da3db36..9f1df8361974 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3002,9 +3002,6 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
 		goto xdr_error;
 
-	fattr->bitmap[0] = bitmap[0];
-	fattr->bitmap[1] = bitmap[1];
-
 	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
 		goto xdr_error;
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b99295e07cdf..6013acb0131f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -28,9 +28,6 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 
 struct nfs_fattr {
 	unsigned int		valid;		/* which fields are valid */
-	__u64			pre_size;	/* pre_op_attr.size	  */
-	struct timespec		pre_mtime;	/* pre_op_attr.mtime	  */
-	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
 	enum nfs_ftype		type;		/* always use NFSv2 types */
 	__u32			mode;
 	__u32			nlink;
@@ -52,9 +49,11 @@ struct nfs_fattr {
 	struct timespec		atime;
 	struct timespec		mtime;
 	struct timespec		ctime;
-	__u32			bitmap[2];	/* NFSv4 returned attribute bitmap */
 	__u64			change_attr;	/* NFSv4 change attribute */
 	__u64			pre_change_attr;/* pre-op NFSv4 change attribute */
+	__u64			pre_size;	/* pre_op_attr.size	  */
+	struct timespec		pre_mtime;	/* pre_op_attr.mtime	  */
+	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
 	unsigned long		time_start;
 	unsigned long		gencount;
 };
-- 
cgit v1.2.3


From bca794785c2c12ecddeb09e70165b8ff80baa6ae Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:26 -0400
Subject: NFS: Fix the type of struct nfs_fattr->mode

There is no point in using anything other than umode_t, since we copy the
content pretty much directly into inode->i_mode.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/getroot.c        |  4 ++--
 fs/nfs/nfs2xdr.c        |  7 +++----
 fs/nfs/nfs3xdr.c        | 31 +++++++++++++------------------
 fs/nfs/nfs4xdr.c        | 40 +++++++++++++++++++---------------------
 include/linux/nfs_xdr.h |  3 +--
 5 files changed, 38 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f29..46177cb87064 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
 		return ret;
 	}
 
-	if (fattr.type != NFDIR) {
+	if (!S_ISDIR(fattr.mode)) {
 		printk(KERN_ERR "nfs4_get_root:"
 		       " getroot encountered non-directory\n");
 		return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
 		return ret;
 	}
 
-	if (fattr.type != NFDIR) {
+	if (!S_ISDIR(fattr.mode)) {
 		printk(KERN_ERR "nfs4_get_root:"
 		       " lookupfh encountered non-directory\n");
 		return -ENOTDIR;
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index bea99992c302..c862c9340f9a 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
 static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
-	u32 rdev;
-	fattr->type = (enum nfs_ftype) ntohl(*p++);
+	u32 rdev, type;
+	type = ntohl(*p++);
 	fattr->mode = ntohl(*p++);
 	fattr->nlink = ntohl(*p++);
 	fattr->uid = ntohl(*p++);
@@ -138,8 +138,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time(p, &fattr->ctime);
 	fattr->valid |= NFS_ATTR_FATTR_V2;
 	fattr->rdev = new_decode_dev(rdev);
-	if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
-		fattr->type = NFFIFO;
+	if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
 		fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
 		fattr->rdev = 0;
 	}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index c0f7d02aced9..e6a1932c7110 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -91,19 +91,15 @@
 /*
  * Map file type to S_IFMT bits
  */
-static struct {
-	unsigned int	mode;
-	unsigned int	nfs2type;
-} nfs_type2fmt[] = {
-      { 0,		NFNON	},
-      { S_IFREG,	NFREG	},
-      { S_IFDIR,	NFDIR	},
-      { S_IFBLK,	NFBLK	},
-      { S_IFCHR,	NFCHR	},
-      { S_IFLNK,	NFLNK	},
-      { S_IFSOCK,	NFSOCK	},
-      { S_IFIFO,	NFFIFO	},
-      { 0,		NFBAD	}
+static const umode_t nfs_type2fmt[] = {
+	[NF3BAD] = 0,
+	[NF3REG] = S_IFREG,
+	[NF3DIR] = S_IFDIR,
+	[NF3BLK] = S_IFBLK,
+	[NF3CHR] = S_IFCHR,
+	[NF3LNK] = S_IFLNK,
+	[NF3SOCK] = S_IFSOCK,
+	[NF3FIFO] = S_IFIFO,
 };
 
 /*
@@ -148,13 +144,12 @@ static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
 	unsigned int	type, major, minor;
-	int		fmode;
+	umode_t		fmode;
 
 	type = ntohl(*p++);
-	if (type >= NF3BAD)
-		type = NF3BAD;
-	fmode = nfs_type2fmt[type].mode;
-	fattr->type = nfs_type2fmt[type].nfs2type;
+	if (type > NF3FIFO)
+		type = NF3NON;
+	fmode = nfs_type2fmt[type];
 	fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
 	fattr->nlink = ntohl(*p++);
 	fattr->uid = ntohl(*p++);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9f1df8361974..c1906d2a226b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
 				 decode_lookup_maxsz + \
 				 decode_fs_locations_maxsz)
 
-static struct {
-	unsigned int	mode;
-	unsigned int	nfs2type;
-} nfs_type2fmt[] = {
-	{ 0,		NFNON	     },
-	{ S_IFREG,	NFREG	     },
-	{ S_IFDIR,	NFDIR	     },
-	{ S_IFBLK,	NFBLK	     },
-	{ S_IFCHR,	NFCHR	     },
-	{ S_IFLNK,	NFLNK	     },
-	{ S_IFSOCK,	NFSOCK	     },
-	{ S_IFIFO,	NFFIFO	     },
-	{ 0,		NFNON	     },
-	{ 0,		NFNON	     },
+static const umode_t nfs_type2fmt[] = {
+	[NF4BAD] = 0,
+	[NF4REG] = S_IFREG,
+	[NF4DIR] = S_IFDIR,
+	[NF4BLK] = S_IFBLK,
+	[NF4CHR] = S_IFCHR,
+	[NF4LNK] = S_IFLNK,
+	[NF4SOCK] = S_IFSOCK,
+	[NF4FIFO] = S_IFIFO,
+	[NF4ATTRDIR] = 0,
+	[NF4NAMEDATTR] = 0,
 };
 
 struct compound_hdr {
@@ -2173,7 +2170,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		}
 		bitmap[0] &= ~FATTR4_WORD0_TYPE;
 	}
-	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type);
+	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
 	return 0;
 }
 
@@ -2580,8 +2577,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 	return status;
 }
 
-static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
 {
+	uint32_t tmp;
 	__be32 *p;
 
 	*mode = 0;
@@ -2589,8 +2587,8 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
 		READ_BUF(4);
-		READ32(*mode);
-		*mode &= ~S_IFMT;
+		READ32(tmp);
+		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
 	}
 	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
@@ -2994,7 +2992,8 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	uint32_t attrlen,
 		 bitmap[2] = {0},
 		 type;
-	int status, fmode = 0;
+	int status;
+	umode_t fmode = 0;
 	uint64_t fileid;
 
 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -3008,8 +3007,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 
 	if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
 		goto xdr_error;
-	fattr->type = nfs_type2fmt[type].nfs2type;
-	fattr->mode = nfs_type2fmt[type].mode;
+	fattr->mode = nfs_type2fmt[type];
 
 	if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
 		goto xdr_error;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6013acb0131f..0691b9c188d9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -28,8 +28,7 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 
 struct nfs_fattr {
 	unsigned int		valid;		/* which fields are valid */
-	enum nfs_ftype		type;		/* always use NFSv2 types */
-	__u32			mode;
+	umode_t			mode;
 	__u32			nlink;
 	__u32			uid;
 	__u32			gid;
-- 
cgit v1.2.3


From a65318bf3afc93ce49227e849d213799b072c5fd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:28 -0400
Subject: NFSv4: Simplify some cache consistency post-op GETATTRs

Certain asynchronous operations such as write() do not expect
(or care) that other metadata such as the file owner, mode, acls, ...
change. All they want to do is update and/or check the change attribute,
ctime, and mtime.
By skipping the file owner and group update, we also avoid having to do a
potential idmapper upcall for these asynchronous RPC calls.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c         | 13 ++++++++-----
 include/linux/nfs_fs_sb.h |  5 +++++
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index aa433d077945..101f5f4c304f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1439,7 +1439,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	if (calldata->arg.seqid == NULL)
 		goto out_free_calldata;
 	calldata->arg.fmode = 0;
-	calldata->arg.bitmask = server->attr_bitmask;
+	calldata->arg.bitmask = server->cache_consistency_bitmask;
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
@@ -1600,6 +1600,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 			server->caps |= NFS_CAP_HARDLINKS;
 		if (res.has_symlinks != 0)
 			server->caps |= NFS_CAP_SYMLINKS;
+		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		server->acl_bitmask = res.acl_bitmask;
 	}
 	return status;
@@ -2079,7 +2082,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 	struct nfs_removeargs *args = msg->rpc_argp;
 	struct nfs_removeres *res = msg->rpc_resp;
 
-	args->bitmask = server->attr_bitmask;
+	args->bitmask = server->cache_consistency_bitmask;
 	res->server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
@@ -2323,7 +2326,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		.pages = &page,
 		.pgbase = 0,
 		.count = count,
-		.bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+		.bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
 	};
 	struct nfs4_readdir_res res;
 	struct rpc_message msg = {
@@ -2552,7 +2555,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
-	data->args.bitmask = server->attr_bitmask;
+	data->args.bitmask = server->cache_consistency_bitmask;
 	data->res.server = server;
 	data->timestamp   = jiffies;
 
@@ -2575,7 +2578,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 	
-	data->args.bitmask = server->attr_bitmask;
+	data->args.bitmask = server->cache_consistency_bitmask;
 	data->res.server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 9bb81aec91cf..29b1e40dce99 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -106,6 +106,11 @@ struct nfs_server {
 	u32			attr_bitmask[2];/* V4 bitmask representing the set
 						   of attributes supported on this
 						   filesystem */
+	u32			cache_consistency_bitmask[2];
+						/* V4 bitmask representing the subset
+						   of change attribute, size, ctime
+						   and mtime attributes supported by
+						   the server */
 	u32			acl_bitmask;	/* V4 bitmask representing the ACEs
 						   that are supported on this
 						   filesystem */
-- 
cgit v1.2.3


From fb8a1f11b64e213d94dfa1cebb2a42a7b8c115c4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:29 -0400
Subject: NFS: cleanup - remove struct nfs_inode->ncommit

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  1 -
 fs/nfs/write.c         | 25 ++++++++++++++++---------
 include/linux/nfs_fs.h |  3 +--
 3 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b7656bd3706f..00f116cdadc6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1345,7 +1345,6 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
-	nfsi->ncommit = 0;
 	nfsi->npages = 0;
 	atomic_set(&nfsi->silly_count, 1);
 	INIT_HLIST_HEAD(&nfsi->silly_list);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc1..1a999939fedf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -404,7 +404,6 @@ nfs_mark_request_commit(struct nfs_page *req)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
-	nfsi->ncommit++;
 	set_bit(PG_CLEAN, &(req)->wb_flags);
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
 			req->wb_index,
@@ -523,6 +522,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
 	}
 }
 
+static int
+nfs_need_commit(struct nfs_inode *nfsi)
+{
+	return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
+}
+
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 /*
  * nfs_scan_commit - Scan an inode for commit requests
@@ -538,16 +543,18 @@ static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	int res = 0;
 
-	if (nfsi->ncommit != 0) {
-		res = nfs_scan_list(nfsi, dst, idx_start, npages,
-				NFS_PAGE_TAG_COMMIT);
-		nfsi->ncommit -= res;
-	}
-	return res;
+	if (!nfs_need_commit(nfsi))
+		return 0;
+
+	return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
 }
 #else
+static inline int nfs_need_commit(struct nfs_inode *nfsi)
+{
+	return 0;
+}
+
 static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	return 0;
@@ -820,7 +827,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->args.stable  = NFS_UNSTABLE;
 	if (how & FLUSH_STABLE) {
 		data->args.stable = NFS_DATA_SYNC;
-		if (!NFS_I(inode)->ncommit)
+		if (!nfs_need_commit(NFS_I(inode)))
 			data->args.stable = NFS_FILE_SYNC;
 	}
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index db867b04ac3c..c9fecd3e8f0f 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -166,8 +166,7 @@ struct nfs_inode {
 	 */
 	struct radix_tree_root	nfs_page_tree;
 
-	unsigned long		ncommit,
-				npages;
+	unsigned long		npages;
 
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
-- 
cgit v1.2.3


From 72cb77f4a5ace37b12dcb47a0e8637a2c28ad881 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:30 -0400
Subject: NFS: Throttle page dirtying while we're flushing to disk

The following patch is a combination of a patch by myself and Peter
Staubach.

Trond: If we allow other processes to dirty pages while a process is doing
a consistency sync to disk, we can end up never making progress.

Peter: Attached is a patch which addresses a continuing problem with
the NFS client generating out of order WRITE requests.  While
this is compliant with all of the current protocol
specifications, there are servers in the market which can not
handle out of order WRITE requests very well.  Also, this may
lead to sub-optimal block allocations in the underlying file
system on the server.  This may cause the read throughputs to
be reduced when reading the file from the server.

Peter: There has been a lot of work recently done to address out of
order issues on a systemic level.  However, the NFS client is
still susceptible to the problem.  Out of order WRITE
requests can occur when pdflush is in the middle of writing
out pages while the process dirtying the pages calls
generic_file_buffered_write which calls
generic_perform_write which calls
balance_dirty_pages_rate_limited which ends up calling
writeback_inodes which ends up calling back into the NFS
client to writes out dirty pages for the same file that
pdflush happens to be working with.

Signed-off-by: Peter Staubach <staubach@redhat.com>
[modification by Trond to merge the two similar patches]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c          |  9 +++++++++
 fs/nfs/inode.c         | 12 ++++++++++++
 fs/nfs/internal.h      |  1 +
 fs/nfs/nfs4proc.c      | 10 +---------
 fs/nfs/pagelist.c      | 11 -----------
 fs/nfs/write.c         | 28 +++++++++++++++++++---------
 include/linux/nfs_fs.h |  1 +
 7 files changed, 43 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d2..404c19c866a7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,6 +354,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
+	/*
+	 * Prevent starvation issues if someone is doing a consistency
+	 * sync-to-disk
+	 */
+	ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (ret)
+		return ret;
+
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 00f116cdadc6..c40adc5dd609 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -65,6 +65,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
+/**
+ * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+int nfs_wait_bit_killable(void *word)
+{
+	if (fatal_signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
 /**
  * nfs_compat_user_ino64 - returns the user-visible inode number
  * @fileid: 64-bit fileid
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608f..a55e69aa52e5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -165,6 +165,7 @@ extern void nfs_clear_inode(struct inode *);
 extern void nfs4_clear_inode(struct inode *);
 #endif
 void nfs_zap_acl_cache(struct inode *inode);
+extern int nfs_wait_bit_killable(void *word);
 
 /* super.c */
 void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 101f5f4c304f..95f171e7e05a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 	kunmap_atomic(start, KM_USER0);
 }
 
-static int nfs4_wait_bit_killable(void *word)
-{
-	if (fatal_signal_pending(current))
-		return -ERESTARTSYS;
-	schedule();
-	return 0;
-}
-
 static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
 	int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 	might_sleep();
 
 	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
-			nfs4_wait_bit_killable, TASK_KILLABLE);
+			nfs_wait_bit_killable, TASK_KILLABLE);
 	return res;
 }
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70a..e2975939126a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
 	kref_put(&req->wb_kref, nfs_free_request);
 }
 
-static int nfs_wait_bit_killable(void *word)
-{
-	int ret = 0;
-
-	if (fatal_signal_pending(current))
-		ret = -ERESTARTSYS;
-	else
-		schedule();
-	return ret;
-}
-
 /**
  * nfs_wait_on_request - Wait for a request to complete.
  * @req: request to wait upon.
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1a999939fedf..36fd35e0de83 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
+	unsigned long *bitlock = &NFS_I(inode)->flags;
 	struct nfs_pageio_descriptor pgio;
 	int err;
 
+	/* Stop dirtying of new pages while we sync */
+	err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (err)
+		goto out_err;
+
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
 	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
+
+	clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
+	smp_mb__after_clear_bit();
+	wake_up_bit(bitlock, NFS_INO_FLUSHING);
+
 	if (err < 0)
-		return err;
-	if (pgio.pg_error < 0)
-		return pgio.pg_error;
+		goto out_err;
+	err = pgio.pg_error;
+	if (err < 0)
+		goto out_err;
 	return 0;
+out_err:
+	return err;
 }
 
 /*
@@ -1432,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 {
 	struct writeback_control wbc = {
 		.bdi = mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_NONE,
+		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
 		.for_writepages = 1,
 	};
-	int ret;
 
-	ret = __nfs_write_mapping(mapping, &wbc, how);
-	if (ret < 0)
-		return ret;
-	wbc.sync_mode = WB_SYNC_ALL;
 	return __nfs_write_mapping(mapping, &wbc, how);
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c9fecd3e8f0f..933bc261c0df 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -206,6 +206,7 @@ struct nfs_inode {
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_MOUNTPOINT	(3)		/* inode is remote mountpoint */
+#define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
-- 
cgit v1.2.3


From 441e3e242903f9b190d5764bed73edb58f977413 Mon Sep 17 00:00:00 2001
From: Tom Talpey <tmtalpey@gmail.com>
Date: Wed, 11 Mar 2009 14:37:56 -0400
Subject: SUNRPC: dynamically load RPC transport modules on-demand

Provide an api to attempt to load any necessary kernel RPC
client transport module automatically. By convention, the
desired module name is "xprt"+"transport name". For example,
when NFS mounting with "-o proto=rdma", attempt to load the
"xprtrdma" module.

Signed-off-by: Tom Talpey <tmtalpey@gmail.com>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprt.c           | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 11fc71d50c1e..2b0d960603b9 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -235,6 +235,7 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
  */
 int			xprt_register_transport(struct xprt_class *type);
 int			xprt_unregister_transport(struct xprt_class *type);
+int			xprt_load_transport(const char *);
 void			xprt_set_retrans_timeout_def(struct rpc_task *task);
 void			xprt_set_retrans_timeout_rtt(struct rpc_task *task);
 void			xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 62098d101a1f..d1afec640394 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -151,6 +151,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(xprt_unregister_transport);
 
+/**
+ * xprt_load_transport - load a transport implementation
+ * @transport_name: transport to load
+ *
+ * Returns:
+ * 0:		transport successfully loaded
+ * -ENOENT:	transport module not available
+ */
+int xprt_load_transport(const char *transport_name)
+{
+	struct xprt_class *t;
+	char module_name[sizeof t->name + 5];
+	int result;
+
+	result = 0;
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		if (strcmp(t->name, transport_name) == 0) {
+			spin_unlock(&xprt_list_lock);
+			goto out;
+		}
+	}
+	spin_unlock(&xprt_list_lock);
+	strcpy(module_name, "xprt");
+	strncat(module_name, transport_name, sizeof t->name);
+	result = request_module(module_name);
+out:
+	return result;
+}
+EXPORT_SYMBOL_GPL(xprt_load_transport);
+
 /**
  * xprt_reserve_xprt - serialize write access to transports
  * @task: task that is requesting access to the transport
-- 
cgit v1.2.3


From d820ac4c2fa881079e6b689d2098adce337558ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 01:30:40 +0100
Subject: locking: rename trace_softirq_[enter|exit] =>
 lockdep_softirq_[enter|exit]

Impact: cleanup

The naming clashes with upcoming softirq tracepoints, so rename the
APIs to lockdep_*().

Requested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irqflags.h | 8 ++++----
 kernel/softirq.c         | 4 ++--
 lib/locking-selftest.c   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 74bde13224c9..b02a3f1d46a0 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -24,8 +24,8 @@
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
 # define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define trace_softirq_enter()	do { current->softirq_context++; } while (0)
-# define trace_softirq_exit()	do { current->softirq_context--; } while (0)
+# define lockdep_softirq_enter()	do { current->softirq_context++; } while (0)
+# define lockdep_softirq_exit()	do { current->softirq_context--; } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
@@ -38,8 +38,8 @@
 # define trace_softirqs_enabled(p)	0
 # define trace_hardirq_enter()		do { } while (0)
 # define trace_hardirq_exit()		do { } while (0)
-# define trace_softirq_enter()		do { } while (0)
-# define trace_softirq_exit()		do { } while (0)
+# define lockdep_softirq_enter()	do { } while (0)
+# define lockdep_softirq_exit()		do { } while (0)
 # define INIT_TRACE_IRQFLAGS
 #endif
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9041ea7948fe..08a030f85416 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -180,7 +180,7 @@ asmlinkage void __do_softirq(void)
 	account_system_vtime(current);
 
 	__local_bh_disable((unsigned long)__builtin_return_address(0));
-	trace_softirq_enter();
+	lockdep_softirq_enter();
 
 	cpu = smp_processor_id();
 restart:
@@ -220,7 +220,7 @@ restart:
 	if (pending)
 		wakeup_softirqd();
 
-	trace_softirq_exit();
+	lockdep_softirq_exit();
 
 	account_system_vtime(current);
 	_local_bh_enable();
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 280332c1827c..619313ed6c46 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -157,11 +157,11 @@ static void init_shared_classes(void)
 #define SOFTIRQ_ENTER()				\
 		local_bh_disable();		\
 		local_irq_disable();		\
-		trace_softirq_enter();		\
+		lockdep_softirq_enter();	\
 		WARN_ON(!in_softirq());
 
 #define SOFTIRQ_EXIT()				\
-		trace_softirq_exit();		\
+		lockdep_softirq_exit();		\
 		local_irq_enable();		\
 		local_bh_enable();
 
-- 
cgit v1.2.3


From 48ead02030f849d011259244bb4ea9b985479006 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 12 Mar 2009 18:24:49 +0100
Subject: tracing/core: bring back raw trace_printk for dynamic formats strings

Impact: fix callsites with dynamic format strings

Since its new binary implementation, trace_printk() internally uses static
containers for the format strings on each callsites. But the value is
assigned once at build time, which means that it can't take dynamic
formats.

So this patch unearthes the raw trace_printk implementation for the callers
that will need trace_printk to be able to carry these dynamic format
strings. The trace_printk() macro will use the appropriate implementation
for each callsite. Most of the time however, the binary implementation will
still be used.

The other impact of this patch is that mmiotrace_printk() will use the old
implementation because it calls the low level trace_vprintk and we can't
guess here whether the format passed in it is dynamic or not.

Some parts of this patch have been written by Steven Rostedt (most notably
the part that chooses the appropriate implementation for each callsites).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/kernel.h               | 40 +++++++++++------
 kernel/trace/trace.c                 | 85 +++++++++++++++++++++++++++++++++---
 kernel/trace/trace.h                 | 13 +++++-
 kernel/trace/trace_event_types.h     | 11 ++++-
 kernel/trace/trace_functions_graph.c |  6 +--
 kernel/trace/trace_mmiotrace.c       |  7 +--
 kernel/trace/trace_output.c          | 57 +++++++++++++++++++++---
 kernel/trace/trace_printk.c          | 33 +++++++++++---
 8 files changed, 213 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7742798c9208..1daca3b062bb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -452,31 +452,45 @@ do {									\
 
 #define trace_printk(fmt, args...)					\
 do {									\
-	static const char *trace_printk_fmt				\
-	__attribute__((section("__trace_printk_fmt")));			\
-									\
-	if (!trace_printk_fmt)						\
-		trace_printk_fmt = fmt;					\
-									\
 	__trace_printk_check_format(fmt, ##args);			\
-	__trace_printk(_THIS_IP_, trace_printk_fmt, ##args);		\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args);	\
+	} else								\
+		__trace_printk(_THIS_IP_, fmt, ##args);		\
 } while (0)
 
+extern int
+__trace_bprintk(unsigned long ip, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+
 extern int
 __trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement.
+ */
 #define ftrace_vprintk(fmt, vargs)					\
 do {									\
-	static const char *trace_printk_fmt				\
-	__attribute__((section("__trace_printk_fmt")));			\
-									\
-	if (!trace_printk_fmt)						\
-		trace_printk_fmt = fmt;					\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
 									\
-	__ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs);		\
+		__ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs);	\
+	} else								\
+		__ftrace_vprintk(_THIS_IP_, fmt, vargs);		\
 } while (0)
 
+extern int
+__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
+
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62a63b2b33dd..dbb077d8a172 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1179,10 +1179,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 
 
 /**
- * trace_vprintk - write binary msg to tracing buffer
+ * trace_vbprintk - write binary msg to tracing buffer
  *
  */
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
 	static raw_spinlock_t trace_buf_lock =
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -1191,7 +1191,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
-	struct print_entry *entry;
+	struct bprint_entry *entry;
 	unsigned long flags;
 	int resched;
 	int cpu, len = 0, size, pc;
@@ -1219,7 +1219,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out_unlock;
 
 	size = sizeof(*entry) + sizeof(u32) * len;
-	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc);
+	event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
@@ -1240,6 +1240,60 @@ out:
 
 	return len;
 }
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+{
+	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
+	static char trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	int cpu, len = 0, size, pc;
+	struct print_entry *entry;
+	unsigned long irq_flags;
+
+	if (tracing_disabled || tracing_selftest_running)
+		return 0;
+
+	pc = preempt_count();
+	preempt_disable_notrace();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	pause_graph_tracing();
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&trace_buf_lock);
+	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	len = min(len, TRACE_BUF_SIZE-1);
+	trace_buf[len] = 0;
+
+	size = sizeof(*entry) + len + 1;
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->depth			= depth;
+
+	memcpy(&entry->buf, trace_buf, len);
+	entry->buf[len] = 0;
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+ out_unlock:
+	__raw_spin_unlock(&trace_buf_lock);
+	raw_local_irq_restore(irq_flags);
+	unpause_graph_tracing();
+ out:
+	preempt_enable_notrace();
+
+	return len;
+}
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
 enum trace_file_type {
@@ -1628,6 +1682,22 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct bprint_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1637,7 +1707,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	ret = trace_seq_printf(s, "%s", field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -1702,6 +1772,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 			return ret;
 	}
 
+	if (iter->ent->type == TRACE_BPRINT &&
+			trace_flags & TRACE_ITER_PRINTK &&
+			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+		return print_bprintk_msg_only(iter);
+
 	if (iter->ent->type == TRACE_PRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 336324d717f8..cede1ab49d07 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
+	TRACE_BPRINT,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -117,7 +118,7 @@ struct userstack_entry {
 /*
  * trace_printk entry:
  */
-struct print_entry {
+struct bprint_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
 	int			depth;
@@ -125,6 +126,13 @@ struct print_entry {
 	u32			buf[];
 };
 
+struct print_entry {
+	struct trace_entry	ent;
+	unsigned long		ip;
+	int			depth;
+	char			buf[];
+};
+
 #define TRACE_OLD_SIZE		88
 
 struct trace_field_cont {
@@ -286,6 +294,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
+		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
@@ -570,6 +579,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 extern void *head_page(struct trace_array_cpu *data);
 extern long ns2usecs(cycle_t nsec);
 extern int
+trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+extern int
 trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5cca4c978bde..d0907d746425 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
-TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
@@ -112,6 +112,15 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
 
+TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD_ZERO_CHAR(buf)
+	),
+	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
+);
+
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned int, line, line)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8566c14b3e9a..4c388607ed67 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -684,7 +684,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct print_entry *trace, struct trace_seq *s,
+print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 		   struct trace_entry *ent, struct trace_iterator *iter)
 {
 	int i;
@@ -781,8 +781,8 @@ print_graph_function(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter);
 	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
+	case TRACE_BPRINT: {
+		struct bprint_entry *field;
 		trace_assign_type(field, entry);
 		return print_graph_comment(field, s, entry, iter);
 	}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 23e346a734ca..f095916e477f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -254,6 +254,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 {
 	struct trace_entry *entry = iter->ent;
 	struct print_entry *print = (struct print_entry *)entry;
+	const char *msg		= print->buf;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
 	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
@@ -261,11 +262,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ret = trace_seq_bprintf(s, print->fmt, print->buf);
+	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 491832af9ba1..ea9d3b410c7a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -832,13 +832,13 @@ static struct trace_event trace_user_stack_event = {
 	.binary		= trace_special_bin,
 };
 
-/* TRACE_PRINT */
+/* TRACE_BPRINT */
 static enum print_line_t
-trace_print_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
-	struct print_entry *field;
+	struct bprint_entry *field;
 
 	trace_assign_type(field, entry);
 
@@ -858,9 +858,10 @@ trace_print_print(struct trace_iterator *iter, int flags)
 }
 
 
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t
+trace_bprint_raw(struct trace_iterator *iter, int flags)
 {
-	struct print_entry *field;
+	struct bprint_entry *field;
 	struct trace_seq *s = &iter->seq;
 
 	trace_assign_type(field, iter->ent);
@@ -878,12 +879,55 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 }
 
 
+static struct trace_event trace_bprint_event = {
+	.type		= TRACE_BPRINT,
+	.trace		= trace_bprint_print,
+	.raw		= trace_bprint_raw,
+};
+
+/* TRACE_PRINT */
+static enum print_line_t trace_print_print(struct trace_iterator *iter,
+					   int flags)
+{
+	struct print_entry *field;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_printf(s, ": %s", field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
 static struct trace_event trace_print_event = {
-	.type		= TRACE_PRINT,
+	.type	 	= TRACE_PRINT,
 	.trace		= trace_print_print,
 	.raw		= trace_print_raw,
 };
 
+
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
@@ -891,6 +935,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_special_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
+	&trace_bprint_event,
 	&trace_print_event,
 	NULL
 };
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a50aea22e929..f307a11e2332 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -99,7 +99,7 @@ struct notifier_block module_trace_bprintk_format_nb = {
 	.notifier_call = module_trace_bprintk_format_notify,
 };
 
-int __trace_printk(unsigned long ip, const char *fmt, ...)
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
  {
 	int ret;
 	va_list ap;
@@ -111,13 +111,13 @@ int __trace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
 	va_end(ap);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__trace_printk);
+EXPORT_SYMBOL_GPL(__trace_bprintk);
 
-int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
  {
 	if (unlikely(!fmt))
 		return 0;
@@ -125,11 +125,34 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 	if (!(trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
+	return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
 	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
-
 static __init int init_trace_printk(void)
 {
 	return register_module_notifier(&module_trace_bprintk_format_nb);
-- 
cgit v1.2.3


From 5d592b44b29a1d73e13d5c9e3426eed843bdc359 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 12 Mar 2009 14:33:36 -0400
Subject: tracing: tracepoints for softirq entry/exit - add softirq-to-name
 array

Create a 'softirq_to_name' array, which is indexed by softirq #, so
that we can easily convert between the softirq index # and its name, in
order to get more meaningful output messages.

LKML-Reference: <20090312183336.GB3352@redhat.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/interrupt.h | 5 +++++
 kernel/softirq.c          | 9 ++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 472f11765f60..9b7e9d743476 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -258,6 +258,11 @@ enum
 	NR_SOFTIRQS
 };
 
+/* map softirq index to softirq name. update 'softirq_to_name' in
+ * kernel/softirq.c when adding a new softirq.
+ */
+extern char *softirq_to_name[NR_SOFTIRQS];
+
 /* softirq mask and active fields moved to irq_cpustat_t in
  * asm/hardirq.h to get better cache usage.  KAO
  */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7571bcb71be4..9f90fdc039f4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -53,6 +53,12 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
+char *softirq_to_name[NR_SOFTIRQS] = {
+	"HI_SOFTIRQ", "TIMER_SOFTIRQ", "NET_TX_SOFTIRQ", "NET_RX_SOFTIRQ",
+	"BLOCK_SOFTIRQ", "TASKLET_SOFTIRQ", "SCHED_SOFTIRQ", "HRTIMER_SOFTIRQ",
+	"RCU_SOFTIRQ"
+};
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
@@ -209,9 +215,10 @@ restart:
 			h->action(h);
 
 			if (unlikely(prev_count != preempt_count())) {
-				printk(KERN_ERR "huh, entered softirq %td %p"
+				printk(KERN_ERR "huh, entered softirq %td %s %p"
 				       "with preempt_count %08x,"
 				       " exited with %08x?\n", h - softirq_vec,
+				       softirq_to_name[h - softirq_vec],
 				       h->action, prev_count, preempt_count());
 				preempt_count() = prev_count;
 			}
-- 
cgit v1.2.3


From a70f730282019f487aa33a84e5ac9a5e89c5abd0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Mar 2009 14:49:46 +1030
Subject: cpumask: replace node_to_cpumask with cpumask_of_node.

Impact: cleanup

node_to_cpumask (and the blecherous node_to_cpumask_ptr which
contained a declaration) are replaced now everyone implements
cpumask_of_node.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/base/node.c      | 2 +-
 drivers/pci/pci-driver.c | 3 +--
 include/linux/topology.h | 6 +-----
 mm/page_alloc.c          | 6 +++---
 mm/quicklist.c           | 2 +-
 mm/slab.c                | 2 +-
 mm/vmscan.c              | 6 ++++--
 net/sunrpc/svc.c         | 3 +--
 8 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index f8f578a71b25..40b809742a1c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -24,7 +24,7 @@ static struct sysdev_class node_class = {
 static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf)
 {
 	struct node *node_dev = to_node(dev);
-	node_to_cpumask_ptr(mask, node_dev->sysdev.id);
+	const struct cpumask *mask = cpumask_of_node(node_dev->sysdev.id);
 	int len;
 
 	/* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 93eac1423585..b522f883d674 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -212,10 +212,9 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
 	node = dev_to_node(&dev->dev);
 	if (node >= 0) {
 		int cpu;
-		node_to_cpumask_ptr(nodecpumask, node);
 
 		get_online_cpus();
-		cpu = cpumask_any_and(nodecpumask, cpu_online_mask);
+		cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
 		if (cpu < nr_cpu_ids)
 			error = work_on_cpu(cpu, local_pci_probe, &ddi);
 		else
diff --git a/include/linux/topology.h b/include/linux/topology.h
index a16b9e06f2e5..16b7d6896ce9 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -38,11 +38,7 @@
 #endif
 
 #ifndef nr_cpus_node
-#define nr_cpus_node(node)				\
-	({						\
-		node_to_cpumask_ptr(__tmp__, node);	\
-		cpus_weight(*__tmp__);			\
-	})
+#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
 #endif
 
 #define for_each_node_with_cpus(node)			\
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c44ed49ca93..a92b0975b9a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2134,7 +2134,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = -1;
-	node_to_cpumask_ptr(tmp, 0);
+	const struct cpumask *tmp = cpumask_of_node(0);
 
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
@@ -2155,8 +2155,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 		val += (n < node);
 
 		/* Give preference to headless and unused nodes */
-		node_to_cpumask_ptr_next(tmp, n);
-		if (!cpus_empty(*tmp))
+		tmp = cpumask_of_node(n);
+		if (!cpumask_empty(tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 
 		/* Slight preference for less loaded node */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 8dbb6805ef35..e66d07d1b4ff 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -29,7 +29,7 @@ static unsigned long max_pages(unsigned long min_pages)
 	int node = numa_node_id();
 	struct zone *zones = NODE_DATA(node)->node_zones;
 	int num_cpus_on_node;
-	node_to_cpumask_ptr(cpumask_on_node, node);
+	const struct cpumask *cpumask_on_node = cpumask_of_node(node);
 
 	node_free_pages =
 #ifdef CONFIG_ZONE_DMA
diff --git a/mm/slab.c b/mm/slab.c
index 4d00855629c4..2daaca0b4541 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1160,7 +1160,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 	struct kmem_cache *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
-	node_to_cpumask_ptr(mask, node);
+	const struct cpumask *mask = cpumask_of_node(node);
 
 	list_for_each_entry(cachep, &cache_chain, next) {
 		struct array_cache *nc;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6177e3bcd66b..cc6135586b44 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1963,7 +1963,7 @@ static int kswapd(void *p)
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
-	node_to_cpumask_ptr(cpumask, pgdat->node_id);
+	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
 	if (!cpumask_empty(cpumask))
 		set_cpus_allowed_ptr(tsk, cpumask);
@@ -2198,7 +2198,9 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
 		for_each_node_state(nid, N_HIGH_MEMORY) {
 			pg_data_t *pgdat = NODE_DATA(nid);
-			node_to_cpumask_ptr(mask, pgdat->node_id);
+			const struct cpumask *mask;
+
+			mask = cpumask_of_node(pgdat->node_id);
 
 			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
 				/* One of our CPUs online: restore mask */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c51fed4d1af1..3bdd5bffaca8 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -317,8 +317,7 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 	}
 	case SVC_POOL_PERNODE:
 	{
-		node_to_cpumask_ptr(nodecpumask, node);
-		set_cpus_allowed_ptr(task, nodecpumask);
+		set_cpus_allowed_ptr(task, cpumask_of_node(node));
 		break;
 	}
 	}
-- 
cgit v1.2.3


From ee08c6eccb7d1295516f7cf420fddf7b14e9146f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Mar 2009 05:52:59 +0100
Subject: tracing/ftrace: syscall tracing infrastructure, basics

Provide basic callbacks to do syscall tracing.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <1236401580-5758-2-git-send-email-fweisbec@gmail.com>
[ simplified it to a trace_printk() for now. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h        |  21 ++++++++
 kernel/trace/Kconfig          |  10 ++++
 kernel/trace/Makefile         |   1 +
 kernel/trace/trace.h          |   2 +
 kernel/trace/trace_syscalls.c | 113 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 147 insertions(+)
 create mode 100644 kernel/trace/trace_syscalls.c

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e1583f2639b0..c146c1021a29 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -503,4 +503,25 @@ static inline void trace_hw_branch_oops(void) {}
 
 #endif /* CONFIG_HW_BRANCH_TRACER */
 
+/*
+ * A syscall entry in the ftrace syscalls array.
+ *
+ * @syscall_nr: syscall number
+ */
+struct syscall_trace_entry {
+	int		syscall_nr;
+};
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+extern void start_ftrace_syscalls(void);
+extern void stop_ftrace_syscalls(void);
+extern void ftrace_syscall_enter(struct pt_regs *regs);
+extern void ftrace_syscall_exit(struct pt_regs *regs);
+#else
+static inline void start_ftrace_syscalls(void) { }
+static inline void stop_ftrace_syscalls(void) { }
+static inline void ftrace_syscall_enter(struct pt_regs *regs) { }
+static inline void ftrace_syscall_exit(struct pt_regs *regs) { }
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8e4a2a61cd75..95a0ad191f19 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -34,6 +34,9 @@ config HAVE_FTRACE_MCOUNT_RECORD
 config HAVE_HW_BRANCH_TRACER
 	bool
 
+config HAVE_FTRACE_SYSCALLS
+	bool
+
 config TRACER_MAX_TRACE
 	bool
 
@@ -175,6 +178,13 @@ config EVENT_TRACER
 	  allowing the user to pick and choose which trace point they
 	  want to trace.
 
+config FTRACE_SYSCALLS
+	bool "Trace syscalls"
+	depends on HAVE_FTRACE_SYSCALLS
+	select TRACING
+	help
+	  Basic tracer to catch the syscall entry and exit events.
+
 config BOOT_TRACER
 	bool "Trace boot initcalls"
 	select TRACING
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c7a2943796eb..c3feea01c3e0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -43,5 +43,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5e1d8865fe4..3d49daae47dc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -30,6 +30,8 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_HW_BRANCHES,
+	TRACE_SYSCALL_ENTER,
+	TRACE_SYSCALL_EXIT,
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
new file mode 100644
index 000000000000..66cf97449af3
--- /dev/null
+++ b/kernel/trace/trace_syscalls.c
@@ -0,0 +1,113 @@
+#include <linux/ftrace.h>
+#include <linux/kernel.h>
+
+#include <asm/syscall.h>
+
+#include "trace_output.h"
+#include "trace.h"
+
+static atomic_t refcount;
+
+void start_ftrace_syscalls(void)
+{
+	unsigned long flags;
+	struct task_struct *g, *t;
+
+	if (atomic_inc_return(&refcount) != 1)
+		goto out;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	do_each_thread(g, t) {
+		set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+	} while_each_thread(g, t);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+out:
+	atomic_dec(&refcount);
+}
+
+void stop_ftrace_syscalls(void)
+{
+	unsigned long flags;
+	struct task_struct *g, *t;
+
+	if (atomic_dec_return(&refcount))
+		goto out;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	do_each_thread(g, t) {
+		clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+	} while_each_thread(g, t);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+out:
+	atomic_inc(&refcount);
+}
+
+void ftrace_syscall_enter(struct pt_regs *regs)
+{
+	int syscall_nr;
+
+	syscall_nr = syscall_get_nr(current, regs);
+
+	trace_printk("syscall %d enter\n", syscall_nr);
+}
+
+void ftrace_syscall_exit(struct pt_regs *regs)
+{
+	int syscall_nr;
+
+	syscall_nr = syscall_get_nr(current, regs);
+
+	trace_printk("syscall %d exit\n", syscall_nr);
+}
+
+static int init_syscall_tracer(struct trace_array *tr)
+{
+	start_ftrace_syscalls();
+
+	return 0;
+}
+
+static void reset_syscall_tracer(struct trace_array *tr)
+{
+	stop_ftrace_syscalls();
+}
+
+static struct trace_event syscall_enter_event = {
+	.type		= TRACE_SYSCALL_ENTER,
+};
+
+static struct trace_event syscall_exit_event = {
+	.type		= TRACE_SYSCALL_EXIT,
+};
+
+static struct tracer syscall_tracer __read_mostly = {
+	.name		= "syscall",
+	.init		= init_syscall_tracer,
+	.reset		= reset_syscall_tracer
+};
+
+__init int register_ftrace_syscalls(void)
+{
+	int ret;
+
+	ret = register_ftrace_event(&syscall_enter_event);
+	if (!ret) {
+		printk(KERN_WARNING "event %d failed to register\n",
+		       syscall_enter_event.type);
+		WARN_ON_ONCE(1);
+	}
+
+	ret = register_ftrace_event(&syscall_exit_event);
+	if (!ret) {
+		printk(KERN_WARNING "event %d failed to register\n",
+		       syscall_exit_event.type);
+		WARN_ON_ONCE(1);
+	}
+
+	return register_tracer(&syscall_tracer);
+}
+device_initcall(register_ftrace_syscalls);
-- 
cgit v1.2.3


From d1dedb52acd98bd5e13e1ff4c4d045d58bbd16fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 11:14:06 +0100
Subject: panic, smp: provide smp_send_stop() wrapper on UP too

Impact: cleanup, no code changed

Remove an ugly #ifdef CONFIG_SMP from panic(), by providing
an smp_send_stop() wrapper on UP too.

LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/smp.h | 4 +++-
 kernel/panic.c      | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 2d3bcb6b37ff..a69db820eed6 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -38,7 +38,7 @@ int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
 /*
  * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
  * (defined in asm header):
- */ 
+ */
 
 /*
  * stops all CPUs but the current one:
@@ -122,6 +122,8 @@ extern unsigned int setup_max_cpus;
 
 #else /* !SMP */
 
+static inline void smp_send_stop(void) { }
+
 /*
  *	These macros fold the SMP functionality into a single CPU system
  */
diff --git a/kernel/panic.c b/kernel/panic.c
index 57fb005de546..ca75e819d0ea 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -85,14 +85,12 @@ NORET_TYPE void panic(const char * fmt, ...)
 	 */
 	crash_kexec(NULL);
 
-#ifdef CONFIG_SMP
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
 	 * unfortunately means it may not be hardened to work in a panic
 	 * situation.
 	 */
 	smp_send_stop();
-#endif
 
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
-- 
cgit v1.2.3


From e94142a67f8bad494c593f0a07c9fc2fbec98c0e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:51:27 +0800
Subject: ftrace: remove struct list_head from struct dyn_ftrace

Impact: save memory

The struct dyn_ftrace table is very large, this patch will save
about 50%.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49BA2C9F.8020009@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  1 -
 kernel/trace/ftrace.c  | 14 ++++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index c146c1021a29..9d598bbf28a6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -145,7 +145,6 @@ enum {
 };
 
 struct dyn_ftrace {
-	struct list_head	list;
 	unsigned long		ip; /* address of mcount call-site */
 	unsigned long		flags;
 	struct dyn_arch_ftrace	arch;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bf78a4c75c67..90d5729afeff 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -272,7 +272,7 @@ enum {
 
 static int ftrace_filtered;
 
-static LIST_HEAD(ftrace_new_addrs);
+static struct dyn_ftrace *ftrace_new_addrs;
 
 static DEFINE_MUTEX(ftrace_regex_lock);
 
@@ -409,8 +409,8 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-
-	list_add(&rec->list, &ftrace_new_addrs);
+	rec->flags = (unsigned long)ftrace_new_addrs;
+	ftrace_new_addrs = rec;
 
 	return rec;
 }
@@ -716,19 +716,21 @@ unsigned long		ftrace_update_tot_cnt;
 
 static int ftrace_update_code(struct module *mod)
 {
-	struct dyn_ftrace *p, *t;
+	struct dyn_ftrace *p;
 	cycle_t start, stop;
 
 	start = ftrace_now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
 
-	list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) {
+	while (ftrace_new_addrs) {
 
 		/* If something went wrong, bail without enabling anything */
 		if (unlikely(ftrace_disabled))
 			return -1;
 
-		list_del_init(&p->list);
+		p = ftrace_new_addrs;
+		ftrace_new_addrs = (struct dyn_ftrace *)p->flags;
+		p->flags = 0L;
 
 		/* convert record (i.e, patch mcount-call with NOP) */
 		if (ftrace_code_disable(mod, p)) {
-- 
cgit v1.2.3


From 082edb7bf443eb8eda15b482d16ad9dd8137ad24 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Mar 2009 23:43:37 +1030
Subject: numa, cpumask: move numa_node_id default implementation to topology.h

Impact: cleanup, potential bugfix

Not sure what changed to expose this, but clearly that numa_node_id()
doesn't belong in mmzone.h (the inline in gfp.h is probably overkill, too).

In file included from include/linux/topology.h:34,
                 from arch/x86/mm/numa.c:2:
/home/rusty/patches-cpumask/linux-2.6/arch/x86/include/asm/topology.h:64:1: warning: "numa_node_id" redefined
In file included from include/linux/topology.h:32,
                 from arch/x86/mm/numa.c:2:
include/linux/mmzone.h:770:1: warning: this is the location of the previous definition

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Mike Travis <travis@sgi.com>
LKML-Reference: <200903132343.37661.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/gfp.h      | 1 +
 include/linux/mmzone.h   | 6 ------
 include/linux/topology.h | 5 +++++
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index dd20cd78faa8..0bbc15f54536 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -4,6 +4,7 @@
 #include <linux/mmzone.h>
 #include <linux/stddef.h>
 #include <linux/linkage.h>
+#include <linux/topology.h>
 
 struct vm_area_struct;
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1aca6cebbb78..e6aacf77986a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -764,12 +764,6 @@ extern int numa_zonelist_order_handler(struct ctl_table *, int,
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN 16	/* string buffer size */
 
-#include <linux/topology.h>
-/* Returns the number of the current Node. */
-#ifndef numa_node_id
-#define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
-#endif
-
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 
 extern struct pglist_data contig_page_data;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 16b7d6896ce9..7402c1a27c4f 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -196,4 +196,9 @@ int arch_update_cpu_topology(void);
 #define topology_core_cpumask(cpu)		cpumask_of(cpu)
 #endif
 
+/* Returns the number of the current Node. */
+#ifndef numa_node_id
+#define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
+#endif
+
 #endif /* _LINUX_TOPOLOGY_H */
-- 
cgit v1.2.3


From bed1ffca022cc876fb83161d26670e9b5d3cf36b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 13 Mar 2009 15:42:11 +0100
Subject: tracing/syscalls: core infrastructure for syscalls tracing,
 enhancements

Impact: new feature

This adds the generic support for syscalls tracing. This is
currently exploited through a devoted tracer but other tracing
engines can use it. (They just have to play with
{start,stop}_ftrace_syscalls() and use the display callbacks
unless they want to override them.)

The syscalls prototypes definitions are abused here to steal
some metadata informations:

- syscall name, param types, param names, number of params

The syscall addr is not directly saved during this definition
because we don't know if its prototype is available in the
namespace. But we don't really need it. The arch has just to
build a function able to resolve the syscall number to its
metadata struct.

The current tracer prints the syscall names, parameters names
and values (and their types optionally). Currently the value is
a raw hex but higher level values diplaying is on my TODO list.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1236955332-10133-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |  11 ++-
 include/linux/ftrace.h            |  14 +++-
 include/linux/syscalls.h          |  60 +++++++++++++++-
 kernel/trace/trace.h              |  17 +++++
 kernel/trace/trace_syscalls.c     | 146 +++++++++++++++++++++++++++++++++++---
 5 files changed, 234 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0e0f39be6c8b..d3bc3c86df6a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -77,6 +77,14 @@
 #define TRACE_PRINTKS()
 #endif
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define TRACE_SYSCALLS() VMLINUX_SYMBOL(__start_syscalls_metadata) = .;	\
+			 *(__syscalls_metadata)				\
+			 VMLINUX_SYMBOL(__stop_syscalls_metadata) = .;
+#else
+#define TRACE_SYSCALLS()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -99,7 +107,8 @@
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()						\
 	TRACE_PRINTKS()							\
-	FTRACE_EVENTS()
+	FTRACE_EVENTS()							\
+	TRACE_SYSCALLS()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index c146c1021a29..6dc1c652447e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -506,13 +506,21 @@ static inline void trace_hw_branch_oops(void) {}
 /*
  * A syscall entry in the ftrace syscalls array.
  *
- * @syscall_nr: syscall number
+ * @name: name of the syscall
+ * @nb_args: number of parameters it takes
+ * @types: list of types as strings
+ * @args: list of args as strings (args[i] matches types[i])
  */
-struct syscall_trace_entry {
-	int		syscall_nr;
+struct syscall_metadata {
+	const char	*name;
+	int		nb_args;
+	const char	**types;
+	const char	**args;
 };
 
 #ifdef CONFIG_FTRACE_SYSCALLS
+extern void arch_init_ftrace_syscalls(void);
+extern struct syscall_metadata *syscall_nr_to_meta(int nr);
 extern void start_ftrace_syscalls(void);
 extern void stop_ftrace_syscalls(void);
 extern void ftrace_syscall_enter(struct pt_regs *regs);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9f900cfd066..0cff9bb80b02 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -65,6 +65,7 @@ struct old_linux_dirent;
 #include <asm/signal.h>
 #include <linux/quota.h>
 #include <linux/key.h>
+#include <linux/ftrace.h>
 
 #define __SC_DECL1(t1, a1)	t1 a1
 #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
@@ -95,7 +96,46 @@ struct old_linux_dirent;
 #define __SC_TEST5(t5, a5, ...)	__SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
 #define __SC_TEST6(t6, a6, ...)	__SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define __SC_STR_ADECL1(t, a)		#a
+#define __SC_STR_ADECL2(t, a, ...)	#a, __SC_STR_ADECL1(__VA_ARGS__)
+#define __SC_STR_ADECL3(t, a, ...)	#a, __SC_STR_ADECL2(__VA_ARGS__)
+#define __SC_STR_ADECL4(t, a, ...)	#a, __SC_STR_ADECL3(__VA_ARGS__)
+#define __SC_STR_ADECL5(t, a, ...)	#a, __SC_STR_ADECL4(__VA_ARGS__)
+#define __SC_STR_ADECL6(t, a, ...)	#a, __SC_STR_ADECL5(__VA_ARGS__)
+
+#define __SC_STR_TDECL1(t, a)		#t
+#define __SC_STR_TDECL2(t, a, ...)	#t, __SC_STR_TDECL1(__VA_ARGS__)
+#define __SC_STR_TDECL3(t, a, ...)	#t, __SC_STR_TDECL2(__VA_ARGS__)
+#define __SC_STR_TDECL4(t, a, ...)	#t, __SC_STR_TDECL3(__VA_ARGS__)
+#define __SC_STR_TDECL5(t, a, ...)	#t, __SC_STR_TDECL4(__VA_ARGS__)
+#define __SC_STR_TDECL6(t, a, ...)	#t, __SC_STR_TDECL5(__VA_ARGS__)
+
+#define SYSCALL_METADATA(sname, nb)				\
+	static const struct syscall_metadata __used		\
+	  __attribute__((__aligned__(4)))			\
+	  __attribute__((section("__syscalls_metadata")))	\
+	  __syscall_meta_##sname = {				\
+		.name 		= "sys"#sname,			\
+		.nb_args 	= nb,				\
+		.types		= types_##sname,		\
+		.args		= args_##sname,			\
+	}
+
+#define SYSCALL_DEFINE0(sname)					\
+	static const struct syscall_metadata __used		\
+	  __attribute__((__aligned__(4)))			\
+	  __attribute__((section("__syscalls_metadata")))	\
+	  __syscall_meta_##sname = {				\
+		.name 		= "sys_"#sname,			\
+		.nb_args 	= 0,				\
+	};							\
+	asmlinkage long sys_##sname(void)
+
+#else
 #define SYSCALL_DEFINE0(name)	   asmlinkage long sys_##name(void)
+#endif
+
 #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
@@ -117,10 +157,26 @@ struct old_linux_dirent;
 #endif
 #endif
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define SYSCALL_DEFINEx(x, sname, ...)				\
+	static const char *types_##sname[] = {			\
+		__SC_STR_TDECL##x(__VA_ARGS__)			\
+	};							\
+	static const char *args_##sname[] = {			\
+		__SC_STR_ADECL##x(__VA_ARGS__)			\
+	};							\
+	SYSCALL_METADATA(sname, x);				\
+	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+#else
+#define SYSCALL_DEFINEx(x, sname, ...)				\
+	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+#endif
+
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 
 #define SYSCALL_DEFINE(name) static inline long SYSC_##name
-#define SYSCALL_DEFINEx(x, name, ...)					\
+
+#define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));		\
 	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));	\
 	asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))		\
@@ -134,7 +190,7 @@ struct old_linux_dirent;
 #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
 #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
-#define SYSCALL_DEFINEx(x, name, ...)					\
+#define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
 
 #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3d49daae47dc..d80ca0d464d9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -194,6 +194,19 @@ struct kmemtrace_free_entry {
 	const void *ptr;
 };
 
+struct syscall_trace_enter {
+	struct trace_entry	ent;
+	int			nr;
+	unsigned long		args[];
+};
+
+struct syscall_trace_exit {
+	struct trace_entry	ent;
+	int			nr;
+	unsigned long		ret;
+};
+
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -306,6 +319,10 @@ extern void __ftrace_bad_type(void);
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
 			  TRACE_KMEM_FREE);	\
+		IF_ASSIGN(var, ent, struct syscall_trace_enter,		\
+			  TRACE_SYSCALL_ENTER);				\
+		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
+			  TRACE_SYSCALL_EXIT);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 66cf97449af3..c72e599230ff 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,6 +1,5 @@
-#include <linux/ftrace.h>
 #include <linux/kernel.h>
-
+#include <linux/ftrace.h>
 #include <asm/syscall.h>
 
 #include "trace_output.h"
@@ -8,6 +7,90 @@
 
 static atomic_t refcount;
 
+/* Our two options */
+enum {
+	TRACE_SYSCALLS_OPT_TYPES = 0x1,
+};
+
+static struct tracer_opt syscalls_opts[] = {
+	{ TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
+	{ }
+};
+
+static struct tracer_flags syscalls_flags = {
+	.val = 0, /* By default: no args types */
+	.opts = syscalls_opts
+};
+
+enum print_line_t
+print_syscall_enter(struct trace_iterator *iter, int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *ent = iter->ent;
+	struct syscall_trace_enter *trace;
+	struct syscall_metadata *entry;
+	int i, ret, syscall;
+
+	trace_assign_type(trace, ent);
+
+	syscall = trace->nr;
+
+	entry = syscall_nr_to_meta(syscall);
+	if (!entry)
+		goto end;
+
+	ret = trace_seq_printf(s, "%s(", entry->name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	for (i = 0; i < entry->nb_args; i++) {
+		/* parameter types */
+		if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) {
+			ret = trace_seq_printf(s, "%s ", entry->types[i]);
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+		/* parameter values */
+		ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i],
+				       trace->args[i],
+				       i == entry->nb_args - 1 ? ")" : ",");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+end:
+	trace_seq_printf(s, "\n");
+	return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t
+print_syscall_exit(struct trace_iterator *iter, int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *ent = iter->ent;
+	struct syscall_trace_exit *trace;
+	int syscall;
+	struct syscall_metadata *entry;
+	int ret;
+
+	trace_assign_type(trace, ent);
+
+	syscall = trace->nr;
+
+	entry = syscall_nr_to_meta(syscall);
+	if (!entry) {
+		trace_seq_printf(s, "\n");
+		return TRACE_TYPE_HANDLED;
+	}
+
+	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+				trace->ret);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 void start_ftrace_syscalls(void)
 {
 	unsigned long flags;
@@ -16,6 +99,7 @@ void start_ftrace_syscalls(void)
 	if (atomic_inc_return(&refcount) != 1)
 		goto out;
 
+	arch_init_ftrace_syscalls();
 	read_lock_irqsave(&tasklist_lock, flags);
 
 	do_each_thread(g, t) {
@@ -48,20 +132,63 @@ out:
 
 void ftrace_syscall_enter(struct pt_regs *regs)
 {
+	struct syscall_trace_enter *entry;
+	struct syscall_metadata *sys_data;
+	struct ring_buffer_event *event;
+	int size;
 	int syscall_nr;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	trace_printk("syscall %d enter\n", syscall_nr);
+	cpu = raw_smp_processor_id();
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return;
+
+	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+
+	event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size,
+							0, 0);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	entry->nr = syscall_nr;
+	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
+
+	trace_current_buffer_unlock_commit(event, 0, 0);
+	trace_wake_up();
 }
 
 void ftrace_syscall_exit(struct pt_regs *regs)
 {
+	struct syscall_trace_exit *entry;
+	struct syscall_metadata *sys_data;
+	struct ring_buffer_event *event;
 	int syscall_nr;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	trace_printk("syscall %d exit\n", syscall_nr);
+	cpu = raw_smp_processor_id();
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return;
+
+	event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT,
+				sizeof(*entry), 0, 0);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	entry->nr = syscall_nr;
+	entry->ret = syscall_get_return_value(current, regs);
+
+	trace_current_buffer_unlock_commit(event, 0, 0);
+	trace_wake_up();
 }
 
 static int init_syscall_tracer(struct trace_array *tr)
@@ -77,17 +204,20 @@ static void reset_syscall_tracer(struct trace_array *tr)
 }
 
 static struct trace_event syscall_enter_event = {
-	.type		= TRACE_SYSCALL_ENTER,
+	.type	 	= TRACE_SYSCALL_ENTER,
+	.trace		= print_syscall_enter,
 };
 
 static struct trace_event syscall_exit_event = {
-	.type		= TRACE_SYSCALL_EXIT,
+	.type	 	= TRACE_SYSCALL_EXIT,
+	.trace		= print_syscall_exit,
 };
 
 static struct tracer syscall_tracer __read_mostly = {
-	.name		= "syscall",
+	.name	     	= "syscall",
 	.init		= init_syscall_tracer,
-	.reset		= reset_syscall_tracer
+	.reset		= reset_syscall_tracer,
+	.flags		= &syscalls_flags,
 };
 
 __init int register_ftrace_syscalls(void)
-- 
cgit v1.2.3


From 37886f6a9f62d22530ffee8d3f9215c8345b6969 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 17:22:06 -0400
Subject: ring-buffer: add api to allow a tracer to change clock source

This patch adds a new function called ring_buffer_set_clock that
allows a tracer to assign its own clock source to the buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h |  7 +++--
 kernel/trace/ring_buffer.c  | 65 ++++++++++++++++++++++++++-------------------
 kernel/trace/trace.c        | 21 ++++++++++-----
 3 files changed, 57 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b1a0068a5557..9e6052bd1a1c 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -118,8 +118,11 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 
-u64 ring_buffer_time_stamp(int cpu);
-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
+void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+				      int cpu, u64 *ts);
+void ring_buffer_set_clock(struct ring_buffer *buffer,
+			   u64 (*clock)(void));
 
 size_t ring_buffer_page_len(void *page);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 58128ad2fde0..bbf51922a8ca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -180,29 +180,6 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 
 #include "trace.h"
 
-/* Up this if you want to test the TIME_EXTENTS and normalization */
-#define DEBUG_SHIFT 0
-
-u64 ring_buffer_time_stamp(int cpu)
-{
-	u64 time;
-
-	preempt_disable_notrace();
-	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = trace_clock_local() << DEBUG_SHIFT;
-	preempt_enable_no_resched_notrace();
-
-	return time;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
-
-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
-{
-	/* Just stupid testing the normalize function and deltas */
-	*ts >>= DEBUG_SHIFT;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
-
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	28
@@ -374,6 +351,7 @@ struct ring_buffer {
 #ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block		cpu_notify;
 #endif
+	u64				(*clock)(void);
 };
 
 struct ring_buffer_iter {
@@ -394,6 +372,30 @@ struct ring_buffer_iter {
 		_____ret;					\
 	})
 
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+	u64 time;
+
+	preempt_disable_notrace();
+	/* shift to debug/test normalization and TIME_EXTENTS */
+	time = buffer->clock() << DEBUG_SHIFT;
+	preempt_enable_no_resched_notrace();
+
+	return time;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
+
+void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+				      int cpu, u64 *ts)
+{
+	/* Just stupid testing the normalize function and deltas */
+	*ts >>= DEBUG_SHIFT;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
+
 /**
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
@@ -569,6 +571,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 
 	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 	buffer->flags = flags;
+	buffer->clock = trace_clock_local;
 
 	/* need at least two pages */
 	if (buffer->pages == 1)
@@ -645,6 +648,12 @@ ring_buffer_free(struct ring_buffer *buffer)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free);
 
+void ring_buffer_set_clock(struct ring_buffer *buffer,
+			   u64 (*clock)(void))
+{
+	buffer->clock = clock;
+}
+
 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
 
 static void
@@ -1191,7 +1200,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 			cpu_buffer->tail_page = next_page;
 
 			/* reread the time stamp */
-			*ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+			*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
 			cpu_buffer->tail_page->page->time_stamp = *ts;
 		}
 
@@ -1334,7 +1343,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		return NULL;
 
-	ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+	ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
 
 	/*
 	 * Only the first commit can update the timestamp.
@@ -2051,7 +2060,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	case RINGBUF_TYPE_DATA:
 		if (ts) {
 			*ts = cpu_buffer->read_stamp + event->time_delta;
-			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+			ring_buffer_normalize_time_stamp(buffer,
+							 cpu_buffer->cpu, ts);
 		}
 		return event;
 
@@ -2112,7 +2122,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	case RINGBUF_TYPE_DATA:
 		if (ts) {
 			*ts = iter->read_stamp + event->time_delta;
-			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+			ring_buffer_normalize_time_stamp(buffer,
+							 cpu_buffer->cpu, ts);
 		}
 		return event;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f89690230e6..3be2f788e10d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -155,13 +155,6 @@ ns2usecs(cycle_t nsec)
 	return nsec;
 }
 
-cycle_t ftrace_now(int cpu)
-{
-	u64 ts = ring_buffer_time_stamp(cpu);
-	ring_buffer_normalize_time_stamp(cpu, &ts);
-	return ts;
-}
-
 /*
  * The global_trace is the descriptor that holds the tracing
  * buffers for the live tracing. For each CPU, it contains
@@ -178,6 +171,20 @@ static struct trace_array	global_trace;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
+cycle_t ftrace_now(int cpu)
+{
+	u64 ts;
+
+	/* Early boot up does not have a buffer yet */
+	if (!global_trace.buffer)
+		return trace_clock_local();
+
+	ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
+	ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
+
+	return ts;
+}
+
 /*
  * The max_tr is used to snapshot the global_trace when a maximum
  * latency is reached. Some tracers will use this to store a maximum
-- 
cgit v1.2.3


From 97e7e4f391cac2b00417b581b432533d245d4fd0 Mon Sep 17 00:00:00 2001
From: Witold Baryluk <baryluk@smp.if.uj.edu.pl>
Date: Tue, 17 Mar 2009 21:15:44 +0100
Subject: tracing: optimization of branch tracer

Impact: better performance for if branch tracer

Use an array to count the hit and misses of a conditional instead
of using another conditional. This cuts down on saturation of branch
predictions and increases performance of modern pipelined architectures.

Signed-off-by: Witold Baryluk <baryluk@smp.if.uj.edu.pl>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/compiler.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d95da1020f1c..6faa7e549de4 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -68,6 +68,7 @@ struct ftrace_branch_data {
 			unsigned long miss;
 			unsigned long hit;
 		};
+		unsigned long miss_hit[2];
 	};
 };
 
@@ -125,10 +126,7 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 				.line = __LINE__,			\
 			};						\
 		______r = !!(cond);					\
-		if (______r)						\
-			______f.hit++;					\
-		else							\
-			______f.miss++;					\
+		______f.miss_hit[______r]++;					\
 		______r;						\
 	}))
 #endif /* CONFIG_PROFILE_ALL_BRANCHES */
-- 
cgit v1.2.3


From a4773c08f2872626cb923433284488fbe8acb0ae Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 2 Feb 2009 17:23:10 -0500
Subject: nfsd4: use helper for copying filehandles for replay

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c         | 17 ++++++-----------
 include/linux/nfsd/nfsfh.h |  7 +++++++
 include/linux/nfsd/state.h |  3 +--
 3 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 326506272258..af66073ed423 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -123,10 +123,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 	fh_dup2(current_fh, &resfh);
 
 	/* set reply cache */
-	open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
-	memcpy(open->op_stateowner->so_replay.rp_openfh,
-			&resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
-
+	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+			&resfh.fh_handle);
 	if (!created)
 		status = do_open_permission(rqstp, current_fh, open,
 					    NFSD_MAY_NOP);
@@ -152,10 +150,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
 
 	/* set replay cache */
-	open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size;
-	memcpy(open->op_stateowner->so_replay.rp_openfh,
-		&current_fh->fh_handle.fh_base,
-		current_fh->fh_handle.fh_size);
+	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+			&current_fh->fh_handle);
 
 	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
 		(open->op_iattr.ia_size == 0);
@@ -187,9 +183,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status == nfserr_replay_me) {
 		struct nfs4_replay *rp = &open->op_stateowner->so_replay;
 		fh_put(&cstate->current_fh);
-		cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
-		memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
-				rp->rp_openfh_len);
+		fh_copy_shallow(&cstate->current_fh.fh_handle,
+				&rp->rp_openfh);
 		status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 		if (status)
 			dprintk("nfsd4_open: replay failed"
diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
index fa317f6c154b..afa19016c4a8 100644
--- a/include/linux/nfsd/nfsfh.h
+++ b/include/linux/nfsd/nfsfh.h
@@ -269,6 +269,13 @@ fh_copy(struct svc_fh *dst, struct svc_fh *src)
 	return dst;
 }
 
+static inline void
+fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
+{
+	dst->fh_size = src->fh_size;
+	memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
+}
+
 static __inline__ struct svc_fh *
 fh_init(struct svc_fh *fhp, int maxsize)
 {
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 128298c0362d..b65b2a6274b0 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -168,8 +168,7 @@ struct nfs4_replay {
 	unsigned int		rp_buflen;
 	char			*rp_buf;
 	unsigned		intrp_allocated;
-	int			rp_openfh_len;
-	char			rp_openfh[NFS4_FHSIZE];
+	struct knfsd_fh		rp_openfh;
 	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
 };
 
-- 
cgit v1.2.3


From 6c02eaa1d1e53b9b2cc27d0c6fff3e57da4b611f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 2 Feb 2009 17:30:51 -0500
Subject: nfsd4: use helper for copying delegation filehandle

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 4 ++--
 fs/nfsd/nfs4state.c        | 4 +---
 include/linux/nfsd/state.h | 6 ++----
 3 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..c6804db33c17 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
 encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 {
 	__be32 *p;
-	int len = cb_rec->cbr_fhlen;
+	int len = cb_rec->cbr_fh.fh_size;
 
 	RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
 	WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 	WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
 	WRITE32(cb_rec->cbr_trunc);
 	WRITE32(len);
-	WRITEMEM(cb_rec->cbr_fhval, len);
+	WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
 	return 0;
 }
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 41a3590ef2cc..7f616e928a57 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -215,9 +215,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
 	dp->dl_stateid.si_generation = 0;
-	dp->dl_fhlen = current_fh->fh_handle.fh_size;
-	memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
-		        current_fh->fh_handle.fh_size);
+	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index b65b2a6274b0..1130d534bb63 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -66,8 +66,7 @@ struct nfs4_cb_recall {
 	u32			cbr_ident;
 	int			cbr_trunc;
 	stateid_t		cbr_stateid;
-	u32			cbr_fhlen;
-	char			cbr_fhval[NFS4_FHSIZE];
+	struct knfsd_fh		cbr_fh;
 	struct nfs4_delegation	*cbr_dp;
 };
 
@@ -86,8 +85,7 @@ struct nfs4_delegation {
 };
 
 #define dl_stateid      dl_recall.cbr_stateid
-#define dl_fhlen        dl_recall.cbr_fhlen
-#define dl_fhval        dl_recall.cbr_fhval
+#define dl_fh           dl_recall.cbr_fh
 
 /* client delegation callback info */
 struct nfs4_callback {
-- 
cgit v1.2.3


From 77f18f5e4ebdea35ec3d92343b0ed7546dc87637 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 11 Feb 2009 17:16:58 -0800
Subject: nfs: replace uses of __constant_{endian}

The base versions handle constant folding now, none of these headers
are exported to userspace, so the __ prefixed versions are not
necessary.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/lockd/xdr.h  |  12 ++---
 include/linux/lockd/xdr4.h |  10 ++--
 include/linux/nfsd/nfsd.h  | 132 ++++++++++++++++++++++-----------------------
 include/linux/sunrpc/xdr.h |  42 +++++++--------
 4 files changed, 98 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index 7dc5b6cb44cd..d39ed1cc5fbf 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -25,13 +25,13 @@ struct svc_rqst;
 #define NLM_MAXCOOKIELEN    	32
 #define NLM_MAXSTRLEN		1024
 
-#define	nlm_granted		__constant_htonl(NLM_LCK_GRANTED)
-#define	nlm_lck_denied		__constant_htonl(NLM_LCK_DENIED)
-#define	nlm_lck_denied_nolocks	__constant_htonl(NLM_LCK_DENIED_NOLOCKS)
-#define	nlm_lck_blocked		__constant_htonl(NLM_LCK_BLOCKED)
-#define	nlm_lck_denied_grace_period	__constant_htonl(NLM_LCK_DENIED_GRACE_PERIOD)
+#define	nlm_granted		cpu_to_be32(NLM_LCK_GRANTED)
+#define	nlm_lck_denied		cpu_to_be32(NLM_LCK_DENIED)
+#define	nlm_lck_denied_nolocks	cpu_to_be32(NLM_LCK_DENIED_NOLOCKS)
+#define	nlm_lck_blocked		cpu_to_be32(NLM_LCK_BLOCKED)
+#define	nlm_lck_denied_grace_period	cpu_to_be32(NLM_LCK_DENIED_GRACE_PERIOD)
 
-#define nlm_drop_reply		__constant_htonl(30000)
+#define nlm_drop_reply		cpu_to_be32(30000)
 
 /* Lock info passed via NLM */
 struct nlm_lock {
diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h
index 12bfe09de2b1..7353821341ed 100644
--- a/include/linux/lockd/xdr4.h
+++ b/include/linux/lockd/xdr4.h
@@ -15,11 +15,11 @@
 #include <linux/lockd/xdr.h>
 
 /* error codes new to NLMv4 */
-#define	nlm4_deadlock		__constant_htonl(NLM_DEADLCK)
-#define	nlm4_rofs		__constant_htonl(NLM_ROFS)
-#define	nlm4_stale_fh		__constant_htonl(NLM_STALE_FH)
-#define	nlm4_fbig		__constant_htonl(NLM_FBIG)
-#define	nlm4_failed		__constant_htonl(NLM_FAILED)
+#define	nlm4_deadlock		cpu_to_be32(NLM_DEADLCK)
+#define	nlm4_rofs		cpu_to_be32(NLM_ROFS)
+#define	nlm4_stale_fh		cpu_to_be32(NLM_STALE_FH)
+#define	nlm4_fbig		cpu_to_be32(NLM_FBIG)
+#define	nlm4_failed		cpu_to_be32(NLM_FAILED)
 
 
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index e19f45991b2e..16f7b403d9c1 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -186,78 +186,78 @@ void		nfsd_lockd_shutdown(void);
 /*
  * These macros provide pre-xdr'ed values for faster operation.
  */
-#define	nfs_ok			__constant_htonl(NFS_OK)
-#define	nfserr_perm		__constant_htonl(NFSERR_PERM)
-#define	nfserr_noent		__constant_htonl(NFSERR_NOENT)
-#define	nfserr_io		__constant_htonl(NFSERR_IO)
-#define	nfserr_nxio		__constant_htonl(NFSERR_NXIO)
-#define	nfserr_eagain		__constant_htonl(NFSERR_EAGAIN)
-#define	nfserr_acces		__constant_htonl(NFSERR_ACCES)
-#define	nfserr_exist		__constant_htonl(NFSERR_EXIST)
-#define	nfserr_xdev		__constant_htonl(NFSERR_XDEV)
-#define	nfserr_nodev		__constant_htonl(NFSERR_NODEV)
-#define	nfserr_notdir		__constant_htonl(NFSERR_NOTDIR)
-#define	nfserr_isdir		__constant_htonl(NFSERR_ISDIR)
-#define	nfserr_inval		__constant_htonl(NFSERR_INVAL)
-#define	nfserr_fbig		__constant_htonl(NFSERR_FBIG)
-#define	nfserr_nospc		__constant_htonl(NFSERR_NOSPC)
-#define	nfserr_rofs		__constant_htonl(NFSERR_ROFS)
-#define	nfserr_mlink		__constant_htonl(NFSERR_MLINK)
-#define	nfserr_opnotsupp	__constant_htonl(NFSERR_OPNOTSUPP)
-#define	nfserr_nametoolong	__constant_htonl(NFSERR_NAMETOOLONG)
-#define	nfserr_notempty		__constant_htonl(NFSERR_NOTEMPTY)
-#define	nfserr_dquot		__constant_htonl(NFSERR_DQUOT)
-#define	nfserr_stale		__constant_htonl(NFSERR_STALE)
-#define	nfserr_remote		__constant_htonl(NFSERR_REMOTE)
-#define	nfserr_wflush		__constant_htonl(NFSERR_WFLUSH)
-#define	nfserr_badhandle	__constant_htonl(NFSERR_BADHANDLE)
-#define	nfserr_notsync		__constant_htonl(NFSERR_NOT_SYNC)
-#define	nfserr_badcookie	__constant_htonl(NFSERR_BAD_COOKIE)
-#define	nfserr_notsupp		__constant_htonl(NFSERR_NOTSUPP)
-#define	nfserr_toosmall		__constant_htonl(NFSERR_TOOSMALL)
-#define	nfserr_serverfault	__constant_htonl(NFSERR_SERVERFAULT)
-#define	nfserr_badtype		__constant_htonl(NFSERR_BADTYPE)
-#define	nfserr_jukebox		__constant_htonl(NFSERR_JUKEBOX)
-#define	nfserr_denied		__constant_htonl(NFSERR_DENIED)
-#define	nfserr_deadlock		__constant_htonl(NFSERR_DEADLOCK)
-#define nfserr_expired          __constant_htonl(NFSERR_EXPIRED)
-#define	nfserr_bad_cookie	__constant_htonl(NFSERR_BAD_COOKIE)
-#define	nfserr_same		__constant_htonl(NFSERR_SAME)
-#define	nfserr_clid_inuse	__constant_htonl(NFSERR_CLID_INUSE)
-#define	nfserr_stale_clientid	__constant_htonl(NFSERR_STALE_CLIENTID)
-#define	nfserr_resource		__constant_htonl(NFSERR_RESOURCE)
-#define	nfserr_moved		__constant_htonl(NFSERR_MOVED)
-#define	nfserr_nofilehandle	__constant_htonl(NFSERR_NOFILEHANDLE)
-#define	nfserr_minor_vers_mismatch	__constant_htonl(NFSERR_MINOR_VERS_MISMATCH)
-#define nfserr_share_denied	__constant_htonl(NFSERR_SHARE_DENIED)
-#define nfserr_stale_stateid	__constant_htonl(NFSERR_STALE_STATEID)
-#define nfserr_old_stateid	__constant_htonl(NFSERR_OLD_STATEID)
-#define nfserr_bad_stateid	__constant_htonl(NFSERR_BAD_STATEID)
-#define nfserr_bad_seqid	__constant_htonl(NFSERR_BAD_SEQID)
-#define	nfserr_symlink		__constant_htonl(NFSERR_SYMLINK)
-#define	nfserr_not_same		__constant_htonl(NFSERR_NOT_SAME)
-#define	nfserr_restorefh	__constant_htonl(NFSERR_RESTOREFH)
-#define	nfserr_attrnotsupp	__constant_htonl(NFSERR_ATTRNOTSUPP)
-#define	nfserr_bad_xdr		__constant_htonl(NFSERR_BAD_XDR)
-#define	nfserr_openmode		__constant_htonl(NFSERR_OPENMODE)
-#define	nfserr_locks_held	__constant_htonl(NFSERR_LOCKS_HELD)
-#define	nfserr_op_illegal	__constant_htonl(NFSERR_OP_ILLEGAL)
-#define	nfserr_grace		__constant_htonl(NFSERR_GRACE)
-#define	nfserr_no_grace		__constant_htonl(NFSERR_NO_GRACE)
-#define	nfserr_reclaim_bad	__constant_htonl(NFSERR_RECLAIM_BAD)
-#define	nfserr_badname		__constant_htonl(NFSERR_BADNAME)
-#define	nfserr_cb_path_down	__constant_htonl(NFSERR_CB_PATH_DOWN)
-#define	nfserr_locked		__constant_htonl(NFSERR_LOCKED)
-#define	nfserr_wrongsec		__constant_htonl(NFSERR_WRONGSEC)
-#define	nfserr_replay_me	__constant_htonl(NFSERR_REPLAY_ME)
+#define	nfs_ok			cpu_to_be32(NFS_OK)
+#define	nfserr_perm		cpu_to_be32(NFSERR_PERM)
+#define	nfserr_noent		cpu_to_be32(NFSERR_NOENT)
+#define	nfserr_io		cpu_to_be32(NFSERR_IO)
+#define	nfserr_nxio		cpu_to_be32(NFSERR_NXIO)
+#define	nfserr_eagain		cpu_to_be32(NFSERR_EAGAIN)
+#define	nfserr_acces		cpu_to_be32(NFSERR_ACCES)
+#define	nfserr_exist		cpu_to_be32(NFSERR_EXIST)
+#define	nfserr_xdev		cpu_to_be32(NFSERR_XDEV)
+#define	nfserr_nodev		cpu_to_be32(NFSERR_NODEV)
+#define	nfserr_notdir		cpu_to_be32(NFSERR_NOTDIR)
+#define	nfserr_isdir		cpu_to_be32(NFSERR_ISDIR)
+#define	nfserr_inval		cpu_to_be32(NFSERR_INVAL)
+#define	nfserr_fbig		cpu_to_be32(NFSERR_FBIG)
+#define	nfserr_nospc		cpu_to_be32(NFSERR_NOSPC)
+#define	nfserr_rofs		cpu_to_be32(NFSERR_ROFS)
+#define	nfserr_mlink		cpu_to_be32(NFSERR_MLINK)
+#define	nfserr_opnotsupp	cpu_to_be32(NFSERR_OPNOTSUPP)
+#define	nfserr_nametoolong	cpu_to_be32(NFSERR_NAMETOOLONG)
+#define	nfserr_notempty		cpu_to_be32(NFSERR_NOTEMPTY)
+#define	nfserr_dquot		cpu_to_be32(NFSERR_DQUOT)
+#define	nfserr_stale		cpu_to_be32(NFSERR_STALE)
+#define	nfserr_remote		cpu_to_be32(NFSERR_REMOTE)
+#define	nfserr_wflush		cpu_to_be32(NFSERR_WFLUSH)
+#define	nfserr_badhandle	cpu_to_be32(NFSERR_BADHANDLE)
+#define	nfserr_notsync		cpu_to_be32(NFSERR_NOT_SYNC)
+#define	nfserr_badcookie	cpu_to_be32(NFSERR_BAD_COOKIE)
+#define	nfserr_notsupp		cpu_to_be32(NFSERR_NOTSUPP)
+#define	nfserr_toosmall		cpu_to_be32(NFSERR_TOOSMALL)
+#define	nfserr_serverfault	cpu_to_be32(NFSERR_SERVERFAULT)
+#define	nfserr_badtype		cpu_to_be32(NFSERR_BADTYPE)
+#define	nfserr_jukebox		cpu_to_be32(NFSERR_JUKEBOX)
+#define	nfserr_denied		cpu_to_be32(NFSERR_DENIED)
+#define	nfserr_deadlock		cpu_to_be32(NFSERR_DEADLOCK)
+#define nfserr_expired          cpu_to_be32(NFSERR_EXPIRED)
+#define	nfserr_bad_cookie	cpu_to_be32(NFSERR_BAD_COOKIE)
+#define	nfserr_same		cpu_to_be32(NFSERR_SAME)
+#define	nfserr_clid_inuse	cpu_to_be32(NFSERR_CLID_INUSE)
+#define	nfserr_stale_clientid	cpu_to_be32(NFSERR_STALE_CLIENTID)
+#define	nfserr_resource		cpu_to_be32(NFSERR_RESOURCE)
+#define	nfserr_moved		cpu_to_be32(NFSERR_MOVED)
+#define	nfserr_nofilehandle	cpu_to_be32(NFSERR_NOFILEHANDLE)
+#define	nfserr_minor_vers_mismatch	cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH)
+#define nfserr_share_denied	cpu_to_be32(NFSERR_SHARE_DENIED)
+#define nfserr_stale_stateid	cpu_to_be32(NFSERR_STALE_STATEID)
+#define nfserr_old_stateid	cpu_to_be32(NFSERR_OLD_STATEID)
+#define nfserr_bad_stateid	cpu_to_be32(NFSERR_BAD_STATEID)
+#define nfserr_bad_seqid	cpu_to_be32(NFSERR_BAD_SEQID)
+#define	nfserr_symlink		cpu_to_be32(NFSERR_SYMLINK)
+#define	nfserr_not_same		cpu_to_be32(NFSERR_NOT_SAME)
+#define	nfserr_restorefh	cpu_to_be32(NFSERR_RESTOREFH)
+#define	nfserr_attrnotsupp	cpu_to_be32(NFSERR_ATTRNOTSUPP)
+#define	nfserr_bad_xdr		cpu_to_be32(NFSERR_BAD_XDR)
+#define	nfserr_openmode		cpu_to_be32(NFSERR_OPENMODE)
+#define	nfserr_locks_held	cpu_to_be32(NFSERR_LOCKS_HELD)
+#define	nfserr_op_illegal	cpu_to_be32(NFSERR_OP_ILLEGAL)
+#define	nfserr_grace		cpu_to_be32(NFSERR_GRACE)
+#define	nfserr_no_grace		cpu_to_be32(NFSERR_NO_GRACE)
+#define	nfserr_reclaim_bad	cpu_to_be32(NFSERR_RECLAIM_BAD)
+#define	nfserr_badname		cpu_to_be32(NFSERR_BADNAME)
+#define	nfserr_cb_path_down	cpu_to_be32(NFSERR_CB_PATH_DOWN)
+#define	nfserr_locked		cpu_to_be32(NFSERR_LOCKED)
+#define	nfserr_wrongsec		cpu_to_be32(NFSERR_WRONGSEC)
+#define	nfserr_replay_me	cpu_to_be32(NFSERR_REPLAY_ME)
 
 /* error codes for internal use */
 /* if a request fails due to kmalloc failure, it gets dropped.
  *  Client should resend eventually
  */
-#define	nfserr_dropit		__constant_htonl(30000)
+#define	nfserr_dropit		cpu_to_be32(30000)
 /* end-of-file indicator in readdir */
-#define	nfserr_eof		__constant_htonl(30001)
+#define	nfserr_eof		cpu_to_be32(30001)
 
 /* Check for dir entries '.' and '..' */
 #define isdotent(n, l)	(l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 49e1eb454465..d8910b68e1bd 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -69,27 +69,27 @@ struct xdr_buf {
  * pre-xdr'ed macros.
  */
 
-#define	xdr_zero	__constant_htonl(0)
-#define	xdr_one		__constant_htonl(1)
-#define	xdr_two		__constant_htonl(2)
-
-#define	rpc_success		__constant_htonl(RPC_SUCCESS)
-#define	rpc_prog_unavail	__constant_htonl(RPC_PROG_UNAVAIL)
-#define	rpc_prog_mismatch	__constant_htonl(RPC_PROG_MISMATCH)
-#define	rpc_proc_unavail	__constant_htonl(RPC_PROC_UNAVAIL)
-#define	rpc_garbage_args	__constant_htonl(RPC_GARBAGE_ARGS)
-#define	rpc_system_err		__constant_htonl(RPC_SYSTEM_ERR)
-#define	rpc_drop_reply		__constant_htonl(RPC_DROP_REPLY)
-
-#define	rpc_auth_ok		__constant_htonl(RPC_AUTH_OK)
-#define	rpc_autherr_badcred	__constant_htonl(RPC_AUTH_BADCRED)
-#define	rpc_autherr_rejectedcred __constant_htonl(RPC_AUTH_REJECTEDCRED)
-#define	rpc_autherr_badverf	__constant_htonl(RPC_AUTH_BADVERF)
-#define	rpc_autherr_rejectedverf __constant_htonl(RPC_AUTH_REJECTEDVERF)
-#define	rpc_autherr_tooweak	__constant_htonl(RPC_AUTH_TOOWEAK)
-#define	rpcsec_gsserr_credproblem	__constant_htonl(RPCSEC_GSS_CREDPROBLEM)
-#define	rpcsec_gsserr_ctxproblem	__constant_htonl(RPCSEC_GSS_CTXPROBLEM)
-#define	rpc_autherr_oldseqnum	__constant_htonl(101)
+#define	xdr_zero	cpu_to_be32(0)
+#define	xdr_one		cpu_to_be32(1)
+#define	xdr_two		cpu_to_be32(2)
+
+#define	rpc_success		cpu_to_be32(RPC_SUCCESS)
+#define	rpc_prog_unavail	cpu_to_be32(RPC_PROG_UNAVAIL)
+#define	rpc_prog_mismatch	cpu_to_be32(RPC_PROG_MISMATCH)
+#define	rpc_proc_unavail	cpu_to_be32(RPC_PROC_UNAVAIL)
+#define	rpc_garbage_args	cpu_to_be32(RPC_GARBAGE_ARGS)
+#define	rpc_system_err		cpu_to_be32(RPC_SYSTEM_ERR)
+#define	rpc_drop_reply		cpu_to_be32(RPC_DROP_REPLY)
+
+#define	rpc_auth_ok		cpu_to_be32(RPC_AUTH_OK)
+#define	rpc_autherr_badcred	cpu_to_be32(RPC_AUTH_BADCRED)
+#define	rpc_autherr_rejectedcred cpu_to_be32(RPC_AUTH_REJECTEDCRED)
+#define	rpc_autherr_badverf	cpu_to_be32(RPC_AUTH_BADVERF)
+#define	rpc_autherr_rejectedverf cpu_to_be32(RPC_AUTH_REJECTEDVERF)
+#define	rpc_autherr_tooweak	cpu_to_be32(RPC_AUTH_TOOWEAK)
+#define	rpcsec_gsserr_credproblem	cpu_to_be32(RPCSEC_GSS_CREDPROBLEM)
+#define	rpcsec_gsserr_ctxproblem	cpu_to_be32(RPCSEC_GSS_CTXPROBLEM)
+#define	rpc_autherr_oldseqnum	cpu_to_be32(101)
 
 /*
  * Miscellaneous XDR helper functions
-- 
cgit v1.2.3


From 203a8c8e66278a5936a230edaac29017e50c88fb Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Sat, 21 Feb 2009 13:29:14 -0800
Subject: nfsd4: separate delegreturn case from preprocess_stateid_op

Delegreturn is enough a special case for preprocess_stateid_op to
warrant just open-coding it in delegreturn.

There should be no change in behavior here; we're just reshuffling code.

Thanks to Yang Hongyang for catching a critical typo.

Reviewed-by: Yang Hongyang <yanghy@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 40 ++++++++++++++++++++++++++++------------
 include/linux/nfsd/state.h |  1 -
 2 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d5555850cb64..3570a0d1133f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2000,10 +2000,7 @@ out:
 static inline __be32
 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 {
-	/* Trying to call delegreturn with a special stateid? Yuch: */
-	if (!(flags & (RD_STATE | WR_STATE)))
-		return nfserr_bad_stateid;
-	else if (ONE_STATEID(stateid) && (flags & RD_STATE))
+	if (ONE_STATEID(stateid) && (flags & RD_STATE))
 		return nfs_ok;
 	else if (locks_in_grace()) {
 		/* Answer in remaining cases depends on existance of
@@ -2024,8 +2021,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 static inline int
 io_during_grace_disallowed(struct inode *inode, int flags)
 {
-	return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
-		&& mandatory_lock(inode);
+	return locks_in_grace() && mandatory_lock(inode);
 }
 
 static int check_stateid_generation(stateid_t *in, stateid_t *ref)
@@ -2089,8 +2085,6 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl
 		if (status)
 			goto out;
 		renew_client(dp->dl_client);
-		if (flags & DELEG_RET)
-			unhash_delegation(dp);
 		if (filpp)
 			*filpp = dp->dl_vfs_file;
 	} else { /* open or lock stateid */
@@ -2408,16 +2402,38 @@ __be32
 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_delegreturn *dr)
 {
+	struct nfs4_delegation *dp;
+	stateid_t *stateid = &dr->dr_stateid;
+	struct inode *inode;
 	__be32 status;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
-		goto out;
+		return status;
+	inode = cstate->current_fh.fh_dentry->d_inode;
 
 	nfs4_lock_state();
-	status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-					    &dr->dr_stateid, DELEG_RET, NULL);
-	nfs4_unlock_state();
+	status = nfserr_bad_stateid;
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		goto out;
+	status = nfserr_stale_stateid;
+	if (STALE_STATEID(stateid))
+		goto out;
+	status = nfs_ok;
+	if (!is_delegation_stateid(stateid))
+		goto out;
+	status = nfserr_bad_stateid;
+	dp = find_delegation_stateid(inode, stateid);
+	if (!dp)
+		goto out;
+	status = check_stateid_generation(stateid, &dp->dl_stateid);
+	if (status)
+		goto out;
+	renew_client(dp->dl_client);
+
+	unhash_delegation(dp);
 out:
+	nfs4_unlock_state();
+
 	return status;
 }
 
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 1130d534bb63..c9311a1e2e1a 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -263,7 +263,6 @@ struct nfs4_stateid {
 #define RD_STATE	        0x00000010
 #define WR_STATE	        0x00000020
 #define CLOSE_STATE             0x00000040
-#define DELEG_RET               0x00000080
 
 #define seqid_mutating_err(err)                       \
 	(((err) != nfserr_stale_clientid) &&    \
-- 
cgit v1.2.3


From 6150ef0dc7f734366d297e2eb5697ae458a1ea19 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Sat, 21 Feb 2009 13:36:16 -0800
Subject: nfsd4: remove unused CHECK_FH flag

All users now pass this, so it's meaningless.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c         | 6 +++---
 fs/nfsd/nfs4state.c        | 2 +-
 include/linux/nfsd/state.h | 1 -
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index af66073ed423..77f584f69dfe 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -519,7 +519,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/* check stateid */
 	if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh,
 				&read->rd_stateid,
-				CHECK_FH | RD_STATE, &read->rd_filp))) {
+				RD_STATE, &read->rd_filp))) {
 		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
 		goto out;
 	}
@@ -651,7 +651,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
 		nfs4_lock_state();
 		status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-			&setattr->sa_stateid, CHECK_FH | WR_STATE, NULL);
+			&setattr->sa_stateid, WR_STATE, NULL);
 		nfs4_unlock_state();
 		if (status) {
 			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -690,7 +690,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 	status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid,
-					CHECK_FH | WR_STATE, &filp);
+					WR_STATE, &filp);
 	if (filp)
 		get_file(filp);
 	nfs4_unlock_state();
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6ae28e606afc..5957f7766bdc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2091,7 +2091,7 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl
 		stp = find_stateid(stateid, flags);
 		if (!stp)
 			goto out;
-		if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
+		if (nfs4_check_fh(current_fh, stp))
 			goto out;
 		if (!stp->st_stateowner->so_confirmed)
 			goto out;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index c9311a1e2e1a..503b6bb53a56 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -256,7 +256,6 @@ struct nfs4_stateid {
 };
 
 /* flags for preprocess_seqid_op() */
-#define CHECK_FH                0x00000001
 #define CONFIRM                 0x00000002
 #define OPEN_STATE              0x00000004
 #define LOCK_STATE              0x00000008
-- 
cgit v1.2.3


From 8b671b80707e4fc76adfe4387df07b3be1007c1e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Sun, 22 Feb 2009 14:51:34 -0800
Subject: nfsd4: remove use of mutex for file_hashtable

As part of reducing the scope of the client_mutex, and in order to
remove the need for mutexes from the callback code (so that callbacks
can be done as asynchronous rpc calls), move manipulations of the
file_hashtable under the recall_lock.

Update the relevant comments while we're here.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Alexandros Batsakis <batsakis@netapp.com>
Reviewed-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4callback.c     |  2 --
 fs/nfsd/nfs4state.c        | 47 +++++++++++++++++++++++-----------------------
 include/linux/nfsd/state.h |  2 +-
 3 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3fd7136321ca..5dcd38e5f138 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -491,8 +491,6 @@ out_put_cred:
 	 * or deleg_return.
 	 */
 	put_nfs4_client(clp);
-	nfs4_lock_state();
 	nfs4_put_delegation(dp);
-	nfs4_unlock_state();
 	return;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 89e575e7daea..54651aa45790 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -78,14 +78,18 @@ static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, state
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 static void nfs4_set_recdir(char *recdir);
 
-/* Locking:
- *
- * client_mutex:
- * 	protects clientid_hashtbl[], clientstr_hashtbl[],
- * 	unconfstr_hashtbl[], uncofid_hashtbl[].
- */
+/* Locking: */
+
+/* Currently used for almost all code touching nfsv4 state: */
 static DEFINE_MUTEX(client_mutex);
 
+/*
+ * Currently used for the del_recall_lru and file hash table.  In an
+ * effort to decrease the scope of the client_mutex, this spinlock may
+ * eventually cover more:
+ */
+static DEFINE_SPINLOCK(recall_lock);
+
 static struct kmem_cache *stateowner_slab = NULL;
 static struct kmem_cache *file_slab = NULL;
 static struct kmem_cache *stateid_slab = NULL;
@@ -116,33 +120,23 @@ opaque_hashval(const void *ptr, int nbytes)
 	return x;
 }
 
-/*
- * Delegation state
- */
-
-/* recall_lock protects the del_recall_lru */
-static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
 
-static void
-free_nfs4_file(struct kref *kref)
-{
-	struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
-	list_del(&fp->fi_hash);
-	iput(fp->fi_inode);
-	kmem_cache_free(file_slab, fp);
-}
-
 static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
-	kref_put(&fi->fi_ref, free_nfs4_file);
+	if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+		list_del(&fi->fi_hash);
+		spin_unlock(&recall_lock);
+		iput(fi->fi_inode);
+		kmem_cache_free(file_slab, fi);
+	}
 }
 
 static inline void
 get_nfs4_file(struct nfs4_file *fi)
 {
-	kref_get(&fi->fi_ref);
+	atomic_inc(&fi->fi_ref);
 }
 
 static int num_delegations;
@@ -1000,11 +994,13 @@ alloc_init_file(struct inode *ino)
 
 	fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
 	if (fp) {
-		kref_init(&fp->fi_ref);
+		atomic_set(&fp->fi_ref, 1);
 		INIT_LIST_HEAD(&fp->fi_hash);
 		INIT_LIST_HEAD(&fp->fi_stateids);
 		INIT_LIST_HEAD(&fp->fi_delegations);
+		spin_lock(&recall_lock);
 		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+		spin_unlock(&recall_lock);
 		fp->fi_inode = igrab(ino);
 		fp->fi_id = current_fileid++;
 		fp->fi_had_conflict = false;
@@ -1177,12 +1173,15 @@ find_file(struct inode *ino)
 	unsigned int hashval = file_hashval(ino);
 	struct nfs4_file *fp;
 
+	spin_lock(&recall_lock);
 	list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
 		if (fp->fi_inode == ino) {
 			get_nfs4_file(fp);
+			spin_unlock(&recall_lock);
 			return fp;
 		}
 	}
+	spin_unlock(&recall_lock);
 	return NULL;
 }
 
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 503b6bb53a56..a6e4a00fa392 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -214,7 +214,7 @@ struct nfs4_stateowner {
 *      share_acces, share_deny on the file.
 */
 struct nfs4_file {
-	struct kref		fi_ref;
+	atomic_t		fi_ref;
 	struct list_head        fi_hash;    /* hash by "struct inode *" */
 	struct list_head        fi_stateids;
 	struct list_head	fi_delegations;
-- 
cgit v1.2.3


From 31dec2538e45e9fff2007ea1f4c6bae9f78db724 Mon Sep 17 00:00:00 2001
From: David Shaw <dshaw@jabberwocky.com>
Date: Thu, 5 Mar 2009 20:16:14 -0500
Subject: Short write in nfsd becomes a full write to the client

If a filesystem being written to via NFS returns a short write count
(as opposed to an error) to nfsd, nfsd treats that as a success for
the entire write, rather than the short count that actually succeeded.

For example, given a 8192 byte write, if the underlying filesystem
only writes 4096 bytes, nfsd will ack back to the nfs client that all
8192 bytes were written.  The nfs client does have retry logic for
short writes, but this is never called as the client is told the
complete write succeeded.

There are probably other ways it could happen, but in my case it
happened with a fuse (filesystem in userspace) filesystem which can
rather easily have a partial write.

Here is a patch to properly return the short write count to the
client.

Signed-off-by: David Shaw <dshaw@jabberwocky.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs3proc.c        |  5 +++--
 fs/nfsd/nfs4proc.c        |  7 +++++--
 fs/nfsd/nfsproc.c         |  3 ++-
 fs/nfsd/vfs.c             | 13 +++++++------
 include/linux/nfsd/nfsd.h |  2 +-
 5 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 579ce8c69daa..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -203,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 					 struct nfsd3_writeres  *resp)
 {
 	__be32	nfserr;
+	unsigned long cnt = argp->len;
 
 	dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
 				SVCFH_fmt(&argp->fh),
@@ -215,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 	nfserr = nfsd_write(rqstp, &resp->fh, NULL,
 				   argp->offset,
 				   rqstp->rq_vec, argp->vlen,
-				   argp->len,
+				   &cnt,
 				   &resp->committed);
-	resp->count = argp->count;
+	resp->count = cnt;
 	RETURN_STATUS(nfserr);
 }
 
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 77f584f69dfe..283d77a47120 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -682,6 +682,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct file *filp = NULL;
 	u32 *p;
 	__be32 status = nfs_ok;
+	unsigned long cnt;
 
 	/* no need to check permission - this will be done in nfsd_write() */
 
@@ -700,7 +701,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return status;
 	}
 
-	write->wr_bytes_written = write->wr_buflen;
+	cnt = write->wr_buflen;
 	write->wr_how_written = write->wr_stable_how;
 	p = (u32 *)write->wr_verifier.data;
 	*p++ = nfssvc_boot.tv_sec;
@@ -708,10 +709,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	status =  nfsd_write(rqstp, &cstate->current_fh, filp,
 			     write->wr_offset, rqstp->rq_vec, write->wr_vlen,
-			     write->wr_buflen, &write->wr_how_written);
+			     &cnt, &write->wr_how_written);
 	if (filp)
 		fput(filp);
 
+	write->wr_bytes_written = cnt;
+
 	if (status == nfserr_symlink)
 		status = nfserr_inval;
 	return status;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 {
 	__be32	nfserr;
 	int	stable = 1;
+	unsigned long cnt = argp->len;
 
 	dprintk("nfsd: WRITE    %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
 				   argp->offset,
 				   rqstp->rq_vec, argp->vlen,
-				   argp->len,
+			           &cnt,
 				   &stable);
 	return nfsd_return_attrs(nfserr, resp);
 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 0c076293155d..54404d730809 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -960,7 +960,7 @@ static void kill_suid(struct dentry *dentry)
 static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 				loff_t offset, struct kvec *vec, int vlen,
-	   			unsigned long cnt, int *stablep)
+				unsigned long *cnt, int *stablep)
 {
 	struct svc_export	*exp;
 	struct dentry		*dentry;
@@ -974,7 +974,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	err = nfserr_perm;
 
 	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-		(!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))
+		(!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
 		goto out;
 #endif
 
@@ -1006,7 +1006,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
 	set_fs(oldfs);
 	if (host_err >= 0) {
-		nfsdstats.io_write += cnt;
+		nfsdstats.io_write += host_err;
 		fsnotify_modify(file->f_path.dentry);
 	}
 
@@ -1051,9 +1051,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	}
 
 	dprintk("nfsd: write complete host_err=%d\n", host_err);
-	if (host_err >= 0)
+	if (host_err >= 0) {
 		err = 0;
-	else 
+		*cnt = host_err;
+	} else
 		err = nfserrno(host_err);
 out:
 	return err;
@@ -1095,7 +1096,7 @@ out:
  */
 __be32
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
-		loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
+		loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
 		int *stablep)
 {
 	__be32			err = 0;
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 16f7b403d9c1..54beda12d26b 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -105,7 +105,7 @@ void		nfsd_close(struct file *);
 __be32 		nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *,
 				loff_t, struct kvec *, int, unsigned long *);
 __be32 		nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
-				loff_t, struct kvec *,int, unsigned long, int *);
+				loff_t, struct kvec *,int, unsigned long *, int *);
 __be32		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
 				char *, int *);
 __be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
-- 
cgit v1.2.3


From 59a252ff8c0f2fa32c896f69d56ae33e641ce7ad Mon Sep 17 00:00:00 2001
From: Greg Banks <gnb@sgi.com>
Date: Tue, 13 Jan 2009 21:26:35 +1100
Subject: knfsd: avoid overloading the CPU scheduler with enormous load
 averages

Avoid overloading the CPU scheduler with enormous load averages
when handling high call-rate NFS loads.  When the knfsd bottom half
is made aware of an incoming call by the socket layer, it tries to
choose an nfsd thread and wake it up.  As long as there are idle
threads, one will be woken up.

If there are lot of nfsd threads (a sensible configuration when
the server is disk-bound or is running an HSM), there will be many
more nfsd threads than CPUs to run them.  Under a high call-rate
low service-time workload, the result is that almost every nfsd is
runnable, but only a handful are actually able to run.  This situation
causes two significant problems:

1. The CPU scheduler takes over 10% of each CPU, which is robbing
   the nfsd threads of valuable CPU time.

2. At a high enough load, the nfsd threads starve userspace threads
   of CPU time, to the point where daemons like portmap and rpc.mountd
   do not schedule for tens of seconds at a time.  Clients attempting
   to mount an NFS filesystem timeout at the very first step (opening
   a TCP connection to portmap) because portmap cannot wake up from
   select() and call accept() in time.

Disclaimer: these effects were observed on a SLES9 kernel, modern
kernels' schedulers may behave more gracefully.

The solution is simple: keep in each svc_pool a counter of the number
of threads which have been woken but have not yet run, and do not wake
any more if that count reaches an arbitrary small threshold.

Testing was on a 4 CPU 4 NIC Altix using 4 IRIX clients, each with 16
synthetic client threads simulating an rsync (i.e. recursive directory
listing) workload reading from an i386 RH9 install image (161480
regular files in 10841 directories) on the server.  That tree is small
enough to fill in the server's RAM so no disk traffic was involved.
This setup gives a sustained call rate in excess of 60000 calls/sec
before being CPU-bound on the server.  The server was running 128 nfsds.

Profiling showed schedule() taking 6.7% of every CPU, and __wake_up()
taking 5.2%.  This patch drops those contributions to 3.0% and 2.2%.
Load average was over 120 before the patch, and 20.9 after.

This patch is a forward-ported version of knfsd-avoid-nfsd-overload
which has been shipping in the SGI "Enhanced NFS" product since 2006.
It has been posted before:

http://article.gmane.org/gmane.linux.nfs/10374

Signed-off-by: Greg Banks <gnb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/sunrpc/svc.h |  2 ++
 net/sunrpc/svc_xprt.c      | 25 ++++++++++++++++++-------
 2 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3435d24bfe55..39ec186a492d 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -41,6 +41,7 @@ struct svc_pool {
 	struct list_head	sp_sockets;	/* pending sockets */
 	unsigned int		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
+	int			sp_nwaking;	/* number of threads woken but not yet active */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -264,6 +265,7 @@ struct svc_rqst {
 						 * cache pages */
 	wait_queue_head_t	rq_wait;	/* synchronization */
 	struct task_struct	*rq_task;	/* service thread */
+	int			rq_waking;	/* 1 if thread is being woken */
 };
 
 /*
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e588df5d6b34..0551b6b6cf8c 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -14,6 +14,8 @@
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
+#define SVC_MAX_WAKING 5
+
 static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
 static int svc_deferred_recv(struct svc_rqst *rqstp);
 static struct cache_deferred_req *svc_defer(struct cache_req *req);
@@ -298,6 +300,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 	struct svc_pool *pool;
 	struct svc_rqst	*rqstp;
 	int cpu;
+	int thread_avail;
 
 	if (!(xprt->xpt_flags &
 	      ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
@@ -309,12 +312,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 
 	spin_lock_bh(&pool->sp_lock);
 
-	if (!list_empty(&pool->sp_threads) &&
-	    !list_empty(&pool->sp_sockets))
-		printk(KERN_ERR
-		       "svc_xprt_enqueue: "
-		       "threads and transports both waiting??\n");
-
 	if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
 		/* Don't enqueue dead transports */
 		dprintk("svc: transport %p is dead, not enqueued\n", xprt);
@@ -353,7 +350,14 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 	}
 
  process:
-	if (!list_empty(&pool->sp_threads)) {
+	/* Work out whether threads are available */
+	thread_avail = !list_empty(&pool->sp_threads);	/* threads are asleep */
+	if (pool->sp_nwaking >= SVC_MAX_WAKING) {
+		/* too many threads are runnable and trying to wake up */
+		thread_avail = 0;
+	}
+
+	if (thread_avail) {
 		rqstp = list_entry(pool->sp_threads.next,
 				   struct svc_rqst,
 				   rq_list);
@@ -368,6 +372,8 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 		svc_xprt_get(xprt);
 		rqstp->rq_reserved = serv->sv_max_mesg;
 		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+		rqstp->rq_waking = 1;
+		pool->sp_nwaking++;
 		BUG_ON(xprt->xpt_pool != pool);
 		wake_up(&rqstp->rq_wait);
 	} else {
@@ -633,6 +639,11 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 		return -EINTR;
 
 	spin_lock_bh(&pool->sp_lock);
+	if (rqstp->rq_waking) {
+		rqstp->rq_waking = 0;
+		pool->sp_nwaking--;
+		BUG_ON(pool->sp_nwaking < 0);
+	}
 	xprt = svc_xprt_dequeue(pool);
 	if (xprt) {
 		rqstp->rq_xprt = xprt;
-- 
cgit v1.2.3


From 03cf6c9f49a8fea953d38648d016e3f46e814991 Mon Sep 17 00:00:00 2001
From: Greg Banks <gnb@sgi.com>
Date: Tue, 13 Jan 2009 21:26:36 +1100
Subject: knfsd: add file to export stats about nfsd pools

Add /proc/fs/nfsd/pool_stats to export to userspace various
statistics about the operation of rpc server thread pools.

This patch is based on a forward-ported version of
knfsd-add-pool-thread-stats which has been shipping in the SGI
"Enhanced NFS" product since 2006 and which was previously
posted:

http://article.gmane.org/gmane.linux.nfs/10375

It has also been updated thus:

 * moved EXPORT_SYMBOL() to near the function it exports
 * made the new struct struct seq_operations const
 * used SEQ_START_TOKEN instead of ((void *)1)
 * merged fix from SGI PV 990526 "sunrpc: use dprintk instead of
   printk in svc_pool_stats_*()" by Harshula Jayasuriya.
 * merged fix from SGI PV 964001 "Crash reading pool_stats before
   nfsds are started".

Signed-off-by: Greg Banks <gnb@sgi.com>
Signed-off-by: Harshula Jayasuriya <harshula@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c           |  12 ++++++
 fs/nfsd/nfssvc.c           |   7 ++++
 include/linux/sunrpc/svc.h |  11 +++++
 net/sunrpc/svc_xprt.c      | 100 ++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 129 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d93b2064ce5..4adebb6312c4 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
 	NFSD_FO_UnlockFS,
 	NFSD_Threads,
 	NFSD_Pool_Threads,
+	NFSD_Pool_Stats,
 	NFSD_Versions,
 	NFSD_Ports,
 	NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
 	.owner		= THIS_MODULE,
 };
 
+extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
+
+static struct file_operations pool_stats_operations = {
+	.open		= nfsd_pool_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+	.owner		= THIS_MODULE,
+};
+
 /*----------------------------------------------------------------------------*/
 /*
  * payload - write methods
@@ -1246,6 +1257,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index c3eb0759fd57..ef0a3686639d 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -546,3 +546,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 	nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
 	return 1;
 }
+
+int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+	if (nfsd_serv == NULL)
+		return -ENODEV;
+	return svc_pool_stats_open(nfsd_serv, file);
+}
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 39ec186a492d..9f9f699dd469 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -24,6 +24,15 @@
  */
 typedef int		(*svc_thread_fn)(void *);
 
+/* statistics for svc_pool structures */
+struct svc_pool_stats {
+	unsigned long	packets;
+	unsigned long	sockets_queued;
+	unsigned long	threads_woken;
+	unsigned long	overloads_avoided;
+	unsigned long	threads_timedout;
+};
+
 /*
  *
  * RPC service thread pool.
@@ -42,6 +51,7 @@ struct svc_pool {
 	unsigned int		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
 	int			sp_nwaking;	/* number of threads woken but not yet active */
+	struct svc_pool_stats	sp_stats;	/* statistics on pool operation */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -396,6 +406,7 @@ struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 			sa_family_t, void (*shutdown)(struct svc_serv *),
 			svc_thread_fn, struct module *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
+int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_destroy(struct svc_serv *);
 int		   svc_process(struct svc_rqst *);
 int		   svc_register(const struct svc_serv *, const unsigned short,
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 0551b6b6cf8c..1e66f2491460 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -318,6 +318,8 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 		goto out_unlock;
 	}
 
+	pool->sp_stats.packets++;
+
 	/* Mark transport as busy. It will remain in this state until
 	 * the provider calls svc_xprt_received. We update XPT_BUSY
 	 * atomically because it also guards against trying to enqueue
@@ -355,6 +357,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 	if (pool->sp_nwaking >= SVC_MAX_WAKING) {
 		/* too many threads are runnable and trying to wake up */
 		thread_avail = 0;
+		pool->sp_stats.overloads_avoided++;
 	}
 
 	if (thread_avail) {
@@ -374,11 +377,13 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
 		rqstp->rq_waking = 1;
 		pool->sp_nwaking++;
+		pool->sp_stats.threads_woken++;
 		BUG_ON(xprt->xpt_pool != pool);
 		wake_up(&rqstp->rq_wait);
 	} else {
 		dprintk("svc: transport %p put into queue\n", xprt);
 		list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
+		pool->sp_stats.sockets_queued++;
 		BUG_ON(xprt->xpt_pool != pool);
 	}
 
@@ -591,6 +596,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 	int			pages;
 	struct xdr_buf		*arg;
 	DECLARE_WAITQUEUE(wait, current);
+	long			time_left;
 
 	dprintk("svc: server %p waiting for data (to = %ld)\n",
 		rqstp, timeout);
@@ -676,12 +682,14 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 		add_wait_queue(&rqstp->rq_wait, &wait);
 		spin_unlock_bh(&pool->sp_lock);
 
-		schedule_timeout(timeout);
+		time_left = schedule_timeout(timeout);
 
 		try_to_freeze();
 
 		spin_lock_bh(&pool->sp_lock);
 		remove_wait_queue(&rqstp->rq_wait, &wait);
+		if (!time_left)
+			pool->sp_stats.threads_timedout++;
 
 		xprt = rqstp->rq_xprt;
 		if (!xprt) {
@@ -1114,3 +1122,93 @@ int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen)
 	return totlen;
 }
 EXPORT_SYMBOL_GPL(svc_xprt_names);
+
+
+/*----------------------------------------------------------------------------*/
+
+static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos)
+{
+	unsigned int pidx = (unsigned int)*pos;
+	struct svc_serv *serv = m->private;
+
+	dprintk("svc_pool_stats_start, *pidx=%u\n", pidx);
+
+	lock_kernel();
+	/* bump up the pseudo refcount while traversing */
+	svc_get(serv);
+	unlock_kernel();
+
+	if (!pidx)
+		return SEQ_START_TOKEN;
+	return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]);
+}
+
+static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct svc_pool *pool = p;
+	struct svc_serv *serv = m->private;
+
+	dprintk("svc_pool_stats_next, *pos=%llu\n", *pos);
+
+	if (p == SEQ_START_TOKEN) {
+		pool = &serv->sv_pools[0];
+	} else {
+		unsigned int pidx = (pool - &serv->sv_pools[0]);
+		if (pidx < serv->sv_nrpools-1)
+			pool = &serv->sv_pools[pidx+1];
+		else
+			pool = NULL;
+	}
+	++*pos;
+	return pool;
+}
+
+static void svc_pool_stats_stop(struct seq_file *m, void *p)
+{
+	struct svc_serv *serv = m->private;
+
+	lock_kernel();
+	/* this function really, really should have been called svc_put() */
+	svc_destroy(serv);
+	unlock_kernel();
+}
+
+static int svc_pool_stats_show(struct seq_file *m, void *p)
+{
+	struct svc_pool *pool = p;
+
+	if (p == SEQ_START_TOKEN) {
+		seq_puts(m, "# pool packets-arrived sockets-enqueued threads-woken overloads-avoided threads-timedout\n");
+		return 0;
+	}
+
+	seq_printf(m, "%u %lu %lu %lu %lu %lu\n",
+		pool->sp_id,
+		pool->sp_stats.packets,
+		pool->sp_stats.sockets_queued,
+		pool->sp_stats.threads_woken,
+		pool->sp_stats.overloads_avoided,
+		pool->sp_stats.threads_timedout);
+
+	return 0;
+}
+
+static const struct seq_operations svc_pool_stats_seq_ops = {
+	.start	= svc_pool_stats_start,
+	.next	= svc_pool_stats_next,
+	.stop	= svc_pool_stats_stop,
+	.show	= svc_pool_stats_show,
+};
+
+int svc_pool_stats_open(struct svc_serv *serv, struct file *file)
+{
+	int err;
+
+	err = seq_open(file, &svc_pool_stats_seq_ops);
+	if (!err)
+		((struct seq_file *) file->private_data)->private = serv;
+	return err;
+}
+EXPORT_SYMBOL(svc_pool_stats_open);
+
+/*----------------------------------------------------------------------------*/
-- 
cgit v1.2.3


From 2795e53b4ed5d1f49d2283f416c922f55ec7d461 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 12 Mar 2009 12:07:14 -0400
Subject: SUNRPC: Clean up static inline functions in svc_xprt.h

Clean up:  Enable the use of const arguments in higher level svc_ APIs
by adding const to the arguments of the helper functions in svc_xprt.h

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/sunrpc/svc_xprt.h | 46 +++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 0127daca4354..959b931b6053 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -88,29 +88,32 @@ static inline void svc_xprt_get(struct svc_xprt *xprt)
 	kref_get(&xprt->xpt_ref);
 }
 static inline void svc_xprt_set_local(struct svc_xprt *xprt,
-				      struct sockaddr *sa, int salen)
+				      const struct sockaddr *sa,
+				      const size_t salen)
 {
 	memcpy(&xprt->xpt_local, sa, salen);
 	xprt->xpt_locallen = salen;
 }
 static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
-				       struct sockaddr *sa, int salen)
+				       const struct sockaddr *sa,
+				       const size_t salen)
 {
 	memcpy(&xprt->xpt_remote, sa, salen);
 	xprt->xpt_remotelen = salen;
 }
-static inline unsigned short svc_addr_port(struct sockaddr *sa)
+static inline unsigned short svc_addr_port(const struct sockaddr *sa)
 {
-	unsigned short ret = 0;
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)sa;
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa;
+
 	switch (sa->sa_family) {
 	case AF_INET:
-		ret = ntohs(((struct sockaddr_in *)sa)->sin_port);
-		break;
+		return ntohs(sin->sin_port);
 	case AF_INET6:
-		ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
-		break;
+		return ntohs(sin6->sin6_port);
 	}
-	return ret;
+
+	return 0;
 }
 
 static inline size_t svc_addr_len(struct sockaddr *sa)
@@ -124,36 +127,39 @@ static inline size_t svc_addr_len(struct sockaddr *sa)
 	return -EAFNOSUPPORT;
 }
 
-static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt)
 {
-	return svc_addr_port((struct sockaddr *)&xprt->xpt_local);
+	return svc_addr_port((const struct sockaddr *)&xprt->xpt_local);
 }
 
-static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt)
 {
-	return svc_addr_port((struct sockaddr *)&xprt->xpt_remote);
+	return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote);
 }
 
-static inline char *__svc_print_addr(struct sockaddr *addr,
-				     char *buf, size_t len)
+static inline char *__svc_print_addr(const struct sockaddr *addr,
+				     char *buf, const size_t len)
 {
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)addr;
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr;
+
 	switch (addr->sa_family) {
 	case AF_INET:
-		snprintf(buf, len, "%pI4, port=%u",
-			&((struct sockaddr_in *)addr)->sin_addr,
-			ntohs(((struct sockaddr_in *) addr)->sin_port));
+		snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr,
+			ntohs(sin->sin_port));
 		break;
 
 	case AF_INET6:
 		snprintf(buf, len, "%pI6, port=%u",
-			 &((struct sockaddr_in6 *)addr)->sin6_addr,
-			ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
+			 &sin6->sin6_addr,
+			ntohs(sin6->sin6_port));
 		break;
 
 	default:
 		snprintf(buf, len, "unknown address type: %d", addr->sa_family);
 		break;
 	}
+
 	return buf;
 }
 #endif /* SUNRPC_SVC_XPRT_H */
-- 
cgit v1.2.3


From 7d1e8255cf959fba7ee2317550dfde39f0b936ae Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:38:03 -0400
Subject: SUNRPC: Add the equivalent of the linger and linger2 timeouts to RPC
 sockets

This fixes a regression against FreeBSD servers as reported by Tomas
Kasparek. Apparently when using RPC over a TCP socket, the FreeBSD servers
don't ever react to the client closing the socket, and so commit
e06799f958bf7f9f8fae15f0c6f519953fb0257c (SUNRPC: Use shutdown() instead of
close() when disconnecting a TCP socket) causes the setup to hang forever
whenever the client attempts to close and then reconnect.

We break the deadlock by adding a 'linger2' style timeout to the socket,
after which, the client will abort the connection using a TCP 'RST'.

The default timeout is set to 15 seconds. A subsequent patch will put it
under user control by means of a systctl.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprtsock.c       | 98 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 82 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 2b0d960603b9..1758d9f5b5c3 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -260,6 +260,7 @@ void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
 #define XPRT_BOUND		(4)
 #define XPRT_BINDING		(5)
 #define XPRT_CLOSING		(6)
+#define XPRT_CONNECTION_ABORT	(7)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 2e070679ab4a..b51f58b95c39 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -49,6 +49,8 @@ unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE;
 unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
 unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
 
+#define XS_TCP_LINGER_TO	(15U * HZ)
+
 /*
  * We can register our own files under /proc/sys/sunrpc by
  * calling register_sysctl_table() again.  The files in that
@@ -806,6 +808,7 @@ static void xs_close(struct rpc_xprt *xprt)
 	xs_reset_transport(transport);
 
 	smp_mb__before_clear_bit();
+	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 	clear_bit(XPRT_CLOSING, &xprt->state);
 	smp_mb__after_clear_bit();
@@ -1133,6 +1136,47 @@ out:
 	read_unlock(&sk->sk_callback_lock);
 }
 
+/*
+ * Do the equivalent of linger/linger2 handling for dealing with
+ * broken servers that don't close the socket in a timely
+ * fashion
+ */
+static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt,
+		unsigned long timeout)
+{
+	struct sock_xprt *transport;
+
+	if (xprt_test_and_set_connecting(xprt))
+		return;
+	set_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	transport = container_of(xprt, struct sock_xprt, xprt);
+	queue_delayed_work(rpciod_workqueue, &transport->connect_worker,
+			   timeout);
+}
+
+static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport;
+
+	transport = container_of(xprt, struct sock_xprt, xprt);
+
+	if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) ||
+	    !cancel_delayed_work(&transport->connect_worker))
+		return;
+	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	xprt_clear_connecting(xprt);
+}
+
+static void xs_sock_mark_closed(struct rpc_xprt *xprt)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+	clear_bit(XPRT_CLOSING, &xprt->state);
+	smp_mb__after_clear_bit();
+	/* Mark transport as closed and wake up all pending tasks */
+	xprt_disconnect_done(xprt);
+}
+
 /**
  * xs_tcp_state_change - callback to handle TCP socket state changes
  * @sk: socket whose state has changed
@@ -1178,6 +1222,7 @@ static void xs_tcp_state_change(struct sock *sk)
 		clear_bit(XPRT_CONNECTED, &xprt->state);
 		clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 		smp_mb__after_clear_bit();
+		xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO);
 		break;
 	case TCP_CLOSE_WAIT:
 		/* The server initiated a shutdown of the socket */
@@ -1194,17 +1239,14 @@ static void xs_tcp_state_change(struct sock *sk)
 		break;
 	case TCP_LAST_ACK:
 		set_bit(XPRT_CLOSING, &xprt->state);
+		xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO);
 		smp_mb__before_clear_bit();
 		clear_bit(XPRT_CONNECTED, &xprt->state);
 		smp_mb__after_clear_bit();
 		break;
 	case TCP_CLOSE:
-		smp_mb__before_clear_bit();
-		clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
-		clear_bit(XPRT_CLOSING, &xprt->state);
-		smp_mb__after_clear_bit();
-		/* Mark transport as closed and wake up all pending tasks */
-		xprt_disconnect_done(xprt);
+		xs_tcp_cancel_linger_timeout(xprt);
+		xs_sock_mark_closed(xprt);
 	}
  out:
 	read_unlock(&sk->sk_callback_lock);
@@ -1562,8 +1604,8 @@ static void xs_udp_connect_worker4(struct work_struct *work)
 	xs_udp_finish_connecting(xprt, sock);
 	status = 0;
 out:
-	xprt_wake_pending_tasks(xprt, status);
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /**
@@ -1604,8 +1646,8 @@ static void xs_udp_connect_worker6(struct work_struct *work)
 	xs_udp_finish_connecting(xprt, sock);
 	status = 0;
 out:
-	xprt_wake_pending_tasks(xprt, status);
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /*
@@ -1626,7 +1668,9 @@ static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transpo
 	memset(&any, 0, sizeof(any));
 	any.sa_family = AF_UNSPEC;
 	result = kernel_connect(transport->sock, &any, sizeof(any), 0);
-	if (result)
+	if (!result)
+		xs_sock_mark_closed(xprt);
+	else
 		dprintk("RPC:       AF_UNSPEC connect return code %d\n",
 				result);
 }
@@ -1702,6 +1746,7 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 		goto out;
 
 	if (!sock) {
+		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 		/* start from scratch */
 		if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
 			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
@@ -1713,10 +1758,18 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 			sock_release(sock);
 			goto out;
 		}
-	} else
+	} else {
+		int abort_and_exit;
+
+		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
+				&xprt->state);
 		/* "close" the socket, preserving the local port */
 		xs_tcp_reuse_connection(xprt, transport);
 
+		if (abort_and_exit)
+			goto out_eagain;
+	}
+
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
 
@@ -1732,17 +1785,18 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 	case 0:
 	case -EINPROGRESS:
 	case -EALREADY:
-		goto out_clear;
+		xprt_clear_connecting(xprt);
+		return;
 	}
 	/* get rid of existing socket, and retry */
 	xs_tcp_shutdown(xprt);
 	printk("%s: connect returned unhandled error %d\n",
 			__func__, status);
+out_eagain:
 	status = -EAGAIN;
 out:
-	xprt_wake_pending_tasks(xprt, status);
-out_clear:
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /**
@@ -1763,6 +1817,7 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 		goto out;
 
 	if (!sock) {
+		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 		/* start from scratch */
 		if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
 			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
@@ -1774,10 +1829,18 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 			sock_release(sock);
 			goto out;
 		}
-	} else
+	} else {
+		int abort_and_exit;
+
+		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
+				&xprt->state);
 		/* "close" the socket, preserving the local port */
 		xs_tcp_reuse_connection(xprt, transport);
 
+		if (abort_and_exit)
+			goto out_eagain;
+	}
+
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
 
@@ -1792,17 +1855,18 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 	case 0:
 	case -EINPROGRESS:
 	case -EALREADY:
-		goto out_clear;
+		xprt_clear_connecting(xprt);
+		return;
 	}
 	/* get rid of existing socket, and retry */
 	xs_tcp_shutdown(xprt);
 	printk("%s: connect returned unhandled error %d\n",
 			__func__, status);
+out_eagain:
 	status = -EAGAIN;
 out:
-	xprt_wake_pending_tasks(xprt, status);
-out_clear:
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /**
-- 
cgit v1.2.3


From 7fe5c398fc2186ed586db11106a6692d871d0d58 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 19 Mar 2009 15:35:50 -0400
Subject: NFS: Optimise NFS close()

Close-to-open cache consistency rules really only require us to flush out
writes on calls to close(), and require us to revalidate attributes on the
very last close of the file.

Currently we appear to be doing a lot of extra attribute revalidation
and cache flushes.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c           | 11 ++---------
 fs/nfs/inode.c          | 41 +++++++++++++++++++++++++++++------------
 fs/nfs/internal.h       |  3 +++
 fs/nfs/nfs3proc.c       |  1 +
 fs/nfs/nfs4proc.c       | 10 ++++++++++
 fs/nfs/proc.c           |  1 +
 include/linux/nfs_xdr.h |  1 +
 7 files changed, 47 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 1eab9c9ad242..d451073c4947 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -137,9 +137,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 
-	/* Ensure that dirty pages are flushed out with the right creds */
-	if (filp->f_mode & FMODE_WRITE)
-		nfs_wb_all(dentry->d_inode);
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
 	return nfs_release(inode, filp);
 }
@@ -231,7 +228,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct dentry	*dentry = file->f_path.dentry;
 	struct inode	*inode = dentry->d_inode;
-	int		status;
 
 	dprintk("NFS: flush(%s/%s)\n",
 			dentry->d_parent->d_name.name,
@@ -241,11 +237,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 		return 0;
 	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
 
-	/* Ensure that data+attribute caches are up to date after close() */
-	status = nfs_do_fsync(ctx, inode);
-	if (!status)
-		nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	return status;
+	/* Flush writes to the server and return any errors */
+	return nfs_do_fsync(ctx, inode);
 }
 
 static ssize_t
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c40adc5dd609..a834d1d850b7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -541,6 +541,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	return err;
 }
 
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * always ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	struct inode *inode;
+	struct nfs_server *server;
+
+	if (!(ctx->mode & FMODE_WRITE))
+		return;
+	if (!is_sync)
+		return;
+	inode = ctx->path.dentry->d_inode;
+	if (!list_empty(&NFS_I(inode)->open_files))
+		return;
+	server = NFS_SERVER(inode);
+	if (server->flags & NFS_MOUNT_NOCTO)
+		return;
+	nfs_revalidate_inode(server, inode);
+}
+
 static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
 {
 	struct nfs_open_context *ctx;
@@ -567,24 +593,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 	return ctx;
 }
 
-static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
-	struct inode *inode;
+	struct inode *inode = ctx->path.dentry->d_inode;
 
-	if (ctx == NULL)
-		return;
-
-	inode = ctx->path.dentry->d_inode;
 	if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
 		return;
 	list_del(&ctx->list);
 	spin_unlock(&inode->i_lock);
-	if (ctx->state != NULL) {
-		if (wait)
-			nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
-		else
-			nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
-	}
+	NFS_PROTO(inode)->close_context(ctx, is_sync);
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
 	path_put(&ctx->path);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index a55e69aa52e5..2041f68ff1cc 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -152,6 +152,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
 
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+
 /* dir.c */
 extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679e..b82fe6847f14 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.commit_done	= nfs3_commit_done,
 	.lock		= nfs3_proc_lock,
 	.clear_acl_cache = nfs3_forget_cached_acls,
+	.close_context	= nfs_close_context,
 };
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 95f171e7e05a..97bacccff579 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1572,6 +1572,15 @@ out_drop:
 	return 0;
 }
 
+void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	if (ctx->state == NULL)
+		return;
+	if (is_sync)
+		nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
+	else
+		nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
+}
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
@@ -3776,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.commit_done	= nfs4_commit_done,
 	.lock		= nfs4_proc_lock,
 	.clear_acl_cache = nfs4_zap_acl_attr,
+	.close_context  = nfs4_close_context,
 };
 
 /*
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7c..7be72d90d49d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.commit_setup	= nfs_proc_commit_setup,
 	.lock		= nfs_proc_lock,
 	.lock_check_bounds = nfs_lock_check_bounds,
+	.close_context	= nfs_close_context,
 };
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0691b9c188d9..9708e78a4d49 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -868,6 +868,7 @@ struct nfs_rpc_ops {
 	int	(*lock)(struct file *, int, struct file_lock *);
 	int	(*lock_check_bounds)(const struct file_lock *);
 	void	(*clear_acl_cache)(struct inode *);
+	void	(*close_context)(struct nfs_open_context *ctx, int);
 };
 
 /*
-- 
cgit v1.2.3


From 1bf83e558cb29d163f4bc6decbc3800ecf4db195 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:38:34 +0100
Subject: PCI: PCIe portdrv: Use driver data to simplify code

PCI Express port driver extension, as defined by struct
pcie_port_device_ext in portdrv.h, is allocated and initialized, but
never used (it also is never freed).  Extend it to hold the PCI Express
port type as well as the port interrupt mode, change its name and use it
to simplify the code in portdrv_core.c .

Additionally, remove the redundant interrupt_mode member of struct
pcie_device defined in include/linux/pcieport_if.h .

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv.h      |  5 ++-
 drivers/pci/pcie/portdrv_core.c | 95 ++++++++++++++++-------------------------
 include/linux/pcieport_if.h     |  1 -
 3 files changed, 39 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 2529f3f2ea5a..b0dcbc73415e 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -28,8 +28,9 @@
 
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
-struct pcie_port_device_ext {
-	int interrupt_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
+struct pcie_port_data {
+	int port_type;		/* Type of the port */
+	int port_irq_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
 };
 
 extern struct bus_type pcie_port_bus_type;
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 8b3f8c18032f..273e97619bce 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -15,10 +15,9 @@
 #include <linux/slab.h>
 #include <linux/pcieport_if.h>
 
+#include "../pci.h"
 #include "portdrv.h"
 
-extern int pcie_mch_quirk;	/* MSI-quirk Indicator */
-
 /**
  * release_pcie_device - free PCI Express port service device structure
  * @dev: Port service device to release
@@ -31,28 +30,6 @@ static void release_pcie_device(struct device *dev)
 	kfree(to_pcie_device(dev));			
 }
 
-static int is_msi_quirked(struct pci_dev *dev)
-{
-	int port_type, quirk = 0;
-	u16 reg16;
-
-	pci_read_config_word(dev, 
-		pci_find_capability(dev, PCI_CAP_ID_EXP) + 
-		PCIE_CAPABILITIES_REG, &reg16);
-	port_type = (reg16 >> 4) & PORT_TYPE_MASK;
-	switch(port_type) {
-	case PCIE_RC_PORT:
-		if (pcie_mch_quirk == 1)
-			quirk = 1;
-		break;
-	case PCIE_SW_UPSTREAM_PORT:
-	case PCIE_SW_DOWNSTREAM_PORT:
-	default:
-		break;	
-	}
-	return quirk;
-}
-
 /**
  * assign_interrupt_mode - choose interrupt mode for PCI Express port services
  *                         (INTx, MSI-X, MSI) and set up vectors
@@ -64,6 +41,7 @@ static int is_msi_quirked(struct pci_dev *dev)
  */
 static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 {
+	struct pcie_port_data *port_data = pci_get_drvdata(dev);
 	int i, pos, nvec, status = -EINVAL;
 	int interrupt_mode = PCIE_PORT_INTx_MODE;
 
@@ -75,7 +53,7 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 	}
 	
 	/* Check MSI quirk */
-	if (is_msi_quirked(dev))
+	if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk)
 		return interrupt_mode;
 
 	/* Select MSI-X over MSI if supported */		
@@ -132,13 +110,11 @@ static int get_port_device_capability(struct pci_dev *dev)
 			pos + PCIE_SLOT_CAPABILITIES_REG, &reg32);
 		if (reg32 & SLOT_HP_CAPABLE_MASK)
 			services |= PCIE_PORT_SERVICE_HP;
-	} 
-	/* PME Capable - root port capability */
-	if (((reg16 >> 4) & PORT_TYPE_MASK) == PCIE_RC_PORT)
-		services |= PCIE_PORT_SERVICE_PME;
-
+	}
+	/* AER capable */
 	if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR))
 		services |= PCIE_PORT_SERVICE_AER;
+	/* VC support */
 	if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_VC))
 		services |= PCIE_PORT_SERVICE_VC;
 
@@ -152,15 +128,15 @@ static int get_port_device_capability(struct pci_dev *dev)
  * @port_type: Type of the port
  * @service_type: Type of service to associate with the service device
  * @irq: Interrupt vector to associate with the service device
- * @irq_mode: Interrupt mode of the service (INTx, MSI-X, MSI)
  */
 static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev, 
-	int port_type, int service_type, int irq, int irq_mode)
+	int service_type, int irq)
 {
+	struct pcie_port_data *port_data = pci_get_drvdata(parent);
 	struct device *device;
+	int port_type = port_data->port_type;
 
 	dev->port = parent;
-	dev->interrupt_mode = irq_mode;
 	dev->irq = irq;
 	dev->id.vendor = parent->vendor;
 	dev->id.device = parent->device;
@@ -185,10 +161,9 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev,
  * @port_type: Type of the port
  * @service_type: Type of service to associate with the service device
  * @irq: Interrupt vector to associate with the service device
- * @irq_mode: Interrupt mode of the service (INTx, MSI-X, MSI)
  */
 static struct pcie_device* alloc_pcie_device(struct pci_dev *parent,
-	int port_type, int service_type, int irq, int irq_mode)
+	int service_type, int irq)
 {
 	struct pcie_device *device;
 
@@ -196,7 +171,7 @@ static struct pcie_device* alloc_pcie_device(struct pci_dev *parent,
 	if (!device)
 		return NULL;
 
-	pcie_device_init(parent, device, port_type, service_type, irq,irq_mode);
+	pcie_device_init(parent, device, service_type, irq);
 	return device;
 }
 
@@ -230,39 +205,36 @@ int pcie_port_device_probe(struct pci_dev *dev)
  */
 int pcie_port_device_register(struct pci_dev *dev)
 {
-	struct pcie_port_device_ext *p_ext;
-	int status, type, capabilities, irq_mode, i;
+	struct pcie_port_data *port_data;
+	int status, capabilities, irq_mode, i;
 	int vectors[PCIE_PORT_DEVICE_MAXSERVICES];
 	u16 reg16;
 
-	/* Allocate port device extension */
-	if (!(p_ext = kmalloc(sizeof(struct pcie_port_device_ext), GFP_KERNEL)))
+	port_data = kzalloc(sizeof(*port_data), GFP_KERNEL);
+	if (!port_data)
 		return -ENOMEM;
-
-	pci_set_drvdata(dev, p_ext);
+	pci_set_drvdata(dev, port_data);
 
 	/* Get port type */
 	pci_read_config_word(dev,
 		pci_find_capability(dev, PCI_CAP_ID_EXP) +
 		PCIE_CAPABILITIES_REG, &reg16);
-	type = (reg16 >> 4) & PORT_TYPE_MASK;
+	port_data->port_type = (reg16 >> 4) & PORT_TYPE_MASK;
 
-	/* Now get port services */
 	capabilities = get_port_device_capability(dev);
+	/* Root ports are capable of generating PME too */
+	if (port_data->port_type == PCIE_RC_PORT)
+		capabilities |= PCIE_PORT_SERVICE_PME;
+
 	irq_mode = assign_interrupt_mode(dev, vectors, capabilities);
-	p_ext->interrupt_mode = irq_mode;
+	port_data->port_irq_mode = irq_mode;
 
 	/* Allocate child services if any */
 	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
 		struct pcie_device *child;
 
 		if (capabilities & (1 << i)) {
-			child = alloc_pcie_device(
-				dev, 		/* parent */
-				type,		/* port type */
-				i,		/* service type */
-				vectors[i],	/* irq */
-				irq_mode	/* interrupt mode */);
+			child = alloc_pcie_device(dev, i, vectors[i]);
 			if (child) {
 				status = device_register(&child->device);
 				if (status) {
@@ -349,25 +321,30 @@ static int remove_iter(struct device *dev, void *data)
  */
 void pcie_port_device_remove(struct pci_dev *dev)
 {
-	struct device *device;
-	unsigned long device_addr;
-	int interrupt_mode = PCIE_PORT_INTx_MODE;
+	struct pcie_port_data *port_data = pci_get_drvdata(dev);
 	int status;
 
 	do {
+		unsigned long device_addr;
+
 		status = device_for_each_child(&dev->dev, &device_addr, remove_iter);
 		if (status) {
-			device = (struct device*)device_addr;
-			interrupt_mode = (to_pcie_device(device))->interrupt_mode;
+			struct device *device = (struct device*)device_addr;
 			put_device(device);
 			device_unregister(device);
 		}
 	} while (status);
-	/* Switch to INTx by default if MSI enabled */
-	if (interrupt_mode == PCIE_PORT_MSIX_MODE)
+
+	switch (port_data->port_irq_mode) {
+	case PCIE_PORT_MSIX_MODE:
 		pci_disable_msix(dev);
-	else if (interrupt_mode == PCIE_PORT_MSI_MODE)
+		break;
+	case PCIE_PORT_MSI_MODE:
 		pci_disable_msi(dev);
+		break;
+	}
+
+	kfree(port_data);
 }
 
 /**
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 6cd91e3f9820..194409af1037 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -36,7 +36,6 @@ struct pcie_port_service_id {
 
 struct pcie_device {
 	int 		irq;	    /* Service IRQ/MSI/MSI-X Vector */
-	int 		interrupt_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */	
 	struct pcie_port_service_id id;	/* Service ID */
 	struct pci_dev	*port;	    /* Root/Upstream/Downstream Port */
 	void		*priv_data; /* Service Private Data */
-- 
cgit v1.2.3


From 90e9cd50f7feeddc911325c8a8c1b7e1fccc6599 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:39:39 +0100
Subject: PCI: PCIe portdrv: Aviod using service devices with wrong interrupts

The PCI Express port driver should not attempt to register service
devices that require the ability to generate interrupts if generating
interrupts is not possible.  Namely, if the port has no interrupt pin
configured and we cannot set up MSI or MSI-X for it, there is no way
it can generate interrupts and in such a case the port services that
rely on interrupts (PME, PCIe HP, AER) should not be enabled for it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv_core.c | 41 ++++++++++++++++++++++++++++-------------
 include/linux/pcieport_if.h     |  1 +
 2 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 273e97619bce..265eba033a4a 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -43,7 +43,7 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 {
 	struct pcie_port_data *port_data = pci_get_drvdata(dev);
 	int i, pos, nvec, status = -EINVAL;
-	int interrupt_mode = PCIE_PORT_INTx_MODE;
+	int interrupt_mode = PCIE_PORT_NO_IRQ;
 
 	/* Set INTx as default */
 	for (i = 0, nvec = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
@@ -51,7 +51,9 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 			nvec++;
 		vectors[i] = dev->irq;
 	}
-	
+	if (dev->pin)
+		interrupt_mode = PCIE_PORT_INTx_MODE;
+
 	/* Check MSI quirk */
 	if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk)
 		return interrupt_mode;
@@ -141,7 +143,7 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev,
 	dev->id.vendor = parent->vendor;
 	dev->id.device = parent->device;
 	dev->id.port_type = port_type;
-	dev->id.service_type = (1 << service_type);
+	dev->id.service_type = service_type;
 
 	/* Initialize generic device interface */
 	device = &dev->device;
@@ -232,19 +234,32 @@ int pcie_port_device_register(struct pci_dev *dev)
 	/* Allocate child services if any */
 	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
 		struct pcie_device *child;
+		int service = 1 << i;
 
-		if (capabilities & (1 << i)) {
-			child = alloc_pcie_device(dev, i, vectors[i]);
-			if (child) {
-				status = device_register(&child->device);
-				if (status) {
-					kfree(child);
-					continue;
-				}
-				get_device(&child->device);
-			}
+		if (!(capabilities & service))
+			continue;
+
+		/*
+		 * Don't use service devices that require interrupts if there is
+		 * no way to generate them.
+		 */
+		if (irq_mode == PCIE_PORT_NO_IRQ
+		    && service != PCIE_PORT_SERVICE_VC)
+			continue;
+
+		child = alloc_pcie_device(dev, service, vectors[i]);
+		if (!child)
+			continue;
+
+		status = device_register(&child->device);
+		if (status) {
+			kfree(child);
+			continue;
 		}
+
+		get_device(&child->device);
 	}
+
 	return 0;
 }
 
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 194409af1037..8e1ae1fd92f6 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -22,6 +22,7 @@
 #define PCIE_PORT_SERVICE_VC		8	/* Virtual Channel */
 
 /* Root/Upstream/Downstream Port's Interrupt Mode */
+#define PCIE_PORT_NO_IRQ		(-1)
 #define PCIE_PORT_INTx_MODE		0
 #define PCIE_PORT_MSI_MODE		1
 #define PCIE_PORT_MSIX_MODE		2
-- 
cgit v1.2.3


From 0516c8bcd25293f438573101c439ce25a18916ad Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:44:19 +0100
Subject: PCI: PCIe portdrv: Simplily probe callback of service drivers

The second argument of the ->probe() callback in
struct pcie_port_service_driver is unnecessary and never used.
Remove it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp_acpi.c | 3 +--
 drivers/pci/hotplug/pciehp_core.c | 2 +-
 drivers/pci/pcie/aer/aerdrv.c     | 6 ++----
 drivers/pci/pcie/portdrv_core.c   | 2 +-
 include/linux/pcieport_if.h       | 3 +--
 5 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c
index 438d795f9fe3..ad8835758a17 100644
--- a/drivers/pci/hotplug/pciehp_acpi.c
+++ b/drivers/pci/hotplug/pciehp_acpi.c
@@ -82,8 +82,7 @@ static int __initdata acpi_slot_detected;
 static struct list_head __initdata dummy_slots = LIST_HEAD_INIT(dummy_slots);
 
 /* Dummy driver for dumplicate name detection */
-static int __init dummy_probe(struct pcie_device *dev,
-			      const struct pcie_port_service_id *id)
+static int __init dummy_probe(struct pcie_device *dev)
 {
 	int pos;
 	u32 slot_cap;
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 681e3912b821..3429b21dbb53 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -401,7 +401,7 @@ static int get_cur_bus_speed(struct hotplug_slot *hotplug_slot, enum pci_bus_spe
 	return 0;
 }
 
-static int pciehp_probe(struct pcie_device *dev, const struct pcie_port_service_id *id)
+static int pciehp_probe(struct pcie_device *dev)
 {
 	int rc;
 	struct controller *ctrl;
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index e390707661dd..57c41204c549 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -38,8 +38,7 @@ MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");
 
-static int __devinit aer_probe (struct pcie_device *dev,
-	const struct pcie_port_service_id *id );
+static int __devinit aer_probe (struct pcie_device *dev);
 static void aer_remove(struct pcie_device *dev);
 static int aer_suspend(struct pcie_device *dev, pm_message_t state)
 {return 0;}
@@ -207,8 +206,7 @@ static void aer_remove(struct pcie_device *dev)
  *
  * Invoked when PCI Express bus loads AER service driver.
  **/
-static int __devinit aer_probe (struct pcie_device *dev,
-				const struct pcie_port_service_id *id )
+static int __devinit aer_probe (struct pcie_device *dev)
 {
 	int status;
 	struct aer_rpc *rpc;
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 91ecbc43155f..682524b0c93a 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -402,7 +402,7 @@ static int pcie_port_probe_service(struct device *dev)
 		return -ENODEV;
 
 	pciedev = to_pcie_device(dev);
-	status = driver->probe(pciedev, driver->id_table);
+	status = driver->probe(pciedev);
 	if (!status) {
 		dev_printk(KERN_DEBUG, dev, "service driver %s loaded\n",
 			driver->name);
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 8e1ae1fd92f6..59e90b8a7839 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -56,8 +56,7 @@ static inline void* get_service_data(struct pcie_device *dev)
 
 struct pcie_port_service_driver {
 	const char *name;
-	int (*probe) (struct pcie_device *dev, 
-		const struct pcie_port_service_id *id);
+	int (*probe) (struct pcie_device *dev);
 	void (*remove) (struct pcie_device *dev);
 	int (*suspend) (struct pcie_device *dev, pm_message_t state);
 	int (*resume) (struct pcie_device *dev);
-- 
cgit v1.2.3


From 22106368c999246c414610dcaacd485e741605b1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:46:46 +0100
Subject: PCI: PCIe portdrv: Remove struct pcie_port_service_id

The PCI Express port driver uses 'struct pcie_port_service_id' for
matching port service devices and drivers, but this structure
contains fields that duplicate information from the port device
itself (vendor, device, subvendor, subdevice) and fields that are not
used by any existing port service driver (class, class_mask,
drvier_data).  Also, both existing port service drivers (AER and
PCIe HP) don't even use the vendor and device fields for device
matching.  Therefore 'struct pcie_port_service_id' can be removed
altogether and the only useful members of it (port_type, service) can
be introduced directly into the port service device and port service
driver structures.  That simplifies the code quite a bit and reduces
its size.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp_acpi.c  | 13 ++-----------
 drivers/pci/hotplug/pciehp_core.c  | 12 ++----------
 drivers/pci/pcie/aer/aerdrv.c      | 16 ++--------------
 drivers/pci/pcie/aer/aerdrv_core.c | 10 +++++-----
 drivers/pci/pcie/portdrv.h         |  5 -----
 drivers/pci/pcie/portdrv_bus.c     | 18 ++++++++++--------
 drivers/pci/pcie/portdrv_core.c    |  5 +----
 include/linux/pcieport_if.h        | 17 ++++++++---------
 8 files changed, 30 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c
index ad8835758a17..21734c311529 100644
--- a/drivers/pci/hotplug/pciehp_acpi.c
+++ b/drivers/pci/hotplug/pciehp_acpi.c
@@ -67,16 +67,6 @@ static int __init parse_detect_mode(void)
 	return PCIEHP_DETECT_DEFAULT;
 }
 
-static struct pcie_port_service_id __initdata port_pci_ids[] = {
-	{
-		.vendor = PCI_ANY_ID,
-		.device = PCI_ANY_ID,
-		.port_type = PCIE_ANY_PORT,
-		.service_type = PCIE_PORT_SERVICE_HP,
-		.driver_data =  0,
-        }, { /* end: all zeroes */ }
-};
-
 static int __initdata dup_slot_id;
 static int __initdata acpi_slot_detected;
 static struct list_head __initdata dummy_slots = LIST_HEAD_INIT(dummy_slots);
@@ -110,7 +100,8 @@ static int __init dummy_probe(struct pcie_device *dev)
 
 static struct pcie_port_service_driver __initdata dummy_driver = {
         .name           = "pciehp_dummy",
-        .id_table       = port_pci_ids,
+	.port_type	= PCIE_ANY_PORT,
+	.service	= PCIE_PORT_SERVICE_HP,
         .probe          = dummy_probe,
 };
 
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 3429b21dbb53..3d21bbba3308 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -505,18 +505,10 @@ static int pciehp_resume (struct pcie_device *dev)
 }
 #endif
 
-static struct pcie_port_service_id port_pci_ids[] = { {
-	.vendor = PCI_ANY_ID,
-	.device = PCI_ANY_ID,
-	.port_type = PCIE_ANY_PORT,
-	.service_type = PCIE_PORT_SERVICE_HP,
-	.driver_data =	0,
-	}, { /* end: all zeroes */ }
-};
-
 static struct pcie_port_service_driver hpdriver_portdrv = {
 	.name		= PCIE_MODULE_NAME,
-	.id_table	= &port_pci_ids[0],
+	.port_type	= PCIE_ANY_PORT,
+	.service	= PCIE_PORT_SERVICE_HP,
 
 	.probe		= pciehp_probe,
 	.remove		= pciehp_remove,
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 57c41204c549..e11c03194063 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -48,19 +48,6 @@ static pci_ers_result_t aer_error_detected(struct pci_dev *dev,
 static void aer_error_resume(struct pci_dev *dev);
 static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
 
-/*
- * PCI Express bus's AER Root service driver data structure
- */
-static struct pcie_port_service_id aer_id[] = {
-	{
-	.vendor 	= PCI_ANY_ID,
-	.device 	= PCI_ANY_ID,
-	.port_type 	= PCIE_RC_PORT,
-	.service_type 	= PCIE_PORT_SERVICE_AER,
-	},
-	{ /* end: all zeroes */ }
-};
-
 static struct pci_error_handlers aer_error_handlers = {
 	.error_detected = aer_error_detected,
 	.resume = aer_error_resume,
@@ -68,7 +55,8 @@ static struct pci_error_handlers aer_error_handlers = {
 
 static struct pcie_port_service_driver aerdriver = {
 	.name		= "aer",
-	.id_table	= &aer_id[0],
+	.port_type	= PCIE_ANY_PORT,
+	.service	= PCIE_PORT_SERVICE_AER,
 
 	.probe		= aer_probe,
 	.remove		= aer_remove,
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 382575007382..307452f30035 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -351,21 +351,21 @@ static int find_aer_service_iter(struct device *device, void *data)
 {
 	struct device_driver *driver;
 	struct pcie_port_service_driver *service_driver;
-	struct pcie_device *pcie_dev;
 	struct find_aer_service_data *result;
 
 	result = (struct find_aer_service_data *) data;
 
 	if (device->bus == &pcie_port_bus_type) {
-		pcie_dev = to_pcie_device(device);
-		if (pcie_dev->id.port_type == PCIE_SW_DOWNSTREAM_PORT)
+		struct pcie_port_data *port_data;
+
+		port_data = pci_get_drvdata(to_pcie_device(device)->port);
+		if (port_data->port_type == PCIE_SW_DOWNSTREAM_PORT)
 			result->is_downstream = 1;
 
 		driver = device->driver;
 		if (driver) {
 			service_driver = to_service_driver(driver);
-			if (service_driver->id_table->service_type ==
-					PCIE_PORT_SERVICE_AER) {
+			if (service_driver->service == PCIE_PORT_SERVICE_AER) {
 				result->aer_driver = service_driver;
 				return 1;
 			}
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index b0dcbc73415e..ad4d082a0344 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -28,11 +28,6 @@
 
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
-struct pcie_port_data {
-	int port_type;		/* Type of the port */
-	int port_irq_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
-};
-
 extern struct bus_type pcie_port_bus_type;
 extern int pcie_port_device_probe(struct pci_dev *dev);
 extern int pcie_port_device_register(struct pci_dev *dev);
diff --git a/drivers/pci/pcie/portdrv_bus.c b/drivers/pci/pcie/portdrv_bus.c
index eec89b767f9f..ef3a4eeaebb4 100644
--- a/drivers/pci/pcie/portdrv_bus.c
+++ b/drivers/pci/pcie/portdrv_bus.c
@@ -26,20 +26,22 @@ EXPORT_SYMBOL_GPL(pcie_port_bus_type);
 static int pcie_port_bus_match(struct device *dev, struct device_driver *drv)
 {
 	struct pcie_device *pciedev;
+	struct pcie_port_data *port_data;
 	struct pcie_port_service_driver *driver;
 
 	if (drv->bus != &pcie_port_bus_type || dev->bus != &pcie_port_bus_type)
 		return 0;
-	
+
 	pciedev = to_pcie_device(dev);
 	driver = to_service_driver(drv);
-	if (   (driver->id_table->vendor != PCI_ANY_ID && 
-		driver->id_table->vendor != pciedev->id.vendor) ||
-	       (driver->id_table->device != PCI_ANY_ID &&
-		driver->id_table->device != pciedev->id.device) ||	
-	       (driver->id_table->port_type != PCIE_ANY_PORT &&
-		driver->id_table->port_type != pciedev->id.port_type) ||
-		driver->id_table->service_type != pciedev->id.service_type )
+
+	if (driver->service != pciedev->service)
+		return 0;
+
+	port_data = pci_get_drvdata(pciedev->port);
+
+	if (driver->port_type != PCIE_ANY_PORT
+	     && driver->port_type != port_data->port_type)
 		return 0;
 
 	return 1;
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 682524b0c93a..843d9e30dd3b 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -140,10 +140,7 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev,
 
 	dev->port = parent;
 	dev->irq = irq;
-	dev->id.vendor = parent->vendor;
-	dev->id.device = parent->device;
-	dev->id.port_type = port_type;
-	dev->id.service_type = service_type;
+	dev->service = service_type;
 
 	/* Initialize generic device interface */
 	device = &dev->device;
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 59e90b8a7839..a3832079508e 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -27,18 +27,15 @@
 #define PCIE_PORT_MSI_MODE		1
 #define PCIE_PORT_MSIX_MODE		2
 
-struct pcie_port_service_id {
-	__u32 vendor, device;		/* Vendor and device ID or PCI_ANY_ID*/
-	__u32 subvendor, subdevice;	/* Subsystem ID's or PCI_ANY_ID */
-	__u32 class, class_mask;	/* (class,subclass,prog-if) triplet */
-	__u32 port_type, service_type;	/* Port Entity */
-	kernel_ulong_t driver_data;
+struct pcie_port_data {
+	int port_type;		/* Type of the port */
+	int port_irq_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
 };
 
 struct pcie_device {
 	int 		irq;	    /* Service IRQ/MSI/MSI-X Vector */
-	struct pcie_port_service_id id;	/* Service ID */
-	struct pci_dev	*port;	    /* Root/Upstream/Downstream Port */
+	struct pci_dev *port;	    /* Root/Upstream/Downstream Port */
+	u32		service;    /* Port service this device represents */
 	void		*priv_data; /* Service Private Data */
 	struct device	device;     /* Generic Device Interface */
 };
@@ -67,7 +64,9 @@ struct pcie_port_service_driver {
 	/* Link Reset Capability - AER service driver specific */
 	pci_ers_result_t (*reset_link) (struct pci_dev *dev);
 
-	const struct pcie_port_service_id *id_table;
+	int port_type;  /* Type of the port this driver can handle */
+	u32 service;    /* Port service this device represents */
+
 	struct device_driver driver;
 };
 #define to_service_driver(d) \
-- 
cgit v1.2.3


From a52e2e3513d4beafe8fe8699f1519b021c2d05ba Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 24 Jan 2009 00:21:14 +0100
Subject: PCI/MSI: Introduce pci_msix_table_size()

Introduce new function pci_msix_table_size() returning the size of
the MSI-X table of given PCI device or 0 if the device doesn't
support MSI-X.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 24 +++++++++++++++++++-----
 include/linux/pci.h |  5 +++++
 2 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index baba2eb5367d..08aedd5875b0 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -674,6 +674,23 @@ static int msi_free_irqs(struct pci_dev* dev)
 	return 0;
 }
 
+/**
+ * pci_msix_table_size - return the number of device's MSI-X table entries
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ */
+int pci_msix_table_size(struct pci_dev *dev)
+{
+	int pos;
+	u16 control;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (!pos)
+		return 0;
+
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
+	return multi_msix_capable(control);
+}
+
 /**
  * pci_enable_msix - configure device's MSI-X capability structure
  * @dev: pointer to the pci_dev data structure of MSI-X device function
@@ -691,9 +708,8 @@ static int msi_free_irqs(struct pci_dev* dev)
  **/
 int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
 {
-	int status, pos, nr_entries;
+	int status, nr_entries;
 	int i, j;
-	u16 control;
 
 	if (!entries)
  		return -EINVAL;
@@ -702,9 +718,7 @@ int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
 	if (status)
 		return status;
 
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	pci_read_config_word(dev, msi_control_reg(pos), &control);
-	nr_entries = multi_msix_capable(control);
+	nr_entries = pci_msix_table_size(dev);
 	if (nvec > nr_entries)
 		return -EINVAL;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7bd624bfdcfd..b5d6d0e0f1cb 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -799,6 +799,10 @@ static inline void pci_msi_shutdown(struct pci_dev *dev)
 static inline void pci_disable_msi(struct pci_dev *dev)
 { }
 
+static inline int pci_msix_table_size(struct pci_dev *dev)
+{
+	return 0;
+}
 static inline int pci_enable_msix(struct pci_dev *dev,
 				  struct msix_entry *entries, int nvec)
 {
@@ -823,6 +827,7 @@ static inline int pci_msi_enabled(void)
 extern int pci_enable_msi(struct pci_dev *dev);
 extern void pci_msi_shutdown(struct pci_dev *dev);
 extern void pci_disable_msi(struct pci_dev *dev);
+extern int pci_msix_table_size(struct pci_dev *dev);
 extern int pci_enable_msix(struct pci_dev *dev,
 	struct msix_entry *entries, int nvec);
 extern void pci_msix_shutdown(struct pci_dev *dev);
-- 
cgit v1.2.3


From b43d451385ef833e0696032aac2629da04d46c59 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 24 Jan 2009 00:23:22 +0100
Subject: PCI/PCIe portdrv: Fix allocation of interrupts

If MSI-X interrupt mode is used by the PCI Express port driver, too
many vectors are allocated and it is not ensured that the right
vectors will be used for the right services.  Namely, the PCI Express
specification states that both PCI Express native PME and PCI Express
hotplug will always use the same MSI or MSI-X message for signalling
interrupts, which implies that the same vector will be used by both
of them.  Also, the VC service does not use interrupts at all.
Moreover, is not clear which of the vectors allocated by
pci_enable_msix() in the current code will be used for PME and
hotplug and which of them will be used for AER if all of these
services are configured.

For these reasons, rework the allocation of interrupts for PCI
Express ports so that if MSI-X are enabled, the right vectors will be
used for the right purposes.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv.h      |   6 ++
 drivers/pci/pcie/portdrv_core.c | 206 ++++++++++++++++++++++++++++++++--------
 include/linux/pcieport_if.h     |  12 ++-
 3 files changed, 181 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index ad4d082a0344..5b818bd835ef 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -25,6 +25,12 @@
 #define PCIE_CAPABILITIES_REG		0x2
 #define PCIE_SLOT_CAPABILITIES_REG	0x14
 #define PCIE_PORT_DEVICE_MAXSERVICES	4
+#define PCIE_PORT_MSI_VECTOR_MASK	0x1f
+/*
+ * According to the PCI Express Base Specification 2.0, the indices of the MSI-X
+ * table entires used by port services must not exceed 31
+ */
+#define PCIE_PORT_MAX_MSIX_ENTRIES	32
 
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 843d9e30dd3b..3aea92a92928 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -30,6 +30,152 @@ static void release_pcie_device(struct device *dev)
 	kfree(to_pcie_device(dev));			
 }
 
+/**
+ * pcie_port_msix_add_entry - add entry to given array of MSI-X entries
+ * @entries: Array of MSI-X entries
+ * @new_entry: Index of the entry to add to the array
+ * @nr_entries: Number of entries aleady in the array
+ *
+ * Return value: Position of the added entry in the array
+ */
+static int pcie_port_msix_add_entry(
+	struct msix_entry *entries, int new_entry, int nr_entries)
+{
+	int j;
+
+	for (j = 0; j < nr_entries; j++)
+		if (entries[j].entry == new_entry)
+			return j;
+
+	entries[j].entry = new_entry;
+	return j;
+}
+
+/**
+ * pcie_port_enable_msix - try to set up MSI-X as interrupt mode for given port
+ * @dev: PCI Express port to handle
+ * @vectors: Array of interrupt vectors to populate
+ * @mask: Bitmask of port capabilities returned by get_port_device_capability()
+ *
+ * Return value: 0 on success, error code on failure
+ */
+static int pcie_port_enable_msix(struct pci_dev *dev, int *vectors, int mask)
+{
+	struct msix_entry *msix_entries;
+	int idx[PCIE_PORT_DEVICE_MAXSERVICES];
+	int nr_entries, status, pos, i, nvec;
+	u16 reg16;
+	u32 reg32;
+
+	nr_entries = pci_msix_table_size(dev);
+	if (!nr_entries)
+		return -EINVAL;
+	if (nr_entries > PCIE_PORT_MAX_MSIX_ENTRIES)
+		nr_entries = PCIE_PORT_MAX_MSIX_ENTRIES;
+
+	msix_entries = kzalloc(sizeof(*msix_entries) * nr_entries, GFP_KERNEL);
+	if (!msix_entries)
+		return -ENOMEM;
+
+	/*
+	 * Allocate as many entries as the port wants, so that we can check
+	 * which of them will be useful.  Moreover, if nr_entries is correctly
+	 * equal to the number of entries this port actually uses, we'll happily
+	 * go through without any tricks.
+	 */
+	for (i = 0; i < nr_entries; i++)
+		msix_entries[i].entry = i;
+
+	status = pci_enable_msix(dev, msix_entries, nr_entries);
+	if (status)
+		goto Exit;
+
+	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++)
+		idx[i] = -1;
+	status = -EIO;
+	nvec = 0;
+
+	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
+		int entry;
+
+		/*
+		 * The code below follows the PCI Express Base Specification 2.0
+		 * stating in Section 6.1.6 that "PME and Hot-Plug Event
+		 * interrupts (when both are implemented) always share the same
+		 * MSI or MSI-X vector, as indicated by the Interrupt Message
+		 * Number field in the PCI Express Capabilities register", where
+		 * according to Section 7.8.2 of the specification "For MSI-X,
+		 * the value in this field indicates which MSI-X Table entry is
+		 * used to generate the interrupt message."
+		 */
+		pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+		pci_read_config_word(dev, pos + PCIE_CAPABILITIES_REG, &reg16);
+		entry = (reg16 >> 9) & PCIE_PORT_MSI_VECTOR_MASK;
+		if (entry >= nr_entries)
+			goto Error;
+
+		i = pcie_port_msix_add_entry(msix_entries, entry, nvec);
+		if (i == nvec)
+			nvec++;
+
+		idx[PCIE_PORT_SERVICE_PME_SHIFT] = i;
+		idx[PCIE_PORT_SERVICE_HP_SHIFT] = i;
+	}
+
+	if (mask & PCIE_PORT_SERVICE_AER) {
+		int entry;
+
+		/*
+		 * The code below follows Section 7.10.10 of the PCI Express
+		 * Base Specification 2.0 stating that bits 31-27 of the Root
+		 * Error Status Register contain a value indicating which of the
+		 * MSI/MSI-X vectors assigned to the port is going to be used
+		 * for AER, where "For MSI-X, the value in this register
+		 * indicates which MSI-X Table entry is used to generate the
+		 * interrupt message."
+		 */
+		pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &reg32);
+		entry = reg32 >> 27;
+		if (entry >= nr_entries)
+			goto Error;
+
+		i = pcie_port_msix_add_entry(msix_entries, entry, nvec);
+		if (i == nvec)
+			nvec++;
+
+		idx[PCIE_PORT_SERVICE_AER_SHIFT] = i;
+	}
+
+	/*
+	 * If nvec is equal to the allocated number of entries, we can just use
+	 * what we have.  Otherwise, the port has some extra entries not for the
+	 * services we know and we need to work around that.
+	 */
+	if (nvec == nr_entries) {
+		status = 0;
+	} else {
+		/* Drop the temporary MSI-X setup */
+		pci_disable_msix(dev);
+
+		/* Now allocate the MSI-X vectors for real */
+		status = pci_enable_msix(dev, msix_entries, nvec);
+		if (status)
+			goto Exit;
+	}
+
+	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++)
+		vectors[i] = idx[i] >= 0 ? msix_entries[idx[i]].vector : -1;
+
+ Exit:
+	kfree(msix_entries);
+	return status;
+
+ Error:
+	pci_disable_msix(dev);
+	goto Exit;
+}
+
 /**
  * assign_interrupt_mode - choose interrupt mode for PCI Express port services
  *                         (INTx, MSI-X, MSI) and set up vectors
@@ -42,49 +188,31 @@ static void release_pcie_device(struct device *dev)
 static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 {
 	struct pcie_port_data *port_data = pci_get_drvdata(dev);
-	int i, pos, nvec, status = -EINVAL;
-	int interrupt_mode = PCIE_PORT_NO_IRQ;
-
-	/* Set INTx as default */
-	for (i = 0, nvec = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
-		if (mask & (1 << i)) 
-			nvec++;
-		vectors[i] = dev->irq;
-	}
-	if (dev->pin)
-		interrupt_mode = PCIE_PORT_INTx_MODE;
+	int irq, interrupt_mode = PCIE_PORT_NO_IRQ;
+	int i;
 
 	/* Check MSI quirk */
 	if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk)
-		return interrupt_mode;
+		goto Fallback;
+
+	/* Try to use MSI-X if supported */
+	if (!pcie_port_enable_msix(dev, vectors, mask))
+		return PCIE_PORT_MSIX_MODE;
+
+	/* We're not going to use MSI-X, so try MSI and fall back to INTx */
+	if (!pci_enable_msi(dev))
+		interrupt_mode = PCIE_PORT_MSI_MODE;
+
+ Fallback:
+	if (interrupt_mode == PCIE_PORT_NO_IRQ && dev->pin)
+		interrupt_mode = PCIE_PORT_INTx_MODE;
+
+	irq = interrupt_mode != PCIE_PORT_NO_IRQ ? dev->irq : -1;
+	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++)
+		vectors[i] = irq;
+
+	vectors[PCIE_PORT_SERVICE_VC_SHIFT] = -1;
 
-	/* Select MSI-X over MSI if supported */		
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (pos) {
-		struct msix_entry msix_entries[PCIE_PORT_DEVICE_MAXSERVICES] = 
-			{{0, 0}, {0, 1}, {0, 2}, {0, 3}};
-		status = pci_enable_msix(dev, msix_entries, nvec);
-		if (!status) {
-			int j = 0;
-
-			interrupt_mode = PCIE_PORT_MSIX_MODE;
-			for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
-				if (mask & (1 << i)) 
-					vectors[i] = msix_entries[j++].vector;
-			}
-		}
-	} 
-	if (status) {
-		pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
-		if (pos) {
-			status = pci_enable_msi(dev);
-			if (!status) {
-				interrupt_mode = PCIE_PORT_MSI_MODE;
-				for (i = 0;i < PCIE_PORT_DEVICE_MAXSERVICES;i++)
-					vectors[i] = dev->irq;
-			}
-		}
-	} 
 	return interrupt_mode;
 }
 
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index a3832079508e..5d2afcfa6bc1 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -16,10 +16,14 @@
 #define PCIE_ANY_PORT			7
 
 /* Service Type */
-#define PCIE_PORT_SERVICE_PME		1	/* Power Management Event */
-#define PCIE_PORT_SERVICE_AER		2	/* Advanced Error Reporting */
-#define PCIE_PORT_SERVICE_HP		4	/* Native Hotplug */
-#define PCIE_PORT_SERVICE_VC		8	/* Virtual Channel */
+#define PCIE_PORT_SERVICE_PME_SHIFT	0	/* Power Management Event */
+#define PCIE_PORT_SERVICE_PME		(1 << PCIE_PORT_SERVICE_PME_SHIFT)
+#define PCIE_PORT_SERVICE_AER_SHIFT	1	/* Advanced Error Reporting */
+#define PCIE_PORT_SERVICE_AER		(1 << PCIE_PORT_SERVICE_AER_SHIFT)
+#define PCIE_PORT_SERVICE_HP_SHIFT	2	/* Native Hotplug */
+#define PCIE_PORT_SERVICE_HP		(1 << PCIE_PORT_SERVICE_HP_SHIFT)
+#define PCIE_PORT_SERVICE_VC_SHIFT	3	/* Virtual Channel */
+#define PCIE_PORT_SERVICE_VC		(1 << PCIE_PORT_SERVICE_VC_SHIFT)
 
 /* Root/Upstream/Downstream Port's Interrupt Mode */
 #define PCIE_PORT_NO_IRQ		(-1)
-- 
cgit v1.2.3


From 63f10f0f6df4e4e860b790d64bebfde85b540b0a Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Mon, 9 Feb 2009 15:59:29 +0900
Subject: PCI/ACPI: move _OSC code to pci_root.c

Move PCI _OSC management code from drivers/pci/pci-acpi.c to
drivers/acpi/pci_root.c. The benefits are

- We no longer need struct osc_data and its management code (contents
  are moved to struct acpi_pci_root). This simplify the code, and we
  no longer care about kmalloc() failure.

- We can make pci_acpi_osc_support() be a static function, which is
  called only from drivers/acpi/pci_root.c.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Reviewed-by: Andrew Patterson <andrew.patterson@hp.com>
Tested-by: Andrew Patterson <andrew.patterson@hp.com>
Acked-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/acpi/pci_root.c  | 180 ++++++++++++++++++++++++++++++++++++++-
 drivers/pci/pci-acpi.c   | 215 -----------------------------------------------
 include/linux/pci-acpi.h |   1 -
 3 files changed, 178 insertions(+), 218 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 5b38a026d122..979eccc82c5b 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -66,11 +66,18 @@ struct acpi_pci_root {
 	struct acpi_device * device;
 	struct acpi_pci_id id;
 	struct pci_bus *bus;
+
+	u32 osc_support_set;	/* _OSC state of support bits */
+	u32 osc_control_set;	/* _OSC state of control bits */
+	u32 osc_control_qry;	/* the latest _OSC query result */
+
+	u32 osc_queried:1;	/* has _OSC control been queried? */
 };
 
 static LIST_HEAD(acpi_pci_roots);
 
 static struct acpi_pci_driver *sub_driver;
+static DEFINE_MUTEX(osc_lock);
 
 int acpi_pci_register_driver(struct acpi_pci_driver *driver)
 {
@@ -185,6 +192,175 @@ static void acpi_pci_bridge_scan(struct acpi_device *device)
 		}
 }
 
+static u8 OSC_UUID[16] = {0x5B, 0x4D, 0xDB, 0x33, 0xF7, 0x1F, 0x1C, 0x40,
+			  0x96, 0x57, 0x74, 0x41, 0xC0, 0x3D, 0xD7, 0x66};
+
+static acpi_status acpi_pci_run_osc(acpi_handle handle,
+				    const u32 *capbuf, u32 *retval)
+{
+	acpi_status status;
+	struct acpi_object_list input;
+	union acpi_object in_params[4];
+	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
+	union acpi_object *out_obj;
+	u32 errors;
+
+	/* Setting up input parameters */
+	input.count = 4;
+	input.pointer = in_params;
+	in_params[0].type 		= ACPI_TYPE_BUFFER;
+	in_params[0].buffer.length 	= 16;
+	in_params[0].buffer.pointer	= OSC_UUID;
+	in_params[1].type 		= ACPI_TYPE_INTEGER;
+	in_params[1].integer.value 	= 1;
+	in_params[2].type 		= ACPI_TYPE_INTEGER;
+	in_params[2].integer.value	= 3;
+	in_params[3].type		= ACPI_TYPE_BUFFER;
+	in_params[3].buffer.length 	= 12;
+	in_params[3].buffer.pointer 	= (u8 *)capbuf;
+
+	status = acpi_evaluate_object(handle, "_OSC", &input, &output);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	if (!output.length)
+		return AE_NULL_OBJECT;
+
+	out_obj = output.pointer;
+	if (out_obj->type != ACPI_TYPE_BUFFER) {
+		printk(KERN_DEBUG "_OSC evaluation returned wrong type\n");
+		status = AE_TYPE;
+		goto out_kfree;
+	}
+	/* Need to ignore the bit0 in result code */
+	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
+	if (errors) {
+		if (errors & OSC_REQUEST_ERROR)
+			printk(KERN_DEBUG "_OSC request failed\n");
+		if (errors & OSC_INVALID_UUID_ERROR)
+			printk(KERN_DEBUG "_OSC invalid UUID\n");
+		if (errors & OSC_INVALID_REVISION_ERROR)
+			printk(KERN_DEBUG "_OSC invalid revision\n");
+		if (errors & OSC_CAPABILITIES_MASK_ERROR) {
+			if (capbuf[OSC_QUERY_TYPE] & OSC_QUERY_ENABLE)
+				goto out_success;
+			printk(KERN_DEBUG
+			       "Firmware did not grant requested _OSC control\n");
+			status = AE_SUPPORT;
+			goto out_kfree;
+		}
+		status = AE_ERROR;
+		goto out_kfree;
+	}
+out_success:
+	*retval = *((u32 *)(out_obj->buffer.pointer + 8));
+	status = AE_OK;
+
+out_kfree:
+	kfree(output.pointer);
+	return status;
+}
+
+static acpi_status acpi_pci_query_osc(struct acpi_pci_root *root, u32 flags)
+{
+	acpi_status status;
+	u32 support_set, result, capbuf[3];
+
+	/* do _OSC query for all possible controls */
+	support_set = root->osc_support_set | (flags & OSC_SUPPORT_MASKS);
+	capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE;
+	capbuf[OSC_SUPPORT_TYPE] = support_set;
+	capbuf[OSC_CONTROL_TYPE] = OSC_CONTROL_MASKS;
+
+	status = acpi_pci_run_osc(root->device->handle, capbuf, &result);
+	if (ACPI_SUCCESS(status)) {
+		root->osc_support_set = support_set;
+		root->osc_control_qry = result;
+		root->osc_queried = 1;
+	}
+	return status;
+}
+
+static acpi_status acpi_pci_osc_support(struct acpi_pci_root *root, u32 flags)
+{
+	acpi_status status;
+	acpi_handle tmp;
+
+	status = acpi_get_handle(root->device->handle, "_OSC", &tmp);
+	if (ACPI_FAILURE(status))
+		return status;
+	mutex_lock(&osc_lock);
+	status = acpi_pci_query_osc(root, flags);
+	mutex_unlock(&osc_lock);
+	return status;
+}
+
+static struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle)
+{
+	struct acpi_pci_root *root;
+	list_for_each_entry(root, &acpi_pci_roots, node) {
+		if (root->device->handle == handle)
+			return root;
+	}
+	return NULL;
+}
+
+/**
+ * pci_osc_control_set - commit requested control to Firmware
+ * @handle: acpi_handle for the target ACPI object
+ * @flags: driver's requested control bits
+ *
+ * Attempt to take control from Firmware on requested control bits.
+ **/
+acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
+{
+	acpi_status status;
+	u32 control_req, result, capbuf[3];
+	acpi_handle tmp;
+	struct acpi_pci_root *root;
+
+	status = acpi_get_handle(handle, "_OSC", &tmp);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	control_req = (flags & OSC_CONTROL_MASKS);
+	if (!control_req)
+		return AE_TYPE;
+
+	root = acpi_pci_find_root(handle);
+	if (!root)
+		return AE_NOT_EXIST;
+
+	mutex_lock(&osc_lock);
+	/* No need to evaluate _OSC if the control was already granted. */
+	if ((root->osc_control_set & control_req) == control_req)
+		goto out;
+
+	/* Need to query controls first before requesting them */
+	if (!root->osc_queried) {
+		status = acpi_pci_query_osc(root, root->osc_support_set);
+		if (ACPI_FAILURE(status))
+			goto out;
+	}
+	if ((root->osc_control_qry & control_req) != control_req) {
+		printk(KERN_DEBUG
+		       "Firmware did not grant requested _OSC control\n");
+		status = AE_SUPPORT;
+		goto out;
+	}
+
+	capbuf[OSC_QUERY_TYPE] = 0;
+	capbuf[OSC_SUPPORT_TYPE] = root->osc_support_set;
+	capbuf[OSC_CONTROL_TYPE] = root->osc_control_set | control_req;
+	status = acpi_pci_run_osc(handle, capbuf, &result);
+	if (ACPI_SUCCESS(status))
+		root->osc_control_set = result;
+out:
+	mutex_unlock(&osc_lock);
+	return status;
+}
+EXPORT_SYMBOL(pci_osc_control_set);
+
 static int __devinit acpi_pci_root_add(struct acpi_device *device)
 {
 	int result = 0;
@@ -217,7 +393,7 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
 	 * PCI domains, so we indicate this in _OSC support capabilities.
 	 */
 	flags = base_flags = OSC_PCI_SEGMENT_GROUPS_SUPPORT;
-	pci_acpi_osc_support(device->handle, flags);
+	acpi_pci_osc_support(root, flags);
 
 	/* 
 	 * Segment
@@ -353,7 +529,7 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
 	if (pci_msi_enabled())
 		flags |= OSC_MSI_SUPPORT;
 	if (flags != base_flags)
-		pci_acpi_osc_support(device->handle, flags);
+		acpi_pci_osc_support(root, flags);
 
       end:
 	if (result) {
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index deea8a187eb8..fac5eddcefd2 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -18,221 +18,6 @@
 #include <linux/pci-acpi.h>
 #include "pci.h"
 
-struct acpi_osc_data {
-	acpi_handle handle;
-	u32 support_set;
-	u32 control_set;
-	u32 control_query;
-	int is_queried;
-	struct list_head sibiling;
-};
-static LIST_HEAD(acpi_osc_data_list);
-
-struct acpi_osc_args {
-	u32 capbuf[3];
-};
-
-static DEFINE_MUTEX(pci_acpi_lock);
-
-static struct acpi_osc_data *acpi_get_osc_data(acpi_handle handle)
-{
-	struct acpi_osc_data *data;
-
-	list_for_each_entry(data, &acpi_osc_data_list, sibiling) {
-		if (data->handle == handle)
-			return data;
-	}
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return NULL;
-	INIT_LIST_HEAD(&data->sibiling);
-	data->handle = handle;
-	list_add_tail(&data->sibiling, &acpi_osc_data_list);
-	return data;
-}
-
-static u8 OSC_UUID[16] = {0x5B, 0x4D, 0xDB, 0x33, 0xF7, 0x1F, 0x1C, 0x40,
-			  0x96, 0x57, 0x74, 0x41, 0xC0, 0x3D, 0xD7, 0x66};
-
-static acpi_status acpi_run_osc(acpi_handle handle,
-				struct acpi_osc_args *osc_args, u32 *retval)
-{
-	acpi_status status;
-	struct acpi_object_list input;
-	union acpi_object in_params[4];
-	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
-	union acpi_object *out_obj;
-	u32 errors, flags = osc_args->capbuf[OSC_QUERY_TYPE];
-
-	/* Setting up input parameters */
-	input.count = 4;
-	input.pointer = in_params;
-	in_params[0].type 		= ACPI_TYPE_BUFFER;
-	in_params[0].buffer.length 	= 16;
-	in_params[0].buffer.pointer	= OSC_UUID;
-	in_params[1].type 		= ACPI_TYPE_INTEGER;
-	in_params[1].integer.value 	= 1;
-	in_params[2].type 		= ACPI_TYPE_INTEGER;
-	in_params[2].integer.value	= 3;
-	in_params[3].type		= ACPI_TYPE_BUFFER;
-	in_params[3].buffer.length 	= 12;
-	in_params[3].buffer.pointer 	= (u8 *)osc_args->capbuf;
-
-	status = acpi_evaluate_object(handle, "_OSC", &input, &output);
-	if (ACPI_FAILURE(status))
-		return status;
-
-	if (!output.length)
-		return AE_NULL_OBJECT;
-
-	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_BUFFER) {
-		printk(KERN_DEBUG "Evaluate _OSC returns wrong type\n");
-		status = AE_TYPE;
-		goto out_kfree;
-	}
-	/* Need to ignore the bit0 in result code */
-	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-	if (errors) {
-		if (errors & OSC_REQUEST_ERROR)
-			printk(KERN_DEBUG "_OSC request fails\n"); 
-		if (errors & OSC_INVALID_UUID_ERROR)
-			printk(KERN_DEBUG "_OSC invalid UUID\n"); 
-		if (errors & OSC_INVALID_REVISION_ERROR)
-			printk(KERN_DEBUG "_OSC invalid revision\n"); 
-		if (errors & OSC_CAPABILITIES_MASK_ERROR) {
-			if (flags & OSC_QUERY_ENABLE)
-				goto out_success;
-			printk(KERN_DEBUG "_OSC FW not grant req. control\n");
-			status = AE_SUPPORT;
-			goto out_kfree;
-		}
-		status = AE_ERROR;
-		goto out_kfree;
-	}
-out_success:
-	*retval = *((u32 *)(out_obj->buffer.pointer + 8));
-	status = AE_OK;
-
-out_kfree:
-	kfree(output.pointer);
-	return status;
-}
-
-static acpi_status __acpi_query_osc(u32 flags, struct acpi_osc_data *osc_data)
-{
-	acpi_status status;
-	u32 support_set, result;
-	struct acpi_osc_args osc_args;
-
-	/* do _OSC query for all possible controls */
-	support_set = osc_data->support_set | (flags & OSC_SUPPORT_MASKS);
-	osc_args.capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE;
-	osc_args.capbuf[OSC_SUPPORT_TYPE] = support_set;
-	osc_args.capbuf[OSC_CONTROL_TYPE] = OSC_CONTROL_MASKS;
-
-	status = acpi_run_osc(osc_data->handle, &osc_args, &result);
-	if (ACPI_SUCCESS(status)) {
-		osc_data->support_set = support_set;
-		osc_data->control_query = result;
-		osc_data->is_queried = 1;
-	}
-
-	return status;
-}
-
-/*
- * pci_acpi_osc_support: Invoke _OSC indicating support for the given feature
- * @flags: Bitmask of flags to support
- *
- * See the ACPI spec for the definition of the flags
- */
-int pci_acpi_osc_support(acpi_handle handle, u32 flags)
-{
-	acpi_status status;
-	acpi_handle tmp;
-	struct acpi_osc_data *osc_data;
-	int rc = 0;
-
-	status = acpi_get_handle(handle, "_OSC", &tmp);
-	if (ACPI_FAILURE(status))
-		return -ENOTTY;
-
-	mutex_lock(&pci_acpi_lock);
-	osc_data = acpi_get_osc_data(handle);
-	if (!osc_data) {
-		printk(KERN_ERR "acpi osc data array is full\n");
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	__acpi_query_osc(flags, osc_data);
-out:
-	mutex_unlock(&pci_acpi_lock);
-	return rc;
-}
-
-/**
- * pci_osc_control_set - commit requested control to Firmware
- * @handle: acpi_handle for the target ACPI object
- * @flags: driver's requested control bits
- *
- * Attempt to take control from Firmware on requested control bits.
- **/
-acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
-{
-	acpi_status status;
-	u32 control_req, control_set, result;
-	acpi_handle tmp;
-	struct acpi_osc_data *osc_data;
-	struct acpi_osc_args osc_args;
-
-	status = acpi_get_handle(handle, "_OSC", &tmp);
-	if (ACPI_FAILURE(status))
-		return status;
-
-	mutex_lock(&pci_acpi_lock);
-	osc_data = acpi_get_osc_data(handle);
-	if (!osc_data) {
-		printk(KERN_ERR "acpi osc data array is full\n");
-		status = AE_ERROR;
-		goto out;
-	}
-
-	control_req = (flags & OSC_CONTROL_MASKS);
-	if (!control_req) {
-		status = AE_TYPE;
-		goto out;
-	}
-
-	/* No need to evaluate _OSC if the control was already granted. */
-	if ((osc_data->control_set & control_req) == control_req)
-		goto out;
-
-	if (!osc_data->is_queried) {
-		status = __acpi_query_osc(osc_data->support_set, osc_data);
-		if (ACPI_FAILURE(status))
-			goto out;
-	}
-
-	if ((osc_data->control_query & control_req) != control_req) {
-		status = AE_SUPPORT;
-		goto out;
-	}
-
-	control_set = osc_data->control_set | control_req;
-	osc_args.capbuf[OSC_QUERY_TYPE] = 0;
-	osc_args.capbuf[OSC_SUPPORT_TYPE] = osc_data->support_set;
-	osc_args.capbuf[OSC_CONTROL_TYPE] = control_set;
-	status = acpi_run_osc(handle, &osc_args, &result);
-	if (ACPI_SUCCESS(status))
-		osc_data->control_set = result;
-out:
-	mutex_unlock(&pci_acpi_lock);
-	return status;
-}
-EXPORT_SYMBOL(pci_osc_control_set);
-
 /*
  * _SxD returns the D-state with the highest power
  * (lowest D-state number) supported in the S-state "x".
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 042c166f65d5..65cb103b21db 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -50,7 +50,6 @@
 
 #ifdef CONFIG_ACPI
 extern acpi_status pci_osc_control_set(acpi_handle handle, u32 flags);
-int pci_acpi_osc_support(acpi_handle handle, u32 flags);
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
 	/* Find root host bridge */
-- 
cgit v1.2.3


From 9f5404d8ea90bfa4d58a3936e5a3d0d28cecf60f Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Mon, 9 Feb 2009 16:00:04 +0900
Subject: PCI/ACPI: rename pci_osc_control_set()

- Rename pci_osc_control_set() to acpi_pci_osc_control_set() according
  to the other API names in drivers/acpi/pci_root.c.

- Move _OSC related definitions to include/linux/acpi.h because _OSC
  related API is implemented in drivers/acpi/pci_root.c now.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Reviewed-by: Andrew Patterson <andrew.patterson@hp.com>
Tested-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/acpi/pci_root.c            |  6 ++---
 drivers/pci/hotplug/acpi_pcihp.c   |  5 ++---
 drivers/pci/pcie/aer/aerdrv_acpi.c |  2 +-
 include/linux/acpi.h               | 34 ++++++++++++++++++++++++++++
 include/linux/pci-acpi.h           | 45 --------------------------------------
 5 files changed, 40 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 979eccc82c5b..196f97d00956 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -306,13 +306,13 @@ static struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle)
 }
 
 /**
- * pci_osc_control_set - commit requested control to Firmware
+ * acpi_pci_osc_control_set - commit requested control to Firmware
  * @handle: acpi_handle for the target ACPI object
  * @flags: driver's requested control bits
  *
  * Attempt to take control from Firmware on requested control bits.
  **/
-acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
+acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags)
 {
 	acpi_status status;
 	u32 control_req, result, capbuf[3];
@@ -359,7 +359,7 @@ out:
 	mutex_unlock(&osc_lock);
 	return status;
 }
-EXPORT_SYMBOL(pci_osc_control_set);
+EXPORT_SYMBOL(acpi_pci_osc_control_set);
 
 static int __devinit acpi_pci_root_add(struct acpi_device *device)
 {
diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index 1c1141801060..f47bc74be567 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -30,9 +30,8 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
+#include <linux/acpi.h>
 #include <linux/pci-acpi.h>
-#include <acpi/acpi.h>
-#include <acpi/acpi_bus.h>
 
 #define MY_NAME	"acpi_pcihp"
 
@@ -408,7 +407,7 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags)
 		acpi_get_name(handle, ACPI_FULL_PATHNAME, &string);
 		dbg("Trying to get hotplug control for %s\n",
 				(char *)string.pointer);
-		status = pci_osc_control_set(handle, flags);
+		status = acpi_pci_osc_control_set(handle, flags);
 		if (ACPI_SUCCESS(status))
 			goto got_one;
 		kfree(string.pointer);
diff --git a/drivers/pci/pcie/aer/aerdrv_acpi.c b/drivers/pci/pcie/aer/aerdrv_acpi.c
index ebce26c37049..8edb2f300e8f 100644
--- a/drivers/pci/pcie/aer/aerdrv_acpi.c
+++ b/drivers/pci/pcie/aer/aerdrv_acpi.c
@@ -38,7 +38,7 @@ int aer_osc_setup(struct pcie_device *pciedev)
 
 	handle = acpi_find_root_bridge_handle(pdev);
 	if (handle) {
-		status = pci_osc_control_set(handle,
+		status = acpi_pci_osc_control_set(handle,
 					OSC_PCI_EXPRESS_AER_CONTROL |
 					OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL);
 	}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6fce2fc2d124..2a3b189e3e26 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -256,6 +256,40 @@ void __init acpi_no_s4_hw_signature(void);
 void __init acpi_old_suspend_ordering(void);
 void __init acpi_s4_no_nvs(void);
 #endif /* CONFIG_PM_SLEEP */
+
+#define OSC_QUERY_TYPE			0
+#define OSC_SUPPORT_TYPE 		1
+#define OSC_CONTROL_TYPE		2
+#define OSC_SUPPORT_MASKS		0x1f
+
+/* _OSC DW0 Definition */
+#define OSC_QUERY_ENABLE		1
+#define OSC_REQUEST_ERROR		2
+#define OSC_INVALID_UUID_ERROR		4
+#define OSC_INVALID_REVISION_ERROR	8
+#define OSC_CAPABILITIES_MASK_ERROR	16
+
+/* _OSC DW1 Definition (OS Support Fields) */
+#define OSC_EXT_PCI_CONFIG_SUPPORT		1
+#define OSC_ACTIVE_STATE_PWR_SUPPORT 		2
+#define OSC_CLOCK_PWR_CAPABILITY_SUPPORT	4
+#define OSC_PCI_SEGMENT_GROUPS_SUPPORT		8
+#define OSC_MSI_SUPPORT				16
+
+/* _OSC DW1 Definition (OS Control Fields) */
+#define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL	1
+#define OSC_SHPC_NATIVE_HP_CONTROL 		2
+#define OSC_PCI_EXPRESS_PME_CONTROL		4
+#define OSC_PCI_EXPRESS_AER_CONTROL		8
+#define OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL	16
+
+#define OSC_CONTROL_MASKS 	(OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | 	\
+				OSC_SHPC_NATIVE_HP_CONTROL | 		\
+				OSC_PCI_EXPRESS_PME_CONTROL |		\
+				OSC_PCI_EXPRESS_AER_CONTROL |		\
+				OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL)
+
+extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags);
 #else	/* CONFIG_ACPI */
 
 static inline int early_acpi_boot_init(void)
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 65cb103b21db..20480b9f10c8 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -10,46 +10,7 @@
 
 #include <linux/acpi.h>
 
-#define OSC_QUERY_TYPE			0
-#define OSC_SUPPORT_TYPE 		1
-#define OSC_CONTROL_TYPE		2
-#define OSC_SUPPORT_MASKS		0x1f
-
-/*
- * _OSC DW0 Definition 
- */
-#define OSC_QUERY_ENABLE		1
-#define OSC_REQUEST_ERROR		2
-#define OSC_INVALID_UUID_ERROR		4
-#define OSC_INVALID_REVISION_ERROR	8
-#define OSC_CAPABILITIES_MASK_ERROR	16
-
-/*
- * _OSC DW1 Definition (OS Support Fields)
- */
-#define OSC_EXT_PCI_CONFIG_SUPPORT		1
-#define OSC_ACTIVE_STATE_PWR_SUPPORT 		2
-#define OSC_CLOCK_PWR_CAPABILITY_SUPPORT	4
-#define OSC_PCI_SEGMENT_GROUPS_SUPPORT		8
-#define OSC_MSI_SUPPORT				16
-
-/*
- * _OSC DW1 Definition (OS Control Fields)
- */
-#define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL	1
-#define OSC_SHPC_NATIVE_HP_CONTROL 		2
-#define OSC_PCI_EXPRESS_PME_CONTROL		4
-#define OSC_PCI_EXPRESS_AER_CONTROL		8
-#define OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL	16
-
-#define OSC_CONTROL_MASKS 	(OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | 	\
-				OSC_SHPC_NATIVE_HP_CONTROL | 		\
-				OSC_PCI_EXPRESS_PME_CONTROL |		\
-				OSC_PCI_EXPRESS_AER_CONTROL |		\
-				OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL)
-
 #ifdef CONFIG_ACPI
-extern acpi_status pci_osc_control_set(acpi_handle handle, u32 flags);
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
 	/* Find root host bridge */
@@ -69,12 +30,6 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
 	return acpi_get_pci_rootbridge_handle(seg, busnr);
 }
 #else
-#if !defined(AE_ERROR)
-typedef u32 		acpi_status;
-#define AE_ERROR      	(acpi_status) (0x0001)
-#endif    
-static inline acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
-{return AE_ERROR;}
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 { return NULL; }
 #endif
-- 
cgit v1.2.3


From c48f1670f42b71f39f4a3bfba01ffb691cc9206c Mon Sep 17 00:00:00 2001
From: "akpm@linux-foundation.org" <akpm@linux-foundation.org>
Date: Tue, 3 Feb 2009 15:45:26 -0800
Subject: PCI: constify pci_bus_add_devices()

drivers/pci/hotplug/fakephp.c:283: warning: passing argument 1 of 'pci_bus_add_devices' discards qualifiers from pointer target type

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/bus.c   | 2 +-
 include/linux/pci.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 52b54f053be0..118c77778d29 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -133,7 +133,7 @@ int pci_bus_add_child(struct pci_bus *bus)
  *
  * Call hotplug for each new devices.
  */
-void pci_bus_add_devices(struct pci_bus *bus)
+void pci_bus_add_devices(const struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 	struct pci_bus *child;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b5d6d0e0f1cb..a1af2fe00639 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -528,7 +528,7 @@ void pcibios_update_irq(struct pci_dev *, int irq);
 /* Generic PCI functions used internally */
 
 extern struct pci_bus *pci_find_bus(int domain, int busnr);
-void pci_bus_add_devices(struct pci_bus *bus);
+void pci_bus_add_devices(const struct pci_bus *bus);
 struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus,
 				      struct pci_ops *ops, void *sysdata);
 static inline struct pci_bus * __devinit pci_scan_bus(int bus, struct pci_ops *ops,
-- 
cgit v1.2.3


From ea7415512a07add2b09c070c9a5d1950833cf9b3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 18 Feb 2009 10:44:29 -0800
Subject: PCI: constify pci_bus_assign_resources()

drivers/pci/hotplug/fakephp.c: In function 'pci_rescan_bus':
drivers/pci/hotplug/fakephp.c:271: warning: passing argument 1 of 'pci_bus_assign_resources' discards qualifiers from pointer target type

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/setup-bus.c | 4 ++--
 include/linux/pci.h     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 704608945780..170a3eda9dd3 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -27,7 +27,7 @@
 #include <linux/slab.h>
 
 
-static void pbus_assign_resources_sorted(struct pci_bus *bus)
+static void pbus_assign_resources_sorted(const struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 	struct resource *res;
@@ -495,7 +495,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 }
 EXPORT_SYMBOL(pci_bus_size_bridges);
 
-void __ref pci_bus_assign_resources(struct pci_bus *bus)
+void __ref pci_bus_assign_resources(const struct pci_bus *bus)
 {
 	struct pci_bus *b;
 	struct pci_dev *dev;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a1af2fe00639..7baf2a5db12a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -708,7 +708,7 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void
 int pci_vpd_truncate(struct pci_dev *dev, size_t size);
 
 /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
-void pci_bus_assign_resources(struct pci_bus *bus);
+void pci_bus_assign_resources(const struct pci_bus *bus);
 void pci_bus_size_bridges(struct pci_bus *bus);
 int pci_claim_resource(struct pci_dev *, int);
 void pci_assign_unassigned_resources(void);
-- 
cgit v1.2.3


From a4b6d516a6079c6ba8dc97d185371439035a35d0 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 4 Mar 2009 12:01:41 -0800
Subject: [MTD] partitioning utility predicates

Move mtd_has_partitions() and mtd_has_cmdlinepart() inlines from a
DaVinci-specific driver to the <linux/mtd/partitions.h> header.

Use those to eliminate #ifdefs in two drivers which had their own
definitions of mtd_has_partitions().

Quite a lot of other MTD drivers could benefit from using use one or both
of these to remove #ifdeffery.  Maybe some Janitors would like to help.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/devices/m25p80.c        | 17 ++++++-----------
 drivers/mtd/devices/mtd_dataflash.c | 16 ++++++----------
 drivers/mtd/nand/davinci_nand.c     | 13 -------------
 include/linux/mtd/partitions.h      | 12 ++++++++++++
 4 files changed, 24 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index 7c3fc766dcf1..98b0faf6696d 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -65,12 +65,6 @@
 #define FAST_READ_DUMMY_BYTE 0
 #endif
 
-#ifdef CONFIG_MTD_PARTITIONS
-#define	mtd_has_partitions()	(1)
-#else
-#define	mtd_has_partitions()	(0)
-#endif
-
 /****************************************************************************/
 
 struct m25p {
@@ -708,12 +702,13 @@ static int __devinit m25p_probe(struct spi_device *spi)
 		struct mtd_partition	*parts = NULL;
 		int			nr_parts = 0;
 
-#ifdef CONFIG_MTD_CMDLINE_PARTS
-		static const char *part_probes[] = { "cmdlinepart", NULL, };
+		if (mtd_has_cmdlinepart()) {
+			static const char *part_probes[]
+					= { "cmdlinepart", NULL, };
 
-		nr_parts = parse_mtd_partitions(&flash->mtd,
-				part_probes, &parts, 0);
-#endif
+			nr_parts = parse_mtd_partitions(&flash->mtd,
+					part_probes, &parts, 0);
+		}
 
 		if (nr_parts <= 0 && data && data->parts) {
 			parts = data->parts;
diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c
index 6d9f810565c8..d95f74a93bce 100644
--- a/drivers/mtd/devices/mtd_dataflash.c
+++ b/drivers/mtd/devices/mtd_dataflash.c
@@ -98,12 +98,6 @@ struct dataflash {
 	struct mtd_info		mtd;
 };
 
-#ifdef CONFIG_MTD_PARTITIONS
-#define	mtd_has_partitions()	(1)
-#else
-#define	mtd_has_partitions()	(0)
-#endif
-
 /* ......................................................................... */
 
 /*
@@ -682,11 +676,13 @@ add_dataflash_otp(struct spi_device *spi, char *name,
 		struct mtd_partition	*parts;
 		int			nr_parts = 0;
 
-#ifdef CONFIG_MTD_CMDLINE_PARTS
-		static const char *part_probes[] = { "cmdlinepart", NULL, };
+		if (mtd_has_cmdlinepart()) {
+			static const char *part_probes[]
+					= { "cmdlinepart", NULL, };
 
-		nr_parts = parse_mtd_partitions(device, part_probes, &parts, 0);
-#endif
+			nr_parts = parse_mtd_partitions(device,
+					part_probes, &parts, 0);
+		}
 
 		if (nr_parts <= 0 && pdata && pdata->parts) {
 			parts = pdata->parts;
diff --git a/drivers/mtd/nand/davinci_nand.c b/drivers/mtd/nand/davinci_nand.c
index 4034ac945041..81f7ecd23c60 100644
--- a/drivers/mtd/nand/davinci_nand.c
+++ b/drivers/mtd/nand/davinci_nand.c
@@ -38,19 +38,6 @@
 #include <asm/mach-types.h>
 
 
-#ifdef CONFIG_MTD_PARTITIONS
-static inline int mtd_has_partitions(void) { return 1; }
-#else
-static inline int mtd_has_partitions(void) { return 0; }
-#endif
-
-#ifdef CONFIG_MTD_CMDLINE_PARTS
-static inline int mtd_has_cmdlinepart(void) { return 1; }
-#else
-static inline int mtd_has_cmdlinepart(void) { return 0; }
-#endif
-
-
 /*
  * This is a device driver for the NAND flash controller found on the
  * various DaVinci family chips.  It handles up to four SoC chipselects,
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index a45dd831b3f8..7535a74083b9 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -76,4 +76,16 @@ int __devinit of_mtd_parse_partitions(struct device *dev,
                                       struct device_node *node,
                                       struct mtd_partition **pparts);
 
+#ifdef CONFIG_MTD_PARTITIONS
+static inline int mtd_has_partitions(void) { return 1; }
+#else
+static inline int mtd_has_partitions(void) { return 0; }
+#endif
+
+#ifdef CONFIG_MTD_CMDLINE_PARTS
+static inline int mtd_has_cmdlinepart(void) { return 1; }
+#else
+static inline int mtd_has_cmdlinepart(void) { return 0; }
+#endif
+
 #endif
-- 
cgit v1.2.3


From 3a3c244c9a355105bc193fde873c73727bf87192 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 15 Feb 2009 22:32:48 +0100
Subject: PCI: PCIe portdrv: Implement pm object

Implement pm object for the PCI Express port driver in order to use
the new power management framework and reduce the code size.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp_core.c |  4 ++--
 drivers/pci/pcie/aer/aerdrv.c     |  6 ------
 drivers/pci/pcie/portdrv.h        |  4 ++--
 drivers/pci/pcie/portdrv_core.c   | 14 ++++++--------
 drivers/pci/pcie/portdrv_pci.c    | 31 +++++++++++++++----------------
 include/linux/pcieport_if.h       |  2 +-
 6 files changed, 26 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 3d21bbba3308..fb254b2454de 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -475,7 +475,7 @@ static void pciehp_remove (struct pcie_device *dev)
 }
 
 #ifdef CONFIG_PM
-static int pciehp_suspend (struct pcie_device *dev, pm_message_t state)
+static int pciehp_suspend (struct pcie_device *dev)
 {
 	dev_info(&dev->device, "%s ENTRY\n", __func__);
 	return 0;
@@ -503,7 +503,7 @@ static int pciehp_resume (struct pcie_device *dev)
 	}
 	return 0;
 }
-#endif
+#endif /* PM */
 
 static struct pcie_port_service_driver hpdriver_portdrv = {
 	.name		= PCIE_MODULE_NAME,
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index e11c03194063..32ade5af927e 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -40,9 +40,6 @@ MODULE_LICENSE("GPL");
 
 static int __devinit aer_probe (struct pcie_device *dev);
 static void aer_remove(struct pcie_device *dev);
-static int aer_suspend(struct pcie_device *dev, pm_message_t state)
-{return 0;}
-static int aer_resume(struct pcie_device *dev) {return 0;}
 static pci_ers_result_t aer_error_detected(struct pci_dev *dev,
 	enum pci_channel_state error);
 static void aer_error_resume(struct pci_dev *dev);
@@ -61,9 +58,6 @@ static struct pcie_port_service_driver aerdriver = {
 	.probe		= aer_probe,
 	.remove		= aer_remove,
 
-	.suspend	= aer_suspend,
-	.resume		= aer_resume,
-
 	.err_handler	= &aer_error_handlers,
 
 	.reset_link	= aer_root_reset,
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 5b818bd835ef..17ad53868f9f 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -38,8 +38,8 @@ extern struct bus_type pcie_port_bus_type;
 extern int pcie_port_device_probe(struct pci_dev *dev);
 extern int pcie_port_device_register(struct pci_dev *dev);
 #ifdef CONFIG_PM
-extern int pcie_port_device_suspend(struct pci_dev *dev, pm_message_t state);
-extern int pcie_port_device_resume(struct pci_dev *dev);
+extern int pcie_port_device_suspend(struct device *dev);
+extern int pcie_port_device_resume(struct device *dev);
 #endif
 extern void pcie_port_device_remove(struct pci_dev *dev);
 extern int __must_check pcie_port_bus_register(void);
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 569af0015fce..5a5bfe7cdf5f 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -410,13 +410,12 @@ int pcie_port_device_register(struct pci_dev *dev)
 static int suspend_iter(struct device *dev, void *data)
 {
 	struct pcie_port_service_driver *service_driver;
-	pm_message_t state = * (pm_message_t *) data;
 
  	if ((dev->bus == &pcie_port_bus_type) &&
  	    (dev->driver)) {
  		service_driver = to_service_driver(dev->driver);
  		if (service_driver->suspend)
- 			service_driver->suspend(to_pcie_device(dev), state);
+ 			service_driver->suspend(to_pcie_device(dev));
   	}
 	return 0;
 }
@@ -424,11 +423,10 @@ static int suspend_iter(struct device *dev, void *data)
 /**
  * pcie_port_device_suspend - suspend port services associated with a PCIe port
  * @dev: PCI Express port to handle
- * @state: Representation of system power management transition in progress
  */
-int pcie_port_device_suspend(struct pci_dev *dev, pm_message_t state)
+int pcie_port_device_suspend(struct device *dev)
 {
-	return device_for_each_child(&dev->dev, &state, suspend_iter);
+	return device_for_each_child(dev, NULL, suspend_iter);
 }
 
 static int resume_iter(struct device *dev, void *data)
@@ -448,11 +446,11 @@ static int resume_iter(struct device *dev, void *data)
  * pcie_port_device_suspend - resume port services associated with a PCIe port
  * @dev: PCI Express port to handle
  */
-int pcie_port_device_resume(struct pci_dev *dev)
+int pcie_port_device_resume(struct device *dev)
 {
-	return device_for_each_child(&dev->dev, NULL, resume_iter);
+	return device_for_each_child(dev, NULL, resume_iter);
 }
-#endif
+#endif /* PM */
 
 static int remove_iter(struct device *dev, void *data)
 {
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 94d0e2af9bad..a61f4930d676 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -44,21 +44,21 @@ static int pcie_portdrv_restore_config(struct pci_dev *dev)
 }
 
 #ifdef CONFIG_PM
-static int pcie_portdrv_suspend(struct pci_dev *dev, pm_message_t state)
-{
-	return pcie_port_device_suspend(dev, state);
+static struct dev_pm_ops pcie_portdrv_pm_ops = {
+	.suspend	= pcie_port_device_suspend,
+	.resume		= pcie_port_device_resume,
+	.freeze		= pcie_port_device_suspend,
+	.thaw		= pcie_port_device_resume,
+	.poweroff	= pcie_port_device_suspend,
+	.restore	= pcie_port_device_resume,
+};
 
-}
+#define PCIE_PORTDRV_PM_OPS	(&pcie_portdrv_pm_ops)
 
-static int pcie_portdrv_resume(struct pci_dev *dev)
-{
-	pci_set_master(dev);
-	return pcie_port_device_resume(dev);
-}
-#else
-#define pcie_portdrv_suspend NULL
-#define pcie_portdrv_resume NULL
-#endif
+#else /* !PM */
+
+#define PCIE_PORTDRV_PM_OPS	NULL
+#endif /* !PM */
 
 /*
  * pcie_portdrv_probe - Probe PCI-Express port devices
@@ -268,10 +268,9 @@ static struct pci_driver pcie_portdriver = {
 	.probe		= pcie_portdrv_probe,
 	.remove		= pcie_portdrv_remove,
 
-	.suspend	= pcie_portdrv_suspend,
-	.resume		= pcie_portdrv_resume,
-
 	.err_handler 	= &pcie_portdrv_err_handler,
+
+	.driver.pm 	= PCIE_PORTDRV_PM_OPS,
 };
 
 static int __init pcie_portdrv_init(void)
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 5d2afcfa6bc1..b4c79545330b 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -59,7 +59,7 @@ struct pcie_port_service_driver {
 	const char *name;
 	int (*probe) (struct pcie_device *dev);
 	void (*remove) (struct pcie_device *dev);
-	int (*suspend) (struct pcie_device *dev, pm_message_t state);
+	int (*suspend) (struct pcie_device *dev);
 	int (*resume) (struct pcie_device *dev);
 
 	/* Service Error Recovery Handler */
-- 
cgit v1.2.3


From 0747aaf42d78d26684c6f6b34a4103ff81f571f8 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:11:56 +0900
Subject: PCI/ACPI: fix wrong assumption in acpi_pci_get_bridge_handle

Current acpi_pci_get_bridge_handle() has an assumption that
pci_bus->self is NULL on the root pci bus. But it might not true on
some platforms. Because of this wrong assumption, current
acpi_pci_get_bridge_handle() might return improper ACPI handle. We
must check pci_bus->parent instead.

This bug is the root cause of the following kernel panic reported by
James Bottomley. This problem was introduced by the commit
e8c331e963c58b83db24b7d0e39e8c07f687dbc6. The immediate cause was
acpi_pci_get_bridge_handle() returned NULL unexpectedly and it was
passed as the second argument of acpi_walk_namespace().

pci_hotplug: PCI Hot Plug PCI Core version: 0.5
acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
IP: [<ffffffff8039646f>] acpi_ns_get_next_node+0xb/0x3c
PGD 0
Oops: 0000 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:
Pid: 1, comm: swapper Not tainted 2.6.28 #1
RIP: 0010:[<ffffffff8039646f>]  [<ffffffff8039646f>] acpi_ns_get_next_node+0xb/0x3c
RSP: 0018:ffff88007f87fd30  EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 0000000000000000 R08: ffffffff8037d260 R09: ffff88007f87fdfc
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffffffff80742040(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000010 CR3: 0000000000201000 CR4: 00000000000006a0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 1, threadinfo ffff88007f87e000, task ffff88007f875040)
Stack:
 0000000000000000 ffffffff803964f5 ffff88007f81b728 0000000000001001
 ffff88007f87fdfc ffffffff8037d260 0000000600000001 0000000000000000
 ffffffff8037d260 0000000000000000 0000000000000001 ffff88007f87fdfc
Call Trace:
 [<ffffffff803964f5>] acpi_ns_walk_namespace+0x55/0x138
 [<ffffffff8037d260>] is_pci_dock_device+0x0/0x20
 [<ffffffff8037d260>] is_pci_dock_device+0x0/0x20
 [<ffffffff80394a9e>] acpi_walk_namespace+0x5f/0x83
 [<ffffffff8037dd33>] detect_ejectable_slots+0x53/0x70
 [<ffffffff8037de38>] add_bridge+0xe8/0x200
 [<ffffffff80394aaa>] acpi_walk_namespace+0x6b/0x83
 [<ffffffff803a4ad1>] acpi_pci_register_driver+0x48/0x61
 [<ffffffff806fc5df>] acpiphp_init+0x0/0x58
 [<ffffffff806fc732>] acpiphp_glue_init+0x4c/0x5a
 [<ffffffff806fc616>] acpiphp_init+0x37/0x58
 [<ffffffff8020903b>] _stext+0x3b/0x180
 [<ffffffff80312598>] create_proc_entry+0x58/0xa0
 [<ffffffff802815d1>] register_irq_proc+0xc1/0xe0
 [<ffffffff806db64b>] kernel_init+0x152/0x1ac
 [<ffffffff8023d970>] finish_task_switch+0x0/0x110
 [<ffffffff8020ca7a>] child_rip+0xa/0x20
 [<ffffffff8020c47c>] restore_args+0x0/0x30
 [<ffffffff806db4f9>] kernel_init+0x0/0x1ac
 [<ffffffff8020ca70>] child_rip+0x0/0x20
Code: 89 c2 48 8b 00 48 85 c0 75 f5 48 8b 45 00 48 89 02 44 88 65 09 48 89 5d 00 31 c0 5b 5d 41 5c c3 53 48 85 d2 89 fb 48 89 d7 75 06 <48> 8b 56 10 eb 08 e8 73 f1 ff ff 48 89 c2 85 db 74 1a eb 13 0f
RIP  [<ffffffff8039646f>] acpi_ns_get_next_node+0xb/0x3c
 RSP <ffff88007f87fd30>
CR2: 0000000000000010
---[ end trace a7919e7f17c0a725 ]---

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci-acpi.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 20480b9f10c8..3cee2367459f 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -23,11 +23,10 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 
 static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
 {
-	int seg = pci_domain_nr(pbus), busnr = pbus->number;
-	struct pci_dev *bridge = pbus->self;
-	if (bridge)
-		return DEVICE_ACPI_HANDLE(&(bridge->dev));
-	return acpi_get_pci_rootbridge_handle(seg, busnr);
+	if (pbus->parent)
+		return DEVICE_ACPI_HANDLE(&(pbus->self->dev));
+	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus),
+					      pbus->number);
 }
 #else
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
-- 
cgit v1.2.3


From d18690af626b83fef1d1953b9f70e09497060586 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:12:36 +0900
Subject: PCI/ACPI: fix wrong assumption in acpi_find_root_bridge_handle

Current acpi_find_root_bridge_handle() has a assumption that
pci_bus->self is NULL on the root pci bus. But it might not be true on
some platforms. Because of this wrong assumption, current
acpi_find_root_bridge_handle() might cause endless loop. We must check
pci_bus->parent instead.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci-acpi.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 3cee2367459f..092e82e0048c 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -13,12 +13,12 @@
 #ifdef CONFIG_ACPI
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
-	/* Find root host bridge */
-	while (pdev->bus->self)
-		pdev = pdev->bus->self;
-
-	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pdev->bus),
-			pdev->bus->number);
+	struct pci_bus *pbus = pdev->bus;
+	/* Find a PCI root bus */
+	while (pbus->parent)
+		pbus = pbus->parent;
+	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus),
+					      pbus->number);
 }
 
 static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
-- 
cgit v1.2.3


From 998dd7c719f62dcfa91d7bf7f4eb9c160e03d817 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Wed, 25 Feb 2009 13:15:52 +0800
Subject: PCI: fix incorrect mask of PM No_Soft_Reset bit

Reviewed-by: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci_regs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index 027815b4635e..b647a4df59fc 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -235,7 +235,7 @@
 #define  PCI_PM_CAP_PME_SHIFT	11	/* Start of the PME Mask in PMC */
 #define PCI_PM_CTRL		4	/* PM control and status register */
 #define  PCI_PM_CTRL_STATE_MASK	0x0003	/* Current power state (D0 to D3) */
-#define  PCI_PM_CTRL_NO_SOFT_RESET	0x0004	/* No reset for D3hot->D0 */
+#define  PCI_PM_CTRL_NO_SOFT_RESET	0x0008	/* No reset for D3hot->D0 */
 #define  PCI_PM_CTRL_PME_ENABLE	0x0100	/* PME pin enable */
 #define  PCI_PM_CTRL_DATA_SEL_MASK	0x1e00	/* Data select (??) */
 #define  PCI_PM_CTRL_DATA_SCALE_MASK	0x6000	/* Data scale (??) */
-- 
cgit v1.2.3


From 24d27553390c69d11cdbd930d635193956fc295f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:06 -0400
Subject: PCI MSI: Replace 'type' with 'is_msix'

By changing from a 5-bit field to a 1-bit field, we free up some bits
that can be used by a later patch.  Also rearrange the fields for better
packing on 64-bit platforms (reducing the size of msi_desc from 72 bytes
to 64 bytes).

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 115 ++++++++++++++++++----------------------------------
 include/linux/msi.h |   4 +-
 2 files changed, 41 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index dceea56f7342..b3db4388f974 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -111,20 +111,10 @@ static void msix_flush_writes(struct irq_desc *desc)
 
 	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
-	switch (entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-		/* nothing to do */
-		break;
-	case PCI_CAP_ID_MSIX:
-	{
+	if (entry->msi_attrib.is_msix) {
 		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
 			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
 		readl(entry->mask_base + offset);
-		break;
-	}
-	default:
-		BUG();
-		break;
 	}
 }
 
@@ -143,32 +133,23 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 
 	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
-	switch (entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-		if (entry->msi_attrib.maskbit) {
-			int pos;
-			u32 mask_bits;
-
-			pos = (long)entry->mask_base;
-			pci_read_config_dword(entry->dev, pos, &mask_bits);
-			mask_bits &= ~(mask);
-			mask_bits |= flag & mask;
-			pci_write_config_dword(entry->dev, pos, mask_bits);
-		} else {
-			return 0;
-		}
-		break;
-	case PCI_CAP_ID_MSIX:
-	{
+	if (entry->msi_attrib.is_msix) {
 		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
 			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
 		writel(flag, entry->mask_base + offset);
 		readl(entry->mask_base + offset);
-		break;
-	}
-	default:
-		BUG();
-		break;
+	} else {
+		int pos;
+		u32 mask_bits;
+
+		if (!entry->msi_attrib.maskbit)
+			return 0;
+
+		pos = (long)entry->mask_base;
+		pci_read_config_dword(entry->dev, pos, &mask_bits);
+		mask_bits &= ~mask;
+		mask_bits |= flag & mask;
+		pci_write_config_dword(entry->dev, pos, mask_bits);
 	}
 	entry->msi_attrib.masked = !!flag;
 	return 1;
@@ -177,9 +158,14 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
 	struct msi_desc *entry = get_irq_desc_msi(desc);
-	switch(entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-	{
+	if (entry->msi_attrib.is_msix) {
+		void __iomem *base = entry->mask_base +
+			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+		msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
+	} else {
 		struct pci_dev *dev = entry->dev;
 		int pos = entry->msi_attrib.pos;
 		u16 data;
@@ -195,21 +181,6 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 			pci_read_config_word(dev, msi_data_reg(pos, 0), &data);
 		}
 		msg->data = data;
-		break;
-	}
-	case PCI_CAP_ID_MSIX:
-	{
-		void __iomem *base;
-		base = entry->mask_base +
-			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
-
-		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-		msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
- 		break;
- 	}
- 	default:
-		BUG();
 	}
 }
 
@@ -223,9 +194,17 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
 void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
 	struct msi_desc *entry = get_irq_desc_msi(desc);
-	switch (entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-	{
+	if (entry->msi_attrib.is_msix) {
+		void __iomem *base;
+		base = entry->mask_base +
+			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+		writel(msg->address_lo,
+			base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+		writel(msg->address_hi,
+			base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+		writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
+	} else {
 		struct pci_dev *dev = entry->dev;
 		int pos = entry->msi_attrib.pos;
 
@@ -240,23 +219,6 @@ void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 			pci_write_config_word(dev, msi_data_reg(pos, 0),
 						msg->data);
 		}
-		break;
-	}
-	case PCI_CAP_ID_MSIX:
-	{
-		void __iomem *base;
-		base = entry->mask_base +
-			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
-
-		writel(msg->address_lo,
-			base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-		writel(msg->address_hi,
-			base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-		writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
-		break;
-	}
-	default:
-		BUG();
 	}
 	entry->msg = *msg;
 }
@@ -393,7 +355,7 @@ static int msi_capability_init(struct pci_dev *dev)
 	if (!entry)
 		return -ENOMEM;
 
-	entry->msi_attrib.type = PCI_CAP_ID_MSI;
+	entry->msi_attrib.is_msix = 0;
 	entry->msi_attrib.is_64 = is_64bit_address(control);
 	entry->msi_attrib.entry_nr = 0;
 	entry->msi_attrib.maskbit = is_mask_bit_support(control);
@@ -475,7 +437,7 @@ static int msix_capability_init(struct pci_dev *dev,
 			break;
 
  		j = entries[i].entry;
-		entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+		entry->msi_attrib.is_msix = 1;
 		entry->msi_attrib.is_64 = 1;
 		entry->msi_attrib.entry_nr = j;
 		entry->msi_attrib.maskbit = 1;
@@ -619,12 +581,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
 		struct irq_desc *desc = irq_to_desc(dev->irq);
 		msi_set_mask_bits(desc, mask, ~mask);
 	}
-	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+	if (!entry->dev || entry->msi_attrib.is_msix)
 		return;
 
 	/* Restore dev->irq to its default pin-assertion irq */
 	dev->irq = entry->msi_attrib.default_irq;
 }
+
 void pci_disable_msi(struct pci_dev* dev)
 {
 	struct msi_desc *entry;
@@ -635,7 +598,7 @@ void pci_disable_msi(struct pci_dev* dev)
 	pci_msi_shutdown(dev);
 
 	entry = list_entry(dev->msi_list.next, struct msi_desc, list);
-	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+	if (!entry->dev || entry->msi_attrib.is_msix)
 		return;
 
 	msi_free_irqs(dev);
@@ -654,7 +617,7 @@ static int msi_free_irqs(struct pci_dev* dev)
 	arch_teardown_msi_irqs(dev);
 
 	list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
-		if (entry->msi_attrib.type == PCI_CAP_ID_MSIX) {
+		if (entry->msi_attrib.is_msix) {
 			writel(1, entry->mask_base + entry->msi_attrib.entry_nr
 				  * PCI_MSIX_ENTRY_SIZE
 				  + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d2b8a1e8ca11..9c5ce214fbf4 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -20,13 +20,13 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
 struct msi_desc {
 	struct {
-		__u8	type	: 5; 	/* {0: unused, 5h:MSI, 11h:MSI-X} */
+		__u8	is_msix	: 1;
 		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
 		__u8	masked	: 1;
 		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
 		__u8	pos;	 	/* Location of the msi capability */
-		__u32	maskbits_mask;  /* mask bits mask */
 		__u16	entry_nr;    	/* specific enabled entry 	  */
+		__u32	maskbits_mask;  /* mask bits mask */
 		unsigned default_irq;	/* default pre-assigned irq	  */
 	}msi_attrib;
 
-- 
cgit v1.2.3


From 264d9caaa1c574c0274b019a810abfe957391005 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:08 -0400
Subject: PCI MSI: Use mask_pos instead of mask_base when appropriate

MSI interrupts have a mask_pos where MSI-X have a mask_base.  Use a
transparent union to get rid of some ugly casts.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 5 ++---
 include/linux/msi.h | 5 ++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index a658c0f34e16..fcde04df6dfe 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -145,7 +145,7 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 		if (!entry->msi_attrib.maskbit)
 			return 0;
 
-		pos = (long)entry->mask_base;
+		pos = entry->mask_pos;
 		pci_read_config_dword(entry->dev, pos, &mask_bits);
 		mask_bits &= ~mask;
 		mask_bits |= flag & mask;
@@ -363,8 +363,7 @@ static int msi_capability_init(struct pci_dev *dev)
 		unsigned int base, maskbits, temp;
 
 		base = msi_mask_bits_reg(pos, entry->msi_attrib.is_64);
-		entry->mask_base = (void __iomem *)(long)base;
-
+		entry->mask_pos = base;
 		/* All MSIs are unmasked by default, Mask them all */
 		pci_read_config_dword(dev, base, &maskbits);
 		temp = msi_mask((control & PCI_MSI_FLAGS_QMASK) >> 1);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 9c5ce214fbf4..5025ca4d91e4 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -33,7 +33,10 @@ struct msi_desc {
 	unsigned int irq;
 	struct list_head list;
 
-	void __iomem *mask_base;
+	union {
+		void __iomem *mask_base;
+		u8 mask_pos;
+	};
 	struct pci_dev *dev;
 
 	/* Last set MSI message */
-- 
cgit v1.2.3


From f2440d9acbe866b917b16cc0f927366341ce9215 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:09 -0400
Subject: PCI MSI: Refactor interrupt masking code

Since most of the callers already know whether they have an MSI or
an MSI-X capability, split msi_set_mask_bits() into msi_mask_irq()
and msix_mask_irq().  The only callers which don't (mask_msi_irq()
and unmask_msi_irq()) can share code in msi_set_mask_bit().  This then
becomes the only caller of msix_flush_writes(), so we can inline it.
The flushing read can be to any address that belongs to the device,
so we can eliminate the calculation too.

We can also get rid of maskbits_mask from struct msi_desc and simply
recalculate it on the rare occasion that we need it.  The single-bit
'masked' element is replaced by a copy of the 32-bit 'masked' register,
so this patch does not affect the size of msi_desc.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 155 +++++++++++++++++++++++++---------------------------
 include/linux/msi.h |   5 +-
 2 files changed, 77 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index fcde04df6dfe..adcc78242571 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -105,17 +105,14 @@ static inline __attribute_const__ u32 msi_mask(unsigned x)
 	return (1 << (1 << x)) - 1;
 }
 
-static void msix_flush_writes(struct irq_desc *desc)
+static inline __attribute_const__ u32 msi_capable_mask(u16 control)
 {
-	struct msi_desc *entry;
+	return msi_mask((control >> 1) & 7);
+}
 
-	entry = get_irq_desc_msi(desc);
-	BUG_ON(!entry);
-	if (entry->msi_attrib.is_msix) {
-		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
-		readl(entry->mask_base + offset);
-	}
+static inline __attribute_const__ u32 msi_enabled_mask(u16 control)
+{
+	return msi_mask((control >> 4) & 7);
 }
 
 /*
@@ -127,32 +124,57 @@ static void msix_flush_writes(struct irq_desc *desc)
  * Returns 1 if it succeeded in masking the interrupt and 0 if the device
  * doesn't support MSI masking.
  */
-static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
+static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 {
-	struct msi_desc *entry;
+	u32 mask_bits = desc->masked;
 
-	entry = get_irq_desc_msi(desc);
-	BUG_ON(!entry);
-	if (entry->msi_attrib.is_msix) {
-		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
-		writel(flag, entry->mask_base + offset);
-		readl(entry->mask_base + offset);
-	} else {
-		int pos;
-		u32 mask_bits;
+	if (!desc->msi_attrib.maskbit)
+		return;
+
+	mask_bits &= ~mask;
+	mask_bits |= flag;
+	pci_write_config_dword(desc->dev, desc->mask_pos, mask_bits);
+	desc->masked = mask_bits;
+}
+
+/*
+ * This internal function does not flush PCI writes to the device.
+ * All users must ensure that they read from the device before either
+ * assuming that the device state is up to date, or returning out of this
+ * file.  This saves a few milliseconds when initialising devices with lots
+ * of MSI-X interrupts.
+ */
+static void msix_mask_irq(struct msi_desc *desc, u32 flag)
+{
+	u32 mask_bits = desc->masked;
+	unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+					PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+	mask_bits &= ~1;
+	mask_bits |= flag;
+	writel(mask_bits, desc->mask_base + offset);
+	desc->masked = mask_bits;
+}
 
-		if (!entry->msi_attrib.maskbit)
-			return 0;
+static void msi_set_mask_bit(unsigned irq, u32 flag)
+{
+	struct msi_desc *desc = get_irq_msi(irq);
 
-		pos = entry->mask_pos;
-		pci_read_config_dword(entry->dev, pos, &mask_bits);
-		mask_bits &= ~mask;
-		mask_bits |= flag & mask;
-		pci_write_config_dword(entry->dev, pos, mask_bits);
+	if (desc->msi_attrib.is_msix) {
+		msix_mask_irq(desc, flag);
+		readl(desc->mask_base);		/* Flush write to device */
+	} else {
+		msi_mask_irq(desc, 1, flag);
 	}
-	entry->msi_attrib.masked = !!flag;
-	return 1;
+}
+
+void mask_msi_irq(unsigned int irq)
+{
+	msi_set_mask_bit(irq, 1);
+}
+
+void unmask_msi_irq(unsigned int irq)
+{
+	msi_set_mask_bit(irq, 0);
 }
 
 void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
@@ -230,22 +252,6 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
 	write_msi_msg_desc(desc, msg);
 }
 
-void mask_msi_irq(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	msi_set_mask_bits(desc, 1, 1);
-	msix_flush_writes(desc);
-}
-
-void unmask_msi_irq(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	msi_set_mask_bits(desc, 1, 0);
-	msix_flush_writes(desc);
-}
-
 static int msi_free_irqs(struct pci_dev* dev);
 
 static struct msi_desc *alloc_msi_entry(struct pci_dev *dev)
@@ -281,13 +287,9 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
 	pci_intx_for_msi(dev, 0);
 	msi_set_enable(dev, 0);
 	write_msi_msg(dev->irq, &entry->msg);
-	if (entry->msi_attrib.maskbit) {
-		struct irq_desc *desc = irq_to_desc(dev->irq);
-		msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask,
-				  entry->msi_attrib.masked);
-	}
 
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
+	msi_mask_irq(entry, msi_capable_mask(control), entry->masked);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
 	control |= PCI_MSI_FLAGS_ENABLE;
 	pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control);
@@ -307,9 +309,8 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
 	msix_set_enable(dev, 0);
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
-		struct irq_desc *desc = irq_to_desc(entry->irq);
 		write_msi_msg(entry->irq, &entry->msg);
-		msi_set_mask_bits(desc, 1, entry->msi_attrib.masked);
+		msix_mask_irq(entry, entry->masked);
 	}
 
 	BUG_ON(list_empty(&dev->msi_list));
@@ -342,6 +343,7 @@ static int msi_capability_init(struct pci_dev *dev)
 	struct msi_desc *entry;
 	int pos, ret;
 	u16 control;
+	unsigned mask;
 
 	msi_set_enable(dev, 0);	/* Ensure msi is disabled as I set it up */
 
@@ -356,21 +358,16 @@ static int msi_capability_init(struct pci_dev *dev)
 	entry->msi_attrib.is_64 = is_64bit_address(control);
 	entry->msi_attrib.entry_nr = 0;
 	entry->msi_attrib.maskbit = is_mask_bit_support(control);
-	entry->msi_attrib.masked = 1;
 	entry->msi_attrib.default_irq = dev->irq;	/* Save IOAPIC IRQ */
 	entry->msi_attrib.pos = pos;
-	if (entry->msi_attrib.maskbit) {
-		unsigned int base, maskbits, temp;
-
-		base = msi_mask_bits_reg(pos, entry->msi_attrib.is_64);
-		entry->mask_pos = base;
-		/* All MSIs are unmasked by default, Mask them all */
-		pci_read_config_dword(dev, base, &maskbits);
-		temp = msi_mask((control & PCI_MSI_FLAGS_QMASK) >> 1);
-		maskbits |= temp;
-		pci_write_config_dword(dev, base, maskbits);
-		entry->msi_attrib.maskbits_mask = temp;
-	}
+
+	entry->mask_pos = msi_mask_bits_reg(pos, entry->msi_attrib.is_64);
+	/* All MSIs are unmasked by default, Mask them all */
+	if (entry->msi_attrib.maskbit)
+		pci_read_config_dword(dev, entry->mask_pos, &entry->masked);
+	mask = msi_capable_mask(control);
+	msi_mask_irq(entry, mask, mask);
+
 	list_add_tail(&entry->list, &dev->msi_list);
 
 	/* Configure MSI capability structure */
@@ -435,11 +432,12 @@ static int msix_capability_init(struct pci_dev *dev,
 		entry->msi_attrib.is_msix = 1;
 		entry->msi_attrib.is_64 = 1;
 		entry->msi_attrib.entry_nr = j;
-		entry->msi_attrib.maskbit = 1;
-		entry->msi_attrib.masked = 1;
 		entry->msi_attrib.default_irq = dev->irq;
 		entry->msi_attrib.pos = pos;
 		entry->mask_base = base;
+		entry->masked = readl(base + j * PCI_MSIX_ENTRY_SIZE +
+					PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+		msix_mask_irq(entry, 1);
 
 		list_add_tail(&entry->list, &dev->msi_list);
 	}
@@ -556,9 +554,11 @@ int pci_enable_msi(struct pci_dev* dev)
 }
 EXPORT_SYMBOL(pci_enable_msi);
 
-void pci_msi_shutdown(struct pci_dev* dev)
+void pci_msi_shutdown(struct pci_dev *dev)
 {
-	struct msi_desc *entry;
+	struct msi_desc *desc;
+	u32 mask;
+	u16 ctrl;
 
 	if (!pci_msi_enable || !dev || !dev->msi_enabled)
 		return;
@@ -568,18 +568,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
 	dev->msi_enabled = 0;
 
 	BUG_ON(list_empty(&dev->msi_list));
-	entry = list_entry(dev->msi_list.next, struct msi_desc, list);
-	/* Return the the pci reset with msi irqs unmasked */
-	if (entry->msi_attrib.maskbit) {
-		u32 mask = entry->msi_attrib.maskbits_mask;
-		struct irq_desc *desc = irq_to_desc(dev->irq);
-		msi_set_mask_bits(desc, mask, ~mask);
-	}
-	if (entry->msi_attrib.is_msix)
-		return;
+	desc = list_first_entry(&dev->msi_list, struct msi_desc, list);
+	pci_read_config_word(dev, desc->msi_attrib.pos + PCI_MSI_FLAGS, &ctrl);
+	mask = msi_capable_mask(ctrl);
+	msi_mask_irq(desc, mask, ~mask);
 
 	/* Restore dev->irq to its default pin-assertion irq */
-	dev->irq = entry->msi_attrib.default_irq;
+	dev->irq = desc->msi_attrib.default_irq;
 }
 
 void pci_disable_msi(struct pci_dev* dev)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 5025ca4d91e4..37c1bbe546e5 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -22,14 +22,13 @@ struct msi_desc {
 	struct {
 		__u8	is_msix	: 1;
 		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
-		__u8	masked	: 1;
 		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
 		__u8	pos;	 	/* Location of the msi capability */
 		__u16	entry_nr;    	/* specific enabled entry 	  */
-		__u32	maskbits_mask;  /* mask bits mask */
 		unsigned default_irq;	/* default pre-assigned irq	  */
-	}msi_attrib;
+	} msi_attrib;
 
+	u32 masked;			/* mask bits */
 	unsigned int irq;
 	struct list_head list;
 
-- 
cgit v1.2.3


From 1c8d7b0a562da06d3ebe83f01b1ed553205d1ae4 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:10 -0400
Subject: PCI MSI: Add support for multiple MSI

Add the new API pci_enable_msi_block() to allow drivers to
request multiple MSI and reimplement pci_enable_msi in terms of
pci_enable_msi_block.  Ensure that the architecture back ends don't
have to know about multiple MSI.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/PCI/MSI-HOWTO.txt | 45 +++++++++++++++++---
 arch/powerpc/kernel/msi.c       |  4 ++
 arch/x86/kernel/io_apic.c       |  4 ++
 drivers/pci/msi.c               | 91 +++++++++++++++++++++++++++++------------
 drivers/pci/msi.h               |  6 ---
 include/linux/msi.h             |  1 +
 include/linux/pci.h             |  6 ++-
 7 files changed, 116 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 1c02431f1d1a..9494f6dc38eb 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -94,15 +94,48 @@ This function should be called before the driver calls request_irq()
 since enabling MSIs disables the pin-based IRQ and the driver will not
 receive interrupts on the old interrupt.
 
-4.2.2 pci_disable_msi
+4.2.2 pci_enable_msi_block
+
+int pci_enable_msi_block(struct pci_dev *dev, int count)
+
+This variation on the above call allows a device driver to request multiple
+MSIs.  The MSI specification only allows interrupts to be allocated in
+powers of two, up to a maximum of 2^5 (32).
+
+If this function returns 0, it has succeeded in allocating at least as many
+interrupts as the driver requested (it may have allocated more in order
+to satisfy the power-of-two requirement).  In this case, the function
+enables MSI on this device and updates dev->irq to be the lowest of
+the new interrupts assigned to it.  The other interrupts assigned to
+the device are in the range dev->irq to dev->irq + count - 1.
+
+If this function returns a negative number, it indicates an error and
+the driver should not attempt to request any more MSI interrupts for
+this device.  If this function returns a positive number, it will be
+less than 'count' and indicate the number of interrupts that could have
+been allocated.  In neither case will the irq value have been
+updated, nor will the device have been switched into MSI mode.
+
+The device driver must decide what action to take if
+pci_enable_msi_block() returns a value less than the number asked for.
+Some devices can make use of fewer interrupts than the maximum they
+request; in this case the driver should call pci_enable_msi_block()
+again.  Note that it is not guaranteed to succeed, even when the
+'count' has been reduced to the value returned from a previous call to
+pci_enable_msi_block().  This is because there are multiple constraints
+on the number of vectors that can be allocated; pci_enable_msi_block()
+will return as soon as it finds any constraint that doesn't allow the
+call to succeed.
+
+4.2.3 pci_disable_msi
 
 void pci_disable_msi(struct pci_dev *dev)
 
-This function should be used to undo the effect of pci_enable_msi().
-Calling it restores dev->irq to the pin-based interrupt number and frees
-the previously allocated message signaled interrupt(s).  The interrupt
-may subsequently be assigned to another device, so drivers should not
-cache the value of dev->irq.
+This function should be used to undo the effect of pci_enable_msi() or
+pci_enable_msi_block().  Calling it restores dev->irq to the pin-based
+interrupt number and frees the previously allocated message signaled
+interrupt(s).  The interrupt may subsequently be assigned to another
+device, so drivers should not cache the value of dev->irq.
 
 A device driver must always call free_irq() on the interrupt(s)
 for which it has called request_irq() before calling this function.
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index 3bb7d3dd28be..0c16e2a854e5 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -19,6 +19,10 @@ int arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
 		return -ENOSYS;
 	}
 
+	/* PowerPC doesn't support multiple MSI yet */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
 	if (ppc_md.msi_check_device) {
 		pr_debug("msi: Using platform check routine.\n");
 		return ppc_md.msi_check_device(dev, nvec, type);
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index bc7ac4da90d7..a09549a6321b 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -3510,6 +3510,10 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	int index = 0;
 #endif
 
+	/* x86 doesn't support multiple MSI yet */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index adcc78242571..6f2e6295e773 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -40,6 +40,13 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	struct msi_desc *entry;
 	int ret;
 
+	/*
+	 * If an architecture wants to support multiple MSI, it needs to
+	 * override arch_setup_msi_irqs()
+	 */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
 	list_for_each_entry(entry, &dev->msi_list, list) {
 		ret = arch_setup_msi_irq(dev, entry);
 		if (ret < 0)
@@ -58,8 +65,12 @@ void arch_teardown_msi_irqs(struct pci_dev *dev)
 	struct msi_desc *entry;
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
-		if (entry->irq != 0)
-			arch_teardown_msi_irq(entry->irq);
+		int i, nvec;
+		if (entry->irq == 0)
+			continue;
+		nvec = 1 << entry->msi_attrib.multiple;
+		for (i = 0; i < nvec; i++)
+			arch_teardown_msi_irq(entry->irq + i);
 	}
 }
 #endif
@@ -163,7 +174,8 @@ static void msi_set_mask_bit(unsigned irq, u32 flag)
 		msix_mask_irq(desc, flag);
 		readl(desc->mask_base);		/* Flush write to device */
 	} else {
-		msi_mask_irq(desc, 1, flag);
+		unsigned offset = irq - desc->dev->irq;
+		msi_mask_irq(desc, 1 << offset, flag << offset);
 	}
 }
 
@@ -229,6 +241,12 @@ void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 	} else {
 		struct pci_dev *dev = entry->dev;
 		int pos = entry->msi_attrib.pos;
+		u16 msgctl;
+
+		pci_read_config_word(dev, msi_control_reg(pos), &msgctl);
+		msgctl &= ~PCI_MSI_FLAGS_QSIZE;
+		msgctl |= entry->msi_attrib.multiple << 4;
+		pci_write_config_word(dev, msi_control_reg(pos), msgctl);
 
 		pci_write_config_dword(dev, msi_lower_address_reg(pos),
 					msg->address_lo);
@@ -291,7 +309,7 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 	msi_mask_irq(entry, msi_capable_mask(control), entry->masked);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
-	control |= PCI_MSI_FLAGS_ENABLE;
+	control |= (entry->msi_attrib.multiple << 4) | PCI_MSI_FLAGS_ENABLE;
 	pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control);
 }
 
@@ -332,13 +350,15 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state);
 /**
  * msi_capability_init - configure device's MSI capability structure
  * @dev: pointer to the pci_dev data structure of MSI device function
+ * @nvec: number of interrupts to allocate
  *
- * Setup the MSI capability structure of device function with a single
- * MSI irq, regardless of device function is capable of handling
- * multiple messages. A return of zero indicates the successful setup
- * of an entry zero with the new MSI irq or non-zero for otherwise.
- **/
-static int msi_capability_init(struct pci_dev *dev)
+ * Setup the MSI capability structure of the device with the requested
+ * number of interrupts.  A return value of zero indicates the successful
+ * setup of an entry with the new MSI irq.  A negative return value indicates
+ * an error, and a positive return value indicates the number of interrupts
+ * which could have been allocated.
+ */
+static int msi_capability_init(struct pci_dev *dev, int nvec)
 {
 	struct msi_desc *entry;
 	int pos, ret;
@@ -371,7 +391,7 @@ static int msi_capability_init(struct pci_dev *dev)
 	list_add_tail(&entry->list, &dev->msi_list);
 
 	/* Configure MSI capability structure */
-	ret = arch_setup_msi_irqs(dev, 1, PCI_CAP_ID_MSI);
+	ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
 	if (ret) {
 		msi_free_irqs(dev);
 		return ret;
@@ -524,35 +544,48 @@ static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type)
 }
 
 /**
- * pci_enable_msi - configure device's MSI capability structure
- * @dev: pointer to the pci_dev data structure of MSI device function
+ * pci_enable_msi_block - configure device's MSI capability structure
+ * @dev: device to configure
+ * @nvec: number of interrupts to configure
  *
- * Setup the MSI capability structure of device function with
- * a single MSI irq upon its software driver call to request for
- * MSI mode enabled on its hardware device function. A return of zero
- * indicates the successful setup of an entry zero with the new MSI
- * irq or non-zero for otherwise.
- **/
-int pci_enable_msi(struct pci_dev* dev)
+ * Allocate IRQs for a device with the MSI capability.
+ * This function returns a negative errno if an error occurs.  If it
+ * is unable to allocate the number of interrupts requested, it returns
+ * the number of interrupts it might be able to allocate.  If it successfully
+ * allocates at least the number of interrupts requested, it returns 0 and
+ * updates the @dev's irq member to the lowest new interrupt number; the
+ * other interrupt numbers allocated to this device are consecutive.
+ */
+int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
 {
-	int status;
+	int status, pos, maxvec;
+	u16 msgctl;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	if (!pos)
+		return -EINVAL;
+	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
+	maxvec = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1);
+	if (nvec > maxvec)
+		return maxvec;
 
-	status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
+	status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI);
 	if (status)
 		return status;
 
 	WARN_ON(!!dev->msi_enabled);
 
-	/* Check whether driver already requested for MSI-X irqs */
+	/* Check whether driver already requested MSI-X irqs */
 	if (dev->msix_enabled) {
 		dev_info(&dev->dev, "can't enable MSI "
 			 "(MSI-X already enabled)\n");
 		return -EINVAL;
 	}
-	status = msi_capability_init(dev);
+
+	status = msi_capability_init(dev, nvec);
 	return status;
 }
-EXPORT_SYMBOL(pci_enable_msi);
+EXPORT_SYMBOL(pci_enable_msi_block);
 
 void pci_msi_shutdown(struct pci_dev *dev)
 {
@@ -599,8 +632,12 @@ static int msi_free_irqs(struct pci_dev* dev)
 	struct msi_desc *entry, *tmp;
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
-		if (entry->irq)
-			BUG_ON(irq_has_action(entry->irq));
+		int i, nvec;
+		if (!entry->irq)
+			continue;
+		nvec = 1 << entry->msi_attrib.multiple;
+		for (i = 0; i < nvec; i++)
+			BUG_ON(irq_has_action(entry->irq + i));
 	}
 
 	arch_teardown_msi_irqs(dev);
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 3898f5237144..71f4df2ef654 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -20,14 +20,8 @@
 #define msi_mask_bits_reg(base, is64bit) \
 	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
 #define msi_disable(control)		control &= ~PCI_MSI_FLAGS_ENABLE
-#define multi_msi_capable(control) \
-	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
-#define multi_msi_enable(control, num) \
-	control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
 #define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
 #define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
-#define msi_enable(control, num) multi_msi_enable(control, num); \
-	control |= PCI_MSI_FLAGS_ENABLE
 
 #define msix_table_offset_reg(base)	(base + 0x04)
 #define msix_pba_offset_reg(base)	(base + 0x08)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 37c1bbe546e5..6991ab5b24d1 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -21,6 +21,7 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 struct msi_desc {
 	struct {
 		__u8	is_msix	: 1;
+		__u8	multiple: 3;	/* log2 number of messages */
 		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
 		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
 		__u8	pos;	 	/* Location of the msi capability */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7baf2a5db12a..1f6c5ddaae36 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -789,7 +789,7 @@ struct msix_entry {
 
 
 #ifndef CONFIG_PCI_MSI
-static inline int pci_enable_msi(struct pci_dev *dev)
+static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
 {
 	return -1;
 }
@@ -824,7 +824,7 @@ static inline int pci_msi_enabled(void)
 	return 0;
 }
 #else
-extern int pci_enable_msi(struct pci_dev *dev);
+extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
 extern void pci_msi_shutdown(struct pci_dev *dev);
 extern void pci_disable_msi(struct pci_dev *dev);
 extern int pci_msix_table_size(struct pci_dev *dev);
@@ -846,6 +846,8 @@ static inline int pcie_aspm_enabled(void)
 extern int pcie_aspm_enabled(void);
 #endif
 
+#define pci_enable_msi(pdev)	pci_enable_msi_block(pdev, 1)
+
 #ifdef CONFIG_HT_IRQ
 /* The functions a driver should call */
 int  ht_create_irq(struct pci_dev *dev, int idx);
-- 
cgit v1.2.3


From 8293b0f629095efbe7c7e3f9b437f8c040c19eb5 Mon Sep 17 00:00:00 2001
From: David O'Shea <dcoshea@hotmail.com>
Date: Mon, 2 Mar 2009 09:51:13 +0100
Subject: PCI: Compaq Evo D510 SMBus quirk using USB instead of VGA

On the Compaq Evo D510 SFF/CMT, a PCI quirk activated the SMBus device
based on detection of the on-board VGA controller, but the on-board
VGA is disabled if an AGP card is inserted, so look for one of the USB
controllers instead.

Signed-off-by: David O'Shea <dcoshea@hotmail.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/quirks.c    | 9 +++++++--
 include/linux/pci_ids.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 50233818a763..7ddcfc65e790 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1186,10 +1186,15 @@ static void __init asus_hides_smbus_hostbridge(struct pci_dev *dev)
 				 * its on-board VGA controller */
 				asus_hides_smbus = 1;
 			}
-		else if (dev->device == PCI_DEVICE_ID_INTEL_82845G_IG)
+		else if (dev->device == PCI_DEVICE_ID_INTEL_82801DB_2)
 			switch(dev->subsystem_device) {
 			case 0x00b8: /* Compaq Evo D510 CMT */
 			case 0x00b9: /* Compaq Evo D510 SFF */
+				/* Motherboard doesn't have Host bridge
+				 * subvendor/subdevice IDs and on-board VGA
+				 * controller is disabled if an AGP card is
+				 * inserted, therefore checking USB UHCI
+				 * Controller #1 */
 				asus_hides_smbus = 1;
 			}
 		else if (dev->device == PCI_DEVICE_ID_INTEL_82815_CGC)
@@ -1214,7 +1219,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82855GM_HB,	as
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82915GM_HB, asus_hides_smbus_hostbridge);
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82810_IG3,	asus_hides_smbus_hostbridge);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82845G_IG,	asus_hides_smbus_hostbridge);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82801DB_2,	asus_hides_smbus_hostbridge);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82815_CGC,	asus_hides_smbus_hostbridge);
 
 static void asus_hides_smbus_lpc(struct pci_dev *dev)
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index aca8c458aa8a..3ddf8beabdf8 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2373,6 +2373,7 @@
 #define PCI_DEVICE_ID_INTEL_82801CA_12	0x248c
 #define PCI_DEVICE_ID_INTEL_82801DB_0	0x24c0
 #define PCI_DEVICE_ID_INTEL_82801DB_1	0x24c1
+#define PCI_DEVICE_ID_INTEL_82801DB_2	0x24c2
 #define PCI_DEVICE_ID_INTEL_82801DB_3	0x24c3
 #define PCI_DEVICE_ID_INTEL_82801DB_5	0x24c5
 #define PCI_DEVICE_ID_INTEL_82801DB_6	0x24c6
-- 
cgit v1.2.3


From d1b054da8f599905f3c18a218961dcf17f9d5f13 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:11 +0800
Subject: PCI: initialize and release SR-IOV capability

If a device has the SR-IOV capability, initialize it (set the ARI
Capable Hierarchy in the lowest numbered PF if necessary; calculate
the System Page Size for the VF MMIO, probe the VF Offset, Stride
and BARs). A lock for the VF bus allocation is also initialized if
a PF is the lowest numbered PF.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/Kconfig      |  10 +++
 drivers/pci/Makefile     |   2 +
 drivers/pci/iov.c        | 182 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.c        |   7 ++
 drivers/pci/pci.h        |  37 ++++++++++
 drivers/pci/probe.c      |   4 ++
 include/linux/pci.h      |  11 +++
 include/linux/pci_regs.h |  33 +++++++++
 8 files changed, 286 insertions(+)
 create mode 100644 drivers/pci/iov.c

(limited to 'include/linux')

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 2a4501dd2515..fdc864f9cf23 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -59,3 +59,13 @@ config HT_IRQ
 	   This allows native hypertransport devices to use interrupts.
 
 	   If unsure say Y.
+
+config PCI_IOV
+	bool "PCI IOV support"
+	depends on PCI
+	help
+	  I/O Virtualization is a PCI feature supported by some devices
+	  which allows them to create virtual devices which share their
+	  physical resources.
+
+	  If unsure, say N.
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 3d07ce24f6a8..ba6af162fd39 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -29,6 +29,8 @@ obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
 
 obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
 
+obj-$(CONFIG_PCI_IOV) += iov.o
+
 #
 # Some architectures use the generic PCI setup functions
 #
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
new file mode 100644
index 000000000000..66cc414ed15f
--- /dev/null
+++ b/drivers/pci/iov.c
@@ -0,0 +1,182 @@
+/*
+ * drivers/pci/iov.c
+ *
+ * Copyright (C) 2009 Intel Corporation, Yu Zhao <yu.zhao@intel.com>
+ *
+ * PCI Express I/O Virtualization (IOV) support.
+ *   Single Root IOV 1.0
+ */
+
+#include <linux/pci.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include "pci.h"
+
+
+static int sriov_init(struct pci_dev *dev, int pos)
+{
+	int i;
+	int rc;
+	int nres;
+	u32 pgsz;
+	u16 ctrl, total, offset, stride;
+	struct pci_sriov *iov;
+	struct resource *res;
+	struct pci_dev *pdev;
+
+	if (dev->pcie_type != PCI_EXP_TYPE_RC_END &&
+	    dev->pcie_type != PCI_EXP_TYPE_ENDPOINT)
+		return -ENODEV;
+
+	pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
+	if (ctrl & PCI_SRIOV_CTRL_VFE) {
+		pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
+		ssleep(1);
+	}
+
+	pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total);
+	if (!total)
+		return 0;
+
+	ctrl = 0;
+	list_for_each_entry(pdev, &dev->bus->devices, bus_list)
+		if (pdev->is_physfn)
+			goto found;
+
+	pdev = NULL;
+	if (pci_ari_enabled(dev->bus))
+		ctrl |= PCI_SRIOV_CTRL_ARI;
+
+found:
+	pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
+	pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, total);
+	pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
+	pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
+	if (!offset || (total > 1 && !stride))
+		return -EIO;
+
+	pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
+	i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
+	pgsz &= ~((1 << i) - 1);
+	if (!pgsz)
+		return -EIO;
+
+	pgsz &= ~(pgsz - 1);
+	pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
+
+	nres = 0;
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		i += __pci_read_base(dev, pci_bar_unknown, res,
+				     pos + PCI_SRIOV_BAR + i * 4);
+		if (!res->flags)
+			continue;
+		if (resource_size(res) & (PAGE_SIZE - 1)) {
+			rc = -EIO;
+			goto failed;
+		}
+		res->end = res->start + resource_size(res) * total - 1;
+		nres++;
+	}
+
+	iov = kzalloc(sizeof(*iov), GFP_KERNEL);
+	if (!iov) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	iov->pos = pos;
+	iov->nres = nres;
+	iov->ctrl = ctrl;
+	iov->total = total;
+	iov->offset = offset;
+	iov->stride = stride;
+	iov->pgsz = pgsz;
+	iov->self = dev;
+	pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, &iov->cap);
+	pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link);
+
+	if (pdev)
+		iov->dev = pci_dev_get(pdev);
+	else {
+		iov->dev = dev;
+		mutex_init(&iov->lock);
+	}
+
+	dev->sriov = iov;
+	dev->is_physfn = 1;
+
+	return 0;
+
+failed:
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		res->flags = 0;
+	}
+
+	return rc;
+}
+
+static void sriov_release(struct pci_dev *dev)
+{
+	if (dev == dev->sriov->dev)
+		mutex_destroy(&dev->sriov->lock);
+	else
+		pci_dev_put(dev->sriov->dev);
+
+	kfree(dev->sriov);
+	dev->sriov = NULL;
+}
+
+/**
+ * pci_iov_init - initialize the IOV capability
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_iov_init(struct pci_dev *dev)
+{
+	int pos;
+
+	if (!dev->is_pcie)
+		return -ENODEV;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
+	if (pos)
+		return sriov_init(dev, pos);
+
+	return -ENODEV;
+}
+
+/**
+ * pci_iov_release - release resources used by the IOV capability
+ * @dev: the PCI device
+ */
+void pci_iov_release(struct pci_dev *dev)
+{
+	if (dev->is_physfn)
+		sriov_release(dev);
+}
+
+/**
+ * pci_iov_resource_bar - get position of the SR-IOV BAR
+ * @dev: the PCI device
+ * @resno: the resource number
+ * @type: the BAR type to be filled in
+ *
+ * Returns position of the BAR encapsulated in the SR-IOV capability.
+ */
+int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+			 enum pci_bar_type *type)
+{
+	if (resno < PCI_IOV_RESOURCES || resno > PCI_IOV_RESOURCE_END)
+		return 0;
+
+	BUG_ON(!dev->is_physfn);
+
+	*type = pci_bar_unknown;
+
+	return dev->sriov->pos + PCI_SRIOV_BAR +
+		4 * (resno - PCI_IOV_RESOURCES);
+}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index a35a8b2ba631..2b3201ec2b05 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2360,12 +2360,19 @@ int pci_select_bars(struct pci_dev *dev, unsigned long flags)
  */
 int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type)
 {
+	int reg;
+
 	if (resno < PCI_ROM_RESOURCE) {
 		*type = pci_bar_unknown;
 		return PCI_BASE_ADDRESS_0 + 4 * resno;
 	} else if (resno == PCI_ROM_RESOURCE) {
 		*type = pci_bar_mem32;
 		return dev->rom_base_reg;
+	} else if (resno < PCI_BRIDGE_RESOURCES) {
+		/* device specific resource */
+		reg = pci_iov_resource_bar(dev, resno, type);
+		if (reg)
+			return reg;
 	}
 
 	dev_err(&dev->dev, "BAR: invalid resource #%d\n", resno);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 2cd1cba7236f..7d5327c986f5 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -201,4 +201,41 @@ resource_size_t pci_specified_resource_alignment(struct pci_dev *dev);
 extern void pci_disable_bridge_window(struct pci_dev *dev);
 #endif
 
+/* Single Root I/O Virtualization */
+struct pci_sriov {
+	int pos;		/* capability position */
+	int nres;		/* number of resources */
+	u32 cap;		/* SR-IOV Capabilities */
+	u16 ctrl;		/* SR-IOV Control */
+	u16 total;		/* total VFs associated with the PF */
+	u16 offset;		/* first VF Routing ID offset */
+	u16 stride;		/* following VF stride */
+	u32 pgsz;		/* page size for BAR alignment */
+	u8 link;		/* Function Dependency Link */
+	struct pci_dev *dev;	/* lowest numbered PF */
+	struct pci_dev *self;	/* this PF */
+	struct mutex lock;	/* lock for VF bus */
+};
+
+#ifdef CONFIG_PCI_IOV
+extern int pci_iov_init(struct pci_dev *dev);
+extern void pci_iov_release(struct pci_dev *dev);
+extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+				enum pci_bar_type *type);
+#else
+static inline int pci_iov_init(struct pci_dev *dev)
+{
+	return -ENODEV;
+}
+static inline void pci_iov_release(struct pci_dev *dev)
+
+{
+}
+static inline int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+				       enum pci_bar_type *type)
+{
+	return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 579a56c8181f..0471f6ea1466 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -785,6 +785,7 @@ static int pci_setup_device(struct pci_dev * dev)
 static void pci_release_capabilities(struct pci_dev *dev)
 {
 	pci_vpd_release(dev);
+	pci_iov_release(dev);
 }
 
 /**
@@ -979,6 +980,9 @@ static void pci_init_capabilities(struct pci_dev *dev)
 
 	/* Alternative Routing-ID Forwarding */
 	pci_enable_ari(dev);
+
+	/* Single Root I/O Virtualization */
+	pci_iov_init(dev);
 }
 
 void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1f6c5ddaae36..8ce2f2d9ab63 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -93,6 +93,12 @@ enum {
 	/* #6: expansion ROM resource */
 	PCI_ROM_RESOURCE,
 
+	/* device specific resources */
+#ifdef CONFIG_PCI_IOV
+	PCI_IOV_RESOURCES,
+	PCI_IOV_RESOURCE_END = PCI_IOV_RESOURCES + PCI_SRIOV_NUM_BARS - 1,
+#endif
+
 	/* resources assigned to buses behind the bridge */
 #define PCI_BRIDGE_RESOURCE_NUM 4
 
@@ -180,6 +186,7 @@ struct pci_cap_saved_state {
 
 struct pcie_link_state;
 struct pci_vpd;
+struct pci_sriov;
 
 /*
  * The pci_dev structure is used to describe PCI devices.
@@ -257,6 +264,7 @@ struct pci_dev {
 	unsigned int	is_managed:1;
 	unsigned int	is_pcie:1;
 	unsigned int	state_saved:1;
+	unsigned int	is_physfn:1;
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
@@ -270,6 +278,9 @@ struct pci_dev {
 	struct list_head msi_list;
 #endif
 	struct pci_vpd *vpd;
+#ifdef CONFIG_PCI_IOV
+	struct pci_sriov *sriov;	/* SR-IOV capability related */
+#endif
 };
 
 extern struct pci_dev *alloc_pci_dev(void);
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index b647a4df59fc..d4e663877f45 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -375,6 +375,7 @@
 #define  PCI_EXP_TYPE_UPSTREAM	0x5	/* Upstream Port */
 #define  PCI_EXP_TYPE_DOWNSTREAM 0x6	/* Downstream Port */
 #define  PCI_EXP_TYPE_PCI_BRIDGE 0x7	/* PCI/PCI-X Bridge */
+#define  PCI_EXP_TYPE_RC_END	0x9	/* Root Complex Integrated Endpoint */
 #define PCI_EXP_FLAGS_SLOT	0x0100	/* Slot implemented */
 #define PCI_EXP_FLAGS_IRQ	0x3e00	/* Interrupt message number */
 #define PCI_EXP_DEVCAP		4	/* Device capabilities */
@@ -498,6 +499,7 @@
 #define PCI_EXT_CAP_ID_DSN	3
 #define PCI_EXT_CAP_ID_PWR	4
 #define PCI_EXT_CAP_ID_ARI	14
+#define PCI_EXT_CAP_ID_SRIOV	16
 
 /* Advanced Error Reporting */
 #define PCI_ERR_UNCOR_STATUS	4	/* Uncorrectable Error Status */
@@ -615,4 +617,35 @@
 #define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
 #define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
 
+/* Single Root I/O Virtualization */
+#define PCI_SRIOV_CAP		0x04	/* SR-IOV Capabilities */
+#define  PCI_SRIOV_CAP_VFM	0x01	/* VF Migration Capable */
+#define  PCI_SRIOV_CAP_INTR(x)	((x) >> 21) /* Interrupt Message Number */
+#define PCI_SRIOV_CTRL		0x08	/* SR-IOV Control */
+#define  PCI_SRIOV_CTRL_VFE	0x01	/* VF Enable */
+#define  PCI_SRIOV_CTRL_VFM	0x02	/* VF Migration Enable */
+#define  PCI_SRIOV_CTRL_INTR	0x04	/* VF Migration Interrupt Enable */
+#define  PCI_SRIOV_CTRL_MSE	0x08	/* VF Memory Space Enable */
+#define  PCI_SRIOV_CTRL_ARI	0x10	/* ARI Capable Hierarchy */
+#define PCI_SRIOV_STATUS	0x0a	/* SR-IOV Status */
+#define  PCI_SRIOV_STATUS_VFM	0x01	/* VF Migration Status */
+#define PCI_SRIOV_INITIAL_VF	0x0c	/* Initial VFs */
+#define PCI_SRIOV_TOTAL_VF	0x0e	/* Total VFs */
+#define PCI_SRIOV_NUM_VF	0x10	/* Number of VFs */
+#define PCI_SRIOV_FUNC_LINK	0x12	/* Function Dependency Link */
+#define PCI_SRIOV_VF_OFFSET	0x14	/* First VF Offset */
+#define PCI_SRIOV_VF_STRIDE	0x16	/* Following VF Stride */
+#define PCI_SRIOV_VF_DID	0x1a	/* VF Device ID */
+#define PCI_SRIOV_SUP_PGSIZE	0x1c	/* Supported Page Sizes */
+#define PCI_SRIOV_SYS_PGSIZE	0x20	/* System Page Size */
+#define PCI_SRIOV_BAR		0x24	/* VF BAR0 */
+#define  PCI_SRIOV_NUM_BARS	6	/* Number of VF BARs */
+#define PCI_SRIOV_VFM		0x3c	/* VF Migration State Array Offset*/
+#define  PCI_SRIOV_VFM_BIR(x)	((x) & 7)	/* State BIR */
+#define  PCI_SRIOV_VFM_OFFSET(x) ((x) & ~7)	/* State Offset */
+#define  PCI_SRIOV_VFM_UA	0x0	/* Inactive.Unavailable */
+#define  PCI_SRIOV_VFM_MI	0x1	/* Dormant.MigrateIn */
+#define  PCI_SRIOV_VFM_MO	0x2	/* Active.MigrateOut */
+#define  PCI_SRIOV_VFM_AV	0x3	/* Active.Available */
+
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From dd7cc44d0bcec5e9c42fe52e88dc254ae62eac8d Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:15 +0800
Subject: PCI: add SR-IOV API for Physical Function driver

Add or remove the Virtual Function when the SR-IOV is enabled or
disabled by the device driver. This can happen anytime rather than
only at the device probe stage.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/iov.c   | 314 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h   |   2 +
 include/linux/pci.h |  19 +++-
 3 files changed, 334 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 5ddfc09a8d3f..d0ff8ad8f7ba 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include "pci.h"
 
+#define VIRTFN_ID_LEN	16
 
 static inline u8 virtfn_bus(struct pci_dev *dev, int id)
 {
@@ -26,6 +27,284 @@ static inline u8 virtfn_devfn(struct pci_dev *dev, int id)
 		dev->sriov->stride * id) & 0xff;
 }
 
+static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
+{
+	int rc;
+	struct pci_bus *child;
+
+	if (bus->number == busnr)
+		return bus;
+
+	child = pci_find_bus(pci_domain_nr(bus), busnr);
+	if (child)
+		return child;
+
+	child = pci_add_new_bus(bus, NULL, busnr);
+	if (!child)
+		return NULL;
+
+	child->subordinate = busnr;
+	child->dev.parent = bus->bridge;
+	rc = pci_bus_add_child(child);
+	if (rc) {
+		pci_remove_bus(child);
+		return NULL;
+	}
+
+	return child;
+}
+
+static void virtfn_remove_bus(struct pci_bus *bus, int busnr)
+{
+	struct pci_bus *child;
+
+	if (bus->number == busnr)
+		return;
+
+	child = pci_find_bus(pci_domain_nr(bus), busnr);
+	BUG_ON(!child);
+
+	if (list_empty(&child->devices))
+		pci_remove_bus(child);
+}
+
+static int virtfn_add(struct pci_dev *dev, int id, int reset)
+{
+	int i;
+	int rc;
+	u64 size;
+	char buf[VIRTFN_ID_LEN];
+	struct pci_dev *virtfn;
+	struct resource *res;
+	struct pci_sriov *iov = dev->sriov;
+
+	virtfn = alloc_pci_dev();
+	if (!virtfn)
+		return -ENOMEM;
+
+	mutex_lock(&iov->dev->sriov->lock);
+	virtfn->bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id));
+	if (!virtfn->bus) {
+		kfree(virtfn);
+		mutex_unlock(&iov->dev->sriov->lock);
+		return -ENOMEM;
+	}
+	virtfn->devfn = virtfn_devfn(dev, id);
+	virtfn->vendor = dev->vendor;
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device);
+	pci_setup_device(virtfn);
+	virtfn->dev.parent = dev->dev.parent;
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		if (!res->parent)
+			continue;
+		virtfn->resource[i].name = pci_name(virtfn);
+		virtfn->resource[i].flags = res->flags;
+		size = resource_size(res);
+		do_div(size, iov->total);
+		virtfn->resource[i].start = res->start + size * id;
+		virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
+		rc = request_resource(res, &virtfn->resource[i]);
+		BUG_ON(rc);
+	}
+
+	if (reset)
+		pci_execute_reset_function(virtfn);
+
+	pci_device_add(virtfn, virtfn->bus);
+	mutex_unlock(&iov->dev->sriov->lock);
+
+	virtfn->physfn = pci_dev_get(dev);
+	virtfn->is_virtfn = 1;
+
+	rc = pci_bus_add_device(virtfn);
+	if (rc)
+		goto failed1;
+	sprintf(buf, "virtfn%u", id);
+	rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
+	if (rc)
+		goto failed1;
+	rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
+	if (rc)
+		goto failed2;
+
+	kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
+
+	return 0;
+
+failed2:
+	sysfs_remove_link(&dev->dev.kobj, buf);
+failed1:
+	pci_dev_put(dev);
+	mutex_lock(&iov->dev->sriov->lock);
+	pci_remove_bus_device(virtfn);
+	virtfn_remove_bus(dev->bus, virtfn_bus(dev, id));
+	mutex_unlock(&iov->dev->sriov->lock);
+
+	return rc;
+}
+
+static void virtfn_remove(struct pci_dev *dev, int id, int reset)
+{
+	char buf[VIRTFN_ID_LEN];
+	struct pci_bus *bus;
+	struct pci_dev *virtfn;
+	struct pci_sriov *iov = dev->sriov;
+
+	bus = pci_find_bus(pci_domain_nr(dev->bus), virtfn_bus(dev, id));
+	if (!bus)
+		return;
+
+	virtfn = pci_get_slot(bus, virtfn_devfn(dev, id));
+	if (!virtfn)
+		return;
+
+	pci_dev_put(virtfn);
+
+	if (reset) {
+		device_release_driver(&virtfn->dev);
+		pci_execute_reset_function(virtfn);
+	}
+
+	sprintf(buf, "virtfn%u", id);
+	sysfs_remove_link(&dev->dev.kobj, buf);
+	sysfs_remove_link(&virtfn->dev.kobj, "physfn");
+
+	mutex_lock(&iov->dev->sriov->lock);
+	pci_remove_bus_device(virtfn);
+	virtfn_remove_bus(dev->bus, virtfn_bus(dev, id));
+	mutex_unlock(&iov->dev->sriov->lock);
+
+	pci_dev_put(dev);
+}
+
+static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
+{
+	int rc;
+	int i, j;
+	int nres;
+	u16 offset, stride, initial;
+	struct resource *res;
+	struct pci_dev *pdev;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (!nr_virtfn)
+		return 0;
+
+	if (iov->nr_virtfn)
+		return -EINVAL;
+
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_INITIAL_VF, &initial);
+	if (initial > iov->total ||
+	    (!(iov->cap & PCI_SRIOV_CAP_VFM) && (initial != iov->total)))
+		return -EIO;
+
+	if (nr_virtfn < 0 || nr_virtfn > iov->total ||
+	    (!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial)))
+		return -EINVAL;
+
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
+	if (!offset || (nr_virtfn > 1 && !stride))
+		return -EIO;
+
+	nres = 0;
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		if (res->parent)
+			nres++;
+	}
+	if (nres != iov->nres) {
+		dev_err(&dev->dev, "not enough MMIO resources for SR-IOV\n");
+		return -ENOMEM;
+	}
+
+	iov->offset = offset;
+	iov->stride = stride;
+
+	if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->subordinate) {
+		dev_err(&dev->dev, "SR-IOV: bus number out of range\n");
+		return -ENOMEM;
+	}
+
+	if (iov->link != dev->devfn) {
+		pdev = pci_get_slot(dev->bus, iov->link);
+		if (!pdev)
+			return -ENODEV;
+
+		pci_dev_put(pdev);
+
+		if (!pdev->is_physfn)
+			return -ENODEV;
+
+		rc = sysfs_create_link(&dev->dev.kobj,
+					&pdev->dev.kobj, "dep_link");
+		if (rc)
+			return rc;
+	}
+
+	iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	msleep(100);
+	pci_unblock_user_cfg_access(dev);
+
+	iov->initial = initial;
+	if (nr_virtfn < initial)
+		initial = nr_virtfn;
+
+	for (i = 0; i < initial; i++) {
+		rc = virtfn_add(dev, i, 0);
+		if (rc)
+			goto failed;
+	}
+
+	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
+	iov->nr_virtfn = nr_virtfn;
+
+	return 0;
+
+failed:
+	for (j = 0; j < i; j++)
+		virtfn_remove(dev, j, 0);
+
+	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	ssleep(1);
+	pci_unblock_user_cfg_access(dev);
+
+	if (iov->link != dev->devfn)
+		sysfs_remove_link(&dev->dev.kobj, "dep_link");
+
+	return rc;
+}
+
+static void sriov_disable(struct pci_dev *dev)
+{
+	int i;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (!iov->nr_virtfn)
+		return;
+
+	for (i = 0; i < iov->nr_virtfn; i++)
+		virtfn_remove(dev, i, 0);
+
+	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	ssleep(1);
+	pci_unblock_user_cfg_access(dev);
+
+	if (iov->link != dev->devfn)
+		sysfs_remove_link(&dev->dev.kobj, "dep_link");
+
+	iov->nr_virtfn = 0;
+}
+
 static int sriov_init(struct pci_dev *dev, int pos)
 {
 	int i;
@@ -132,6 +411,8 @@ failed:
 
 static void sriov_release(struct pci_dev *dev)
 {
+	BUG_ON(dev->sriov->nr_virtfn);
+
 	if (dev == dev->sriov->dev)
 		mutex_destroy(&dev->sriov->lock);
 	else
@@ -155,6 +436,7 @@ static void sriov_restore_state(struct pci_dev *dev)
 		pci_update_resource(dev, i);
 
 	pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, iov->nr_virtfn);
 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
 	if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
 		msleep(100);
@@ -245,3 +527,35 @@ int pci_iov_bus_range(struct pci_bus *bus)
 
 	return max ? max - bus->number : 0;
 }
+
+/**
+ * pci_enable_sriov - enable the SR-IOV capability
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+{
+	might_sleep();
+
+	if (!dev->is_physfn)
+		return -ENODEV;
+
+	return sriov_enable(dev, nr_virtfn);
+}
+EXPORT_SYMBOL_GPL(pci_enable_sriov);
+
+/**
+ * pci_disable_sriov - disable the SR-IOV capability
+ * @dev: the PCI device
+ */
+void pci_disable_sriov(struct pci_dev *dev)
+{
+	might_sleep();
+
+	if (!dev->is_physfn)
+		return;
+
+	sriov_disable(dev);
+}
+EXPORT_SYMBOL_GPL(pci_disable_sriov);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index f4fc10fc5872..0f1c7d103509 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -209,6 +209,8 @@ struct pci_sriov {
 	u32 cap;		/* SR-IOV Capabilities */
 	u16 ctrl;		/* SR-IOV Control */
 	u16 total;		/* total VFs associated with the PF */
+	u16 initial;		/* initial VFs associated with the PF */
+	u16 nr_virtfn;		/* number of VFs available */
 	u16 offset;		/* first VF Routing ID offset */
 	u16 stride;		/* following VF stride */
 	u32 pgsz;		/* page size for BAR alignment */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8ce2f2d9ab63..c2e491e04063 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -265,6 +265,7 @@ struct pci_dev {
 	unsigned int	is_pcie:1;
 	unsigned int	state_saved:1;
 	unsigned int	is_physfn:1;
+	unsigned int	is_virtfn:1;
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
@@ -279,7 +280,10 @@ struct pci_dev {
 #endif
 	struct pci_vpd *vpd;
 #ifdef CONFIG_PCI_IOV
-	struct pci_sriov *sriov;	/* SR-IOV capability related */
+	union {
+		struct pci_sriov *sriov;	/* SR-IOV capability related */
+		struct pci_dev *physfn;	/* the PF this VF is associated with */
+	};
 #endif
 };
 
@@ -1212,5 +1216,18 @@ int pci_ext_cfg_avail(struct pci_dev *dev);
 
 void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
 
+#ifdef CONFIG_PCI_IOV
+extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
+extern void pci_disable_sriov(struct pci_dev *dev);
+#else
+static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+{
+	return -ENODEV;
+}
+static inline void pci_disable_sriov(struct pci_dev *dev)
+{
+}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
-- 
cgit v1.2.3


From 74bb1bcc7dbbc9ddef773bf3395d7ff92aaaad2e Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:16 +0800
Subject: PCI: handle SR-IOV Virtual Function Migration

Add or remove a Virtual Function after receiving a Migrate In or Out
Request.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/iov.c   | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h   |   4 ++
 include/linux/pci.h |   6 +++
 3 files changed, 129 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index d0ff8ad8f7ba..7227efc760db 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -179,6 +179,97 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
 	pci_dev_put(dev);
 }
 
+static int sriov_migration(struct pci_dev *dev)
+{
+	u16 status;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (!iov->nr_virtfn)
+		return 0;
+
+	if (!(iov->cap & PCI_SRIOV_CAP_VFM))
+		return 0;
+
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_STATUS, &status);
+	if (!(status & PCI_SRIOV_STATUS_VFM))
+		return 0;
+
+	schedule_work(&iov->mtask);
+
+	return 1;
+}
+
+static void sriov_migration_task(struct work_struct *work)
+{
+	int i;
+	u8 state;
+	u16 status;
+	struct pci_sriov *iov = container_of(work, struct pci_sriov, mtask);
+
+	for (i = iov->initial; i < iov->nr_virtfn; i++) {
+		state = readb(iov->mstate + i);
+		if (state == PCI_SRIOV_VFM_MI) {
+			writeb(PCI_SRIOV_VFM_AV, iov->mstate + i);
+			state = readb(iov->mstate + i);
+			if (state == PCI_SRIOV_VFM_AV)
+				virtfn_add(iov->self, i, 1);
+		} else if (state == PCI_SRIOV_VFM_MO) {
+			virtfn_remove(iov->self, i, 1);
+			writeb(PCI_SRIOV_VFM_UA, iov->mstate + i);
+			state = readb(iov->mstate + i);
+			if (state == PCI_SRIOV_VFM_AV)
+				virtfn_add(iov->self, i, 0);
+		}
+	}
+
+	pci_read_config_word(iov->self, iov->pos + PCI_SRIOV_STATUS, &status);
+	status &= ~PCI_SRIOV_STATUS_VFM;
+	pci_write_config_word(iov->self, iov->pos + PCI_SRIOV_STATUS, status);
+}
+
+static int sriov_enable_migration(struct pci_dev *dev, int nr_virtfn)
+{
+	int bir;
+	u32 table;
+	resource_size_t pa;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (nr_virtfn <= iov->initial)
+		return 0;
+
+	pci_read_config_dword(dev, iov->pos + PCI_SRIOV_VFM, &table);
+	bir = PCI_SRIOV_VFM_BIR(table);
+	if (bir > PCI_STD_RESOURCE_END)
+		return -EIO;
+
+	table = PCI_SRIOV_VFM_OFFSET(table);
+	if (table + nr_virtfn > pci_resource_len(dev, bir))
+		return -EIO;
+
+	pa = pci_resource_start(dev, bir) + table;
+	iov->mstate = ioremap(pa, nr_virtfn);
+	if (!iov->mstate)
+		return -ENOMEM;
+
+	INIT_WORK(&iov->mtask, sriov_migration_task);
+
+	iov->ctrl |= PCI_SRIOV_CTRL_VFM | PCI_SRIOV_CTRL_INTR;
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+
+	return 0;
+}
+
+static void sriov_disable_migration(struct pci_dev *dev)
+{
+	struct pci_sriov *iov = dev->sriov;
+
+	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFM | PCI_SRIOV_CTRL_INTR);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+
+	cancel_work_sync(&iov->mtask);
+	iounmap(iov->mstate);
+}
+
 static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 {
 	int rc;
@@ -261,6 +352,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 			goto failed;
 	}
 
+	if (iov->cap & PCI_SRIOV_CAP_VFM) {
+		rc = sriov_enable_migration(dev, nr_virtfn);
+		if (rc)
+			goto failed;
+	}
+
 	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
 	iov->nr_virtfn = nr_virtfn;
 
@@ -290,6 +387,9 @@ static void sriov_disable(struct pci_dev *dev)
 	if (!iov->nr_virtfn)
 		return;
 
+	if (iov->cap & PCI_SRIOV_CAP_VFM)
+		sriov_disable_migration(dev);
+
 	for (i = 0; i < iov->nr_virtfn; i++)
 		virtfn_remove(dev, i, 0);
 
@@ -559,3 +659,22 @@ void pci_disable_sriov(struct pci_dev *dev)
 	sriov_disable(dev);
 }
 EXPORT_SYMBOL_GPL(pci_disable_sriov);
+
+/**
+ * pci_sriov_migration - notify SR-IOV core of Virtual Function Migration
+ * @dev: the PCI device
+ *
+ * Returns IRQ_HANDLED if the IRQ is handled, or IRQ_NONE if not.
+ *
+ * Physical Function driver is responsible to register IRQ handler using
+ * VF Migration Interrupt Message Number, and call this function when the
+ * interrupt is generated by the hardware.
+ */
+irqreturn_t pci_sriov_migration(struct pci_dev *dev)
+{
+	if (!dev->is_physfn)
+		return IRQ_NONE;
+
+	return sriov_migration(dev) ? IRQ_HANDLED : IRQ_NONE;
+}
+EXPORT_SYMBOL_GPL(pci_sriov_migration);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 0f1c7d103509..22dcfdb75d91 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1,6 +1,8 @@
 #ifndef DRIVERS_PCI_H
 #define DRIVERS_PCI_H
 
+#include <linux/workqueue.h>
+
 #define PCI_CFG_SPACE_SIZE	256
 #define PCI_CFG_SPACE_EXP_SIZE	4096
 
@@ -218,6 +220,8 @@ struct pci_sriov {
 	struct pci_dev *dev;	/* lowest numbered PF */
 	struct pci_dev *self;	/* this PF */
 	struct mutex lock;	/* lock for VF bus */
+	struct work_struct mtask; /* VF Migration task */
+	u8 __iomem *mstate;	/* VF Migration State Array */
 };
 
 #ifdef CONFIG_PCI_IOV
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c2e491e04063..1216843412da 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -52,6 +52,7 @@
 #include <asm/atomic.h>
 #include <linux/device.h>
 #include <linux/io.h>
+#include <linux/irqreturn.h>
 
 /* Include the ID list */
 #include <linux/pci_ids.h>
@@ -1219,6 +1220,7 @@ void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
 #ifdef CONFIG_PCI_IOV
 extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
 extern void pci_disable_sriov(struct pci_dev *dev);
+extern irqreturn_t pci_sriov_migration(struct pci_dev *dev);
 #else
 static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
 {
@@ -1227,6 +1229,10 @@ static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
 static inline void pci_disable_sriov(struct pci_dev *dev)
 {
 }
+static inline irqreturn_t pci_sriov_migration(struct pci_dev *dev)
+{
+	return IRQ_NONE;
+}
 #endif
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 79af72d716cf1bb13b175429cf181a6c4d063ee8 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Fri, 20 Mar 2009 14:55:55 -0600
Subject: PCI: pci_is_root_bus helper

Introduce pci_is_root_bus helper function. This will help make code
more consistent, as well as prevent incorrect assumptions (such as
pci_bus->self == NULL on a root bus, which is not always true).

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1216843412da..50d94388e87c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -357,6 +357,15 @@ struct pci_bus {
 #define pci_bus_b(n)	list_entry(n, struct pci_bus, node)
 #define to_pci_bus(n)	container_of(n, struct pci_bus, dev)
 
+/*
+ * Returns true if the pci bus is root (behind host-pci bridge),
+ * false otherwise
+ */
+static inline bool pci_is_root_bus(struct pci_bus *pbus)
+{
+	return !(pbus->parent);
+}
+
 #ifdef CONFIG_PCI_MSI
 static inline bool pci_dev_msi_enabled(struct pci_dev *pci_dev)
 {
-- 
cgit v1.2.3


From 3ed4fd96b3188406ac5357d9290bcffa08c65cf6 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Fri, 20 Mar 2009 14:56:25 -0600
Subject: PCI: Introduce pci_rescan_bus()

This API is used by the PCI core to rescan a bus and rediscover
newly added devices.

Over time, it is expected that the various PCI hotplug drivers
will migrate to this interface and away from the old
pci_do_scan_bus() interface.

Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/fakephp.c |  6 +++---
 drivers/pci/probe.c           | 32 ++++++++++++++++++++++++++++++++
 include/linux/pci.h           |  3 +++
 3 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c
index d8649e127298..16063745766e 100644
--- a/drivers/pci/hotplug/fakephp.c
+++ b/drivers/pci/hotplug/fakephp.c
@@ -245,12 +245,12 @@ static int pci_rescan_slot(struct pci_dev *temp)
 
 
 /**
- * pci_rescan_bus - Rescan PCI bus
+ * pci_rescan_bus_local - fakephp version of rescan PCI bus
  * @bus: the PCI bus to rescan
  *
  * Call pci_rescan_slot for each possible function of the bus.
  */
-static void pci_rescan_bus(const struct pci_bus *bus)
+static void pci_rescan_bus_local(const struct pci_bus *bus)
 {
 	unsigned int devfn;
 	struct pci_dev *dev;
@@ -291,7 +291,7 @@ static void pci_rescan_buses(const struct list_head *list)
 	const struct list_head *l;
 	list_for_each(l,list) {
 		const struct pci_bus *b = pci_bus_b(l);
-		pci_rescan_bus(b);
+		pci_rescan_bus_local(b);
 		pci_rescan_buses(&b->children);
 	}
 }
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index f69256c63b2b..60a8e5fec6c5 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1212,6 +1212,38 @@ struct pci_bus * __devinit pci_scan_bus_parented(struct device *parent,
 EXPORT_SYMBOL(pci_scan_bus_parented);
 
 #ifdef CONFIG_HOTPLUG
+/**
+ * pci_rescan_bus - scan a PCI bus for devices.
+ * @bus: PCI bus to scan
+ *
+ * Scan a PCI bus and child buses for new devices, adds them,
+ * and enables them.
+ *
+ * Returns the max number of subordinate bus discovered.
+ */
+unsigned int __devinit pci_rescan_bus(struct pci_bus *bus)
+{
+	unsigned int max;
+	struct pci_dev *dev;
+
+	max = pci_scan_child_bus(bus);
+
+	up_read(&pci_bus_sem);
+	list_for_each_entry(dev, &bus->devices, bus_list)
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
+		    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+			if (dev->subordinate)
+				pci_bus_size_bridges(dev->subordinate);
+	down_read(&pci_bus_sem);
+
+	pci_bus_assign_resources(bus);
+	pci_enable_bridges(bus);
+	pci_bus_add_devices(bus);
+
+	return max;
+}
+EXPORT_SYMBOL_GPL(pci_rescan_bus);
+
 EXPORT_SYMBOL(pci_add_new_bus);
 EXPORT_SYMBOL(pci_scan_slot);
 EXPORT_SYMBOL(pci_scan_bridge);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 50d94388e87c..6fb335b0d74f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -726,6 +726,9 @@ int pci_back_from_sleep(struct pci_dev *dev);
 
 /* Functions for PCI Hotplug drivers to use */
 int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap);
+#ifdef CONFIG_HOTPLUG
+unsigned int pci_rescan_bus(struct pci_bus *bus);
+#endif
 
 /* Vital product data routines */
 ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
-- 
cgit v1.2.3


From 2d622719f1572ef31e0616444a515eba3094d050 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:30:49 -0500
Subject: tracing: add ring_buffer_event_discard() to ring buffer

This patch overloads RINGBUF_TYPE_PADDING to provide a way to discard
events from the ring buffer, for the event-filtering mechanism
introduced in a subsequent patch.

I did the initial version but thanks to Steven Rostedt for adding
the parts that actually made it work. ;-)

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |  11 +++--
 kernel/trace/ring_buffer.c  | 117 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 105 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 9e6052bd1a1c..e1b7b2173885 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -18,10 +18,13 @@ struct ring_buffer_event {
 /**
  * enum ring_buffer_type - internal ring buffer types
  *
- * @RINGBUF_TYPE_PADDING:	Left over page padding
- *				 array is ignored
- *				 size is variable depending on how much
+ * @RINGBUF_TYPE_PADDING:	Left over page padding or discarded event
+ *				 If time_delta is 0:
+ *				  array is ignored
+ *				  size is variable depending on how much
  *				  padding is needed
+ *				 If time_delta is non zero:
+ *				  everything else same as RINGBUF_TYPE_DATA
  *
  * @RINGBUF_TYPE_TIME_EXTEND:	Extend the time delta
  *				 array[0] = time delta (28 .. 59)
@@ -65,6 +68,8 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
 	return event->time_delta;
 }
 
+void ring_buffer_event_discard(struct ring_buffer_event *event);
+
 /*
  * size is in bytes for each per CPU buffer.
  */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 384ca5d9d729..a09027ec1714 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -189,16 +189,65 @@ enum {
 	RB_LEN_TIME_STAMP = 16,
 };
 
-/* inline for ring buffer fast paths */
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+	return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+}
+
+static inline int rb_discarded_event(struct ring_buffer_event *event)
+{
+	return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+}
+
+static void rb_event_set_padding(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	event->time_delta = 0;
+}
+
+/**
+ * ring_buffer_event_discard - discard an event in the ring buffer
+ * @buffer: the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+
 static unsigned
-rb_event_length(struct ring_buffer_event *event)
+rb_event_data_length(struct ring_buffer_event *event)
 {
 	unsigned length;
 
+	if (event->len)
+		length = event->len * RB_ALIGNMENT;
+	else
+		length = event->array[0];
+	return length + RB_EVNT_HDR_SIZE;
+}
+
+/* inline for ring buffer fast paths */
+static unsigned
+rb_event_length(struct ring_buffer_event *event)
+{
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		/* undefined */
-		return -1;
+		if (rb_null_event(event))
+			/* undefined */
+			return -1;
+		return rb_event_data_length(event);
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		return RB_LEN_TIME_EXTEND;
@@ -207,11 +256,7 @@ rb_event_length(struct ring_buffer_event *event)
 		return RB_LEN_TIME_STAMP;
 
 	case RINGBUF_TYPE_DATA:
-		if (event->len)
-			length = event->len * RB_ALIGNMENT;
-		else
-			length = event->array[0];
-		return length + RB_EVNT_HDR_SIZE;
+		return rb_event_data_length(event);
 	default:
 		BUG();
 	}
@@ -845,11 +890,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
 
-static inline int rb_null_event(struct ring_buffer_event *event)
-{
-	return event->type == RINGBUF_TYPE_PADDING;
-}
-
 static inline void *
 __rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
@@ -1219,7 +1259,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		if (tail < BUF_PAGE_SIZE) {
 			/* Mark the rest of the page with padding */
 			event = __rb_page_index(tail_page, tail);
-			event->type = RINGBUF_TYPE_PADDING;
+			rb_event_set_padding(event);
 		}
 
 		if (tail <= BUF_PAGE_SIZE)
@@ -1969,7 +2009,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	event = rb_reader_event(cpu_buffer);
 
-	if (event->type == RINGBUF_TYPE_DATA)
+	if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
 		cpu_buffer->entries--;
 
 	rb_update_read_stamp(cpu_buffer, event);
@@ -2052,9 +2092,18 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		RB_WARN_ON(cpu_buffer, 1);
+		if (rb_null_event(event))
+			RB_WARN_ON(cpu_buffer, 1);
+		/*
+		 * Because the writer could be discarding every
+		 * event it creates (which would probably be bad)
+		 * if we were to go back to "again" then we may never
+		 * catch up, and will trigger the warn on, or lock
+		 * the box. Return the padding, and we will release
+		 * the current locks, and try again.
+		 */
 		rb_advance_reader(cpu_buffer);
-		return NULL;
+		return event;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		/* Internal data, OK to advance */
@@ -2115,8 +2164,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		rb_inc_iter(iter);
-		goto again;
+		if (rb_null_event(event)) {
+			rb_inc_iter(iter);
+			goto again;
+		}
+		rb_advance_iter(iter);
+		return event;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		/* Internal data, OK to advance */
@@ -2163,10 +2216,16 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 
@@ -2185,10 +2244,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_event *event;
 	unsigned long flags;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_iter_peek(iter, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 
@@ -2207,6 +2272,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
 
+ again:
 	/* might be called in atomic */
 	preempt_disable();
 
@@ -2228,6 +2294,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
  out:
 	preempt_enable();
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2306,6 +2377,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 	unsigned long flags;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_iter_peek(iter, ts);
 	if (!event)
@@ -2315,6 +2387,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read);
-- 
cgit v1.2.3


From c0f92ba99bdeaf35f9c580291b4e1a657c67fbd4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:44 +0100
Subject: debugfs: function to know if debugfs is initialized

Impact: add new debugfs API

With ftrace, some tracers are registered in early initcalls
and attempt to create files on the debugfs filesystem.
Depending on when they are activated, they can try to create their
file at any time. Some checks can be done on the tracing area
but providing a helper to know if debugfs is registered make it
really more easy.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237759847-21025-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/debugfs/inode.c      | 16 ++++++++++++++++
 include/linux/debugfs.h |  8 ++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 81ae9ea3c6e1..0662ba6de85a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,6 +30,7 @@
 
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
+static bool debugfs_registered;
 
 static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 {
@@ -496,6 +497,16 @@ exit:
 }
 EXPORT_SYMBOL_GPL(debugfs_rename);
 
+/**
+ * debugfs_initialized - Tells whether debugfs has been registered
+ */
+bool debugfs_initialized(void)
+{
+	return debugfs_registered;
+}
+EXPORT_SYMBOL_GPL(debugfs_initialized);
+
+
 static struct kobject *debug_kobj;
 
 static int __init debugfs_init(void)
@@ -509,11 +520,16 @@ static int __init debugfs_init(void)
 	retval = register_filesystem(&debug_fs_type);
 	if (retval)
 		kobject_put(debug_kobj);
+	else
+		debugfs_registered = true;
+
 	return retval;
 }
 
 static void __exit debugfs_exit(void)
 {
+	debugfs_registered = false;
+
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	unregister_filesystem(&debug_fs_type);
 	kobject_put(debug_kobj);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index af0e01d4c663..eb5c2ba2f81a 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -71,6 +71,9 @@ struct dentry *debugfs_create_bool(const char *name, mode_t mode,
 struct dentry *debugfs_create_blob(const char *name, mode_t mode,
 				  struct dentry *parent,
 				  struct debugfs_blob_wrapper *blob);
+
+bool debugfs_initialized(void);
+
 #else
 
 #include <linux/err.h>
@@ -183,6 +186,11 @@ static inline struct dentry *debugfs_create_blob(const char *name, mode_t mode,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline bool debugfs_initialized(void)
+{
+	return false;
+}
+
 #endif
 
 #endif
-- 
cgit v1.2.3


From 402d326519c1a4859c527702383f4e60f606ef52 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 12 Feb 2009 10:40:00 +0000
Subject: NOMMU: Present backing device capabilities for MTD chardevs

Present backing device capabilities for MTD character device files to allow
NOMMU mmap to do direct mapping where possible.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Bernd Schmidt <bernd.schmidt@analog.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/Makefile         |  2 +-
 drivers/mtd/chips/map_ram.c  | 17 ++++++++++++
 drivers/mtd/chips/map_rom.c  | 17 ++++++++++++
 drivers/mtd/devices/mtdram.c | 14 ++++++++++
 drivers/mtd/internal.h       | 17 ++++++++++++
 drivers/mtd/mtdbdi.c         | 43 ++++++++++++++++++++++++++++++
 drivers/mtd/mtdchar.c        | 63 +++++++++++++++++++++++++++++++++++++++++++-
 drivers/mtd/mtdcore.c        | 15 +++++++++++
 drivers/mtd/mtdpart.c        | 15 +++++++++++
 include/linux/mtd/mtd.h      | 14 ++++++++++
 10 files changed, 215 insertions(+), 2 deletions(-)
 create mode 100644 drivers/mtd/internal.h
 create mode 100644 drivers/mtd/mtdbdi.c

(limited to 'include/linux')

diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile
index 4521b1ecce45..82d1e4de475b 100644
--- a/drivers/mtd/Makefile
+++ b/drivers/mtd/Makefile
@@ -4,7 +4,7 @@
 
 # Core functionality.
 obj-$(CONFIG_MTD)		+= mtd.o
-mtd-y				:= mtdcore.o mtdsuper.o
+mtd-y				:= mtdcore.o mtdsuper.o mtdbdi.o
 mtd-$(CONFIG_MTD_PARTITIONS)	+= mtdpart.o
 
 obj-$(CONFIG_MTD_CONCAT)	+= mtdconcat.o
diff --git a/drivers/mtd/chips/map_ram.c b/drivers/mtd/chips/map_ram.c
index 072dd8abf33a..6bdc50c727e7 100644
--- a/drivers/mtd/chips/map_ram.c
+++ b/drivers/mtd/chips/map_ram.c
@@ -21,6 +21,8 @@ static int mapram_write (struct mtd_info *, loff_t, size_t, size_t *, const u_ch
 static int mapram_erase (struct mtd_info *, struct erase_info *);
 static void mapram_nop (struct mtd_info *);
 static struct mtd_info *map_ram_probe(struct map_info *map);
+static unsigned long mapram_unmapped_area(struct mtd_info *, unsigned long,
+					  unsigned long, unsigned long);
 
 
 static struct mtd_chip_driver mapram_chipdrv = {
@@ -64,6 +66,7 @@ static struct mtd_info *map_ram_probe(struct map_info *map)
 	mtd->type = MTD_RAM;
 	mtd->size = map->size;
 	mtd->erase = mapram_erase;
+	mtd->get_unmapped_area = mapram_unmapped_area;
 	mtd->read = mapram_read;
 	mtd->write = mapram_write;
 	mtd->sync = mapram_nop;
@@ -79,6 +82,20 @@ static struct mtd_info *map_ram_probe(struct map_info *map)
 }
 
 
+/*
+ * Allow NOMMU mmap() to directly map the device (if not NULL)
+ * - return the address to which the offset maps
+ * - return -ENOSYS to indicate refusal to do the mapping
+ */
+static unsigned long mapram_unmapped_area(struct mtd_info *mtd,
+					  unsigned long len,
+					  unsigned long offset,
+					  unsigned long flags)
+{
+	struct map_info *map = mtd->priv;
+	return (unsigned long) map->virt + offset;
+}
+
 static int mapram_read (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf)
 {
 	struct map_info *map = mtd->priv;
diff --git a/drivers/mtd/chips/map_rom.c b/drivers/mtd/chips/map_rom.c
index c76d6e5f47ee..076090a67b90 100644
--- a/drivers/mtd/chips/map_rom.c
+++ b/drivers/mtd/chips/map_rom.c
@@ -20,6 +20,8 @@ static int maprom_write (struct mtd_info *, loff_t, size_t, size_t *, const u_ch
 static void maprom_nop (struct mtd_info *);
 static struct mtd_info *map_rom_probe(struct map_info *map);
 static int maprom_erase (struct mtd_info *mtd, struct erase_info *info);
+static unsigned long maprom_unmapped_area(struct mtd_info *, unsigned long,
+					  unsigned long, unsigned long);
 
 static struct mtd_chip_driver maprom_chipdrv = {
 	.probe	= map_rom_probe,
@@ -40,6 +42,7 @@ static struct mtd_info *map_rom_probe(struct map_info *map)
 	mtd->name = map->name;
 	mtd->type = MTD_ROM;
 	mtd->size = map->size;
+	mtd->get_unmapped_area = maprom_unmapped_area;
 	mtd->read = maprom_read;
 	mtd->write = maprom_write;
 	mtd->sync = maprom_nop;
@@ -53,6 +56,20 @@ static struct mtd_info *map_rom_probe(struct map_info *map)
 }
 
 
+/*
+ * Allow NOMMU mmap() to directly map the device (if not NULL)
+ * - return the address to which the offset maps
+ * - return -ENOSYS to indicate refusal to do the mapping
+ */
+static unsigned long maprom_unmapped_area(struct mtd_info *mtd,
+					  unsigned long len,
+					  unsigned long offset,
+					  unsigned long flags)
+{
+	struct map_info *map = mtd->priv;
+	return (unsigned long) map->virt + offset;
+}
+
 static int maprom_read (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf)
 {
 	struct map_info *map = mtd->priv;
diff --git a/drivers/mtd/devices/mtdram.c b/drivers/mtd/devices/mtdram.c
index 3aaca88847d3..fce5ff7589aa 100644
--- a/drivers/mtd/devices/mtdram.c
+++ b/drivers/mtd/devices/mtdram.c
@@ -65,6 +65,19 @@ static void ram_unpoint(struct mtd_info *mtd, loff_t from, size_t len)
 {
 }
 
+/*
+ * Allow NOMMU mmap() to directly map the device (if not NULL)
+ * - return the address to which the offset maps
+ * - return -ENOSYS to indicate refusal to do the mapping
+ */
+static unsigned long ram_get_unmapped_area(struct mtd_info *mtd,
+					   unsigned long len,
+					   unsigned long offset,
+					   unsigned long flags)
+{
+	return (unsigned long) mtd->priv + offset;
+}
+
 static int ram_read(struct mtd_info *mtd, loff_t from, size_t len,
 		size_t *retlen, u_char *buf)
 {
@@ -116,6 +129,7 @@ int mtdram_init_device(struct mtd_info *mtd, void *mapped_address,
 	mtd->erase = ram_erase;
 	mtd->point = ram_point;
 	mtd->unpoint = ram_unpoint;
+	mtd->get_unmapped_area = ram_get_unmapped_area;
 	mtd->read = ram_read;
 	mtd->write = ram_write;
 
diff --git a/drivers/mtd/internal.h b/drivers/mtd/internal.h
new file mode 100644
index 000000000000..c658fe7216b5
--- /dev/null
+++ b/drivers/mtd/internal.h
@@ -0,0 +1,17 @@
+/* Internal MTD definitions
+ *
+ * Copyright © 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * mtdbdi.c
+ */
+extern struct backing_dev_info mtd_bdi_unmappable;
+extern struct backing_dev_info mtd_bdi_ro_mappable;
+extern struct backing_dev_info mtd_bdi_rw_mappable;
diff --git a/drivers/mtd/mtdbdi.c b/drivers/mtd/mtdbdi.c
new file mode 100644
index 000000000000..5ca5aed0b225
--- /dev/null
+++ b/drivers/mtd/mtdbdi.c
@@ -0,0 +1,43 @@
+/* MTD backing device capabilities
+ *
+ * Copyright © 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/backing-dev.h>
+#include <linux/mtd/mtd.h>
+#include "internal.h"
+
+/*
+ * backing device capabilities for non-mappable devices (such as NAND flash)
+ * - permits private mappings, copies are taken of the data
+ */
+struct backing_dev_info mtd_bdi_unmappable = {
+	.capabilities	= BDI_CAP_MAP_COPY,
+};
+
+/*
+ * backing device capabilities for R/O mappable devices (such as ROM)
+ * - permits private mappings, copies are taken of the data
+ * - permits non-writable shared mappings
+ */
+struct backing_dev_info mtd_bdi_ro_mappable = {
+	.capabilities	= (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
+			   BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP),
+};
+
+/*
+ * backing device capabilities for writable mappable devices (such as RAM)
+ * - permits private mappings, copies are taken of the data
+ * - permits non-writable shared mappings
+ */
+struct backing_dev_info mtd_bdi_rw_mappable = {
+	.capabilities	= (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
+			   BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP |
+			   BDI_CAP_WRITE_MAP),
+};
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 2c8a16f49ca2..f478f1fc3949 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/compatmac.h>
@@ -107,12 +108,15 @@ static int mtd_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 
-	if (MTD_ABSENT == mtd->type) {
+	if (mtd->type == MTD_ABSENT) {
 		put_mtd_device(mtd);
 		ret = -ENODEV;
 		goto out;
 	}
 
+	if (mtd->backing_dev_info)
+		file->f_mapping->backing_dev_info = mtd->backing_dev_info;
+
 	/* You can't open it RW if it's not a writeable device */
 	if ((file->f_mode & FMODE_WRITE) && !(mtd->flags & MTD_WRITEABLE)) {
 		put_mtd_device(mtd);
@@ -781,6 +785,59 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 	return ret;
 } /* memory_ioctl */
 
+/*
+ * try to determine where a shared mapping can be made
+ * - only supported for NOMMU at the moment (MMU can't doesn't copy private
+ *   mappings)
+ */
+#ifndef CONFIG_MMU
+static unsigned long mtd_get_unmapped_area(struct file *file,
+					   unsigned long addr,
+					   unsigned long len,
+					   unsigned long pgoff,
+					   unsigned long flags)
+{
+	struct mtd_file_info *mfi = file->private_data;
+	struct mtd_info *mtd = mfi->mtd;
+
+	if (mtd->get_unmapped_area) {
+		unsigned long offset;
+
+		if (addr != 0)
+			return (unsigned long) -EINVAL;
+
+		if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+			return (unsigned long) -EINVAL;
+
+		offset = pgoff << PAGE_SHIFT;
+		if (offset > mtd->size - len)
+			return (unsigned long) -EINVAL;
+
+		return mtd->get_unmapped_area(mtd, len, offset, flags);
+	}
+
+	/* can't map directly */
+	return (unsigned long) -ENOSYS;
+}
+#endif
+
+/*
+ * set up a mapping for shared memory segments
+ */
+static int mtd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+#ifdef CONFIG_MMU
+	struct mtd_file_info *mfi = file->private_data;
+	struct mtd_info *mtd = mfi->mtd;
+
+	if (mtd->type == MTD_RAM || mtd->type == MTD_ROM)
+		return 0;
+	return -ENOSYS;
+#else
+	return vma->vm_flags & VM_SHARED ? 0 : -ENOSYS;
+#endif
+}
+
 static const struct file_operations mtd_fops = {
 	.owner		= THIS_MODULE,
 	.llseek		= mtd_lseek,
@@ -789,6 +846,10 @@ static const struct file_operations mtd_fops = {
 	.ioctl		= mtd_ioctl,
 	.open		= mtd_open,
 	.release	= mtd_close,
+	.mmap		= mtd_mmap,
+#ifndef CONFIG_MMU
+	.get_unmapped_area = mtd_get_unmapped_area,
+#endif
 };
 
 static int __init init_mtdchar(void)
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 76fe0a1e7a5e..65a7f3703881 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -19,6 +19,7 @@
 #include <linux/proc_fs.h>
 
 #include <linux/mtd/mtd.h>
+#include "internal.h"
 
 #include "mtdcore.h"
 
@@ -46,6 +47,20 @@ int add_mtd_device(struct mtd_info *mtd)
 {
 	int i;
 
+	if (!mtd->backing_dev_info) {
+		switch (mtd->type) {
+		case MTD_RAM:
+			mtd->backing_dev_info = &mtd_bdi_rw_mappable;
+			break;
+		case MTD_ROM:
+			mtd->backing_dev_info = &mtd_bdi_ro_mappable;
+			break;
+		default:
+			mtd->backing_dev_info = &mtd_bdi_unmappable;
+			break;
+		}
+	}
+
 	BUG_ON(mtd->writesize == 0);
 	mutex_lock(&mtd_table_mutex);
 
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 144e6b613a77..06d5b9d8853a 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -84,6 +84,18 @@ static void part_unpoint(struct mtd_info *mtd, loff_t from, size_t len)
 	part->master->unpoint(part->master, from + part->offset, len);
 }
 
+static unsigned long part_get_unmapped_area(struct mtd_info *mtd,
+					    unsigned long len,
+					    unsigned long offset,
+					    unsigned long flags)
+{
+	struct mtd_part *part = PART(mtd);
+
+	offset += part->offset;
+	return part->master->get_unmapped_area(part->master, len, offset,
+					       flags);
+}
+
 static int part_read_oob(struct mtd_info *mtd, loff_t from,
 		struct mtd_oob_ops *ops)
 {
@@ -342,6 +354,7 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 
 	slave->mtd.name = part->name;
 	slave->mtd.owner = master->owner;
+	slave->mtd.backing_dev_info = master->backing_dev_info;
 
 	slave->mtd.read = part_read;
 	slave->mtd.write = part_write;
@@ -354,6 +367,8 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 		slave->mtd.unpoint = part_unpoint;
 	}
 
+	if (master->get_unmapped_area)
+		slave->mtd.get_unmapped_area = part_get_unmapped_area;
 	if (master->read_oob)
 		slave->mtd.read_oob = part_read_oob;
 	if (master->write_oob)
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 3aa5d77c2cdb..0c079fd307a5 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -162,6 +162,20 @@ struct mtd_info {
 	/* We probably shouldn't allow XIP if the unpoint isn't a NULL */
 	void (*unpoint) (struct mtd_info *mtd, loff_t from, size_t len);
 
+	/* Allow NOMMU mmap() to directly map the device (if not NULL)
+	 * - return the address to which the offset maps
+	 * - return -ENOSYS to indicate refusal to do the mapping
+	 */
+	unsigned long (*get_unmapped_area) (struct mtd_info *mtd,
+					    unsigned long len,
+					    unsigned long offset,
+					    unsigned long flags);
+
+	/* Backing device capabilities for this device
+	 * - provides mmap capabilities
+	 */
+	struct backing_dev_info *backing_dev_info;
+
 
 	int (*read) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
 	int (*write) (struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf);
-- 
cgit v1.2.3


From 58c610bd1a3f50820e45a7c09ec0e44d2cda15dd Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 18 Mar 2009 15:33:05 +0800
Subject: intel-iommu: Snooping control support

Snooping control enabled IOMMU to guarantee DMA cache coherency and thus reduce
software effort (VMM) in maintaining effective memory type.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c   | 38 +++++++++++++++++++++++++++++++++-----
 include/linux/intel-iommu.h |  2 +-
 2 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index f3f686581a90..be999ff025af 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -231,6 +231,7 @@ struct dmar_domain {
 	int		flags;		/* flags to find out type of domain */
 
 	int		iommu_coherency;/* indicate coherency of iommu access */
+	int		iommu_snooping; /* indicate snooping control feature*/
 	int		iommu_count;	/* reference count of iommu */
 	spinlock_t	iommu_lock;	/* protect iommu set in domain */
 	u64		max_addr;	/* maximum mapped address */
@@ -421,7 +422,6 @@ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 	return g_iommus[iommu_id];
 }
 
-/* "Coherency" capability may be different across iommus */
 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 {
 	int i;
@@ -438,6 +438,29 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
 	}
 }
 
+static void domain_update_iommu_snooping(struct dmar_domain *domain)
+{
+	int i;
+
+	domain->iommu_snooping = 1;
+
+	i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
+	for (; i < g_num_of_iommus; ) {
+		if (!ecap_sc_support(g_iommus[i]->ecap)) {
+			domain->iommu_snooping = 0;
+			break;
+		}
+		i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
+	}
+}
+
+/* Some capabilities may be different across iommus */
+static void domain_update_iommu_cap(struct dmar_domain *domain)
+{
+	domain_update_iommu_coherency(domain);
+	domain_update_iommu_snooping(domain);
+}
+
 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
 {
 	struct dmar_drhd_unit *drhd = NULL;
@@ -1429,6 +1452,11 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 	else
 		domain->iommu_coherency = 0;
 
+	if (ecap_sc_support(iommu->ecap))
+		domain->iommu_snooping = 1;
+	else
+		domain->iommu_snooping = 0;
+
 	domain->iommu_count = 1;
 
 	/* always allocate the top pgd */
@@ -1557,7 +1585,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 	spin_lock_irqsave(&domain->iommu_lock, flags);
 	if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
 		domain->iommu_count++;
-		domain_update_iommu_coherency(domain);
+		domain_update_iommu_cap(domain);
 	}
 	spin_unlock_irqrestore(&domain->iommu_lock, flags);
 	return 0;
@@ -2820,7 +2848,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
 		spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
 		clear_bit(iommu->seq_id, &domain->iommu_bmp);
 		domain->iommu_count--;
-		domain_update_iommu_coherency(domain);
+		domain_update_iommu_cap(domain);
 		spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
 	}
 
@@ -2848,13 +2876,13 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
 		iommu_detach_dev(iommu, info->bus, info->devfn);
 
 		/* clear this iommu in iommu_bmp, update iommu count
-		 * and coherency
+		 * and capabilities
 		 */
 		spin_lock_irqsave(&domain->iommu_lock, flags2);
 		if (test_and_clear_bit(iommu->seq_id,
 				       &domain->iommu_bmp)) {
 			domain->iommu_count--;
-			domain_update_iommu_coherency(domain);
+			domain_update_iommu_cap(domain);
 		}
 		spin_unlock_irqrestore(&domain->iommu_lock, flags2);
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d2e3cbfba14f..3ad894004938 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -123,7 +123,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define ecap_eim_support(e)	((e >> 4) & 0x1)
 #define ecap_ir_support(e)	((e >> 3) & 0x1)
 #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
-
+#define ecap_sc_support(e)	((e >> 7) & 0x1) /* Snooping Control */
 
 /* IOTLB_REG */
 #define DMA_TLB_FLUSH_GRANU_OFFSET  60
-- 
cgit v1.2.3


From dbb9fd8630e95b6155aff658a2b5f80e95ca2bc6 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 18 Mar 2009 15:33:06 +0800
Subject: iommu: Add domain_has_cap iommu_ops

This iommu_op can tell if domain have a specific capability, like snooping
control for Intel IOMMU, which can be used by other components of kernel to
adjust the behaviour.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/kernel/amd_iommu.c |  7 +++++++
 drivers/base/iommu.c        |  7 +++++++
 drivers/pci/intel-iommu.c   | 12 ++++++++++++
 include/linux/iommu.h       | 12 ++++++++++++
 4 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 5113c080f0c4..65c9b58655ff 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1924,6 +1924,12 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 	return paddr;
 }
 
+static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
+				    unsigned long cap)
+{
+	return 0;
+}
+
 static struct iommu_ops amd_iommu_ops = {
 	.domain_init = amd_iommu_domain_init,
 	.domain_destroy = amd_iommu_domain_destroy,
@@ -1932,5 +1938,6 @@ static struct iommu_ops amd_iommu_ops = {
 	.map = amd_iommu_map_range,
 	.unmap = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
+	.domain_has_cap = amd_iommu_domain_has_cap,
 };
 
diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index 5e039d4f877c..c314f144825f 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -98,3 +98,10 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 	return iommu_ops->iova_to_phys(domain, iova);
 }
 EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
+
+int iommu_domain_has_cap(struct iommu_domain *domain,
+			 unsigned long cap)
+{
+	return iommu_ops->domain_has_cap(domain, cap);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_has_cap);
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index be999ff025af..3778ab149baf 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3158,6 +3158,17 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 	return phys;
 }
 
+static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
+				      unsigned long cap)
+{
+	struct dmar_domain *dmar_domain = domain->priv;
+
+	if (cap == IOMMU_CAP_CACHE_COHERENCY)
+		return dmar_domain->iommu_snooping;
+
+	return 0;
+}
+
 static struct iommu_ops intel_iommu_ops = {
 	.domain_init	= intel_iommu_domain_init,
 	.domain_destroy = intel_iommu_domain_destroy,
@@ -3166,6 +3177,7 @@ static struct iommu_ops intel_iommu_ops = {
 	.map		= intel_iommu_map_range,
 	.unmap		= intel_iommu_unmap_range,
 	.iova_to_phys	= intel_iommu_iova_to_phys,
+	.domain_has_cap = intel_iommu_domain_has_cap,
 };
 
 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 8a7bfb1b6ca0..0cf3a4e43f23 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -28,6 +28,8 @@ struct iommu_domain {
 	void *priv;
 };
 
+#define IOMMU_CAP_CACHE_COHERENCY	0x1
+
 struct iommu_ops {
 	int (*domain_init)(struct iommu_domain *domain);
 	void (*domain_destroy)(struct iommu_domain *domain);
@@ -39,6 +41,8 @@ struct iommu_ops {
 		      size_t size);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
 				    unsigned long iova);
+	int (*domain_has_cap)(struct iommu_domain *domain,
+			      unsigned long cap);
 };
 
 #ifdef CONFIG_IOMMU_API
@@ -57,6 +61,8 @@ extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
 			      size_t size);
 extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 				      unsigned long iova);
+extern int iommu_domain_has_cap(struct iommu_domain *domain,
+				unsigned long cap);
 
 #else /* CONFIG_IOMMU_API */
 
@@ -107,6 +113,12 @@ static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 	return 0;
 }
 
+static inline int domain_has_cap(struct iommu_domain *domain,
+				 unsigned long cap)
+{
+	return 0;
+}
+
 #endif /* CONFIG_IOMMU_API */
 
 #endif /* __LINUX_IOMMU_H */
-- 
cgit v1.2.3


From 9cf0669746be19a4906a6c48920060bcf54c708b Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 18 Mar 2009 15:33:07 +0800
Subject: intel-iommu: VT-d page table to support snooping control bit

The user can request to enable snooping control through VT-d page table.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 12 +++++++++++-
 include/linux/dma_remapping.h |  1 +
 include/linux/iommu.h         |  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 3778ab149baf..a0ba568b831c 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -164,7 +164,8 @@ static inline void context_clear_entry(struct context_entry *context)
  * 1: writable
  * 2-6: reserved
  * 7: super page
- * 8-11: available
+ * 8-10: available
+ * 11: snoop behavior
  * 12-63: Host physcial address
  */
 struct dma_pte {
@@ -186,6 +187,11 @@ static inline void dma_set_pte_writable(struct dma_pte *pte)
 	pte->val |= DMA_PTE_WRITE;
 }
 
+static inline void dma_set_pte_snp(struct dma_pte *pte)
+{
+	pte->val |= DMA_PTE_SNP;
+}
+
 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 {
 	pte->val = (pte->val & ~3) | (prot & 3);
@@ -1685,6 +1691,8 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
 		BUG_ON(dma_pte_addr(pte));
 		dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
 		dma_set_pte_prot(pte, prot);
+		if (prot & DMA_PTE_SNP)
+			dma_set_pte_snp(pte);
 		domain_flush_cache(domain, pte, sizeof(*pte));
 		start_pfn++;
 		index++;
@@ -3105,6 +3113,8 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
 		prot |= DMA_PTE_READ;
 	if (iommu_prot & IOMMU_WRITE)
 		prot |= DMA_PTE_WRITE;
+	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
+		prot |= DMA_PTE_SNP;
 
 	max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
 	if (dmar_domain->max_addr < max_addr) {
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index af1dab41674b..1a455f1f86d7 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -11,6 +11,7 @@
 
 #define DMA_PTE_READ (1)
 #define DMA_PTE_WRITE (2)
+#define DMA_PTE_SNP (1 << 11)
 
 struct intel_iommu;
 struct dmar_domain;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0cf3a4e43f23..3af4ffd591b9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -21,6 +21,7 @@
 
 #define IOMMU_READ	(1)
 #define IOMMU_WRITE	(2)
+#define IOMMU_CACHE	(4) /* DMA cache coherency */
 
 struct device;
 
-- 
cgit v1.2.3


From 3aa551c9b4c40018f0e261a178e3d25478dc04a9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 23 Mar 2009 18:28:15 +0100
Subject: genirq: add threaded interrupt handler support

Add support for threaded interrupt handlers:

A device driver can request that its main interrupt handler runs in a
thread. To achive this the device driver requests the interrupt with
request_threaded_irq() and provides additionally to the handler a
thread function. The handler function is called in hard interrupt
context and needs to check whether the interrupt originated from the
device. If the interrupt originated from the device then the handler
can either return IRQ_HANDLED or IRQ_WAKE_THREAD. IRQ_HANDLED is
returned when no further action is required. IRQ_WAKE_THREAD causes
the genirq code to invoke the threaded (main) handler. When
IRQ_WAKE_THREAD is returned handler must have disabled the interrupt
on the device level. This is mandatory for shared interrupt handlers,
but we need to do it as well for obscure x86 hardware where disabling
an interrupt on the IO_APIC level redirects the interrupt to the
legacy PIC interrupt lines.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hardirq.h   |   2 +-
 include/linux/interrupt.h |  37 ++++++++-
 include/linux/irq.h       |   5 ++
 include/linux/irqreturn.h |   2 +
 include/linux/sched.h     |   5 ++
 kernel/exit.c             |   2 +
 kernel/irq/handle.c       |  31 +++++++-
 kernel/irq/manage.c       | 192 ++++++++++++++++++++++++++++++++++++++++++----
 8 files changed, 259 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f83288347dda..2dfaadbdb2ac 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -105,7 +105,7 @@
 # define IRQ_EXIT_OFFSET HARDIRQ_OFFSET
 #endif
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS)
 extern void synchronize_irq(unsigned int irq);
 #else
 # define synchronize_irq(irq)	barrier()
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0c9cb63e6895..6fc2b720c231 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -59,6 +59,16 @@
 #define IRQF_NOBALANCING	0x00000800
 #define IRQF_IRQPOLL		0x00001000
 
+/*
+ * Bits used by threaded handlers:
+ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
+ * IRQTF_DIED      - handler thread died
+ */
+enum {
+	IRQTF_RUNTHREAD,
+	IRQTF_DIED,
+};
+
 typedef irqreturn_t (*irq_handler_t)(int, void *);
 
 /**
@@ -71,6 +81,9 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
  * @next:	pointer to the next irqaction for shared interrupts
  * @irq:	interrupt number
  * @dir:	pointer to the proc/irq/NN/name entry
+ * @thread_fn:	interupt handler function for threaded interrupts
+ * @thread:	thread pointer for threaded interrupts
+ * @thread_flags:	flags related to @thread
  */
 struct irqaction {
 	irq_handler_t handler;
@@ -81,11 +94,31 @@ struct irqaction {
 	struct irqaction *next;
 	int irq;
 	struct proc_dir_entry *dir;
+	irq_handler_t thread_fn;
+	struct task_struct *thread;
+	unsigned long thread_flags;
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
-extern int __must_check request_irq(unsigned int, irq_handler_t handler,
-		       unsigned long, const char *, void *);
+
+extern int __must_check
+request_threaded_irq(unsigned int irq, irq_handler_t handler,
+		     irq_handler_t thread_fn,
+		     unsigned long flags, const char *name, void *dev);
+
+static inline int __must_check
+request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
+	    const char *name, void *dev)
+{
+	return request_threaded_irq(irq, handler, NULL, flags, name, dev);
+}
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+extern void exit_irq_thread(void);
+#else
+static inline void exit_irq_thread(void) { }
+#endif
+
 extern void free_irq(unsigned int, void *);
 
 struct device;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 873e4ac11b81..8b1cf0630210 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -20,6 +20,7 @@
 #include <linux/irqreturn.h>
 #include <linux/irqnr.h>
 #include <linux/errno.h>
+#include <linux/wait.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
@@ -155,6 +156,8 @@ struct irq_2_iommu;
  * @affinity:		IRQ affinity on SMP
  * @cpu:		cpu index useful for balancing
  * @pending_mask:	pending rebalanced interrupts
+ * @threads_active:	number of irqaction threads currently running
+ * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
  * @dir:		/proc/irq/ procfs entry
  * @name:		flow handler name for /proc/interrupts output
  */
@@ -186,6 +189,8 @@ struct irq_desc {
 	cpumask_var_t		pending_mask;
 #endif
 #endif
+	atomic_t		threads_active;
+	wait_queue_head_t       wait_for_threads;
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry	*dir;
 #endif
diff --git a/include/linux/irqreturn.h b/include/linux/irqreturn.h
index c5584ca5b8c9..819acaaac3f5 100644
--- a/include/linux/irqreturn.h
+++ b/include/linux/irqreturn.h
@@ -5,10 +5,12 @@
  * enum irqreturn
  * @IRQ_NONE		interrupt was not from this device
  * @IRQ_HANDLED		interrupt was handled by this device
+ * @IRQ_WAKE_THREAD	handler requests to wake the handler thread
  */
 enum irqreturn {
 	IRQ_NONE,
 	IRQ_HANDLED,
+	IRQ_WAKE_THREAD,
 };
 
 typedef enum irqreturn irqreturn_t;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 46d680643f89..38b77b0f56e5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1292,6 +1292,11 @@ struct task_struct {
 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
 	spinlock_t alloc_lock;
 
+#ifdef CONFIG_GENERIC_HARDIRQS
+	/* IRQ handler threads */
+	struct irqaction *irqaction;
+#endif
+
 	/* Protection of the PI data structures: */
 	spinlock_t pi_lock;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..ca0b3488c4a9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1037,6 +1037,8 @@ NORET_TYPE void do_exit(long code)
 		schedule();
 	}
 
+	exit_irq_thread();
+
 	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
 	 * tsk->flags are checked in the futex code to protect against
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 9ebf77968871..fe8f45374e86 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -357,8 +357,37 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	do {
 		ret = action->handler(irq, action->dev_id);
-		if (ret == IRQ_HANDLED)
+
+		switch (ret) {
+		case IRQ_WAKE_THREAD:
+			/*
+			 * Wake up the handler thread for this
+			 * action. In case the thread crashed and was
+			 * killed we just pretend that we handled the
+			 * interrupt. The hardirq handler above has
+			 * disabled the device interrupt, so no irq
+			 * storm is lurking.
+			 */
+			if (likely(!test_bit(IRQTF_DIED,
+					     &action->thread_flags))) {
+				set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+				wake_up_process(action->thread);
+			}
+
+			/*
+			 * Set it to handled so the spurious check
+			 * does not trigger.
+			 */
+			ret = IRQ_HANDLED;
+			/* Fall through to add to randomness */
+		case IRQ_HANDLED:
 			status |= action->flags;
+			break;
+
+		default:
+			break;
+		}
+
 		retval |= ret;
 		action = action->next;
 	} while (action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6458e99984c0..a4c1ab86cd25 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -8,16 +8,15 @@
  */
 
 #include <linux/irq.h>
+#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 #include "internals.h"
 
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
-cpumask_var_t irq_default_affinity;
-
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *	@irq: interrupt number to wait for
@@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq)
 
 		/* Oops, that failed? */
 	} while (status & IRQ_INPROGRESS);
+
+	/*
+	 * We made sure that no hardirq handler is running. Now verify
+	 * that no threaded handlers are active.
+	 */
+	wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
 }
 EXPORT_SYMBOL(synchronize_irq);
 
+#ifdef CONFIG_SMP
+cpumask_var_t irq_default_affinity;
+
 /**
  *	irq_can_set_affinity - Check if the affinity of a given irq can be set
  *	@irq:		Interrupt to check
@@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq)
 	return 1;
 }
 
+static void
+irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
+{
+	struct irqaction *action = desc->action;
+
+	while (action) {
+		if (action->thread)
+			set_cpus_allowed_ptr(action->thread, cpumask);
+		action = action->next;
+	}
+}
+
 /**
  *	irq_set_affinity - Set the irq affinity of a given irq
  *	@irq:		Interrupt to set affinity
@@ -100,6 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	cpumask_copy(desc->affinity, cpumask);
 	desc->chip->set_affinity(irq, cpumask);
 #endif
+	irq_set_thread_affinity(desc, cpumask);
 	desc->status |= IRQ_AFFINITY_SET;
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
@@ -150,6 +171,8 @@ int irq_select_affinity_usr(unsigned int irq)
 
 	spin_lock_irqsave(&desc->lock, flags);
 	ret = setup_affinity(irq, desc);
+	if (!ret)
+		irq_set_thread_affinity(desc, desc->affinity);
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	return ret;
@@ -384,6 +407,93 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	return ret;
 }
 
+static inline int irq_thread_should_run(struct irqaction *action)
+{
+	return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+}
+
+static int irq_wait_for_interrupt(struct irqaction *action)
+{
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (irq_thread_should_run(action)) {
+			__set_current_state(TASK_RUNNING);
+			return 0;
+		} else
+			schedule();
+	}
+	return -1;
+}
+
+/*
+ * Interrupt handler thread
+ */
+static int irq_thread(void *data)
+{
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+	struct irqaction *action = data;
+	struct irq_desc *desc = irq_to_desc(action->irq);
+	int wake;
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	current->irqaction = action;
+
+	while (!irq_wait_for_interrupt(action)) {
+
+		atomic_inc(&desc->threads_active);
+
+		spin_lock_irq(&desc->lock);
+		if (unlikely(desc->status & IRQ_DISABLED)) {
+			/*
+			 * CHECKME: We might need a dedicated
+			 * IRQ_THREAD_PENDING flag here, which
+			 * retriggers the thread in check_irq_resend()
+			 * but AFAICT IRQ_PENDING should be fine as it
+			 * retriggers the interrupt itself --- tglx
+			 */
+			desc->status |= IRQ_PENDING;
+			spin_unlock_irq(&desc->lock);
+		} else {
+			spin_unlock_irq(&desc->lock);
+
+			action->thread_fn(action->irq, action->dev_id);
+		}
+
+		wake = atomic_dec_and_test(&desc->threads_active);
+
+		if (wake && waitqueue_active(&desc->wait_for_threads))
+			wake_up(&desc->wait_for_threads);
+	}
+
+	/*
+	 * Clear irqaction. Otherwise exit_irq_thread() would make
+	 * fuzz about an active irq thread going into nirvana.
+	 */
+	current->irqaction = NULL;
+	return 0;
+}
+
+/*
+ * Called from do_exit()
+ */
+void exit_irq_thread(void)
+{
+	struct task_struct *tsk = current;
+
+	if (!tsk->irqaction)
+		return;
+
+	printk(KERN_ERR
+	       "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+	       tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+
+	/*
+	 * Set the THREAD DIED flag to prevent further wakeups of the
+	 * soon to be gone threaded handler.
+	 */
+	set_bit(IRQTF_DIED, &tsk->irqaction->flags);
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -419,6 +529,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		rand_initialize_irq(irq);
 	}
 
+	/*
+	 * Threaded handler ?
+	 */
+	if (new->thread_fn) {
+		struct task_struct *t;
+
+		t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
+				   new->name);
+		if (IS_ERR(t))
+			return PTR_ERR(t);
+		/*
+		 * We keep the reference to the task struct even if
+		 * the thread dies to avoid that the interrupt code
+		 * references an already freed task_struct.
+		 */
+		get_task_struct(t);
+		new->thread = t;
+		wake_up_process(t);
+	}
+
 	/*
 	 * The following block of code has to be executed atomically
 	 */
@@ -456,15 +586,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	if (!shared) {
 		irq_chip_set_defaults(desc->chip);
 
+		init_waitqueue_head(&desc->wait_for_threads);
+
 		/* Setup the type (level, edge polarity) if configured: */
 		if (new->flags & IRQF_TRIGGER_MASK) {
 			ret = __irq_set_trigger(desc, irq,
 					new->flags & IRQF_TRIGGER_MASK);
 
-			if (ret) {
-				spin_unlock_irqrestore(&desc->lock, flags);
-				return ret;
-			}
+			if (ret)
+				goto out_thread;
 		} else
 			compat_irq_chip_set_default_handler(desc);
 #if defined(CONFIG_IRQ_PER_CPU)
@@ -532,8 +662,19 @@ mismatch:
 		dump_stack();
 	}
 #endif
+	ret = -EBUSY;
+
+out_thread:
 	spin_unlock_irqrestore(&desc->lock, flags);
-	return -EBUSY;
+	if (new->thread) {
+		struct task_struct *t = new->thread;
+
+		new->thread = NULL;
+		if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
+			kthread_stop(t);
+		put_task_struct(t);
+	}
+	return ret;
 }
 
 /**
@@ -559,6 +700,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action, **action_ptr;
+	struct task_struct *irqthread;
 	unsigned long flags;
 
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -605,6 +747,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		else
 			desc->chip->disable(irq);
 	}
+
+	irqthread = action->thread;
+	action->thread = NULL;
+
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	unregister_handler_proc(irq, action);
@@ -612,6 +758,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	/* Make sure it's not being used on another CPU: */
 	synchronize_irq(irq);
 
+	if (irqthread) {
+		if (!test_bit(IRQTF_DIED, &action->thread_flags))
+			kthread_stop(irqthread);
+		put_task_struct(irqthread);
+	}
+
 #ifdef CONFIG_DEBUG_SHIRQ
 	/*
 	 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -664,9 +816,12 @@ void free_irq(unsigned int irq, void *dev_id)
 EXPORT_SYMBOL(free_irq);
 
 /**
- *	request_irq - allocate an interrupt line
+ *	request_threaded_irq - allocate an interrupt line
  *	@irq: Interrupt line to allocate
- *	@handler: Function to be called when the IRQ occurs
+ *	@handler: Function to be called when the IRQ occurs.
+ *		  Primary handler for threaded interrupts
+ *      @thread_fn: Function called from the irq handler thread
+ *                  If NULL, no irq thread is created
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
@@ -678,6 +833,15 @@ EXPORT_SYMBOL(free_irq);
  *	raises, you must take care both to initialise your hardware
  *	and to set up the interrupt handler in the right order.
  *
+ *	If you want to set up a threaded irq handler for your device
+ *	then you need to supply @handler and @thread_fn. @handler ist
+ *	still called in hard interrupt context and has to check
+ *	whether the interrupt originates from the device. If yes it
+ *	needs to disable the interrupt on the device and return
+ *	IRQ_THREAD_WAKE which will wake up the handler thread and run
+ *	@thread_fn. This split handler design is necessary to support
+ *	shared interrupts.
+ *
  *	Dev_id must be globally unique. Normally the address of the
  *	device data structure is used as the cookie. Since the handler
  *	receives this value it makes sense to use it.
@@ -693,8 +857,9 @@ EXPORT_SYMBOL(free_irq);
  *	IRQF_TRIGGER_*		Specify active edge(s) or level
  *
  */
-int request_irq(unsigned int irq, irq_handler_t handler,
-		unsigned long irqflags, const char *devname, void *dev_id)
+int request_threaded_irq(unsigned int irq, irq_handler_t handler,
+			 irq_handler_t thread_fn, unsigned long irqflags,
+			 const char *devname, void *dev_id)
 {
 	struct irqaction *action;
 	struct irq_desc *desc;
@@ -742,6 +907,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 		return -ENOMEM;
 
 	action->handler = handler;
+	action->thread_fn = thread_fn;
 	action->flags = irqflags;
 	action->name = devname;
 	action->dev_id = dev_id;
@@ -771,4 +937,4 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 #endif
 	return retval;
 }
-EXPORT_SYMBOL(request_irq);
+EXPORT_SYMBOL(request_threaded_irq);
-- 
cgit v1.2.3


From 935bd5b971f0df7c06d214d022cf8392e2f37952 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 23 Mar 2009 18:28:16 +0100
Subject: genirq: add support for threaded interrupts to devres

Some devices use devres_request_irq() for to install their interrupt
handler. Add support for threaded interrupts to devres as well.

[tglx - simplified and adapted to latest threadirq version]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 17 ++++++++++++++---
 kernel/irq/devres.c       | 16 ++++++++++------
 2 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 6fc2b720c231..dbf6a6fd116c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -123,9 +123,20 @@ extern void free_irq(unsigned int, void *);
 
 struct device;
 
-extern int __must_check devm_request_irq(struct device *dev, unsigned int irq,
-			    irq_handler_t handler, unsigned long irqflags,
-			    const char *devname, void *dev_id);
+extern int __must_check
+devm_request_threaded_irq(struct device *dev, unsigned int irq,
+			  irq_handler_t handler, irq_handler_t thread_fn,
+			  unsigned long irqflags, const char *devname,
+			  void *dev_id);
+
+static inline int __must_check
+devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler,
+		 unsigned long irqflags, const char *devname, void *dev_id)
+{
+	return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags,
+					 devname, dev_id);
+}
+
 extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
 
 /*
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 38a25b8d8bff..d06df9c41cba 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 }
 
 /**
- *	devm_request_irq - allocate an interrupt line for a managed device
+ *	devm_request_threaded_irq - allocate an interrupt line for a managed device
  *	@dev: device to request interrupt for
  *	@irq: Interrupt line to allocate
  *	@handler: Function to be called when the IRQ occurs
+ *	@thread_fn: function to be called in a threaded interrupt context. NULL
+ *		    for devices which handle everything in @handler
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
@@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
  *	If an IRQ allocated with this function needs to be freed
  *	separately, dev_free_irq() must be used.
  */
-int devm_request_irq(struct device *dev, unsigned int irq,
-		     irq_handler_t handler, unsigned long irqflags,
-		     const char *devname, void *dev_id)
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+			      irq_handler_t handler, irq_handler_t thread_fn,
+			      unsigned long irqflags, const char *devname,
+			      void *dev_id)
 {
 	struct irq_devres *dr;
 	int rc;
@@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq,
 	if (!dr)
 		return -ENOMEM;
 
-	rc = request_irq(irq, handler, irqflags, devname, dev_id);
+	rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
+				  dev_id);
 	if (rc) {
 		devres_free(dr);
 		return rc;
@@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
 
 	return 0;
 }
-EXPORT_SYMBOL(devm_request_irq);
+EXPORT_SYMBOL(devm_request_threaded_irq);
 
 /**
  *	devm_free_irq - free an interrupt
-- 
cgit v1.2.3


From f48fe81e5b032914183e9a17052313720c2cac56 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 24 Mar 2009 11:46:22 +0100
Subject: genirq: threaded irq handlers review fixups

Delta patch to address the review comments.

      - Implement warning when IRQ_WAKE_THREAD is requested and no
        thread handler installed
      - coding style fixes

Pointed-out-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  2 ++
 kernel/irq/handle.c       | 29 ++++++++++++++++++++++++-----
 kernel/irq/manage.c       | 17 +++++++----------
 3 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index dbf6a6fd116c..266a5f5f57cc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -63,10 +63,12 @@
  * Bits used by threaded handlers:
  * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
  * IRQTF_DIED      - handler thread died
+ * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
  */
 enum {
 	IRQTF_RUNTHREAD,
 	IRQTF_DIED,
+	IRQTF_WARNED,
 };
 
 typedef irqreturn_t (*irq_handler_t)(int, void *);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index fe8f45374e86..38b49a9e508a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -338,6 +338,15 @@ irqreturn_t no_action(int cpl, void *dev_id)
 	return IRQ_NONE;
 }
 
+static void warn_no_thread(unsigned int irq, struct irqaction *action)
+{
+	if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
+		return;
+
+	printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
+	       "but no thread function available.", irq, action->name);
+}
+
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
@@ -360,6 +369,21 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 		switch (ret) {
 		case IRQ_WAKE_THREAD:
+			/*
+			 * Set result to handled so the spurious check
+			 * does not trigger.
+			 */
+			ret = IRQ_HANDLED;
+
+			/*
+			 * Catch drivers which return WAKE_THREAD but
+			 * did not set up a thread function
+			 */
+			if (unlikely(!action->thread_fn)) {
+				warn_no_thread(irq, action);
+				break;
+			}
+
 			/*
 			 * Wake up the handler thread for this
 			 * action. In case the thread crashed and was
@@ -374,11 +398,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 				wake_up_process(action->thread);
 			}
 
-			/*
-			 * Set it to handled so the spurious check
-			 * does not trigger.
-			 */
-			ret = IRQ_HANDLED;
 			/* Fall through to add to randomness */
 		case IRQ_HANDLED:
 			status |= action->flags;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4c1ab86cd25..a3eb7baf1e46 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -407,20 +407,17 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	return ret;
 }
 
-static inline int irq_thread_should_run(struct irqaction *action)
-{
-	return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags);
-}
-
 static int irq_wait_for_interrupt(struct irqaction *action)
 {
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (irq_thread_should_run(action)) {
+
+		if (test_and_clear_bit(IRQTF_RUNTHREAD,
+				       &action->thread_flags)) {
 			__set_current_state(TASK_RUNNING);
 			return 0;
-		} else
-			schedule();
+		}
+		schedule();
 	}
 	return -1;
 }
@@ -820,8 +817,8 @@ EXPORT_SYMBOL(free_irq);
  *	@irq: Interrupt line to allocate
  *	@handler: Function to be called when the IRQ occurs.
  *		  Primary handler for threaded interrupts
- *      @thread_fn: Function called from the irq handler thread
- *                  If NULL, no irq thread is created
+ *	@thread_fn: Function called from the irq handler thread
+ *		    If NULL, no irq thread is created
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
-- 
cgit v1.2.3


From 5d1a03dc541dc6672e60e57249ed22f40654ca47 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 23:38:49 -0400
Subject: function-graph: moved the timestamp from arch to generic code

This patch move the timestamp from happening in the arch specific
code into the general code. This allows for better control by the tracer
to time manipulation.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/ftrace.c             | 6 +-----
 include/linux/ftrace.h               | 3 +--
 kernel/trace/trace_functions_graph.c | 8 +++++---
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 57b33edb7ce3..61df77532120 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -410,7 +410,6 @@ int ftrace_disable_ftrace_graph_caller(void)
 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 {
 	unsigned long old;
-	unsigned long long calltime;
 	int faulted;
 	struct ftrace_graph_ent trace;
 	unsigned long return_hooker = (unsigned long)
@@ -453,10 +452,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	calltime = trace_clock_local();
-
-	if (ftrace_push_return_trace(old, calltime,
-				self_addr, &trace.depth) == -EBUSY) {
+	if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) {
 		*parent = old;
 		return;
 	}
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index db3fed630db3..1141248c84ee 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -369,8 +369,7 @@ struct ftrace_ret_stack {
 extern void return_to_handler(void);
 
 extern int
-ftrace_push_return_trace(unsigned long ret, unsigned long long time,
-			 unsigned long func, int *depth);
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
 extern void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e876816fa8e7..d28687e7b3a7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,9 +57,9 @@ static struct tracer_flags tracer_flags = {
 
 /* Add a function return address to the trace stack on thread info.*/
 int
-ftrace_push_return_trace(unsigned long ret, unsigned long long time,
-			 unsigned long func, int *depth)
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 {
+	unsigned long long calltime;
 	int index;
 
 	if (!current->ret_stack)
@@ -71,11 +71,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long long time,
 		return -EBUSY;
 	}
 
+	calltime = trace_clock_local();
+
 	index = ++current->curr_ret_stack;
 	barrier();
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
-	current->ret_stack[index].calltime = time;
+	current->ret_stack[index].calltime = calltime;
 	*depth = index;
 
 	return 0;
-- 
cgit v1.2.3


From 8aef2d2856158a36c295a8d1288281e4839bff13 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 01:10:15 -0400
Subject: function-graph: ignore times across schedule

Impact: more accurate timings

The current method of function graph tracing does not take into
account the time spent when a task is not running. This shows functions
that call schedule have increased costs:

 3) + 18.664 us   |      }
 ------------------------------------------
 3)    <idle>-0    =>  kblockd-123
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   1.441 us    |        _spin_unlock_irq();
 3)   3.966 us    |      }
 3) ! 2959.433 us |    }
 3) ! 2961.465 us |  }

This patch uses the tracepoint in the scheduling context switch to
account for time that has elapsed while a task is scheduled out.
Now we see:

 ------------------------------------------
 3)    <idle>-0    =>  edac-po-1067
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   0.685 us    |        _spin_unlock_irq();
 3)   2.331 us    |      }
 3) + 41.439 us   |    }
 3) + 42.663 us   |  }

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/sched.h |  2 ++
 kernel/trace/ftrace.c | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 89cd308cc7a5..471e36d30123 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1409,6 +1409,8 @@ struct task_struct {
 	int curr_ret_stack;
 	/* Stack of return addresses for return function tracing */
 	struct ftrace_ret_stack	*ret_stack;
+	/* time stamp for last schedule */
+	unsigned long long ftrace_timestamp;
 	/*
 	 * Number of functions that haven't been traced
 	 * because of depth overrun.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c81a759fbf76..0b90364d1a2c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,6 +29,8 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
+#include <trace/sched.h>
+
 #include <asm/ftrace.h>
 
 #include "trace.h"
@@ -2590,6 +2592,31 @@ free:
 	return ret;
 }
 
+static void
+ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+				struct task_struct *next)
+{
+	unsigned long long timestamp;
+	int index;
+
+	timestamp = trace_clock_local();
+
+	prev->ftrace_timestamp = timestamp;
+
+	/* only process tasks that we timestamped */
+	if (!next->ftrace_timestamp)
+		return;
+
+	/*
+	 * Update all the counters in next to make up for the
+	 * time next was sleeping.
+	 */
+	timestamp -= next->ftrace_timestamp;
+
+	for (index = next->curr_ret_stack; index >= 0; index--)
+		next->ret_stack[index].calltime += timestamp;
+}
+
 /* Allocate a return stack for each task */
 static int start_graph_tracing(void)
 {
@@ -2611,6 +2638,13 @@ static int start_graph_tracing(void)
 		ret = alloc_retstack_tasklist(ret_stack_list);
 	} while (ret == -EAGAIN);
 
+	if (!ret) {
+		ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+		if (ret)
+			pr_info("ftrace_graph: Couldn't activate tracepoint"
+				" probe to kernel_sched_switch\n");
+	}
+
 	kfree(ret_stack_list);
 	return ret;
 }
@@ -2674,6 +2708,7 @@ void unregister_ftrace_graph(void)
 	mutex_lock(&ftrace_lock);
 
 	atomic_dec(&ftrace_graph_active);
+	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
@@ -2694,6 +2729,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 		t->curr_ret_stack = -1;
 		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
+		t->ftrace_timestamp = 0;
 	} else
 		t->ret_stack = NULL;
 }
-- 
cgit v1.2.3


From ee000b7f9fe429d2470c674ccec8d344f6789e0d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 13:38:06 +0800
Subject: tracing: use union for multi-usages field

Impact: cleanup

struct dyn_ftrace::ip has different usages in his lifecycle,
we use union for it. And also for struct dyn_ftrace::flags.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C871BE.3080405@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 12 +++++++++---
 kernel/trace/ftrace.c  |  8 ++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1141248c84ee..015a3d22cf74 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -145,9 +145,15 @@ enum {
 };
 
 struct dyn_ftrace {
-	unsigned long		ip; /* address of mcount call-site */
-	unsigned long		flags;
-	struct dyn_arch_ftrace	arch;
+	union {
+		unsigned long		ip; /* address of mcount call-site */
+		struct dyn_ftrace	*freelist;
+	};
+	union {
+		unsigned long		flags;
+		struct dyn_ftrace	*newlist;
+	};
+	struct dyn_arch_ftrace		arch;
 };
 
 int ftrace_force_update(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bb377112b1bb..7b8722baf153 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -341,7 +341,7 @@ static inline int record_frozen(struct dyn_ftrace *rec)
 
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
-	rec->ip = (unsigned long)ftrace_free_records;
+	rec->freelist = ftrace_free_records;
 	ftrace_free_records = rec;
 	rec->flags |= FTRACE_FL_FREE;
 }
@@ -379,7 +379,7 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 			return NULL;
 		}
 
-		ftrace_free_records = (void *)rec->ip;
+		ftrace_free_records = rec->freelist;
 		memset(rec, 0, sizeof(*rec));
 		return rec;
 	}
@@ -411,7 +411,7 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-	rec->flags = (unsigned long)ftrace_new_addrs;
+	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
 	return rec;
@@ -731,7 +731,7 @@ static int ftrace_update_code(struct module *mod)
 			return -1;
 
 		p = ftrace_new_addrs;
-		ftrace_new_addrs = (struct dyn_ftrace *)p->flags;
+		ftrace_new_addrs = p->newlist;
 		p->flags = 0L;
 
 		/* convert record (i.e, patch mcount-call with NOP) */
-- 
cgit v1.2.3


From 3a38148f0488069cadb75c4a6909954072d648bf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 24 Mar 2009 20:27:39 +0100
Subject: genirq: provide old request_irq() for CONFIG_GENERIC_HARDIRQ=n

Impact: Undo compile breakage for archs with CONFIG_GENERIC_HARDIRQ=n

The threaded interrupt handler patches changed request_irq from extern
to inline. Architectures which do not use the generic irq code still
have request_irq() as a global function and therefor fail to compile.

Keep the extern declaration for CONFIG_GENERIC_HARDIRQ=n

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 266a5f5f57cc..7e63b824833f 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -103,6 +103,7 @@ struct irqaction {
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
 
+#ifdef CONFIG_GENERIC_HARDIRQS
 extern int __must_check
 request_threaded_irq(unsigned int irq, irq_handler_t handler,
 		     irq_handler_t thread_fn,
@@ -115,9 +116,13 @@ request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
 	return request_threaded_irq(irq, handler, NULL, flags, name, dev);
 }
 
-#ifdef CONFIG_GENERIC_HARDIRQS
 extern void exit_irq_thread(void);
 #else
+
+extern int __must_check
+request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
+	    const char *name, void *dev);
+
 static inline void exit_irq_thread(void) { }
 #endif
 
-- 
cgit v1.2.3


From 54aee6a5f560d0e1bf3f39987c6ebe06daeb0ce1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 25 Mar 2009 09:13:24 -0700
Subject: dmaengine: kill some unused headers

The dmaengine redux left some unneeded headers in
include/linux/dmaengine.h, clean them up.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dmaengine.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 1956c8d46d32..96e676e5bf9b 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -23,9 +23,6 @@
 
 #include <linux/device.h>
 #include <linux/uio.h>
-#include <linux/kref.h>
-#include <linux/completion.h>
-#include <linux/rcupdate.h>
 #include <linux/dma-mapping.h>
 
 /**
-- 
cgit v1.2.3


From 06164f3194e01ea4c76941ac60f541d656c8975f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 25 Mar 2009 09:13:25 -0700
Subject: async_tx: provide __async_inline for HAS_DMA=n archs

To allow an async_tx routine to be compiled away on HAS_DMA=n arch it
needs to be declared __always_inline otherwise the compiler may emit
code and cause a link error.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_xor.c | 7 ++-----
 include/linux/async_tx.h    | 9 +++++++++
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index 595b78672b36..95fe2c8d6c51 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -30,11 +30,8 @@
 #include <linux/raid/xor.h>
 #include <linux/async_tx.h>
 
-/* do_async_xor - dma map the pages and perform the xor with an engine.
- * 	This routine is marked __always_inline so it can be compiled away
- * 	when CONFIG_DMA_ENGINE=n
- */
-static __always_inline struct dma_async_tx_descriptor *
+/* do_async_xor - dma map the pages and perform the xor with an engine */
+static __async_inline struct dma_async_tx_descriptor *
 do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 	     unsigned int offset, int src_cnt, size_t len,
 	     enum async_tx_flags flags,
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 45f6297821bd..5fc2ef8d97fa 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -21,6 +21,15 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 
+/* on architectures without dma-mapping capabilities we need to ensure
+ * that the asynchronous path compiles away
+ */
+#ifdef CONFIG_HAS_DMA
+#define __async_inline
+#else
+#define __async_inline __always_inline
+#endif
+
 /**
  * dma_chan_ref - object used to manage dma channels received from the
  *   dmaengine core.
-- 
cgit v1.2.3


From 729b5d1b8ec72c28e99840b3f300ba67726e3ab9 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 25 Mar 2009 09:13:25 -0700
Subject: dmaengine: allow dma support for async_tx to be toggled

Provide a config option for blocking the allocation of dma channels to
the async_tx api.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_tx.c |  6 +++---
 drivers/dma/Kconfig        | 11 +++++++++++
 include/linux/dmaengine.h  | 18 ++++++++++++++++++
 3 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index f21147f3626a..06eb6cc09fef 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -30,7 +30,7 @@
 #ifdef CONFIG_DMA_ENGINE
 static int __init async_tx_init(void)
 {
-	dmaengine_get();
+	async_dmaengine_get();
 
 	printk(KERN_INFO "async_tx: api initialized (async)\n");
 
@@ -39,7 +39,7 @@ static int __init async_tx_init(void)
 
 static void __exit async_tx_exit(void)
 {
-	dmaengine_put();
+	async_dmaengine_put();
 }
 
 /**
@@ -56,7 +56,7 @@ __async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
 	if (depend_tx &&
 	    dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
 		return depend_tx->chan;
-	return dma_find_channel(tx_type);
+	return async_dma_find_channel(tx_type);
 }
 EXPORT_SYMBOL_GPL(__async_tx_find_channel);
 #else
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 48ea59e79672..3b3c01b6f1ee 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -98,6 +98,17 @@ config NET_DMA
 	  Say Y here if you enabled INTEL_IOATDMA or FSL_DMA, otherwise
 	  say N.
 
+config ASYNC_TX_DMA
+	bool "Async_tx: Offload support for the async_tx api"
+	depends on DMA_ENGINE
+	help
+	  This allows the async_tx api to take advantage of offload engines for
+	  memcpy, memset, xor, and raid6 p+q operations.  If your platform has
+	  a dma engine that can perform raid operations and you have enabled
+	  MD_RAID456 say Y.
+
+	  If unsure, say N.
+
 config DMATEST
 	tristate "DMA Test client"
 	depends on DMA_ENGINE
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 96e676e5bf9b..2afc2c95e42d 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -288,6 +288,24 @@ static inline void net_dmaengine_put(void)
 }
 #endif
 
+#ifdef CONFIG_ASYNC_TX_DMA
+#define async_dmaengine_get()	dmaengine_get()
+#define async_dmaengine_put()	dmaengine_put()
+#define async_dma_find_channel(type) dma_find_channel(type)
+#else
+static inline void async_dmaengine_get(void)
+{
+}
+static inline void async_dmaengine_put(void)
+{
+}
+static inline struct dma_chan *
+async_dma_find_channel(enum dma_transaction_type type)
+{
+	return NULL;
+}
+#endif
+
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len);
 dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
-- 
cgit v1.2.3


From de18836e447c2dc30120c0919b8db8ddc0401cc4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 25 Mar 2009 17:33:38 +0100
Subject: genirq: fix devres.o build for GENERIC_HARDIRQS=n

kernel/irq/devres.c is built by sparc (32bit) and m68k via the obscure
../../../kernel/irq/devres.o reference in arch/[sparc/m68k]/kernel/Makefile

To avoid ifdeffery in devres.c provide request_threaded_irq as an
inline for these users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7e63b824833f..143192f48bf3 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -123,6 +123,20 @@ extern int __must_check
 request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
 	    const char *name, void *dev);
 
+/*
+ * Special function to avoid ifdeffery in kernel/irq/devres.c which
+ * gets magically built by GENERIC_HARDIRQS=n architectures (sparc,
+ * m68k). I really love these $@%#!* obvious Makefile references:
+ * ../../../kernel/irq/devres.o
+ */
+static inline int __must_check
+request_threaded_irq(unsigned int irq, irq_handler_t handler,
+		     irq_handler_t thread_fn,
+		     unsigned long flags, const char *name, void *dev)
+{
+	return request_irq(irq, handler, flags, name, dev);
+}
+
 static inline void exit_irq_thread(void) { }
 #endif
 
-- 
cgit v1.2.3


From 0361a28d3f9a4315a100c7b37ba0b55cfe15fe07 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oliver@neukum.org>
Date: Wed, 17 Dec 2008 15:38:03 +0100
Subject: HID: autosuspend support for USB HID

This uses the USB busy mechanism for aggessive autosuspend of USB
HID devices. It autosuspends all opened devices supporting remote wakeup
after a timeout unless

- output is being done to the device
- a key is being held down (remote wakeup isn't triggered upon key release)
- LED(s) are lit
- hiddev is opened

As in the current driver closed devices will be autosuspended even if they
don't support remote wakeup.

The patch is quite large because output to devices is done in hard interrupt
context meaning a lot a queuing and locking had to be touched. The LED stuff
has been solved by means of a simple counter. Additions to the generic HID code
could be avoided. In addition it now covers hidraw. It contains an embryonic
version of an API to let the generic HID code tell the lower levels which
capabilities with respect to power management are needed.

Signed-off-by: Oliver Neukum <oneukum@suse.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c        |  16 ++
 drivers/hid/hidraw.c          |  17 +-
 drivers/hid/usbhid/hid-core.c | 435 +++++++++++++++++++++++++++++++++---------
 drivers/hid/usbhid/hiddev.c   |  17 +-
 drivers/hid/usbhid/usbhid.h   |  14 +-
 include/linux/hid.h           |   6 +
 6 files changed, 402 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 1cc967448f4d..feaeb6167ea4 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1822,6 +1822,22 @@ static DECLARE_WORK(hid_compat_work, hid_compat_load);
 static struct workqueue_struct *hid_compat_wq;
 #endif
 
+int hid_check_keys_pressed(struct hid_device *hid)
+{
+	struct hid_input *hidinput;
+	int i;
+
+	list_for_each_entry(hidinput, &hid->inputs, list) {
+		for (i = 0; i < BITS_TO_LONGS(KEY_MAX); i++)
+			if (hidinput->input->key[i])
+				return 1;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(hid_check_keys_pressed);
+
 static int __init hid_init(void)
 {
 	int ret;
diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index 02b19db5442e..e263d4731179 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -181,9 +181,17 @@ static int hidraw_open(struct inode *inode, struct file *file)
 
 	dev = hidraw_table[minor];
 	if (!dev->open++) {
+		if (dev->hid->ll_driver->power) {
+			err = dev->hid->ll_driver->power(dev->hid, PM_HINT_FULLON);
+			if (err < 0)
+				goto out_unlock;
+		}
 		err = dev->hid->ll_driver->open(dev->hid);
-		if (err < 0)
+		if (err < 0) {
+			if (dev->hid->ll_driver->power)
+				dev->hid->ll_driver->power(dev->hid, PM_HINT_NORMAL);
 			dev->open--;
+		}
 	}
 
 out_unlock:
@@ -209,10 +217,13 @@ static int hidraw_release(struct inode * inode, struct file * file)
 	list_del(&list->node);
 	dev = hidraw_table[minor];
 	if (!--dev->open) {
-		if (list->hidraw->exist)
+		if (list->hidraw->exist) {
+			if (dev->hid->ll_driver->power)
+				dev->hid->ll_driver->power(dev->hid, PM_HINT_NORMAL);
 			dev->hid->ll_driver->close(dev->hid);
-		else
+		} else {
 			kfree(list->hidraw);
+		}
 	}
 
 	kfree(list);
diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index f0a0f72238ab..625e7e8eb373 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -5,6 +5,7 @@
  *  Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz>
  *  Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc
  *  Copyright (c) 2006-2008 Jiri Kosina
+ *  Copyright (c) 2007-2008 Oliver Neukum
  */
 
 /*
@@ -27,6 +28,7 @@
 #include <asm/byteorder.h>
 #include <linux/input.h>
 #include <linux/wait.h>
+#include <linux/workqueue.h>
 
 #include <linux/usb.h>
 
@@ -53,6 +55,10 @@ static unsigned int hid_mousepoll_interval;
 module_param_named(mousepoll, hid_mousepoll_interval, uint, 0644);
 MODULE_PARM_DESC(mousepoll, "Polling interval of mice");
 
+static unsigned int ignoreled;
+module_param_named(ignoreled, ignoreled, uint, 0644);
+MODULE_PARM_DESC(ignoreled, "Autosuspend with active leds");
+
 /* Quirks specified at module load time */
 static char *quirks_param[MAX_USBHID_BOOT_QUIRKS] = { [ 0 ... (MAX_USBHID_BOOT_QUIRKS - 1) ] = NULL };
 module_param_array_named(quirks, quirks_param, charp, NULL, 0444);
@@ -63,8 +69,13 @@ MODULE_PARM_DESC(quirks, "Add/modify USB HID quirks by specifying "
 /*
  * Input submission and I/O error handler.
  */
+static DEFINE_MUTEX(hid_open_mut);
+static struct workqueue_struct *resumption_waker;
 
 static void hid_io_error(struct hid_device *hid);
+static int hid_submit_out(struct hid_device *hid);
+static int hid_submit_ctrl(struct hid_device *hid);
+static void hid_cancel_delayed_stuff(struct usbhid_device *usbhid);
 
 /* Start up the input URB */
 static int hid_start_in(struct hid_device *hid)
@@ -73,15 +84,16 @@ static int hid_start_in(struct hid_device *hid)
 	int rc = 0;
 	struct usbhid_device *usbhid = hid->driver_data;
 
-	spin_lock_irqsave(&usbhid->inlock, flags);
-	if (hid->open > 0 && !test_bit(HID_SUSPENDED, &usbhid->iofl) &&
+	spin_lock_irqsave(&usbhid->lock, flags);
+	if (hid->open > 0 &&
 			!test_bit(HID_DISCONNECTED, &usbhid->iofl) &&
+			!test_bit(HID_REPORTED_IDLE, &usbhid->iofl) &&
 			!test_and_set_bit(HID_IN_RUNNING, &usbhid->iofl)) {
 		rc = usb_submit_urb(usbhid->urbin, GFP_ATOMIC);
 		if (rc != 0)
 			clear_bit(HID_IN_RUNNING, &usbhid->iofl);
 	}
-	spin_unlock_irqrestore(&usbhid->inlock, flags);
+	spin_unlock_irqrestore(&usbhid->lock, flags);
 	return rc;
 }
 
@@ -145,7 +157,7 @@ static void hid_io_error(struct hid_device *hid)
 	unsigned long flags;
 	struct usbhid_device *usbhid = hid->driver_data;
 
-	spin_lock_irqsave(&usbhid->inlock, flags);
+	spin_lock_irqsave(&usbhid->lock, flags);
 
 	/* Stop when disconnected */
 	if (test_bit(HID_DISCONNECTED, &usbhid->iofl))
@@ -175,7 +187,51 @@ static void hid_io_error(struct hid_device *hid)
 	mod_timer(&usbhid->io_retry,
 			jiffies + msecs_to_jiffies(usbhid->retry_delay));
 done:
-	spin_unlock_irqrestore(&usbhid->inlock, flags);
+	spin_unlock_irqrestore(&usbhid->lock, flags);
+}
+
+static void usbhid_mark_busy(struct usbhid_device *usbhid)
+{
+	struct usb_interface *intf = usbhid->intf;
+
+	usb_mark_last_busy(interface_to_usbdev(intf));
+}
+
+static int usbhid_restart_out_queue(struct usbhid_device *usbhid)
+{
+	struct hid_device *hid = usb_get_intfdata(usbhid->intf);
+	int kicked;
+
+	if (!hid)
+		return 0;
+
+	if ((kicked = (usbhid->outhead != usbhid->outtail))) {
+		dbg("Kicking head %d tail %d", usbhid->outhead, usbhid->outtail);
+		if (hid_submit_out(hid)) {
+			clear_bit(HID_OUT_RUNNING, &usbhid->iofl);
+			wake_up(&usbhid->wait);
+		}
+	}
+	return kicked;
+}
+
+static int usbhid_restart_ctrl_queue(struct usbhid_device *usbhid)
+{
+	struct hid_device *hid = usb_get_intfdata(usbhid->intf);
+	int kicked;
+
+	WARN_ON(hid == NULL);
+	if (!hid)
+		return 0;
+
+	if ((kicked = (usbhid->ctrlhead != usbhid->ctrltail))) {
+		dbg("Kicking head %d tail %d", usbhid->ctrlhead, usbhid->ctrltail);
+		if (hid_submit_ctrl(hid)) {
+			clear_bit(HID_CTRL_RUNNING, &usbhid->iofl);
+			wake_up(&usbhid->wait);
+		}
+	}
+	return kicked;
 }
 
 /*
@@ -190,12 +246,23 @@ static void hid_irq_in(struct urb *urb)
 
 	switch (urb->status) {
 	case 0:			/* success */
+		usbhid_mark_busy(usbhid);
 		usbhid->retry_delay = 0;
 		hid_input_report(urb->context, HID_INPUT_REPORT,
 				 urb->transfer_buffer,
 				 urb->actual_length, 1);
+		/*
+		 * autosuspend refused while keys are pressed
+		 * because most keyboards don't wake up when
+		 * a key is released
+		 */
+		if (hid_check_keys_pressed(hid))
+			set_bit(HID_KEYS_PRESSED, &usbhid->iofl);
+		else
+			clear_bit(HID_KEYS_PRESSED, &usbhid->iofl);
 		break;
 	case -EPIPE:		/* stall */
+		usbhid_mark_busy(usbhid);
 		clear_bit(HID_IN_RUNNING, &usbhid->iofl);
 		set_bit(HID_CLEAR_HALT, &usbhid->iofl);
 		schedule_work(&usbhid->reset_work);
@@ -209,6 +276,7 @@ static void hid_irq_in(struct urb *urb)
 	case -EPROTO:		/* protocol error or unplug */
 	case -ETIME:		/* protocol error or unplug */
 	case -ETIMEDOUT:	/* Should never happen, but... */
+		usbhid_mark_busy(usbhid);
 		clear_bit(HID_IN_RUNNING, &usbhid->iofl);
 		hid_io_error(hid);
 		return;
@@ -239,16 +307,25 @@ static int hid_submit_out(struct hid_device *hid)
 	report = usbhid->out[usbhid->outtail].report;
 	raw_report = usbhid->out[usbhid->outtail].raw_report;
 
-	usbhid->urbout->transfer_buffer_length = ((report->size - 1) >> 3) + 1 + (report->id > 0);
-	usbhid->urbout->dev = hid_to_usb_dev(hid);
-	memcpy(usbhid->outbuf, raw_report, usbhid->urbout->transfer_buffer_length);
-	kfree(raw_report);
+	if (!test_bit(HID_REPORTED_IDLE, &usbhid->iofl)) {
+		usbhid->urbout->transfer_buffer_length = ((report->size - 1) >> 3) + 1 + (report->id > 0);
+		usbhid->urbout->dev = hid_to_usb_dev(hid);
+		memcpy(usbhid->outbuf, raw_report, usbhid->urbout->transfer_buffer_length);
+		kfree(raw_report);
 
-	dbg_hid("submitting out urb\n");
+		dbg_hid("submitting out urb\n");
 
-	if (usb_submit_urb(usbhid->urbout, GFP_ATOMIC)) {
-		err_hid("usb_submit_urb(out) failed");
-		return -1;
+		if (usb_submit_urb(usbhid->urbout, GFP_ATOMIC)) {
+			err_hid("usb_submit_urb(out) failed");
+			return -1;
+		}
+	} else {
+		/*
+		 * queue work to wake up the device.
+		 * as the work queue is freezeable, this is safe
+		 * with respect to STD and STR
+		 */
+		queue_work(resumption_waker, &usbhid->restart_work);
 	}
 
 	return 0;
@@ -266,41 +343,50 @@ static int hid_submit_ctrl(struct hid_device *hid)
 	raw_report = usbhid->ctrl[usbhid->ctrltail].raw_report;
 	dir = usbhid->ctrl[usbhid->ctrltail].dir;
 
-	len = ((report->size - 1) >> 3) + 1 + (report->id > 0);
-	if (dir == USB_DIR_OUT) {
-		usbhid->urbctrl->pipe = usb_sndctrlpipe(hid_to_usb_dev(hid), 0);
-		usbhid->urbctrl->transfer_buffer_length = len;
-		memcpy(usbhid->ctrlbuf, raw_report, len);
-		kfree(raw_report);
-	} else {
-		int maxpacket, padlen;
-
-		usbhid->urbctrl->pipe = usb_rcvctrlpipe(hid_to_usb_dev(hid), 0);
-		maxpacket = usb_maxpacket(hid_to_usb_dev(hid), usbhid->urbctrl->pipe, 0);
-		if (maxpacket > 0) {
-			padlen = DIV_ROUND_UP(len, maxpacket);
-			padlen *= maxpacket;
-			if (padlen > usbhid->bufsize)
-				padlen = usbhid->bufsize;
-		} else
-			padlen = 0;
-		usbhid->urbctrl->transfer_buffer_length = padlen;
-	}
-	usbhid->urbctrl->dev = hid_to_usb_dev(hid);
+	if (!test_bit(HID_REPORTED_IDLE, &usbhid->iofl)) {
+		len = ((report->size - 1) >> 3) + 1 + (report->id > 0);
+		if (dir == USB_DIR_OUT) {
+			usbhid->urbctrl->pipe = usb_sndctrlpipe(hid_to_usb_dev(hid), 0);
+			usbhid->urbctrl->transfer_buffer_length = len;
+			memcpy(usbhid->ctrlbuf, raw_report, len);
+			kfree(raw_report);
+		} else {
+			int maxpacket, padlen;
+
+			usbhid->urbctrl->pipe = usb_rcvctrlpipe(hid_to_usb_dev(hid), 0);
+			maxpacket = usb_maxpacket(hid_to_usb_dev(hid), usbhid->urbctrl->pipe, 0);
+			if (maxpacket > 0) {
+				padlen = DIV_ROUND_UP(len, maxpacket);
+				padlen *= maxpacket;
+				if (padlen > usbhid->bufsize)
+					padlen = usbhid->bufsize;
+			} else
+				padlen = 0;
+			usbhid->urbctrl->transfer_buffer_length = padlen;
+		}
+		usbhid->urbctrl->dev = hid_to_usb_dev(hid);
 
-	usbhid->cr->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE | dir;
-	usbhid->cr->bRequest = (dir == USB_DIR_OUT) ? HID_REQ_SET_REPORT : HID_REQ_GET_REPORT;
-	usbhid->cr->wValue = cpu_to_le16(((report->type + 1) << 8) | report->id);
-	usbhid->cr->wIndex = cpu_to_le16(usbhid->ifnum);
-	usbhid->cr->wLength = cpu_to_le16(len);
+		usbhid->cr->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE | dir;
+		usbhid->cr->bRequest = (dir == USB_DIR_OUT) ? HID_REQ_SET_REPORT : HID_REQ_GET_REPORT;
+		usbhid->cr->wValue = cpu_to_le16(((report->type + 1) << 8) | report->id);
+		usbhid->cr->wIndex = cpu_to_le16(usbhid->ifnum);
+		usbhid->cr->wLength = cpu_to_le16(len);
 
-	dbg_hid("submitting ctrl urb: %s wValue=0x%04x wIndex=0x%04x wLength=%u\n",
-		usbhid->cr->bRequest == HID_REQ_SET_REPORT ? "Set_Report" : "Get_Report",
-		usbhid->cr->wValue, usbhid->cr->wIndex, usbhid->cr->wLength);
+		dbg_hid("submitting ctrl urb: %s wValue=0x%04x wIndex=0x%04x wLength=%u\n",
+			usbhid->cr->bRequest == HID_REQ_SET_REPORT ? "Set_Report" : "Get_Report",
+			usbhid->cr->wValue, usbhid->cr->wIndex, usbhid->cr->wLength);
 
-	if (usb_submit_urb(usbhid->urbctrl, GFP_ATOMIC)) {
-		err_hid("usb_submit_urb(ctrl) failed");
-		return -1;
+		if (usb_submit_urb(usbhid->urbctrl, GFP_ATOMIC)) {
+			err_hid("usb_submit_urb(ctrl) failed");
+			return -1;
+		}
+	} else {
+		/*
+		 * queue work to wake up the device.
+		 * as the work queue is freezeable, this is safe
+		 * with respect to STD and STR
+		 */
+		queue_work(resumption_waker, &usbhid->restart_work);
 	}
 
 	return 0;
@@ -332,7 +418,7 @@ static void hid_irq_out(struct urb *urb)
 				"received\n", urb->status);
 	}
 
-	spin_lock_irqsave(&usbhid->outlock, flags);
+	spin_lock_irqsave(&usbhid->lock, flags);
 
 	if (unplug)
 		usbhid->outtail = usbhid->outhead;
@@ -344,12 +430,12 @@ static void hid_irq_out(struct urb *urb)
 			clear_bit(HID_OUT_RUNNING, &usbhid->iofl);
 			wake_up(&usbhid->wait);
 		}
-		spin_unlock_irqrestore(&usbhid->outlock, flags);
+		spin_unlock_irqrestore(&usbhid->lock, flags);
 		return;
 	}
 
 	clear_bit(HID_OUT_RUNNING, &usbhid->iofl);
-	spin_unlock_irqrestore(&usbhid->outlock, flags);
+	spin_unlock_irqrestore(&usbhid->lock, flags);
 	wake_up(&usbhid->wait);
 }
 
@@ -361,12 +447,11 @@ static void hid_ctrl(struct urb *urb)
 {
 	struct hid_device *hid = urb->context;
 	struct usbhid_device *usbhid = hid->driver_data;
-	unsigned long flags;
-	int unplug = 0;
+	int unplug = 0, status = urb->status;
 
-	spin_lock_irqsave(&usbhid->ctrllock, flags);
+	spin_lock(&usbhid->lock);
 
-	switch (urb->status) {
+	switch (status) {
 	case 0:			/* success */
 		if (usbhid->ctrl[usbhid->ctrltail].dir == USB_DIR_IN)
 			hid_input_report(urb->context,
@@ -383,7 +468,7 @@ static void hid_ctrl(struct urb *urb)
 		break;
 	default:		/* error */
 		dev_warn(&urb->dev->dev, "ctrl urb status %d "
-				"received\n", urb->status);
+				"received\n", status);
 	}
 
 	if (unplug)
@@ -396,19 +481,18 @@ static void hid_ctrl(struct urb *urb)
 			clear_bit(HID_CTRL_RUNNING, &usbhid->iofl);
 			wake_up(&usbhid->wait);
 		}
-		spin_unlock_irqrestore(&usbhid->ctrllock, flags);
+		spin_unlock(&usbhid->lock);
 		return;
 	}
 
 	clear_bit(HID_CTRL_RUNNING, &usbhid->iofl);
-	spin_unlock_irqrestore(&usbhid->ctrllock, flags);
+	spin_unlock(&usbhid->lock);
 	wake_up(&usbhid->wait);
 }
 
-void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir)
+void __usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir)
 {
 	int head;
-	unsigned long flags;
 	struct usbhid_device *usbhid = hid->driver_data;
 	int len = ((report->size - 1) >> 3) + 1 + (report->id > 0);
 
@@ -416,18 +500,13 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns
 		return;
 
 	if (usbhid->urbout && dir == USB_DIR_OUT && report->type == HID_OUTPUT_REPORT) {
-
-		spin_lock_irqsave(&usbhid->outlock, flags);
-
 		if ((head = (usbhid->outhead + 1) & (HID_OUTPUT_FIFO_SIZE - 1)) == usbhid->outtail) {
-			spin_unlock_irqrestore(&usbhid->outlock, flags);
 			dev_warn(&hid->dev, "output queue full\n");
 			return;
 		}
 
 		usbhid->out[usbhid->outhead].raw_report = kmalloc(len, GFP_ATOMIC);
 		if (!usbhid->out[usbhid->outhead].raw_report) {
-			spin_unlock_irqrestore(&usbhid->outlock, flags);
 			dev_warn(&hid->dev, "output queueing failed\n");
 			return;
 		}
@@ -438,15 +517,10 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns
 		if (!test_and_set_bit(HID_OUT_RUNNING, &usbhid->iofl))
 			if (hid_submit_out(hid))
 				clear_bit(HID_OUT_RUNNING, &usbhid->iofl);
-
-		spin_unlock_irqrestore(&usbhid->outlock, flags);
 		return;
 	}
 
-	spin_lock_irqsave(&usbhid->ctrllock, flags);
-
 	if ((head = (usbhid->ctrlhead + 1) & (HID_CONTROL_FIFO_SIZE - 1)) == usbhid->ctrltail) {
-		spin_unlock_irqrestore(&usbhid->ctrllock, flags);
 		dev_warn(&hid->dev, "control queue full\n");
 		return;
 	}
@@ -454,7 +528,6 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns
 	if (dir == USB_DIR_OUT) {
 		usbhid->ctrl[usbhid->ctrlhead].raw_report = kmalloc(len, GFP_ATOMIC);
 		if (!usbhid->ctrl[usbhid->ctrlhead].raw_report) {
-			spin_unlock_irqrestore(&usbhid->ctrllock, flags);
 			dev_warn(&hid->dev, "control queueing failed\n");
 			return;
 		}
@@ -467,15 +540,25 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns
 	if (!test_and_set_bit(HID_CTRL_RUNNING, &usbhid->iofl))
 		if (hid_submit_ctrl(hid))
 			clear_bit(HID_CTRL_RUNNING, &usbhid->iofl);
+}
 
-	spin_unlock_irqrestore(&usbhid->ctrllock, flags);
+void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir)
+{
+	struct usbhid_device *usbhid = hid->driver_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&usbhid->lock, flags);
+	__usbhid_submit_report(hid, report, dir);
+	spin_unlock_irqrestore(&usbhid->lock, flags);
 }
 EXPORT_SYMBOL_GPL(usbhid_submit_report);
 
 static int usb_hidinput_input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value)
 {
 	struct hid_device *hid = input_get_drvdata(dev);
+	struct usbhid_device *usbhid = hid->driver_data;
 	struct hid_field *field;
+	unsigned long flags;
 	int offset;
 
 	if (type == EV_FF)
@@ -490,6 +573,15 @@ static int usb_hidinput_input_event(struct input_dev *dev, unsigned int type, un
 	}
 
 	hid_set_field(field, offset, value);
+	if (value) {
+		spin_lock_irqsave(&usbhid->lock, flags);
+		usbhid->ledcount++;
+		spin_unlock_irqrestore(&usbhid->lock, flags);
+	} else {
+		spin_lock_irqsave(&usbhid->lock, flags);
+		usbhid->ledcount--;
+		spin_unlock_irqrestore(&usbhid->lock, flags);
+	}
 	usbhid_submit_report(hid, field->report, USB_DIR_OUT);
 
 	return 0;
@@ -538,15 +630,22 @@ int usbhid_open(struct hid_device *hid)
 	struct usbhid_device *usbhid = hid->driver_data;
 	int res;
 
+	mutex_lock(&hid_open_mut);
 	if (!hid->open++) {
 		res = usb_autopm_get_interface(usbhid->intf);
+		/* the device must be awake to reliable request remote wakeup */
 		if (res < 0) {
 			hid->open--;
+			mutex_unlock(&hid_open_mut);
 			return -EIO;
 		}
+		usbhid->intf->needs_remote_wakeup = 1;
+		if (hid_start_in(hid))
+			hid_io_error(hid);
+ 
+		usb_autopm_put_interface(usbhid->intf);
 	}
-	if (hid_start_in(hid))
-		hid_io_error(hid);
+	mutex_unlock(&hid_open_mut);
 	return 0;
 }
 
@@ -554,10 +653,22 @@ void usbhid_close(struct hid_device *hid)
 {
 	struct usbhid_device *usbhid = hid->driver_data;
 
+	mutex_lock(&hid_open_mut);
+
+	/* protecting hid->open to make sure we don't restart
+	 * data acquistion due to a resumption we no longer
+	 * care about
+	 */
+	spin_lock_irq(&usbhid->lock);
 	if (!--hid->open) {
+		spin_unlock_irq(&usbhid->lock);
 		usb_kill_urb(usbhid->urbin);
-		usb_autopm_put_interface(usbhid->intf);
+		flush_scheduled_work();
+		usbhid->intf->needs_remote_wakeup = 0;
+	} else {
+		spin_unlock_irq(&usbhid->lock);
 	}
+	mutex_unlock(&hid_open_mut);
 }
 
 /*
@@ -687,6 +798,25 @@ static int usbhid_output_raw_report(struct hid_device *hid, __u8 *buf, size_t co
 	return ret;
 }
 
+static void usbhid_restart_queues(struct usbhid_device *usbhid)
+{
+	if (usbhid->urbout)
+		usbhid_restart_out_queue(usbhid);
+	usbhid_restart_ctrl_queue(usbhid);
+}
+
+static void __usbhid_restart_queues(struct work_struct *work)
+{
+	struct usbhid_device *usbhid =
+		container_of(work, struct usbhid_device, restart_work);
+	int r;
+
+	r = usb_autopm_get_interface(usbhid->intf);
+	if (r < 0)
+		return;
+	usb_autopm_put_interface(usbhid->intf);
+}
+
 static void hid_free_buffers(struct usb_device *dev, struct hid_device *hid)
 {
 	struct usbhid_device *usbhid = hid->driver_data;
@@ -850,11 +980,11 @@ static int usbhid_start(struct hid_device *hid)
 
 	init_waitqueue_head(&usbhid->wait);
 	INIT_WORK(&usbhid->reset_work, hid_reset);
+	INIT_WORK(&usbhid->restart_work, __usbhid_restart_queues);
 	setup_timer(&usbhid->io_retry, hid_retry_timeout, (unsigned long) hid);
 
-	spin_lock_init(&usbhid->inlock);
-	spin_lock_init(&usbhid->outlock);
-	spin_lock_init(&usbhid->ctrllock);
+	spin_lock_init(&usbhid->lock);
+	spin_lock_init(&usbhid->lock);
 
 	usbhid->intf = intf;
 	usbhid->ifnum = interface->desc.bInterfaceNumber;
@@ -906,15 +1036,14 @@ static void usbhid_stop(struct hid_device *hid)
 		return;
 
 	clear_bit(HID_STARTED, &usbhid->iofl);
-	spin_lock_irq(&usbhid->inlock);	/* Sync with error handler */
+	spin_lock_irq(&usbhid->lock);	/* Sync with error handler */
 	set_bit(HID_DISCONNECTED, &usbhid->iofl);
-	spin_unlock_irq(&usbhid->inlock);
+	spin_unlock_irq(&usbhid->lock);
 	usb_kill_urb(usbhid->urbin);
 	usb_kill_urb(usbhid->urbout);
 	usb_kill_urb(usbhid->urbctrl);
 
-	del_timer_sync(&usbhid->io_retry);
-	cancel_work_sync(&usbhid->reset_work);
+	hid_cancel_delayed_stuff(usbhid);
 
 	if (hid->claimed & HID_CLAIMED_INPUT)
 		hidinput_disconnect(hid);
@@ -935,12 +1064,28 @@ static void usbhid_stop(struct hid_device *hid)
 	hid_free_buffers(hid_to_usb_dev(hid), hid);
 }
 
+static int usbhid_power(struct hid_device *hid, int lvl)
+{
+	int r = 0;
+
+	switch (lvl) {
+	case PM_HINT_FULLON:
+		r = usbhid_get_power(hid);
+		break;
+	case PM_HINT_NORMAL:
+		usbhid_put_power(hid);
+		break;
+	}
+	return r;
+}
+
 static struct hid_ll_driver usb_hid_driver = {
 	.parse = usbhid_parse,
 	.start = usbhid_start,
 	.stop = usbhid_stop,
 	.open = usbhid_open,
 	.close = usbhid_close,
+	.power = usbhid_power,
 	.hidinput_input_event = usb_hidinput_input_event,
 };
 
@@ -1049,19 +1194,75 @@ static void hid_disconnect(struct usb_interface *intf)
 	kfree(usbhid);
 }
 
+static void hid_cancel_delayed_stuff(struct usbhid_device *usbhid)
+{
+	del_timer_sync(&usbhid->io_retry);
+	cancel_work_sync(&usbhid->restart_work);
+	cancel_work_sync(&usbhid->reset_work);
+}
+
+static void hid_cease_io(struct usbhid_device *usbhid)
+{
+	del_timer(&usbhid->io_retry);
+	usb_kill_urb(usbhid->urbin);
+	usb_kill_urb(usbhid->urbctrl);
+	usb_kill_urb(usbhid->urbout);
+	flush_scheduled_work();
+}
+
 static int hid_suspend(struct usb_interface *intf, pm_message_t message)
 {
-	struct hid_device *hid = usb_get_intfdata (intf);
+	struct hid_device *hid = usb_get_intfdata(intf);
 	struct usbhid_device *usbhid = hid->driver_data;
+	struct usb_device *udev = interface_to_usbdev(intf);
+	int status;
 
-	if (!test_bit(HID_STARTED, &usbhid->iofl))
-		return 0;
+	if (udev->auto_pm) {
+		spin_lock_irq(&usbhid->lock);	/* Sync with error handler */
+		if (!test_bit(HID_RESET_PENDING, &usbhid->iofl)
+		    && !test_bit(HID_CLEAR_HALT, &usbhid->iofl)
+		    && !test_bit(HID_OUT_RUNNING, &usbhid->iofl)
+		    && !test_bit(HID_CTRL_RUNNING, &usbhid->iofl)
+		    && !test_bit(HID_KEYS_PRESSED, &usbhid->iofl)
+		    && (!usbhid->ledcount || ignoreled))
+		{
+			set_bit(HID_REPORTED_IDLE, &usbhid->iofl);
+			spin_unlock_irq(&usbhid->lock);
+		} else {
+			usbhid_mark_busy(usbhid);
+			spin_unlock_irq(&usbhid->lock);
+			return -EBUSY;
+		}
 
-	spin_lock_irq(&usbhid->inlock);	/* Sync with error handler */
-	set_bit(HID_SUSPENDED, &usbhid->iofl);
-	spin_unlock_irq(&usbhid->inlock);
-	del_timer_sync(&usbhid->io_retry);
-	usb_kill_urb(usbhid->urbin);
+	} else {
+		spin_lock_irq(&usbhid->lock);
+		set_bit(HID_REPORTED_IDLE, &usbhid->iofl);
+		spin_unlock_irq(&usbhid->lock);
+		if (usbhid_wait_io(hid) < 0)
+			return -EIO;
+	}
+
+	if (!ignoreled && udev->auto_pm) {
+		spin_lock_irq(&usbhid->lock);
+		if (test_bit(HID_LED_ON, &usbhid->iofl)) {
+			spin_unlock_irq(&usbhid->lock);
+			usbhid_mark_busy(usbhid);
+			return -EBUSY;
+		}
+		spin_unlock_irq(&usbhid->lock);
+	}
+
+	hid_cancel_delayed_stuff(usbhid);
+	hid_cease_io(usbhid);
+
+	if (udev->auto_pm && test_bit(HID_KEYS_PRESSED, &usbhid->iofl)) {
+		/* lost race against keypresses */
+		status = hid_start_in(hid);
+		if (status < 0)
+			hid_io_error(hid);
+		usbhid_mark_busy(usbhid);
+		return -EBUSY;
+	}
 	dev_dbg(&intf->dev, "suspend\n");
 	return 0;
 }
@@ -1075,18 +1276,33 @@ static int hid_resume(struct usb_interface *intf)
 	if (!test_bit(HID_STARTED, &usbhid->iofl))
 		return 0;
 
-	clear_bit(HID_SUSPENDED, &usbhid->iofl);
+	clear_bit(HID_REPORTED_IDLE, &usbhid->iofl);
+	usbhid_mark_busy(usbhid);
+
+	if (test_bit(HID_CLEAR_HALT, &usbhid->iofl) ||
+	    test_bit(HID_RESET_PENDING, &usbhid->iofl))
+		schedule_work(&usbhid->reset_work);
 	usbhid->retry_delay = 0;
 	status = hid_start_in(hid);
+	if (status < 0)
+		hid_io_error(hid);
+	usbhid_restart_queues(usbhid);
+
 	dev_dbg(&intf->dev, "resume status %d\n", status);
-	return status;
+	return 0;
 }
 
 /* Treat USB reset pretty much the same as suspend/resume */
 static int hid_pre_reset(struct usb_interface *intf)
 {
-	/* FIXME: What if the interface is already suspended? */
-	hid_suspend(intf, PMSG_ON);
+	struct hid_device *hid = usb_get_intfdata(intf);
+	struct usbhid_device *usbhid = hid->driver_data;
+
+	spin_lock_irq(&usbhid->lock);
+	set_bit(HID_RESET_PENDING, &usbhid->iofl);
+	spin_unlock_irq(&usbhid->lock);
+	hid_cease_io(usbhid);
+
 	return 0;
 }
 
@@ -1094,11 +1310,35 @@ static int hid_pre_reset(struct usb_interface *intf)
 static int hid_post_reset(struct usb_interface *intf)
 {
 	struct usb_device *dev = interface_to_usbdev (intf);
-
+	struct hid_device *hid = usb_get_intfdata(intf);
+	struct usbhid_device *usbhid = hid->driver_data;
+	int status;
+ 
+	spin_lock_irq(&usbhid->lock);
+	clear_bit(HID_RESET_PENDING, &usbhid->iofl);
+	spin_unlock_irq(&usbhid->lock);
 	hid_set_idle(dev, intf->cur_altsetting->desc.bInterfaceNumber, 0, 0);
 	/* FIXME: Any more reinitialization needed? */
+	status = hid_start_in(hid);
+	if (status < 0)
+		hid_io_error(hid);
+	usbhid_restart_queues(usbhid);
 
-	return hid_resume(intf);
+	return 0;
+}
+
+int usbhid_get_power(struct hid_device *hid)
+{
+	struct usbhid_device *usbhid = hid->driver_data;
+ 
+	return usb_autopm_get_interface(usbhid->intf);
+}
+
+void usbhid_put_power(struct hid_device *hid)
+{
+	struct usbhid_device *usbhid = hid->driver_data;
+ 
+	usb_autopm_put_interface(usbhid->intf);
 }
 
 static struct usb_device_id hid_usb_ids [] = {
@@ -1134,7 +1374,11 @@ static struct hid_driver hid_usb_driver = {
 
 static int __init hid_init(void)
 {
-	int retval;
+	int retval = -ENOMEM;
+
+	resumption_waker = create_freezeable_workqueue("usbhid_resumer");
+	if (!resumption_waker)
+		goto no_queue;
 	retval = hid_register_driver(&hid_usb_driver);
 	if (retval)
 		goto hid_register_fail;
@@ -1158,6 +1402,8 @@ hiddev_init_fail:
 usbhid_quirks_init_fail:
 	hid_unregister_driver(&hid_usb_driver);
 hid_register_fail:
+	destroy_workqueue(resumption_waker);
+no_queue:
 	return retval;
 }
 
@@ -1167,6 +1413,7 @@ static void __exit hid_exit(void)
 	hiddev_exit();
 	usbhid_quirks_exit();
 	hid_unregister_driver(&hid_usb_driver);
+	destroy_workqueue(resumption_waker);
 }
 
 module_init(hid_init);
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 1f5b5d4c3c34..fd7375627e5d 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -249,10 +249,12 @@ static int hiddev_release(struct inode * inode, struct file * file)
 	spin_unlock_irqrestore(&list->hiddev->list_lock, flags);
 
 	if (!--list->hiddev->open) {
-		if (list->hiddev->exist)
+		if (list->hiddev->exist) {
 			usbhid_close(list->hiddev->hid);
-		else
+			usbhid_put_power(list->hiddev->hid);
+		} else {
 			kfree(list->hiddev);
+		}
 	}
 
 	kfree(list);
@@ -303,6 +305,17 @@ static int hiddev_open(struct inode *inode, struct file *file)
 	list_add_tail(&list->node, &hiddev_table[i]->list);
 	spin_unlock_irq(&list->hiddev->list_lock);
 
+	if (!list->hiddev->open++)
+		if (list->hiddev->exist) {
+			struct hid_device *hid = hiddev_table[i]->hid;
+			res = usbhid_get_power(hid);
+			if (res < 0) {
+				res = -EIO;
+				goto bail;
+			}
+			usbhid_open(hid);
+		}
+
 	return 0;
 bail:
 	file->private_data = NULL;
diff --git a/drivers/hid/usbhid/usbhid.h b/drivers/hid/usbhid/usbhid.h
index 9eb30564be9c..08f505ca2e3d 100644
--- a/drivers/hid/usbhid/usbhid.h
+++ b/drivers/hid/usbhid/usbhid.h
@@ -38,7 +38,10 @@ int usbhid_wait_io(struct hid_device* hid);
 void usbhid_close(struct hid_device *hid);
 int usbhid_open(struct hid_device *hid);
 void usbhid_init_reports(struct hid_device *hid);
-void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir);
+void usbhid_submit_report
+(struct hid_device *hid, struct hid_report *report, unsigned char dir);
+int usbhid_get_power(struct hid_device *hid);
+void usbhid_put_power(struct hid_device *hid);
 
 /* iofl flags */
 #define HID_CTRL_RUNNING	1
@@ -49,6 +52,9 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns
 #define HID_CLEAR_HALT		6
 #define HID_DISCONNECTED	7
 #define HID_STARTED		8
+#define HID_REPORTED_IDLE	9
+#define HID_KEYS_PRESSED	10
+#define HID_LED_ON		11
 
 /*
  * USB-specific HID struct, to be pointed to
@@ -66,7 +72,6 @@ struct usbhid_device {
 	struct urb *urbin;                                              /* Input URB */
 	char *inbuf;                                                    /* Input buffer */
 	dma_addr_t inbuf_dma;                                           /* Input buffer dma */
-	spinlock_t inlock;                                              /* Input fifo spinlock */
 
 	struct urb *urbctrl;                                            /* Control URB */
 	struct usb_ctrlrequest *cr;                                     /* Control request struct */
@@ -75,21 +80,22 @@ struct usbhid_device {
 	unsigned char ctrlhead, ctrltail;                               /* Control fifo head & tail */
 	char *ctrlbuf;                                                  /* Control buffer */
 	dma_addr_t ctrlbuf_dma;                                         /* Control buffer dma */
-	spinlock_t ctrllock;                                            /* Control fifo spinlock */
 
 	struct urb *urbout;                                             /* Output URB */
 	struct hid_output_fifo out[HID_CONTROL_FIFO_SIZE];              /* Output pipe fifo */
 	unsigned char outhead, outtail;                                 /* Output pipe fifo head & tail */
 	char *outbuf;                                                   /* Output buffer */
 	dma_addr_t outbuf_dma;                                          /* Output buffer dma */
-	spinlock_t outlock;                                             /* Output fifo spinlock */
 
+	spinlock_t lock;						/* fifo spinlock */
 	unsigned long iofl;                                             /* I/O flags (CTRL_RUNNING, OUT_RUNNING) */
 	struct timer_list io_retry;                                     /* Retry timer */
 	unsigned long stop_retry;                                       /* Time to give up, in jiffies */
 	unsigned int retry_delay;                                       /* Delay length in ms */
 	struct work_struct reset_work;                                  /* Task context for resets */
+	struct work_struct restart_work;				/* waking up for output to be done in a task */
 	wait_queue_head_t wait;						/* For sleeping */
+	int ledcount;							/* counting the number of active leds */
 };
 
 #define	hid_to_usb_dev(hid_dev) \
diff --git a/include/linux/hid.h b/include/linux/hid.h
index fa8ee9cef7be..6ac7795a8acc 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -603,12 +603,17 @@ struct hid_ll_driver {
 	int (*open)(struct hid_device *hdev);
 	void (*close)(struct hid_device *hdev);
 
+	int (*power)(struct hid_device *hdev, int level);
+
 	int (*hidinput_input_event) (struct input_dev *idev, unsigned int type,
 			unsigned int code, int value);
 
 	int (*parse)(struct hid_device *hdev);
 };
 
+#define	PM_HINT_FULLON	1<<5
+#define PM_HINT_NORMAL	1<<1
+
 /* Applications from HID Usage Tables 4/8/99 Version 1.1 */
 /* We ignore a few input applications that are not widely used */
 #define IS_INPUT_APPLICATION(a) (((a >= 0x00010000) && (a <= 0x00010008)) || (a == 0x00010080) || (a == 0x000c0001) || (a == 0x000d0002))
@@ -641,6 +646,7 @@ int hidinput_find_field(struct hid_device *hid, unsigned int type, unsigned int
 void hid_output_report(struct hid_report *report, __u8 *data);
 struct hid_device *hid_allocate_device(void);
 int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size);
+int hid_check_keys_pressed(struct hid_device *hid);
 int hid_connect(struct hid_device *hid, unsigned int connect_mask);
 
 /**
-- 
cgit v1.2.3


From 7058548cd50e5bda8db086bb2e5c1d82f746d047 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 25 Mar 2009 23:35:46 -0400
Subject: ext4: Use WRITE_SYNC for commits which are caused by fsync()

If a commit is triggered by fsync(), set a flag indicating the journal
blocks associated with the transaction should be flushed out using
WRITE_SYNC.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c      | 5 ++++-
 fs/jbd2/transaction.c | 2 ++
 include/linux/jbd2.h  | 6 ++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44c..4ea72377c7a2 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
+	int write_op = WRITE;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	if (commit_transaction->t_synchronous_commit)
+		write_op = WRITE_SYNC;
 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 	stats.u.run.rs_locked = jiffies;
 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -680,7 +683,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(WRITE, bh);
+				submit_bh(write_op, bh);
 			}
 			cond_resched();
 			stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 28ce21d8598e..996ffda06bf3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
 		}
 	}
 
+	if (handle->h_sync)
+		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 4d248b3f1323..8815a3456b3b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -648,6 +648,12 @@ struct transaction_s
 	 */
 	int t_handle_count;
 
+	/*
+	 * This transaction is being forced and some process is
+	 * waiting for it to finish.
+	 */
+	int t_synchronous_commit:1;
+
 	/*
 	 * For use by the filesystem to store fs-specific data
 	 * structures associated with the transaction
-- 
cgit v1.2.3


From 0f571515c332e00b3515dbe0859ceaa30ab66e00 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Fri, 6 Mar 2009 20:07:14 +0900
Subject: dmaengine: Add privatecnt to revert DMA_PRIVATE property

Currently dma_request_channel() set DMA_PRIVATE capability but never
clear it.  So if a public channel was once grabbed by
dma_request_channel(), the device stay PRIVATE forever.  Add
privatecnt member to dma_device to correctly revert it.

[lg@denx.de: fix bad usage of 'chan' in dma_async_device_register]
Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmaengine.c   | 8 ++++++++
 include/linux/dmaengine.h | 9 +++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index a41d1ea10fa3..92438e9dacc3 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -507,6 +507,7 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
 			 * published in the general-purpose allocator
 			 */
 			dma_cap_set(DMA_PRIVATE, device->cap_mask);
+			device->privatecnt++;
 			err = dma_chan_get(chan);
 
 			if (err == -ENODEV) {
@@ -518,6 +519,8 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
 				       dma_chan_name(chan), err);
 			else
 				break;
+			if (--device->privatecnt == 0)
+				dma_cap_clear(DMA_PRIVATE, device->cap_mask);
 			chan->private = NULL;
 			chan = NULL;
 		}
@@ -537,6 +540,9 @@ void dma_release_channel(struct dma_chan *chan)
 	WARN_ONCE(chan->client_count != 1,
 		  "chan reference count %d != 1\n", chan->client_count);
 	dma_chan_put(chan);
+	/* drop PRIVATE cap enabled by __dma_request_channel() */
+	if (--chan->device->privatecnt == 0)
+		dma_cap_clear(DMA_PRIVATE, chan->device->cap_mask);
 	chan->private = NULL;
 	mutex_unlock(&dma_list_mutex);
 }
@@ -719,6 +725,8 @@ int dma_async_device_register(struct dma_device *device)
 			}
 		}
 	list_add_tail_rcu(&device->global_node, &dma_device_list);
+	if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+		device->privatecnt++;	/* Always private */
 	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 2afc2c95e42d..2e2aa3df170c 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -202,6 +202,7 @@ struct dma_async_tx_descriptor {
 /**
  * struct dma_device - info on the entity supplying DMA services
  * @chancnt: how many DMA channels are supported
+ * @privatecnt: how many DMA channels are requested by dma_request_channel
  * @channels: the list of struct dma_chan
  * @global_node: list_head for global dma_device_list
  * @cap_mask: one or more dma_capability flags
@@ -224,6 +225,7 @@ struct dma_async_tx_descriptor {
 struct dma_device {
 
 	unsigned int chancnt;
+	unsigned int privatecnt;
 	struct list_head channels;
 	struct list_head global_node;
 	dma_cap_mask_t  cap_mask;
@@ -352,6 +354,13 @@ __dma_cap_set(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp)
 	set_bit(tx_type, dstp->bits);
 }
 
+#define dma_cap_clear(tx, mask) __dma_cap_clear((tx), &(mask))
+static inline void
+__dma_cap_clear(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp)
+{
+	clear_bit(tx_type, dstp->bits);
+}
+
 #define dma_cap_zero(mask) __dma_cap_zero(&(mask))
 static inline void __dma_cap_zero(dma_cap_mask_t *dstp)
 {
-- 
cgit v1.2.3


From 898585172fa729513d8636257b44bd1cfd279096 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Mon, 16 Feb 2009 02:55:47 +0800
Subject: PCI: save and restore PCIe 2.0 registers

PCIe 2.0 defines several new registers (Device Control 2, Link Control 2,
and Slot Control 2). Save and retore them in pci_save_pcie_state() and
pci_restore_pcie_state().

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c        | 11 ++++++++++-
 include/linux/pci_regs.h |  2 ++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 676bbcbc272b..59569b8cf1d5 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -647,6 +647,8 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state)
 
 EXPORT_SYMBOL(pci_choose_state);
 
+#define PCI_EXP_SAVE_REGS	7
+
 static int pci_save_pcie_state(struct pci_dev *dev)
 {
 	int pos, i = 0;
@@ -668,6 +670,9 @@ static int pci_save_pcie_state(struct pci_dev *dev)
 	pci_read_config_word(dev, pos + PCI_EXP_LNKCTL, &cap[i++]);
 	pci_read_config_word(dev, pos + PCI_EXP_SLTCTL, &cap[i++]);
 	pci_read_config_word(dev, pos + PCI_EXP_RTCTL, &cap[i++]);
+	pci_read_config_word(dev, pos + PCI_EXP_DEVCTL2, &cap[i++]);
+	pci_read_config_word(dev, pos + PCI_EXP_LNKCTL2, &cap[i++]);
+	pci_read_config_word(dev, pos + PCI_EXP_SLTCTL2, &cap[i++]);
 
 	return 0;
 }
@@ -688,6 +693,9 @@ static void pci_restore_pcie_state(struct pci_dev *dev)
 	pci_write_config_word(dev, pos + PCI_EXP_LNKCTL, cap[i++]);
 	pci_write_config_word(dev, pos + PCI_EXP_SLTCTL, cap[i++]);
 	pci_write_config_word(dev, pos + PCI_EXP_RTCTL, cap[i++]);
+	pci_write_config_word(dev, pos + PCI_EXP_DEVCTL2, cap[i++]);
+	pci_write_config_word(dev, pos + PCI_EXP_LNKCTL2, cap[i++]);
+	pci_write_config_word(dev, pos + PCI_EXP_SLTCTL2, cap[i++]);
 }
 
 
@@ -1372,7 +1380,8 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev)
 {
 	int error;
 
-	error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_EXP, 4 * sizeof(u16));
+	error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_EXP,
+					PCI_EXP_SAVE_REGS * sizeof(u16));
 	if (error)
 		dev_err(&dev->dev,
 			"unable to preallocate PCI Express save buffer\n");
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index d4e663877f45..e4d08c1b2e0b 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -488,6 +488,8 @@
 #define  PCI_EXP_DEVCAP2_ARI	0x20	/* Alternative Routing-ID */
 #define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
 #define  PCI_EXP_DEVCTL2_ARI	0x20	/* Alternative Routing-ID */
+#define PCI_EXP_LNKCTL2		48	/* Link Control 2 */
+#define PCI_EXP_SLTCTL2		56	/* Slot Control 2 */
 
 /* Extended Capabilities (PCI-X 2.0 and Express) */
 #define PCI_EXT_CAP_ID(header)		(header & 0x0000ffff)
-- 
cgit v1.2.3


From 3341323bb4c198f704cffbfdda37bcec1226ef7d Mon Sep 17 00:00:00 2001
From: Alexander Clouter <alex@digriz.org.uk>
Date: Fri, 27 Mar 2009 12:59:54 +0800
Subject: hwrng: timeriomem - Use phys address rather than virt

There is no ioremap'ing or anything in timeriomem-rng.c as I foolishly
used already remapped virtual addresses instead of passing the physical
address to be polled.

This patch fixes this flaw and lets developers do the Right Thing(tm).

Signed-off-by: Alexander Clouter <alex@digriz.org.uk>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/timeriomem-rng.c | 39 +++++++++++++++++++++++++++++----
 include/linux/timeriomem-rng.h          |  2 +-
 2 files changed, 36 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c
index 10ad41be5897..dcd352ad0e7f 100644
--- a/drivers/char/hw_random/timeriomem-rng.c
+++ b/drivers/char/hw_random/timeriomem-rng.c
@@ -90,10 +90,30 @@ static struct hwrng timeriomem_rng_ops = {
 
 static int __init timeriomem_rng_probe(struct platform_device *pdev)
 {
+	struct resource *res, *mem;
 	int ret;
 
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+
+	if (!res)
+		return -ENOENT;
+
+	mem = request_mem_region(res->start, res->end - res->start + 1,
+				 pdev->name);
+	if (mem == NULL)
+		return -EBUSY;
+
+	dev_set_drvdata(&pdev->dev, mem);
+
 	timeriomem_rng_data = pdev->dev.platform_data;
 
+	timeriomem_rng_data->address = ioremap(res->start,
+						res->end - res->start + 1);
+	if (!timeriomem_rng_data->address) {
+		ret = -ENOMEM;
+		goto err_ioremap;
+	}
+
 	if (timeriomem_rng_data->period != 0
 		&& usecs_to_jiffies(timeriomem_rng_data->period) > 0) {
 		timeriomem_rng_timer.expires = jiffies;
@@ -104,23 +124,34 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev)
 	timeriomem_rng_data->present = 1;
 
 	ret = hwrng_register(&timeriomem_rng_ops);
-	if (ret) {
-		dev_err(&pdev->dev, "problem registering\n");
-		return ret;
-	}
+	if (ret)
+		goto err_register;
 
 	dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n",
 			timeriomem_rng_data->address,
 			timeriomem_rng_data->period);
 
 	return 0;
+
+err_register:
+	dev_err(&pdev->dev, "problem registering\n");
+	iounmap(timeriomem_rng_data->address);
+err_ioremap:
+	release_resource(mem);
+
+	return ret;
 }
 
 static int __devexit timeriomem_rng_remove(struct platform_device *pdev)
 {
+	struct resource *mem = dev_get_drvdata(&pdev->dev);
+
 	del_timer_sync(&timeriomem_rng_timer);
 	hwrng_unregister(&timeriomem_rng_ops);
 
+	iounmap(timeriomem_rng_data->address);
+	release_resource(mem);
+
 	return 0;
 }
 
diff --git a/include/linux/timeriomem-rng.h b/include/linux/timeriomem-rng.h
index dd253177f65f..3e08a1c86830 100644
--- a/include/linux/timeriomem-rng.h
+++ b/include/linux/timeriomem-rng.h
@@ -14,7 +14,7 @@ struct timeriomem_rng_data {
 	struct completion	completion;
 	unsigned int		present:1;
 
-	u32 __iomem		*address;
+	void __iomem		*address;
 
 	/* measures in usecs */
 	unsigned int		period;
-- 
cgit v1.2.3


From 9b57896e62bfa752ee7435e6cfe57fb210c0db8c Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Thu, 26 Mar 2009 21:58:14 +0900
Subject: sony-laptop: Add support for extra keyboard events

The current sony-laptop code assumes that the keyboard event method is
always located at slot 2 in the platform code. Remove this assumption and
add support for some additional hotkeys.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Mattia Dongili <malattia@linux.it>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/platform/x86/sony-laptop.c | 59 +++++++++++++++++++++-----------------
 include/linux/sonypi.h             |  1 +
 2 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index 04deed826180..f6cdc8929fd7 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -211,6 +211,7 @@ static int sony_laptop_input_index[] = {
 	48,	/* 61 SONYPI_EVENT_WIRELESS_OFF */
 	49,	/* 62 SONYPI_EVENT_ZOOM_IN_PRESSED */
 	50,	/* 63 SONYPI_EVENT_ZOOM_OUT_PRESSED */
+	51,	/* 64 SONYPI_EVENT_CD_EJECT_PRESSED */
 };
 
 static int sony_laptop_input_keycode_map[] = {
@@ -264,7 +265,8 @@ static int sony_laptop_input_keycode_map[] = {
 	KEY_WLAN,	/* 47 SONYPI_EVENT_WIRELESS_ON */
 	KEY_WLAN,	/* 48 SONYPI_EVENT_WIRELESS_OFF */
 	KEY_ZOOMIN,	/* 49 SONYPI_EVENT_ZOOM_IN_PRESSED */
-	KEY_ZOOMOUT	/* 50 SONYPI_EVENT_ZOOM_OUT_PRESSED */
+	KEY_ZOOMOUT,	/* 50 SONYPI_EVENT_ZOOM_OUT_PRESSED */
+	KEY_EJECTCD	/* 51 SONYPI_EVENT_CD_EJECT_PRESSED */
 };
 
 /* release buttons after a short delay if pressed */
@@ -834,7 +836,11 @@ struct sony_nc_event {
 	u8	event;
 };
 
-static struct sony_nc_event sony_C_events[] = {
+static struct sony_nc_event sony_nc_events[] = {
+	{ 0x90, SONYPI_EVENT_PKEY_P1 },
+	{ 0x10, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x91, SONYPI_EVENT_PKEY_P1 },
+	{ 0x11, SONYPI_EVENT_ANYBUTTON_RELEASED },
 	{ 0x81, SONYPI_EVENT_FNKEY_F1 },
 	{ 0x01, SONYPI_EVENT_FNKEY_RELEASED },
 	{ 0x85, SONYPI_EVENT_FNKEY_F5 },
@@ -843,10 +849,14 @@ static struct sony_nc_event sony_C_events[] = {
 	{ 0x06, SONYPI_EVENT_FNKEY_RELEASED },
 	{ 0x87, SONYPI_EVENT_FNKEY_F7 },
 	{ 0x07, SONYPI_EVENT_FNKEY_RELEASED },
+	{ 0x89, SONYPI_EVENT_FNKEY_F9 },
+	{ 0x09, SONYPI_EVENT_FNKEY_RELEASED },
 	{ 0x8A, SONYPI_EVENT_FNKEY_F10 },
 	{ 0x0A, SONYPI_EVENT_FNKEY_RELEASED },
 	{ 0x8C, SONYPI_EVENT_FNKEY_F12 },
 	{ 0x0C, SONYPI_EVENT_FNKEY_RELEASED },
+	{ 0x9f, SONYPI_EVENT_CD_EJECT_PRESSED },
+	{ 0x1f, SONYPI_EVENT_ANYBUTTON_RELEASED },
 	{ 0, 0 },
 };
 
@@ -855,38 +865,33 @@ static struct sony_nc_event sony_C_events[] = {
  */
 static void sony_acpi_notify(acpi_handle handle, u32 event, void *data)
 {
-	int i;
 	u32 ev = event;
 	int result;
 
-	if (ev == 0x92 || ev == 0x90) {
+	if (ev >= 0x90) {
+		/* New-style event */
 		int origev = ev;
-		/* read the key pressed from EC.GECR
-		 * A call to SN07 with 0x0202 will do it as well respecting
-		 * the current protocol on different OSes
-		 *
-		 * Note: the path for GECR may be
-		 *   \_SB.PCI0.LPCB.EC (C, FE, AR, N and friends)
-		 *   \_SB.PCI0.PIB.EC0 (VGN-FR notifications are sent directly, no GECR)
-		 *
-		 * TODO: we may want to do the same for the older GHKE -need
-		 *       dmi list- so this snippet may become one more callback.
-		 */
-		if (sony_call_snc_handle(0x100, 0x200, &result))
-			dprintk("sony_acpi_notify, unable to decode event 0x%.2x\n", ev);
-		else
-			ev = result & 0xFF;
+		ev -= 0x90;
 
-		for (i = 0; sony_C_events[i].data; i++) {
-			if (sony_C_events[i].data == ev) {
-				ev = sony_C_events[i].event;
-				break;
+		if (sony_find_snc_handle(0x100) == ev) {
+			int i;
+
+			if (sony_call_snc_handle(0x100, 0x200, &result))
+				dprintk("sony_acpi_notify, unable to decode event 0x%.2x\n", ev);
+			else
+				ev = result & 0xFF;
+
+			for (i = 0; sony_nc_events[i].event; i++) {
+				if (sony_nc_events[i].data == ev) {
+					ev = sony_nc_events[i].event;
+					break;
+				}
 			}
-		}
 
-		if (!sony_C_events[i].data)
-			printk(KERN_INFO DRV_PFX "Unknown event: %x %x\n",
-			       origev, ev);
+			if (!sony_nc_events[i].data)
+				printk(KERN_INFO DRV_PFX
+				       "Unknown event: %x %x\n", origev, ev);
+		}
 	}
 
 	dprintk("sony_acpi_notify, event: 0x%.2x\n", ev);
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index f41ffd7c2dd9..8458dbe95862 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -103,6 +103,7 @@
 #define SONYPI_EVENT_WIRELESS_OFF		61
 #define SONYPI_EVENT_ZOOM_IN_PRESSED		62
 #define SONYPI_EVENT_ZOOM_OUT_PRESSED		63
+#define SONYPI_EVENT_CD_EJECT_PRESSED		64
 
 /* get/set brightness */
 #define SONYPI_IOCGBRT		_IOR('v', 0, __u8)
-- 
cgit v1.2.3


From 45c7942ba8f6b7d5d1147c10f84f0cbf5fa3a2b8 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Thu, 26 Mar 2009 21:58:16 +0900
Subject: sony-laptop: Add support for extended hotkeys

Recent Sony SR-series machines have an additional set of buttons accessed
via the 0x127 method rather than the 0x100 method. Add support for these.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Mattia Dongili <malattia@linux.it>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/platform/x86/sony-laptop.c | 65 +++++++++++++++++++++++++++++++-------
 include/linux/sonypi.h             |  4 +++
 2 files changed, 57 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index f458870c30b6..e000c9f6cdf5 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -226,6 +226,10 @@ static int sony_laptop_input_index[] = {
 	49,	/* 62 SONYPI_EVENT_ZOOM_IN_PRESSED */
 	50,	/* 63 SONYPI_EVENT_ZOOM_OUT_PRESSED */
 	51,	/* 64 SONYPI_EVENT_CD_EJECT_PRESSED */
+	52,	/* 65 SONYPI_EVENT_MODEKEY_PRESSED */
+	53,	/* 66 SONYPI_EVENT_PKEY_P4 */
+	54,	/* 67 SONYPI_EVENT_PKEY_P5 */
+	55,	/* 68 SONYPI_EVENT_SETTINGKEY_PRESSED */
 };
 
 static int sony_laptop_input_keycode_map[] = {
@@ -280,7 +284,11 @@ static int sony_laptop_input_keycode_map[] = {
 	KEY_WLAN,	/* 48 SONYPI_EVENT_WIRELESS_OFF */
 	KEY_ZOOMIN,	/* 49 SONYPI_EVENT_ZOOM_IN_PRESSED */
 	KEY_ZOOMOUT,	/* 50 SONYPI_EVENT_ZOOM_OUT_PRESSED */
-	KEY_EJECTCD	/* 51 SONYPI_EVENT_CD_EJECT_PRESSED */
+	KEY_EJECTCD,	/* 51 SONYPI_EVENT_CD_EJECT_PRESSED */
+	KEY_F13,	/* 52 SONYPI_EVENT_MODEKEY_PRESSED */
+	KEY_PROG4,	/* 53 SONYPI_EVENT_PKEY_P4 */
+	KEY_F14,	/* 54 SONYPI_EVENT_PKEY_P5 */
+	KEY_F15,	/* 55 SONYPI_EVENT_SETTINGKEY_PRESSED */
 };
 
 /* release buttons after a short delay if pressed */
@@ -850,7 +858,7 @@ struct sony_nc_event {
 	u8	event;
 };
 
-static struct sony_nc_event sony_nc_events[] = {
+static struct sony_nc_event sony_100_events[] = {
 	{ 0x90, SONYPI_EVENT_PKEY_P1 },
 	{ 0x10, SONYPI_EVENT_ANYBUTTON_RELEASED },
 	{ 0x91, SONYPI_EVENT_PKEY_P1 },
@@ -874,6 +882,25 @@ static struct sony_nc_event sony_nc_events[] = {
 	{ 0, 0 },
 };
 
+static struct sony_nc_event sony_127_events[] = {
+	{ 0x81, SONYPI_EVENT_MODEKEY_PRESSED },
+	{ 0x01, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x82, SONYPI_EVENT_PKEY_P1 },
+	{ 0x02, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x83, SONYPI_EVENT_PKEY_P2 },
+	{ 0x03, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x84, SONYPI_EVENT_PKEY_P3 },
+	{ 0x04, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x85, SONYPI_EVENT_PKEY_P4 },
+	{ 0x05, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x86, SONYPI_EVENT_PKEY_P5 },
+	{ 0x06, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x06, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x87, SONYPI_EVENT_SETTINGKEY_PRESSED },
+	{ 0x07, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0, 0 },
+};
+
 /*
  * ACPI callbacks
  */
@@ -884,27 +911,41 @@ static void sony_acpi_notify(acpi_handle handle, u32 event, void *data)
 
 	if (ev >= 0x90) {
 		/* New-style event */
-		int origev = ev;
+		int key_handle = 0;
 		ev -= 0x90;
 
-		if (sony_find_snc_handle(0x100) == ev) {
-			int i;
+		if (sony_find_snc_handle(0x100) == ev)
+			key_handle = 0x100;
+		if (sony_find_snc_handle(0x127) == ev)
+			key_handle = 0x127;
+
+		if (handle) {
+			struct sony_nc_event *key_event;
 
-			if (sony_call_snc_handle(0x100, 0x200, &result))
-				dprintk("sony_acpi_notify, unable to decode event 0x%.2x\n", ev);
+			if (sony_call_snc_handle(key_handle, 0x200, &result))
+				dprintk("sony_acpi_notify, unable to decode"
+					" event 0x%.2x 0x%.2x\n", key_handle,
+					ev);
 			else
 				ev = result & 0xFF;
 
-			for (i = 0; sony_nc_events[i].event; i++) {
-				if (sony_nc_events[i].data == ev) {
-					ev = sony_nc_events[i].event;
+			if (key_handle == 0x100)
+				key_event = sony_100_events;
+			else
+				key_event = sony_127_events;
+
+			for (; key_event->data; key_event++) {
+				if (key_event->data == ev) {
+					ev = key_event->event;
 					break;
 				}
 			}
 
-			if (!sony_nc_events[i].data)
+			if (!key_event->data) {
 				printk(KERN_INFO DRV_PFX
-				       "Unknown event: %x %x\n", origev, ev);
+				       "Unknown event: 0x%x 0x%x\n", key_handle,
+				       ev);
+			}
 		} else if (sony_find_snc_handle(0x124) == ev) {
 			sony_nc_rfkill_update();
 			return;
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index 8458dbe95862..bb835019ac7f 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -104,6 +104,10 @@
 #define SONYPI_EVENT_ZOOM_IN_PRESSED		62
 #define SONYPI_EVENT_ZOOM_OUT_PRESSED		63
 #define SONYPI_EVENT_CD_EJECT_PRESSED		64
+#define SONYPI_EVENT_MODEKEY_PRESSED		65
+#define SONYPI_EVENT_PKEY_P4			66
+#define SONYPI_EVENT_PKEY_P5			67
+#define SONYPI_EVENT_SETTINGKEY_PRESSED		68
 
 /* get/set brightness */
 #define SONYPI_IOCGBRT		_IOR('v', 0, __u8)
-- 
cgit v1.2.3


From 1cae71032183776e833036fe828315dcd3444df1 Mon Sep 17 00:00:00 2001
From: Harald Jenny <harald@a-little-linux-box.at>
Date: Thu, 26 Mar 2009 21:58:18 +0900
Subject: sony-laptop: VGN-A317M hotkey support

This laptop has 5 SPIC managed buttons above the keyboard:
sound + and - as well as brightness, zoom and S1.
Possibly the entire VGN-A serie behaves the same.

Signed-off-by: Harald Jenny <harald@a-little-linux-box.at>
Signed-off-by: Mattia Dongili <malattia@linux.it>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/platform/x86/sony-laptop.c | 22 ++++++++++++++++++++++
 include/linux/sonypi.h             |  3 +++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index 3e45c65b8f81..4f932889569b 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -230,6 +230,9 @@ static int sony_laptop_input_index[] = {
 	53,	/* 66 SONYPI_EVENT_PKEY_P4 */
 	54,	/* 67 SONYPI_EVENT_PKEY_P5 */
 	55,	/* 68 SONYPI_EVENT_SETTINGKEY_PRESSED */
+	56,	/* 69 SONYPI_EVENT_VOLUME_INC_PRESSED */
+	57,	/* 70 SONYPI_EVENT_VOLUME_DEC_PRESSED */
+	-1,	/* 71 SONYPI_EVENT_BRIGHTNESS_PRESSED */
 };
 
 static int sony_laptop_input_keycode_map[] = {
@@ -289,6 +292,8 @@ static int sony_laptop_input_keycode_map[] = {
 	KEY_PROG4,	/* 53 SONYPI_EVENT_PKEY_P4 */
 	KEY_F14,	/* 54 SONYPI_EVENT_PKEY_P5 */
 	KEY_F15,	/* 55 SONYPI_EVENT_SETTINGKEY_PRESSED */
+	KEY_VOLUMEUP,	/* 56 SONYPI_EVENT_VOLUME_INC_PRESSED */
+	KEY_VOLUMEDOWN,	/* 57 SONYPI_EVENT_VOLUME_DEC_PRESSED */
 };
 
 /* release buttons after a short delay if pressed */
@@ -1555,6 +1560,7 @@ static struct sonypi_event sonypi_pkeyev[] = {
 	{ 0x01, SONYPI_EVENT_PKEY_P1 },
 	{ 0x02, SONYPI_EVENT_PKEY_P2 },
 	{ 0x04, SONYPI_EVENT_PKEY_P3 },
+	{ 0x20, SONYPI_EVENT_PKEY_P1 },
 	{ 0, 0 }
 };
 
@@ -1598,6 +1604,7 @@ static struct sonypi_event sonypi_zoomev[] = {
 	{ 0x39, SONYPI_EVENT_ZOOM_PRESSED },
 	{ 0x10, SONYPI_EVENT_ZOOM_IN_PRESSED },
 	{ 0x20, SONYPI_EVENT_ZOOM_OUT_PRESSED },
+	{ 0x04, SONYPI_EVENT_ZOOM_PRESSED },
 	{ 0, 0 }
 };
 
@@ -1628,6 +1635,19 @@ static struct sonypi_event sonypi_batteryev[] = {
 	{ 0, 0 }
 };
 
+/* The set of possible volume events */
+static struct sonypi_event sonypi_volumeev[] = {
+	{ 0x01, SONYPI_EVENT_VOLUME_INC_PRESSED },
+	{ 0x02, SONYPI_EVENT_VOLUME_DEC_PRESSED },
+	{ 0, 0 }
+};
+
+/* The set of possible brightness events */
+static struct sonypi_event sonypi_brightnessev[] = {
+	{ 0x80, SONYPI_EVENT_BRIGHTNESS_PRESSED },
+	{ 0, 0 }
+};
+
 static struct sonypi_eventtypes type1_events[] = {
 	{ 0, 0xffffffff, sonypi_releaseev },
 	{ 0x70, SONYPI_MEYE_MASK, sonypi_meyeev },
@@ -1668,6 +1688,8 @@ static struct sonypi_eventtypes type3_events[] = {
 	{ 0x05, SONYPI_PKEY_MASK, sonypi_pkeyev },
 	{ 0x05, SONYPI_ZOOM_MASK, sonypi_zoomev },
 	{ 0x05, SONYPI_CAPTURE_MASK, sonypi_captureev },
+	{ 0x05, SONYPI_PKEY_MASK, sonypi_volumeev },
+	{ 0x05, SONYPI_PKEY_MASK, sonypi_brightnessev },
 	{ 0 },
 };
 
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index bb835019ac7f..34c4475ac4a2 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -108,6 +108,9 @@
 #define SONYPI_EVENT_PKEY_P4			66
 #define SONYPI_EVENT_PKEY_P5			67
 #define SONYPI_EVENT_SETTINGKEY_PRESSED		68
+#define SONYPI_EVENT_VOLUME_INC_PRESSED		69
+#define SONYPI_EVENT_VOLUME_DEC_PRESSED		70
+#define SONYPI_EVENT_BRIGHTNESS_PRESSED		71
 
 /* get/set brightness */
 #define SONYPI_IOCGBRT		_IOR('v', 0, __u8)
-- 
cgit v1.2.3


From 03a971a2899886006f19f3495973bbd646d8bdae Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Wed, 3 Dec 2008 18:00:38 +0000
Subject: thermal: support forcing support for passive cooling

Due to poor thermal design or Linux driving hardware outside its thermal
envelope, some systems will reach critical temperature and shut down
under high load. This patch adds support for forcing a polling-based
passive trip point if the firmware doesn't provide one. The assumption
is made that the processor is the most practical means to reduce the
dynamic heat generation, so hitting the passive thermal limit will cause
the CPU to be throttled until the temperature stabalises around the
defined value.

UI is provided via a "passive" sysfs entry in the thermal zone
directory. It accepts a decimal value in millidegrees celsius, or "0" to
disable the functionality. Default behaviour is for this functionality
to be disabled.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/thermal/thermal_sys.c | 77 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/thermal.h       |  1 +
 2 files changed, 78 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c
index 6378741882f3..d0b093b66adc 100644
--- a/drivers/thermal/thermal_sys.c
+++ b/drivers/thermal/thermal_sys.c
@@ -214,9 +214,69 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
 	return sprintf(buf, "%ld\n", temperature);
 }
 
+static ssize_t
+passive_store(struct device *dev, struct device_attribute *attr,
+		    const char *buf, size_t count)
+{
+	struct thermal_zone_device *tz = to_thermal_zone(dev);
+	struct thermal_cooling_device *cdev = NULL;
+	int state;
+
+	if (!sscanf(buf, "%d\n", &state))
+		return -EINVAL;
+
+	if (state && !tz->forced_passive) {
+		mutex_lock(&thermal_list_lock);
+		list_for_each_entry(cdev, &thermal_cdev_list, node) {
+			if (!strncmp("Processor", cdev->type,
+				     sizeof("Processor")))
+				thermal_zone_bind_cooling_device(tz,
+								 THERMAL_TRIPS_NONE,
+								 cdev);
+		}
+		mutex_unlock(&thermal_list_lock);
+	} else if (!state && tz->forced_passive) {
+		mutex_lock(&thermal_list_lock);
+		list_for_each_entry(cdev, &thermal_cdev_list, node) {
+			if (!strncmp("Processor", cdev->type,
+				     sizeof("Processor")))
+				thermal_zone_unbind_cooling_device(tz,
+								   THERMAL_TRIPS_NONE,
+								   cdev);
+		}
+		mutex_unlock(&thermal_list_lock);
+	}
+
+	tz->tc1 = 1;
+	tz->tc2 = 1;
+
+	if (!tz->passive_delay)
+		tz->passive_delay = 1000;
+
+	if (!tz->polling_delay)
+		tz->polling_delay = 10000;
+
+	tz->forced_passive = state;
+
+	thermal_zone_device_update(tz);
+
+	return count;
+}
+
+static ssize_t
+passive_show(struct device *dev, struct device_attribute *attr,
+		   char *buf)
+{
+	struct thermal_zone_device *tz = to_thermal_zone(dev);
+
+	return sprintf(buf, "%d\n", tz->forced_passive);
+}
+
 static DEVICE_ATTR(type, 0444, type_show, NULL);
 static DEVICE_ATTR(temp, 0444, temp_show, NULL);
 static DEVICE_ATTR(mode, 0644, mode_show, mode_store);
+static DEVICE_ATTR(passive, S_IRUGO | S_IWUSR, passive_show, \
+		   passive_store);
 
 static struct device_attribute trip_point_attrs[] = {
 	__ATTR(trip_point_0_type, 0444, trip_point_type_show, NULL),
@@ -939,6 +999,11 @@ void thermal_zone_device_update(struct thermal_zone_device *tz)
 			break;
 		}
 	}
+
+	if (tz->forced_passive)
+		thermal_zone_device_passive(tz, temp, tz->forced_passive,
+					    THERMAL_TRIPS_NONE);
+
 	tz->last_temperature = temp;
 	if (tz->passive)
 		thermal_zone_device_set_polling(tz, tz->passive_delay);
@@ -977,8 +1042,10 @@ struct thermal_zone_device *thermal_zone_device_register(char *type,
 {
 	struct thermal_zone_device *tz;
 	struct thermal_cooling_device *pos;
+	enum thermal_trip_type trip_type;
 	int result;
 	int count;
+	int passive = 0;
 
 	if (strlen(type) >= THERMAL_NAME_LENGTH)
 		return ERR_PTR(-EINVAL);
@@ -1041,8 +1108,18 @@ struct thermal_zone_device *thermal_zone_device_register(char *type,
 		TRIP_POINT_ATTR_ADD(&tz->device, count, result);
 		if (result)
 			goto unregister;
+		tz->ops->get_trip_type(tz, count, &trip_type);
+		if (trip_type == THERMAL_TRIP_PASSIVE)
+			passive = 1;
 	}
 
+	if (!passive)
+		result = device_create_file(&tz->device,
+					    &dev_attr_passive);
+
+	if (result)
+		goto unregister;
+
 	result = thermal_add_hwmon_sysfs(tz);
 	if (result)
 		goto unregister;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index a81c61521ba4..1de8b9eb841b 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -113,6 +113,7 @@ struct thermal_zone_device {
 	int polling_delay;
 	int last_temperature;
 	bool passive;
+	unsigned int forced_passive;
 	struct thermal_zone_device_ops *ops;
 	struct list_head cooling_devices;
 	struct idr idr;
-- 
cgit v1.2.3


From 42d671c78f6486c932b68a50f88768c7b4e57ebf Mon Sep 17 00:00:00 2001
From: Greg Banks <gnb@sgi.com>
Date: Fri, 27 Mar 2009 02:32:47 +1100
Subject: Fix a build warning about leaking CONFIG_NFSD to userspace.

Fix a build warning about leaking CONFIG_NFSD to userspace.

The nfsd_stats data structure does not need to be available to
userspace; no kernel interface uses it.  So move it inside #ifdef
__KERNEL__ and the warning goes away.

Signed-off-by: Greg Banks <gnb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/nfsd/stats.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfsd/stats.h b/include/linux/nfsd/stats.h
index 7678cfbe9960..2693ef647df6 100644
--- a/include/linux/nfsd/stats.h
+++ b/include/linux/nfsd/stats.h
@@ -11,6 +11,11 @@
 
 #include <linux/nfs4.h>
 
+/* thread usage wraps very million seconds (approx one fortnight) */
+#define	NFSD_USAGE_WRAP	(HZ*1000000)
+
+#ifdef __KERNEL__
+
 struct nfsd_stats {
 	unsigned int	rchits;		/* repcache hits */
 	unsigned int	rcmisses;	/* repcache hits */
@@ -35,10 +40,6 @@ struct nfsd_stats {
 
 };
 
-/* thread usage wraps very million seconds (approx one fortnight) */
-#define	NFSD_USAGE_WRAP	(HZ*1000000)
-
-#ifdef __KERNEL__
 
 extern struct nfsd_stats	nfsdstats;
 extern struct svc_stat		nfsd_svcstats;
-- 
cgit v1.2.3


From 512a004382f2c60d5c4f855476ba965adc00250c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 27 Mar 2009 22:14:27 -0400
Subject: ext3: Use WRITE_SYNC for commits which are caused by fsync()

If a commit is triggered by fsync(), set a flag indicating the journal
blocks associated with the transaction should be flushed out using
WRITE_SYNC.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/commit.c      | 23 +++++++++++++++--------
 fs/jbd/transaction.c |  2 ++
 include/linux/jbd.h  |  5 +++++
 3 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 3fbffb1ea714..f8077b9c8981 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/bio.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -171,14 +172,15 @@ static int journal_write_commit_record(journal_t *journal,
 	return (ret == -EIO);
 }
 
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
+				   int write_op)
 {
 	int i;
 
 	for (i = 0; i < bufs; i++) {
 		wbuf[i]->b_end_io = end_buffer_write_sync;
 		/* We use-up our safety reference in submit_bh() */
-		submit_bh(WRITE, wbuf[i]);
+		submit_bh(write_op, wbuf[i]);
 	}
 }
 
@@ -186,7 +188,8 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
  *  Submit all the data buffers to disk
  */
 static int journal_submit_data_buffers(journal_t *journal,
-				transaction_t *commit_transaction)
+				       transaction_t *commit_transaction,
+				       int write_op)
 {
 	struct journal_head *jh;
 	struct buffer_head *bh;
@@ -225,7 +228,7 @@ write_out_data:
 				BUFFER_TRACE(bh, "needs blocking lock");
 				spin_unlock(&journal->j_list_lock);
 				/* Write out all data to prevent deadlocks */
-				journal_do_submit_data(wbuf, bufs);
+				journal_do_submit_data(wbuf, bufs, write_op);
 				bufs = 0;
 				lock_buffer(bh);
 				spin_lock(&journal->j_list_lock);
@@ -256,7 +259,7 @@ write_out_data:
 			jbd_unlock_bh_state(bh);
 			if (bufs == journal->j_wbufsize) {
 				spin_unlock(&journal->j_list_lock);
-				journal_do_submit_data(wbuf, bufs);
+				journal_do_submit_data(wbuf, bufs, write_op);
 				bufs = 0;
 				goto write_out_data;
 			}
@@ -286,7 +289,7 @@ write_out_data:
 		}
 	}
 	spin_unlock(&journal->j_list_lock);
-	journal_do_submit_data(wbuf, bufs);
+	journal_do_submit_data(wbuf, bufs, write_op);
 
 	return err;
 }
@@ -315,6 +318,7 @@ void journal_commit_transaction(journal_t *journal)
 	int first_tag = 0;
 	int tag_flag;
 	int i;
+	int write_op = WRITE;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -347,6 +351,8 @@ void journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	if (commit_transaction->t_synchronous_commit)
+		write_op = WRITE_SYNC;
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
@@ -431,7 +437,8 @@ void journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = journal_submit_data_buffers(journal, commit_transaction);
+	err = journal_submit_data_buffers(journal, commit_transaction,
+					  write_op);
 
 	/*
 	 * Wait for all previously submitted IO to complete.
@@ -660,7 +667,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(WRITE, bh);
+				submit_bh(write_op, bh);
 			}
 			cond_resched();
 
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e6a117431277..ed886e6db399 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1440,6 +1440,8 @@ int journal_stop(handle_t *handle)
 		}
 	}
 
+	if (handle->h_sync)
+		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 64246dce5663..2c6943152c21 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -552,6 +552,11 @@ struct transaction_s
 	 */
 	int t_handle_count;
 
+	/*
+	 * This transaction is being forced and some process is
+	 * waiting for it to finish.
+	 */
+	int t_synchronous_commit:1;
 };
 
 /**
-- 
cgit v1.2.3


From 7faa144a518c456e2057918f030f50100144ccc6 Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Fri, 27 Mar 2009 22:23:52 -0400
Subject: ACPI: battery: add power_{now,avg} properties to power_class

ACPI has smart batteries, which work in units of energy and measure
rate of (dis)charge as power, thus it is not appropriate to export it
as a current_now. Current_now will still be exported to allow
for userland applications to match.

Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/battery.c             | 12 +++++++-----
 drivers/acpi/sbs.c                 | 27 ++++++++++++++++-----------
 drivers/power/power_supply_sysfs.c |  2 ++
 include/linux/power_supply.h       |  2 ++
 4 files changed, 27 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 69cbc57c2d1c..09a2240d5605 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -92,7 +92,7 @@ struct acpi_battery {
 #endif
 	struct acpi_device *device;
 	unsigned long update_time;
-	int current_now;
+	int rate_now;
 	int capacity_now;
 	int voltage_now;
 	int design_capacity;
@@ -196,7 +196,8 @@ static int acpi_battery_get_property(struct power_supply *psy,
 		val->intval = battery->voltage_now * 1000;
 		break;
 	case POWER_SUPPLY_PROP_CURRENT_NOW:
-		val->intval = battery->current_now * 1000;
+	case POWER_SUPPLY_PROP_POWER_NOW:
+		val->intval = battery->rate_now * 1000;
 		break;
 	case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN:
 	case POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN:
@@ -247,6 +248,7 @@ static enum power_supply_property energy_battery_props[] = {
 	POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
 	POWER_SUPPLY_PROP_VOLTAGE_NOW,
 	POWER_SUPPLY_PROP_CURRENT_NOW,
+	POWER_SUPPLY_PROP_POWER_NOW,
 	POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN,
 	POWER_SUPPLY_PROP_ENERGY_FULL,
 	POWER_SUPPLY_PROP_ENERGY_NOW,
@@ -273,7 +275,7 @@ struct acpi_offsets {
 
 static struct acpi_offsets state_offsets[] = {
 	{offsetof(struct acpi_battery, state), 0},
-	{offsetof(struct acpi_battery, current_now), 0},
+	{offsetof(struct acpi_battery, rate_now), 0},
 	{offsetof(struct acpi_battery, capacity_now), 0},
 	{offsetof(struct acpi_battery, voltage_now), 0},
 };
@@ -605,11 +607,11 @@ static int acpi_battery_print_state(struct seq_file *seq, int result)
 	else
 		seq_printf(seq, "charging state:          charged\n");
 
-	if (battery->current_now == ACPI_BATTERY_VALUE_UNKNOWN)
+	if (battery->rate_now == ACPI_BATTERY_VALUE_UNKNOWN)
 		seq_printf(seq, "present rate:            unknown\n");
 	else
 		seq_printf(seq, "present rate:            %d %s\n",
-			   battery->current_now, acpi_battery_units(battery));
+			   battery->rate_now, acpi_battery_units(battery));
 
 	if (battery->capacity_now == ACPI_BATTERY_VALUE_UNKNOWN)
 		seq_printf(seq, "remaining capacity:      unknown\n");
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index 6050ce481873..3963cb6e0f19 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -102,8 +102,8 @@ struct acpi_battery {
 	u16 cycle_count;
 	u16 temp_now;
 	u16 voltage_now;
-	s16 current_now;
-	s16 current_avg;
+	s16 rate_now;
+	s16 rate_avg;
 	u16 capacity_now;
 	u16 state_of_charge;
 	u16 state;
@@ -202,9 +202,9 @@ static int acpi_sbs_battery_get_property(struct power_supply *psy,
 		return -ENODEV;
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
-		if (battery->current_now < 0)
+		if (battery->rate_now < 0)
 			val->intval = POWER_SUPPLY_STATUS_DISCHARGING;
-		else if (battery->current_now > 0)
+		else if (battery->rate_now > 0)
 			val->intval = POWER_SUPPLY_STATUS_CHARGING;
 		else
 			val->intval = POWER_SUPPLY_STATUS_FULL;
@@ -224,11 +224,13 @@ static int acpi_sbs_battery_get_property(struct power_supply *psy,
 				acpi_battery_vscale(battery) * 1000;
 		break;
 	case POWER_SUPPLY_PROP_CURRENT_NOW:
-		val->intval = abs(battery->current_now) *
+	case POWER_SUPPLY_PROP_POWER_NOW:
+		val->intval = abs(battery->rate_now) *
 				acpi_battery_ipscale(battery) * 1000;
 		break;
 	case POWER_SUPPLY_PROP_CURRENT_AVG:
-		val->intval = abs(battery->current_avg) *
+	case POWER_SUPPLY_PROP_POWER_AVG:
+		val->intval = abs(battery->rate_avg) *
 				acpi_battery_ipscale(battery) * 1000;
 		break;
 	case POWER_SUPPLY_PROP_CAPACITY:
@@ -293,6 +295,8 @@ static enum power_supply_property sbs_energy_battery_props[] = {
 	POWER_SUPPLY_PROP_VOLTAGE_NOW,
 	POWER_SUPPLY_PROP_CURRENT_NOW,
 	POWER_SUPPLY_PROP_CURRENT_AVG,
+	POWER_SUPPLY_PROP_POWER_NOW,
+	POWER_SUPPLY_PROP_POWER_AVG,
 	POWER_SUPPLY_PROP_CAPACITY,
 	POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN,
 	POWER_SUPPLY_PROP_ENERGY_FULL,
@@ -301,6 +305,7 @@ static enum power_supply_property sbs_energy_battery_props[] = {
 	POWER_SUPPLY_PROP_MODEL_NAME,
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
+
 #endif
 
 /* --------------------------------------------------------------------------
@@ -330,8 +335,8 @@ static struct acpi_battery_reader info_readers[] = {
 static struct acpi_battery_reader state_readers[] = {
 	{0x08, SMBUS_READ_WORD, offsetof(struct acpi_battery, temp_now)},
 	{0x09, SMBUS_READ_WORD, offsetof(struct acpi_battery, voltage_now)},
-	{0x0a, SMBUS_READ_WORD, offsetof(struct acpi_battery, current_now)},
-	{0x0b, SMBUS_READ_WORD, offsetof(struct acpi_battery, current_avg)},
+	{0x0a, SMBUS_READ_WORD, offsetof(struct acpi_battery, rate_now)},
+	{0x0b, SMBUS_READ_WORD, offsetof(struct acpi_battery, rate_avg)},
 	{0x0f, SMBUS_READ_WORD, offsetof(struct acpi_battery, capacity_now)},
 	{0x0e, SMBUS_READ_WORD, offsetof(struct acpi_battery, state_of_charge)},
 	{0x16, SMBUS_READ_WORD, offsetof(struct acpi_battery, state)},
@@ -589,9 +594,9 @@ static int acpi_battery_read_state(struct seq_file *seq, void *offset)
 	seq_printf(seq, "capacity state:          %s\n",
 		   (battery->state & 0x0010) ? "critical" : "ok");
 	seq_printf(seq, "charging state:          %s\n",
-		   (battery->current_now < 0) ? "discharging" :
-		   ((battery->current_now > 0) ? "charging" : "charged"));
-	rate = abs(battery->current_now) * acpi_battery_ipscale(battery);
+		   (battery->rate_now < 0) ? "discharging" :
+		   ((battery->rate_now > 0) ? "charging" : "charged"));
+	rate = abs(battery->rate_now) * acpi_battery_ipscale(battery);
 	rate *= (acpi_battery_mode(battery))?(battery->voltage_now *
 			acpi_battery_vscale(battery)/1000):1;
 	seq_printf(seq, "present rate:            %d%s\n", rate,
diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c
index ac01e06817fb..da73591017f9 100644
--- a/drivers/power/power_supply_sysfs.c
+++ b/drivers/power/power_supply_sysfs.c
@@ -93,6 +93,8 @@ static struct device_attribute power_supply_attrs[] = {
 	POWER_SUPPLY_ATTR(voltage_avg),
 	POWER_SUPPLY_ATTR(current_now),
 	POWER_SUPPLY_ATTR(current_avg),
+	POWER_SUPPLY_ATTR(power_now),
+	POWER_SUPPLY_ATTR(power_avg),
 	POWER_SUPPLY_ATTR(charge_full_design),
 	POWER_SUPPLY_ATTR(charge_empty_design),
 	POWER_SUPPLY_ATTR(charge_full),
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 8ff25e0e7f7a..594c494ac3f0 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -73,6 +73,8 @@ enum power_supply_property {
 	POWER_SUPPLY_PROP_VOLTAGE_AVG,
 	POWER_SUPPLY_PROP_CURRENT_NOW,
 	POWER_SUPPLY_PROP_CURRENT_AVG,
+	POWER_SUPPLY_PROP_POWER_NOW,
+	POWER_SUPPLY_PROP_POWER_AVG,
 	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
 	POWER_SUPPLY_PROP_CHARGE_EMPTY_DESIGN,
 	POWER_SUPPLY_PROP_CHARGE_FULL,
-- 
cgit v1.2.3


From efb3288b423d7e3533a68dccecaa05a56a281a4e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:45:43 -0400
Subject: SUNRPC: Clean up static inline functions in svc_xprt.h

Clean up:  Enable the use of const arguments in higher level svc_ APIs
by adding const to the arguments of the helper functions in svc_xprt.h

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svc_xprt.h | 46 +++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 0127daca4354..959b931b6053 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -88,29 +88,32 @@ static inline void svc_xprt_get(struct svc_xprt *xprt)
 	kref_get(&xprt->xpt_ref);
 }
 static inline void svc_xprt_set_local(struct svc_xprt *xprt,
-				      struct sockaddr *sa, int salen)
+				      const struct sockaddr *sa,
+				      const size_t salen)
 {
 	memcpy(&xprt->xpt_local, sa, salen);
 	xprt->xpt_locallen = salen;
 }
 static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
-				       struct sockaddr *sa, int salen)
+				       const struct sockaddr *sa,
+				       const size_t salen)
 {
 	memcpy(&xprt->xpt_remote, sa, salen);
 	xprt->xpt_remotelen = salen;
 }
-static inline unsigned short svc_addr_port(struct sockaddr *sa)
+static inline unsigned short svc_addr_port(const struct sockaddr *sa)
 {
-	unsigned short ret = 0;
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)sa;
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa;
+
 	switch (sa->sa_family) {
 	case AF_INET:
-		ret = ntohs(((struct sockaddr_in *)sa)->sin_port);
-		break;
+		return ntohs(sin->sin_port);
 	case AF_INET6:
-		ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
-		break;
+		return ntohs(sin6->sin6_port);
 	}
-	return ret;
+
+	return 0;
 }
 
 static inline size_t svc_addr_len(struct sockaddr *sa)
@@ -124,36 +127,39 @@ static inline size_t svc_addr_len(struct sockaddr *sa)
 	return -EAFNOSUPPORT;
 }
 
-static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt)
 {
-	return svc_addr_port((struct sockaddr *)&xprt->xpt_local);
+	return svc_addr_port((const struct sockaddr *)&xprt->xpt_local);
 }
 
-static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt)
 {
-	return svc_addr_port((struct sockaddr *)&xprt->xpt_remote);
+	return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote);
 }
 
-static inline char *__svc_print_addr(struct sockaddr *addr,
-				     char *buf, size_t len)
+static inline char *__svc_print_addr(const struct sockaddr *addr,
+				     char *buf, const size_t len)
 {
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)addr;
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr;
+
 	switch (addr->sa_family) {
 	case AF_INET:
-		snprintf(buf, len, "%pI4, port=%u",
-			&((struct sockaddr_in *)addr)->sin_addr,
-			ntohs(((struct sockaddr_in *) addr)->sin_port));
+		snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr,
+			ntohs(sin->sin_port));
 		break;
 
 	case AF_INET6:
 		snprintf(buf, len, "%pI6, port=%u",
-			 &((struct sockaddr_in6 *)addr)->sin6_addr,
-			ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
+			 &sin6->sin6_addr,
+			ntohs(sin6->sin6_port));
 		break;
 
 	default:
 		snprintf(buf, len, "unknown address type: %d", addr->sa_family);
 		break;
 	}
+
 	return buf;
 }
 #endif /* SUNRPC_SVC_XPRT_H */
-- 
cgit v1.2.3


From 156e62094a74cf43f02f56ef96b6cda567501357 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:45:58 -0400
Subject: SUNRPC: Clean up svc_find_xprt() calling sequence

Clean up: add documentating comment and use appropriate data types for
svc_find_xprt()'s arguments.

This also eliminates a mixed sign comparison: @port was an int, while
the return value of svc_xprt_local_port() is an unsigned short.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svc_xprt.h |  3 ++-
 net/sunrpc/svc_xprt.c           | 16 +++++++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 959b931b6053..55b68582c5d9 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -80,7 +80,8 @@ void	svc_close_xprt(struct svc_xprt *xprt);
 void	svc_delete_xprt(struct svc_xprt *xprt);
 int	svc_port_is_privileged(struct sockaddr *sin);
 int	svc_print_xprts(char *buf, int maxlen);
-struct	svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int);
+struct	svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+			const sa_family_t af, const unsigned short port);
 int	svc_xprt_names(struct svc_serv *serv, char *buf, int buflen);
 
 static inline void svc_xprt_get(struct svc_xprt *xprt)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e588df5d6b34..c947c93dbc24 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1033,7 +1033,13 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
 	return dr;
 }
 
-/*
+/**
+ * svc_find_xprt - find an RPC transport instance
+ * @serv: pointer to svc_serv to search
+ * @xcl_name: C string containing transport's class name
+ * @af: Address family of transport's local address
+ * @port: transport's IP port number
+ *
  * Return the transport instance pointer for the endpoint accepting
  * connections/peer traffic from the specified transport class,
  * address family and port.
@@ -1042,14 +1048,14 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
  * wild-card, and will result in matching the first transport in the
  * service's list that has a matching class name.
  */
-struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
-			       int af, int port)
+struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+			       const sa_family_t af, const unsigned short port)
 {
 	struct svc_xprt *xprt;
 	struct svc_xprt *found = NULL;
 
 	/* Sanity check the args */
-	if (!serv || !xcl_name)
+	if (serv == NULL || xcl_name == NULL)
 		return found;
 
 	spin_lock_bh(&serv->sv_lock);
@@ -1058,7 +1064,7 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
 			continue;
 		if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
 			continue;
-		if (port && port != svc_xprt_local_port(xprt))
+		if (port != 0 && port != svc_xprt_local_port(xprt))
 			continue;
 		found = xprt;
 		svc_xprt_get(xprt);
-- 
cgit v1.2.3


From 4b62e58cccff9c5e7ffc7023f7ec24c75fbd549b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:06 -0400
Subject: SUNRPC: Pass a family argument to svc_register()

The sv_family field is going away.  Instead of using sv_family, have
the svc_register() function take a protocol family argument.

Since this argument represents a protocol family, and not an address
family, this argument takes an int, as this is what is passed to
sock_create_kern().  Also make sure svc_register's helpers are
checking for PF_FOO instead of AF_FOO.  The value of [AP]F_FOO are
equivalent; this is simply a symbolic change to reflect the semantics
of the value stored in that variable.

sock_create_kern() should return EPFNOSUPPORT if the passed-in
protocol family isn't supported, but it uses EAFNOSUPPORT for this
case.  We will stick with that tradition here, as svc_register()
is called by the RPC server in the same path as sock_create_kern().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svc.h |  4 ++--
 net/sunrpc/svc.c           | 21 +++++++++++----------
 net/sunrpc/svcsock.c       |  2 +-
 3 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3435d24bfe55..1f18fc728cba 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -396,8 +396,8 @@ struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 void		   svc_destroy(struct svc_serv *);
 int		   svc_process(struct svc_rqst *);
-int		   svc_register(const struct svc_serv *, const unsigned short,
-				const unsigned short);
+int		   svc_register(const struct svc_serv *, const int,
+				const unsigned short, const unsigned short);
 
 void		   svc_wake_up(struct svc_serv *);
 void		   svc_reserve(struct svc_rqst *rqstp, int space);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c51fed4d1af1..41bc36ea2224 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -800,17 +800,17 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
  * if any error occurs.
  */
 static int __svc_register(const u32 program, const u32 version,
-			  const sa_family_t family,
+			  const int family,
 			  const unsigned short protocol,
 			  const unsigned short port)
 {
 	int error;
 
 	switch (family) {
-	case AF_INET:
+	case PF_INET:
 		return __svc_rpcb_register4(program, version,
 						protocol, port);
-	case AF_INET6:
+	case PF_INET6:
 		error = __svc_rpcb_register6(program, version,
 						protocol, port);
 		if (error < 0)
@@ -840,11 +840,11 @@ static int __svc_register(const u32 program, const u32 version,
  * if any error occurs.
  */
 static int __svc_register(const u32 program, const u32 version,
-			  sa_family_t family,
+			  const int family,
 			  const unsigned short protocol,
 			  const unsigned short port)
 {
-	if (family != AF_INET)
+	if (family != PF_INET)
 		return -EAFNOSUPPORT;
 
 	return rpcb_register(program, version, protocol, port);
@@ -855,13 +855,14 @@ static int __svc_register(const u32 program, const u32 version,
 /**
  * svc_register - register an RPC service with the local portmapper
  * @serv: svc_serv struct for the service to register
+ * @family: protocol family of service's listener socket
  * @proto: transport protocol number to advertise
  * @port: port to advertise
  *
- * Service is registered for any address in serv's address family
+ * Service is registered for any address in the passed-in protocol family
  */
-int svc_register(const struct svc_serv *serv, const unsigned short proto,
-		 const unsigned short port)
+int svc_register(const struct svc_serv *serv, const int family,
+		 const unsigned short proto, const unsigned short port)
 {
 	struct svc_program	*progp;
 	unsigned int		i;
@@ -879,7 +880,7 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
 					i,
 					proto == IPPROTO_UDP?  "udp" : "tcp",
 					port,
-					serv->sv_family,
+					family,
 					progp->pg_vers[i]->vs_hidden?
 						" (but not telling portmap)" : "");
 
@@ -887,7 +888,7 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
 				continue;
 
 			error = __svc_register(progp->pg_prog, i,
-						serv->sv_family, proto, port);
+						family, proto, port);
 			if (error < 0)
 				break;
 		}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5763e6460fea..d00583c1cd04 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1122,7 +1122,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 
 	/* Register socket with portmapper */
 	if (*errp >= 0 && pmap_register)
-		*errp = svc_register(serv, inet->sk_protocol,
+		*errp = svc_register(serv, serv->sv_family, inet->sk_protocol,
 				     ntohs(inet_sk(inet)->sport));
 
 	if (*errp < 0) {
-- 
cgit v1.2.3


From 9652ada3fb5914a67d8422114e8a76388330fa79 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:21 -0400
Subject: SUNRPC: Change svc_create_xprt() to take a @family argument

The sv_family field is going away.  Pass a protocol family argument to
svc_create_xprt() instead of extracting the family from the passed-in
svc_serv struct.

Again, as this is a listener socket and not an address, we make this
new argument an "int" protocol family, instead of an "sa_family_t."

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svc.c                  |  3 ++-
 fs/nfs/callback.c               |  4 ++--
 fs/nfsd/nfsctl.c                |  2 +-
 fs/nfsd/nfssvc.c                |  4 ++--
 include/linux/sunrpc/svc_xprt.h |  3 ++-
 net/sunrpc/svc_xprt.c           | 15 +++++++++------
 6 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b5853..390c5593655c 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -211,7 +211,8 @@ static int create_lockd_listener(struct svc_serv *serv, char *name,
 
 	xprt = svc_find_xprt(serv, name, 0, 0);
 	if (xprt == NULL)
-		return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+		return svc_create_xprt(serv, name, nlmsvc_family,
+					port, SVC_SOCK_DEFAULTS);
 
 	svc_xprt_put(xprt);
 	return 0;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a1083..fb35cab63c8a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -122,8 +122,8 @@ int nfs_callback_up(void)
 	if (!serv)
 		goto out_err;
 
-	ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
-			      SVC_SOCK_ANONYMOUS);
+	ret = svc_create_xprt(serv, "tcp", nfs_callback_family,
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret <= 0)
 		goto out_err;
 	nfs_callback_tcpport = ret;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5a936c14f6ff..a4ed8644d69c 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -943,7 +943,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 			err = nfsd_create_serv();
 			if (!err) {
 				err = svc_create_xprt(nfsd_serv,
-						      transport, port,
+						      transport, PF_INET, port,
 						      SVC_SOCK_ANONYMOUS);
 				if (err == -ENOENT)
 					/* Give a reasonable perror msg for
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..ab7f249055b5 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -244,7 +244,7 @@ static int nfsd_init_socks(int port)
 	if (!list_empty(&nfsd_serv->sv_permsocks))
 		return 0;
 
-	error = svc_create_xprt(nfsd_serv, "udp", port,
+	error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
@@ -253,7 +253,7 @@ static int nfsd_init_socks(int port)
 	if (error < 0)
 		return error;
 
-	error = svc_create_xprt(nfsd_serv, "tcp", port,
+	error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 55b68582c5d9..0d9cb6ef28b0 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -71,7 +71,8 @@ int	svc_reg_xprt_class(struct svc_xprt_class *);
 void	svc_unreg_xprt_class(struct svc_xprt_class *);
 void	svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *,
 		      struct svc_serv *);
-int	svc_create_xprt(struct svc_serv *, char *, unsigned short, int);
+int	svc_create_xprt(struct svc_serv *, const char *, const int,
+			const unsigned short, int);
 void	svc_xprt_enqueue(struct svc_xprt *xprt);
 void	svc_xprt_received(struct svc_xprt *);
 void	svc_xprt_put(struct svc_xprt *xprt);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c947c93dbc24..2819ee093f36 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -161,7 +161,9 @@ EXPORT_SYMBOL_GPL(svc_xprt_init);
 
 static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 					 struct svc_serv *serv,
-					 unsigned short port, int flags)
+					 const int family,
+					 const unsigned short port,
+					 int flags)
 {
 	struct sockaddr_in sin = {
 		.sin_family		= AF_INET,
@@ -176,12 +178,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 	struct sockaddr *sap;
 	size_t len;
 
-	switch (serv->sv_family) {
-	case AF_INET:
+	switch (family) {
+	case PF_INET:
 		sap = (struct sockaddr *)&sin;
 		len = sizeof(sin);
 		break;
-	case AF_INET6:
+	case PF_INET6:
 		sap = (struct sockaddr *)&sin6;
 		len = sizeof(sin6);
 		break;
@@ -192,7 +194,8 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 	return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
 }
 
-int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+		    const int family, const unsigned short port,
 		    int flags)
 {
 	struct svc_xprt_class *xcl;
@@ -209,7 +212,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
 			goto err;
 
 		spin_unlock(&svc_xprt_class_lock);
-		newxprt = __svc_xpo_create(xcl, serv, port, flags);
+		newxprt = __svc_xpo_create(xcl, serv, family, port, flags);
 		if (IS_ERR(newxprt)) {
 			module_put(xcl->xcl_owner);
 			return PTR_ERR(newxprt);
-- 
cgit v1.2.3


From 49a9072f29a1039f142ec98b44a72d7173651c02 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:29 -0400
Subject: SUNRPC: Remove @family argument from svc_create() and
 svc_create_pooled()

Since an RPC service listener's protocol family is specified now via
svc_create_xprt(), it no longer needs to be passed to svc_create() or
svc_create_pooled().  Remove that argument from the synopsis of those
functions, and remove the sv_family field from the svc_serv struct.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svc.c             |  2 +-
 fs/nfs/callback.c          |  3 +--
 fs/nfsd/nfssvc.c           |  1 -
 include/linux/sunrpc/svc.h |  5 ++---
 net/sunrpc/svc.c           | 11 +++++------
 5 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 390c5593655c..d30920038cb6 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -275,7 +275,7 @@ int lockd_up(void)
 			"lockd_up: no pid, %d users??\n", nlmsvc_users);
 
 	error = -ENOMEM;
-	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
 	if (!serv) {
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		goto out;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index fb35cab63c8a..ddf4b4ae6967 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -116,8 +116,7 @@ int nfs_callback_up(void)
 	mutex_lock(&nfs_callback_mutex);
 	if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
 		goto out;
-	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
-				nfs_callback_family, NULL);
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
 	ret = -ENOMEM;
 	if (!serv)
 		goto out_err;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ab7f249055b5..bc3567bab8c4 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,7 +229,6 @@ int nfsd_create_serv(void)
 
 	atomic_set(&nfsd_busy, 0);
 	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-				      AF_INET,
 				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 1f18fc728cba..d3a4c0231933 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -69,7 +69,6 @@ struct svc_serv {
 	struct list_head	sv_tempsocks;	/* all temporary sockets */
 	int			sv_tmpcnt;	/* count of temporary sockets */
 	struct timer_list	sv_temptimer;	/* timer for aging temporary sockets */
-	sa_family_t		sv_family;	/* listener's address family */
 
 	char *			sv_name;	/* service name */
 
@@ -385,13 +384,13 @@ struct svc_procedure {
 /*
  * Function prototypes.
  */
-struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t,
+struct svc_serv *svc_create(struct svc_program *, unsigned int,
 			    void (*shutdown)(struct svc_serv *));
 struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
 					struct svc_pool *pool);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
-			sa_family_t, void (*shutdown)(struct svc_serv *),
+			void (*shutdown)(struct svc_serv *),
 			svc_thread_fn, struct module *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 void		   svc_destroy(struct svc_serv *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 41bc36ea2224..d72ff44826d8 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -359,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
  */
 static struct svc_serv *
 __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
-	   sa_family_t family, void (*shutdown)(struct svc_serv *serv))
+	     void (*shutdown)(struct svc_serv *serv))
 {
 	struct svc_serv	*serv;
 	unsigned int vers;
@@ -368,7 +368,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 	if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
 		return NULL;
-	serv->sv_family    = family;
 	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
@@ -427,21 +426,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 struct svc_serv *
 svc_create(struct svc_program *prog, unsigned int bufsize,
-		sa_family_t family, void (*shutdown)(struct svc_serv *serv))
+	   void (*shutdown)(struct svc_serv *serv))
 {
-	return __svc_create(prog, bufsize, /*npools*/1, family, shutdown);
+	return __svc_create(prog, bufsize, /*npools*/1, shutdown);
 }
 EXPORT_SYMBOL_GPL(svc_create);
 
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
-		  sa_family_t family, void (*shutdown)(struct svc_serv *serv),
+		  void (*shutdown)(struct svc_serv *serv),
 		  svc_thread_fn func, struct module *mod)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
 
-	serv = __svc_create(prog, bufsize, npools, family, shutdown);
+	serv = __svc_create(prog, bufsize, npools, shutdown);
 
 	if (serv != NULL) {
 		serv->sv_function = func;
-- 
cgit v1.2.3


From e354d571bb481f1d71f2c3004b9ff570b32e83bd Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Sat, 28 Mar 2009 11:30:52 +0300
Subject: nfsd: embed nfsd4_current_state in nfsd4_compoundres

Remove the allocation of struct nfsd4_compound_state.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c        | 40 +++++++++-------------------------------
 include/linux/nfsd/xdr4.h |  9 +++++----
 2 files changed, 14 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f156b85b4129..7bb8cb321dfe 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -809,29 +809,6 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
 		nfsdstats.nfs4_opcount[opnum]++;
 }
 
-static void cstate_free(struct nfsd4_compound_state *cstate)
-{
-	if (cstate == NULL)
-		return;
-	fh_put(&cstate->current_fh);
-	fh_put(&cstate->save_fh);
-	BUG_ON(cstate->replay_owner);
-	kfree(cstate);
-}
-
-static struct nfsd4_compound_state *cstate_alloc(void)
-{
-	struct nfsd4_compound_state *cstate;
-
-	cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
-	if (cstate == NULL)
-		return NULL;
-	fh_init(&cstate->current_fh, NFS4_FHSIZE);
-	fh_init(&cstate->save_fh, NFS4_FHSIZE);
-	cstate->replay_owner = NULL;
-	return cstate;
-}
-
 typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
 			      void *);
 
@@ -859,12 +836,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 {
 	struct nfsd4_op	*op;
 	struct nfsd4_operation *opdesc;
-	struct nfsd4_compound_state *cstate = NULL;
+	struct nfsd4_compound_state *cstate = &resp->cstate;
 	int		slack_bytes;
 	__be32		status;
 
 	resp->xbuf = &rqstp->rq_res;
-	resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
+	resp->p = rqstp->rq_res.head[0].iov_base +
+						rqstp->rq_res.head[0].iov_len;
 	resp->tagp = resp->p;
 	/* reserve space for: taglen, tag, and opcnt */
 	resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -873,6 +851,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	resp->tag = args->tag;
 	resp->opcnt = 0;
 	resp->rqstp = rqstp;
+	resp->cstate.replay_owner = NULL;
+	fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
+	fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
 
 	/*
 	 * According to RFC3010, this takes precedence over all other errors.
@@ -881,11 +862,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
 		goto out;
 
-	status = nfserr_resource;
-	cstate = cstate_alloc();
-	if (cstate == NULL)
-		goto out;
-
 	status = nfs_ok;
 	while (!status && resp->opcnt < args->opcnt) {
 		op = &args->ops[resp->opcnt++];
@@ -958,7 +934,9 @@ encode_op:
 		nfsd4_increment_op_stats(op->opnum);
 	}
 
-	cstate_free(cstate);
+	fh_put(&resp->cstate.current_fh);
+	fh_put(&resp->cstate.save_fh);
+	BUG_ON(resp->cstate.replay_owner);
 out:
 	nfsd4_release_compoundargs(args);
 	dprintk("nfsv4 compound returned %d\n", ntohl(status));
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 27bd3e38ec5a..fd15ddc3359d 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -45,9 +45,9 @@
 #define XDR_LEN(n)                     (((n) + 3) & ~3)
 
 struct nfsd4_compound_state {
-	struct svc_fh current_fh;
-	struct svc_fh save_fh;
-	struct nfs4_stateowner *replay_owner;
+	struct svc_fh		current_fh;
+	struct svc_fh		save_fh;
+	struct nfs4_stateowner	*replay_owner;
 };
 
 struct nfsd4_change_info {
@@ -416,7 +416,8 @@ struct nfsd4_compoundres {
 	u32				taglen;
 	char *				tag;
 	u32				opcnt;
-	__be32 *			tagp; /* where to encode tag and  opcount */
+	__be32 *			tagp; /* tag, opcount encode location */
+	struct nfsd4_compound_state	cstate;
 };
 
 #define NFS4_SVC_XDRSIZE		sizeof(struct nfsd4_compoundargs)
-- 
cgit v1.2.3


From 6f4303fb2ec68055e793b84887a7ae0f9ea7cc2d Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 29 Jan 2009 00:15:51 +0100
Subject: HID: bring back possibility to specify vid/pid ignore on module load

When hid quirks were converted to specialized driver, the HID_QUIRK_IGNORE
has been moved completely, as the hid_ignore_list[] has been moved into the
generic code.

However userspace already got used to the possibility that modprobing
usbhid with

	'quirks=vid:pid:0x4'

makes the device ignored by usbhid driver. So keep this quirk flag in place
for backwards compatibility.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/usbhid/hid-core.c | 3 +++
 include/linux/hid.h           | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index f0a0f72238ab..eed05a3017e5 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -711,6 +711,9 @@ static int usbhid_parse(struct hid_device *hid)
 	quirks = usbhid_lookup_quirk(le16_to_cpu(dev->descriptor.idVendor),
 			le16_to_cpu(dev->descriptor.idProduct));
 
+	if (quirks & HID_QUIRK_IGNORE)
+		return -ENODEV;
+
 	/* Many keyboards and mice don't like to be polled for reports,
 	 * so we will always set the HID_QUIRK_NOGET flag for them. */
 	if (interface->desc.bInterfaceSubClass == USB_INTERFACE_SUBCLASS_BOOT) {
diff --git a/include/linux/hid.h b/include/linux/hid.h
index fa8ee9cef7be..a46cda488695 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -270,6 +270,7 @@ struct hid_item {
 
 #define HID_QUIRK_INVERT			0x00000001
 #define HID_QUIRK_NOTOUCH			0x00000002
+#define HID_QUIRK_IGNORE			0x00000004
 #define HID_QUIRK_NOGET				0x00000008
 #define HID_QUIRK_BADPAD			0x00000020
 #define HID_QUIRK_MULTI_INPUT			0x00000040
-- 
cgit v1.2.3


From afa5eb7c68689ced4284f01c96feed44a2d0a127 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 18 Mar 2009 09:13:37 +0100
Subject: HID: remove compat stuff

This removal was scheduled and there is no problem with later
distros to adapt for the new bus, thanks to aliases.

module-init-tools map files are deprecated nowadays, so that
the patch which introduced hid ones into the m-i-t won't be
accepted and hence there is no reason for leaving compat stuff in.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/feature-removal-schedule.txt |  7 ---
 drivers/hid/Kconfig                        | 12 -----
 drivers/hid/Makefile                       |  4 --
 drivers/hid/hid-a4tech.c                   |  2 -
 drivers/hid/hid-apple.c                    |  2 -
 drivers/hid/hid-belkin.c                   |  2 -
 drivers/hid/hid-cherry.c                   |  2 -
 drivers/hid/hid-chicony.c                  |  2 -
 drivers/hid/hid-core.c                     | 21 --------
 drivers/hid/hid-cypress.c                  |  2 -
 drivers/hid/hid-drff.c                     |  2 -
 drivers/hid/hid-dummy.c                    | 87 ------------------------------
 drivers/hid/hid-ezkey.c                    |  2 -
 drivers/hid/hid-gaff.c                     |  2 -
 drivers/hid/hid-gyration.c                 |  2 -
 drivers/hid/hid-kensington.c               |  2 -
 drivers/hid/hid-kye.c                      |  2 -
 drivers/hid/hid-lg.c                       |  2 -
 drivers/hid/hid-microsoft.c                |  2 -
 drivers/hid/hid-monterey.c                 |  2 -
 drivers/hid/hid-ntrig.c                    |  2 -
 drivers/hid/hid-petalynx.c                 |  2 -
 drivers/hid/hid-pl.c                       |  2 -
 drivers/hid/hid-samsung.c                  |  2 -
 drivers/hid/hid-sony.c                     |  2 -
 drivers/hid/hid-sunplus.c                  |  2 -
 drivers/hid/hid-tmff.c                     |  2 -
 drivers/hid/hid-topseed.c                  |  2 -
 drivers/hid/hid-zpff.c                     |  2 -
 include/linux/hid.h                        | 16 ------
 30 files changed, 195 deletions(-)
 delete mode 100644 drivers/hid/hid-dummy.c

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 1135996bec8b..fc5e85a5901c 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -273,13 +273,6 @@ Who:	Glauber Costa <gcosta@redhat.com>
 
 ---------------------------
 
-What:	remove HID compat support
-When:	2.6.29
-Why:	needed only as a temporary solution until distros fix themselves up
-Who:	Jiri Slaby <jirislaby@gmail.com>
-
----------------------------
-
 What: print_fn_descriptor_symbol()
 When: October 2009
 Why:  The %pF vsprintf format provides the same functionality in a
diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
index 88e16ef93247..63a2564f0f81 100644
--- a/drivers/hid/Kconfig
+++ b/drivers/hid/Kconfig
@@ -70,18 +70,6 @@ source "drivers/hid/usbhid/Kconfig"
 menu "Special HID drivers"
 	depends on HID
 
-config HID_COMPAT
-	bool "Load all HID drivers on hid core load"
-	default y
-	---help---
-	Compatible option for older userspace. If you have system without udev
-	support of module loading through aliases and also old
-	module-init-tools which can't handle hid bus, choose Y here. Otherwise
-	say N. If you say N and your userspace is old enough, the only
-	functionality you lose is modules autoloading.
-
-	If unsure, say Y.
-
 config HID_A4TECH
 	tristate "A4 tech" if EMBEDDED
 	depends on USB_HID
diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
index e6b72ed0d70a..1f7cb0fd4505 100644
--- a/drivers/hid/Makefile
+++ b/drivers/hid/Makefile
@@ -8,10 +8,6 @@ obj-$(CONFIG_HID)		+= hid.o
 hid-$(CONFIG_HID_DEBUG)		+= hid-debug.o
 hid-$(CONFIG_HIDRAW)		+= hidraw.o
 
-ifdef CONFIG_HID_COMPAT
-obj-m				+= hid-dummy.o
-endif
-
 hid-logitech-objs		:= hid-lg.o
 ifdef CONFIG_LOGITECH_FF
 	hid-logitech-objs	+= hid-lgff.o
diff --git a/drivers/hid/hid-a4tech.c b/drivers/hid/hid-a4tech.c
index ebca00e6c103..42ea359e94cf 100644
--- a/drivers/hid/hid-a4tech.c
+++ b/drivers/hid/hid-a4tech.c
@@ -158,5 +158,3 @@ static void a4_exit(void)
 module_init(a4_init);
 module_exit(a4_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(a4tech);
diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c
index cab3be7ef0ab..7359d9d88e46 100644
--- a/drivers/hid/hid-apple.c
+++ b/drivers/hid/hid-apple.c
@@ -474,5 +474,3 @@ static void apple_exit(void)
 module_init(apple_init);
 module_exit(apple_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(apple);
diff --git a/drivers/hid/hid-belkin.c b/drivers/hid/hid-belkin.c
index 12c8a9ba6ed6..2f6723133a4b 100644
--- a/drivers/hid/hid-belkin.c
+++ b/drivers/hid/hid-belkin.c
@@ -101,5 +101,3 @@ static void belkin_exit(void)
 module_init(belkin_init);
 module_exit(belkin_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(belkin);
diff --git a/drivers/hid/hid-cherry.c b/drivers/hid/hid-cherry.c
index b833b9769aba..ab8209e7e45c 100644
--- a/drivers/hid/hid-cherry.c
+++ b/drivers/hid/hid-cherry.c
@@ -83,5 +83,3 @@ static void ch_exit(void)
 module_init(ch_init);
 module_exit(ch_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(cherry);
diff --git a/drivers/hid/hid-chicony.c b/drivers/hid/hid-chicony.c
index a54d4096e0f7..7f91076d8493 100644
--- a/drivers/hid/hid-chicony.c
+++ b/drivers/hid/hid-chicony.c
@@ -76,5 +76,3 @@ static void ch_exit(void)
 module_init(ch_init);
 module_exit(ch_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(chicony);
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index b96fbd5dab55..e56f8d5d3a50 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1819,15 +1819,6 @@ void hid_unregister_driver(struct hid_driver *hdrv)
 }
 EXPORT_SYMBOL_GPL(hid_unregister_driver);
 
-#ifdef CONFIG_HID_COMPAT
-static void hid_compat_load(struct work_struct *ws)
-{
-	request_module("hid-dummy");
-}
-static DECLARE_WORK(hid_compat_work, hid_compat_load);
-static struct workqueue_struct *hid_compat_wq;
-#endif
-
 static int __init hid_init(void)
 {
 	int ret;
@@ -1842,15 +1833,6 @@ static int __init hid_init(void)
 	if (ret)
 		goto err_bus;
 
-#ifdef CONFIG_HID_COMPAT
-	hid_compat_wq = create_singlethread_workqueue("hid_compat");
-	if (!hid_compat_wq) {
-		hidraw_exit();
-		goto err;
-	}
-	queue_work(hid_compat_wq, &hid_compat_work);
-#endif
-
 	return 0;
 err_bus:
 	bus_unregister(&hid_bus_type);
@@ -1860,9 +1842,6 @@ err:
 
 static void __exit hid_exit(void)
 {
-#ifdef CONFIG_HID_COMPAT
-	destroy_workqueue(hid_compat_wq);
-#endif
 	hidraw_exit();
 	bus_unregister(&hid_bus_type);
 }
diff --git a/drivers/hid/hid-cypress.c b/drivers/hid/hid-cypress.c
index 5d69d27b935d..9d6d3b91773b 100644
--- a/drivers/hid/hid-cypress.c
+++ b/drivers/hid/hid-cypress.c
@@ -154,5 +154,3 @@ static void cp_exit(void)
 module_init(cp_init);
 module_exit(cp_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(cypress);
diff --git a/drivers/hid/hid-drff.c b/drivers/hid/hid-drff.c
index 785d2492b5ef..34f3eb65100b 100644
--- a/drivers/hid/hid-drff.c
+++ b/drivers/hid/hid-drff.c
@@ -186,5 +186,3 @@ static void __exit dr_exit(void)
 module_init(dr_init);
 module_exit(dr_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(dragonrise);
diff --git a/drivers/hid/hid-dummy.c b/drivers/hid/hid-dummy.c
deleted file mode 100644
index 74d765f38624..000000000000
--- a/drivers/hid/hid-dummy.c
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <linux/autoconf.h>
-#include <linux/module.h>
-#include <linux/hid.h>
-
-static int __init hid_dummy_init(void)
-{
-#ifdef CONFIG_HID_A4TECH_MODULE
-	HID_COMPAT_CALL_DRIVER(a4tech);
-#endif
-#ifdef CONFIG_HID_APPLE_MODULE
-	HID_COMPAT_CALL_DRIVER(apple);
-#endif
-#ifdef CONFIG_HID_BELKIN_MODULE
-	HID_COMPAT_CALL_DRIVER(belkin);
-#endif
-#ifdef CONFIG_HID_BRIGHT_MODULE
-	HID_COMPAT_CALL_DRIVER(bright);
-#endif
-#ifdef CONFIG_HID_CHERRY_MODULE
-	HID_COMPAT_CALL_DRIVER(cherry);
-#endif
-#ifdef CONFIG_HID_CHICONY_MODULE
-	HID_COMPAT_CALL_DRIVER(chicony);
-#endif
-#ifdef CONFIG_HID_CYPRESS_MODULE
-	HID_COMPAT_CALL_DRIVER(cypress);
-#endif
-#ifdef CONFIG_HID_DELL_MODULE
-	HID_COMPAT_CALL_DRIVER(dell);
-#endif
-#ifdef CONFIG_DRAGONRISE_FF_MODULE
-	HID_COMPAT_CALL_DRIVER(dragonrise);
-#endif
-#ifdef CONFIG_HID_EZKEY_MODULE
-	HID_COMPAT_CALL_DRIVER(ezkey);
-#endif
-#ifdef CONFIG_HID_KYE_MODULE
-	HID_COMPAT_CALL_DRIVER(kye);
-#endif
-#ifdef CONFIG_HID_GYRATION_MODULE
-	HID_COMPAT_CALL_DRIVER(gyration);
-#endif
-#ifdef CONFIG_HID_KENSINGTON_MODULE
-	HID_COMPAT_CALL_DRIVER(kensington);
-#endif
-#ifdef CONFIG_HID_LOGITECH_MODULE
-	HID_COMPAT_CALL_DRIVER(logitech);
-#endif
-#ifdef CONFIG_HID_MICROSOFT_MODULE
-	HID_COMPAT_CALL_DRIVER(microsoft);
-#endif
-#ifdef CONFIG_HID_MONTEREY_MODULE
-	HID_COMPAT_CALL_DRIVER(monterey);
-#endif
-#ifdef CONFIG_HID_NTRIG_MODULE
-	HID_COMPAT_CALL_DRIVER(ntrig);
-#endif
-#ifdef CONFIG_HID_PANTHERLORD_MODULE
-	HID_COMPAT_CALL_DRIVER(pantherlord);
-#endif
-#ifdef CONFIG_HID_PETALYNX_MODULE
-	HID_COMPAT_CALL_DRIVER(petalynx);
-#endif
-#ifdef CONFIG_HID_SAMSUNG_MODULE
-	HID_COMPAT_CALL_DRIVER(samsung);
-#endif
-#ifdef CONFIG_HID_SONY_MODULE
-	HID_COMPAT_CALL_DRIVER(sony);
-#endif
-#ifdef CONFIG_HID_SUNPLUS_MODULE
-	HID_COMPAT_CALL_DRIVER(sunplus);
-#endif
-#ifdef CONFIG_GREENASIA_FF_MODULE
-	HID_COMPAT_CALL_DRIVER(greenasia);
-#endif
-#ifdef CONFIG_THRUSTMASTER_FF_MODULE
-	HID_COMPAT_CALL_DRIVER(thrustmaster);
-#endif
-#ifdef CONFIG_ZEROPLUS_FF_MODULE
-	HID_COMPAT_CALL_DRIVER(zeroplus);
-#endif
-
-	return -EIO;
-}
-module_init(hid_dummy_init);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/hid/hid-ezkey.c b/drivers/hid/hid-ezkey.c
index deb42f931b7e..0a1fe054799b 100644
--- a/drivers/hid/hid-ezkey.c
+++ b/drivers/hid/hid-ezkey.c
@@ -91,5 +91,3 @@ static void ez_exit(void)
 module_init(ez_init);
 module_exit(ez_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(ezkey);
diff --git a/drivers/hid/hid-gaff.c b/drivers/hid/hid-gaff.c
index 71211f6a4f02..510ad3ab8d33 100644
--- a/drivers/hid/hid-gaff.c
+++ b/drivers/hid/hid-gaff.c
@@ -181,5 +181,3 @@ static void __exit ga_exit(void)
 module_init(ga_init);
 module_exit(ga_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(greenasia);
diff --git a/drivers/hid/hid-gyration.c b/drivers/hid/hid-gyration.c
index 04a0afec52ac..d42d222097a8 100644
--- a/drivers/hid/hid-gyration.c
+++ b/drivers/hid/hid-gyration.c
@@ -94,5 +94,3 @@ static void gyration_exit(void)
 module_init(gyration_init);
 module_exit(gyration_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(gyration);
diff --git a/drivers/hid/hid-kensington.c b/drivers/hid/hid-kensington.c
index 747fee5b2a73..7353bd79cbe9 100644
--- a/drivers/hid/hid-kensington.c
+++ b/drivers/hid/hid-kensington.c
@@ -61,5 +61,3 @@ static void ks_exit(void)
 module_init(ks_init);
 module_exit(ks_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(kensington);
diff --git a/drivers/hid/hid-kye.c b/drivers/hid/hid-kye.c
index ea7f412e31a9..72ee3fec56d9 100644
--- a/drivers/hid/hid-kye.c
+++ b/drivers/hid/hid-kye.c
@@ -67,5 +67,3 @@ static void kye_exit(void)
 module_init(kye_init);
 module_exit(kye_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(kye);
diff --git a/drivers/hid/hid-lg.c b/drivers/hid/hid-lg.c
index 83e07c9f4144..7b80cb694982 100644
--- a/drivers/hid/hid-lg.c
+++ b/drivers/hid/hid-lg.c
@@ -326,5 +326,3 @@ static void lg_exit(void)
 module_init(lg_init);
 module_exit(lg_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(logitech);
diff --git a/drivers/hid/hid-microsoft.c b/drivers/hid/hid-microsoft.c
index 25b10dcad90d..5e9e37a0506d 100644
--- a/drivers/hid/hid-microsoft.c
+++ b/drivers/hid/hid-microsoft.c
@@ -210,5 +210,3 @@ static void ms_exit(void)
 module_init(ms_init);
 module_exit(ms_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(microsoft);
diff --git a/drivers/hid/hid-monterey.c b/drivers/hid/hid-monterey.c
index f3a85a065f18..240f87618be6 100644
--- a/drivers/hid/hid-monterey.c
+++ b/drivers/hid/hid-monterey.c
@@ -78,5 +78,3 @@ static void mr_exit(void)
 module_init(mr_init);
 module_exit(mr_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(monterey);
diff --git a/drivers/hid/hid-ntrig.c b/drivers/hid/hid-ntrig.c
index db44fbd7bdf6..c5b252be9c21 100644
--- a/drivers/hid/hid-ntrig.c
+++ b/drivers/hid/hid-ntrig.c
@@ -78,5 +78,3 @@ static void ntrig_exit(void)
 module_init(ntrig_init);
 module_exit(ntrig_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(ntrig);
diff --git a/drivers/hid/hid-petalynx.c b/drivers/hid/hid-petalynx.c
index 10945fe12d50..2e83e8ff891a 100644
--- a/drivers/hid/hid-petalynx.c
+++ b/drivers/hid/hid-petalynx.c
@@ -118,5 +118,3 @@ static void pl_exit(void)
 module_init(pl_init);
 module_exit(pl_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(petalynx);
diff --git a/drivers/hid/hid-pl.c b/drivers/hid/hid-pl.c
index 9ad76bf71186..4db9a3483760 100644
--- a/drivers/hid/hid-pl.c
+++ b/drivers/hid/hid-pl.c
@@ -230,5 +230,3 @@ static void pl_exit(void)
 module_init(pl_init);
 module_exit(pl_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(pantherlord);
diff --git a/drivers/hid/hid-samsung.c b/drivers/hid/hid-samsung.c
index 15f3c0492450..07083aa6c19a 100644
--- a/drivers/hid/hid-samsung.c
+++ b/drivers/hid/hid-samsung.c
@@ -96,5 +96,3 @@ static void samsung_exit(void)
 module_init(samsung_init);
 module_exit(samsung_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(samsung);
diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c
index dd5a3979a4de..c2599388a350 100644
--- a/drivers/hid/hid-sony.c
+++ b/drivers/hid/hid-sony.c
@@ -148,5 +148,3 @@ static void sony_exit(void)
 module_init(sony_init);
 module_exit(sony_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(sony);
diff --git a/drivers/hid/hid-sunplus.c b/drivers/hid/hid-sunplus.c
index 5ba68f7dbb78..e0a8fd36a85b 100644
--- a/drivers/hid/hid-sunplus.c
+++ b/drivers/hid/hid-sunplus.c
@@ -78,5 +78,3 @@ static void sp_exit(void)
 module_init(sp_init);
 module_exit(sp_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(sunplus);
diff --git a/drivers/hid/hid-tmff.c b/drivers/hid/hid-tmff.c
index 1b7cba0f7e1f..7c1f7b50330c 100644
--- a/drivers/hid/hid-tmff.c
+++ b/drivers/hid/hid-tmff.c
@@ -265,5 +265,3 @@ static void tm_exit(void)
 module_init(tm_init);
 module_exit(tm_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(thrustmaster);
diff --git a/drivers/hid/hid-topseed.c b/drivers/hid/hid-topseed.c
index cca64a0564a9..152ccfabeba5 100644
--- a/drivers/hid/hid-topseed.c
+++ b/drivers/hid/hid-topseed.c
@@ -73,5 +73,3 @@ static void ts_exit(void)
 module_init(ts_init);
 module_exit(ts_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(topseed);
diff --git a/drivers/hid/hid-zpff.c b/drivers/hid/hid-zpff.c
index ea82f3718b21..85a198a18537 100644
--- a/drivers/hid/hid-zpff.c
+++ b/drivers/hid/hid-zpff.c
@@ -158,5 +158,3 @@ static void zp_exit(void)
 module_init(zp_init);
 module_exit(zp_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(zeroplus);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index a46cda488695..a46cbea71d65 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -792,21 +792,5 @@ dbg_hid(const char *fmt, ...)
 		__FILE__ , ## arg)
 #endif /* HID_FF */
 
-#ifdef __KERNEL__
-#ifdef CONFIG_HID_COMPAT
-#define HID_COMPAT_LOAD_DRIVER(name)	\
-/* prototype to avoid sparse warning */	\
-extern void hid_compat_##name(void);	\
-void hid_compat_##name(void) { }	\
-EXPORT_SYMBOL(hid_compat_##name)
-#else
-#define HID_COMPAT_LOAD_DRIVER(name)
-#endif /* HID_COMPAT */
-#define HID_COMPAT_CALL_DRIVER(name)	do {	\
-	extern void hid_compat_##name(void);	\
-	hid_compat_##name();			\
-} while (0)
-#endif /* __KERNEL__ */
-
 #endif
 
-- 
cgit v1.2.3


From 877d03105d04b2c13e241130277fa69c8d2564f0 Mon Sep 17 00:00:00 2001
From: Nick Andrew <nick@nick-andrew.net>
Date: Mon, 26 Jan 2009 11:06:57 +0100
Subject: trivial: Fix misspelling of firmware

Fix misspelling of firmware.

Signed-off-by: Nick Andrew <nick@nick-andrew.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ia64/kvm.txt                                    | 2 +-
 Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt | 2 +-
 arch/mips/sgi-ip27/ip27-smp.c                                 | 2 +-
 arch/sparc/kernel/head_64.S                                   | 2 +-
 drivers/net/sb1250-mac.c                                      | 2 +-
 drivers/net/tg3.c                                             | 2 +-
 drivers/net/wireless/ipw2x00/ipw2100.c                        | 2 +-
 drivers/net/wireless/ipw2x00/ipw2200.c                        | 2 +-
 drivers/net/wireless/iwlwifi/iwl-agn.c                        | 2 +-
 drivers/net/wireless/iwlwifi/iwl3945-base.c                   | 2 +-
 drivers/net/wireless/libertas/cmd.c                           | 2 +-
 drivers/pci/pci.c                                             | 2 +-
 drivers/platform/x86/thinkpad_acpi.c                          | 2 +-
 drivers/staging/otus/hal/hpmain.c                             | 2 +-
 drivers/usb/atm/ueagle-atm.c                                  | 2 +-
 drivers/usb/serial/ChangeLog.history                          | 2 +-
 include/linux/libata.h                                        | 2 +-
 kernel/power/disk.c                                           | 4 ++--
 sound/oss/pss.c                                               | 2 +-
 sound/sh/aica.c                                               | 2 +-
 20 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ia64/kvm.txt b/Documentation/ia64/kvm.txt
index 84f7cb3d5bec..ffb5c80bec3e 100644
--- a/Documentation/ia64/kvm.txt
+++ b/Documentation/ia64/kvm.txt
@@ -42,7 +42,7 @@ Note: For step 2, please make sure that host page size == TARGET_PAGE_SIZE of qe
 		hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg
 	    you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries.
 
-	(3) Rename the firware you owned to Flash.fd, and copy it to /usr/local/share/qemu
+	(3) Rename the firmware you owned to Flash.fd, and copy it to /usr/local/share/qemu
 
 4. Boot up Linux or Windows guests:
 	4.1 Create or install a image for guest boot. If you have xen experience, it should be easy.
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
index 6c238f59b2a9..249db3a15d15 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
@@ -1,6 +1,6 @@
 * Uploaded QE firmware
 
-      If a new firwmare has been uploaded to the QE (usually by the
+      If a new firmware has been uploaded to the QE (usually by the
       boot loader), then a 'firmware' child node should be added to the QE
       node.  This node provides information on the uploaded firmware that
       device drivers may need.
diff --git a/arch/mips/sgi-ip27/ip27-smp.c b/arch/mips/sgi-ip27/ip27-smp.c
index 5b47d6b65275..cbcd7eb83bd1 100644
--- a/arch/mips/sgi-ip27/ip27-smp.c
+++ b/arch/mips/sgi-ip27/ip27-smp.c
@@ -221,7 +221,7 @@ static void __init ip27_smp_setup(void)
 	 * Assumption to be fixed: we're always booted on logical / physical
 	 * processor 0.  While we're always running on logical processor 0
 	 * this still means this is physical processor zero; it might for
-	 * example be disabled in the firwware.
+	 * example be disabled in the firmware.
 	 */
 	alloc_cpupda(0, 0);
 }
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index a46c3a21e26d..3a1b7bf03cff 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -686,7 +686,7 @@ tlb_fixup_done:
 	 * point.
 	 *
 	 * There used to be enormous complexity wrt. transferring
-	 * over from the firwmare's trap table to the Linux kernel's.
+	 * over from the firmware's trap table to the Linux kernel's.
 	 * For example, there was a chicken & egg problem wrt. building
 	 * the OBP page tables, yet needing to be on the Linux kernel
 	 * trap table (to translate PAGE_OFFSET addresses) in order to
diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c
index 88dd2e09832f..ce7551e17ba7 100644
--- a/drivers/net/sb1250-mac.c
+++ b/drivers/net/sb1250-mac.c
@@ -2299,7 +2299,7 @@ static int sbmac_init(struct platform_device *pldev, long long base)
 	eaddr = sc->sbm_hwaddr;
 
 	/*
-	 * Read the ethernet address.  The firwmare left this programmed
+	 * Read the ethernet address.  The firmware left this programmed
 	 * for us in the ethernet address register for each mac.
 	 */
 
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index f7efcecc4108..ed60b18addac 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -11225,7 +11225,7 @@ static int __devinit tg3_phy_probe(struct tg3 *tp)
 		return tg3_phy_init(tp);
 
 	/* Reading the PHY ID register can conflict with ASF
-	 * firwmare access to the PHY hardware.
+	 * firmware access to the PHY hardware.
 	 */
 	err = 0;
 	if ((tp->tg3_flags & TG3_FLAG_ENABLE_ASF) ||
diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index 115b70487502..f4e963ba768b 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c
@@ -2362,7 +2362,7 @@ static void ipw2100_corruption_detected(struct ipw2100_priv *priv, int i)
 		       i * sizeof(struct ipw2100_status));
 
 #ifdef IPW2100_DEBUG_C3
-	/* Halt the fimrware so we can get a good image */
+	/* Halt the firmware so we can get a good image */
 	write_register(priv->net_dev, IPW_REG_RESET_REG,
 		       IPW_AUX_HOST_RESET_REG_STOP_MASTER);
 	j = 5;
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index b3449948a25a..f6174fdc12bf 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -8844,7 +8844,7 @@ static int ipw_wx_set_mode(struct net_device *dev,
 #endif				/* CONFIG_IPW2200_MONITOR */
 
 	/* Free the existing firmware and reset the fw_loaded
-	 * flag so ipw_load() will bring in the new firmawre */
+	 * flag so ipw_load() will bring in the new firmware */
 	free_firmware();
 
 	priv->ieee->iw_mode = wrqu->mode;
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index 663dc83be501..3889158b359c 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -1337,7 +1337,7 @@ static int iwl_read_ucode(struct iwl_priv *priv)
 
 	/* api_ver should match the api version forming part of the
 	 * firmware filename ... but we don't check for that and only rely
-	 * on the API version read from firware header from here on forward */
+	 * on the API version read from firmware header from here on forward */
 
 	if (api_ver < api_min || api_ver > api_max) {
 		IWL_ERR(priv, "Driver unable to support your firmware API. "
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index a71b08ca7c71..9d5f97dd7c73 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -2562,7 +2562,7 @@ static int iwl3945_read_ucode(struct iwl_priv *priv)
 
 	/* api_ver should match the api version forming part of the
 	 * firmware filename ... but we don't check for that and only rely
-	 * on the API version read from firware header from here on forward */
+	 * on the API version read from firmware header from here on forward */
 
 	if (api_ver < api_min || api_ver > api_max) {
 		IWL_ERR(priv, "Driver unable to support your firmware API. "
diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c
index 639dd02d3d31..8c3605cdc64c 100644
--- a/drivers/net/wireless/libertas/cmd.c
+++ b/drivers/net/wireless/libertas/cmd.c
@@ -1649,7 +1649,7 @@ static struct cmd_ctrl_node *lbs_get_cmd_ctrl_node(struct lbs_private *priv)
 
 /**
  *  @brief This function executes next command in command
- *  pending queue. It will put fimware back to PS mode
+ *  pending queue. It will put firmware back to PS mode
  *  if applicable.
  *
  *  @param priv     A pointer to struct lbs_private structure
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 6d6120007af4..dab33a21d49a 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -550,7 +550,7 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
  * @dev: PCI device to handle.
  * @state: PCI power state (D0, D1, D2, D3hot) to put the device into.
  *
- * Transition a device to a new power state, using the platform formware and/or
+ * Transition a device to a new power state, using the platform firmware and/or
  * the device's PCI PM registers.
  *
  * RETURN VALUE:
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index d2433204a40c..814cb6520673 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -5811,7 +5811,7 @@ static struct ibm_struct volume_driver_data = {
  *	ThinkPads from this same time period (and earlier) probably lack the
  *	tachometer as well.
  *
- *	Unfortunately a lot of ThinkPads with new-style ECs but whose firwmare
+ *	Unfortunately a lot of ThinkPads with new-style ECs but whose firmware
  *	was never fixed by IBM to report the EC firmware version string
  *	probably support the tachometer (like the early X models), so
  *	detecting it is quite hard.  We need more data to know for sure.
diff --git a/drivers/staging/otus/hal/hpmain.c b/drivers/staging/otus/hal/hpmain.c
index 2e65c466aae8..dab278326931 100644
--- a/drivers/staging/otus/hal/hpmain.c
+++ b/drivers/staging/otus/hal/hpmain.c
@@ -152,7 +152,7 @@ u16_t zfHpInit(zdev_t* dev, u32_t frequency)
     else
     {
     #ifndef ZM_OTUS_LINUX_PHASE_2
-        /* donwload the normal frimware */
+        /* download the normal firmware */
         if ((ret = zfFirmwareDownload(dev, (u32_t*)zcFwImage,
                 (u32_t)zcFwImageSize, ZM_FIRMWARE_WLAN_ADDR)) != ZM_SUCCESS)
         {
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index b6483dd98acc..9cf9ff69e3e3 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -626,7 +626,7 @@ static void uea_upload_pre_firmware(const struct firmware *fw_entry, void *conte
 		goto err_fw_corrupted;
 
 	/*
-	 * Start to upload formware : send reset
+	 * Start to upload firmware : send reset
 	 */
 	value = 1;
 	ret = uea_send_modem_cmd(usb, F8051_USBCS, sizeof(value), &value);
diff --git a/drivers/usb/serial/ChangeLog.history b/drivers/usb/serial/ChangeLog.history
index c1b279939bbf..f13fd488ebec 100644
--- a/drivers/usb/serial/ChangeLog.history
+++ b/drivers/usb/serial/ChangeLog.history
@@ -715,7 +715,7 @@ io_edgeport.c Change Log comments:
 
  0.2 (01/30/2000) greg kroah-hartman
 	Milestone 1 release.
-	Device is found by USB subsystem, enumerated, fimware is downloaded
+	Device is found by USB subsystem, enumerated, firmware is downloaded
 	and the descriptors are printed to the debug log, config is set, and
 	green light starts to blink. Open port works, and data can be sent
 	and received at the default settings of the UART. Loopback connector
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 76262d83656b..b450a2628855 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -379,7 +379,7 @@ enum {
 	ATA_HORKAGE_BRIDGE_OK	= (1 << 10),	/* no bridge limits */
 	ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands
 						    not multiple of 16 bytes */
-	ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),	/* firwmare update warning */
+	ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),	/* firmware update warning */
 	ATA_HORKAGE_1_5_GBPS	= (1 << 13),	/* force 1.5 Gbps */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4a4a206b1979..9d1c1a0de350 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -265,7 +265,7 @@ static int create_image(int platform_mode)
  *	hibernation_snapshot - quiesce devices and create the hibernation
  *	snapshot image.
  *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform frimware for the power transition.
+ *			 prepare the platform firmware for the power transition.
  *
  *	Must be called with pm_mutex held
  */
@@ -378,7 +378,7 @@ static int resume_target_kernel(void)
  *	hibernation_restore - quiesce devices and restore the hibernation
  *	snapshot image.  If successful, control returns in hibernation_snaphot()
  *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform frimware for the transition.
+ *			 prepare the platform firmware for the transition.
  *
  *	Must be called with pm_mutex held
  */
diff --git a/sound/oss/pss.c b/sound/oss/pss.c
index 16517a5a1301..83f5ee236b12 100644
--- a/sound/oss/pss.c
+++ b/sound/oss/pss.c
@@ -46,7 +46,7 @@
  *          load the driver as it did in previous versions.
  * 04-07-1999: Anthony Barbachan <barbcode@xmen.cis.fordham.edu>
  *          Added module parameter pss_firmware to allow the user to tell 
- *          the driver where the fireware file is located.  The default 
+ *          the driver where the firmware file is located.  The default 
  *          setting is the previous hardcoded setting "/etc/sound/pss_synth".
  * 00-03-03: Christoph Hellwig <chhellwig@infradead.org>
  *	    Adapted to module_init/module_exit
diff --git a/sound/sh/aica.c b/sound/sh/aica.c
index f551233c5a08..583a3693df75 100644
--- a/sound/sh/aica.c
+++ b/sound/sh/aica.c
@@ -565,7 +565,7 @@ static int load_aica_firmware(void)
 	err = request_firmware(&fw_entry, "aica_firmware.bin", &pd->dev);
 	if (unlikely(err))
 		return err;
-	/* write firware into memory */
+	/* write firmware into memory */
 	spu_disable();
 	spu_memload(0, fw_entry->data, fw_entry->size);
 	spu_enable();
-- 
cgit v1.2.3


From 5243ef8b54a927cae23216253e4e3f03af6f1446 Mon Sep 17 00:00:00 2001
From: Mark Vels <mark.vels@team-embedded.nl>
Date: Sun, 18 Jan 2009 18:42:45 +0100
Subject: trivial: PWM: fix of #endif comment

Signed-off-by: Mark Vels <mark.vels@team-embedded.nl>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/pwm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 3945f803d514..7c775751392c 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -28,4 +28,4 @@ int pwm_enable(struct pwm_device *pwm);
  */
 void pwm_disable(struct pwm_device *pwm);
 
-#endif /* __ASM_ARCH_PWM_H */
+#endif /* __LINUX_PWM_H */
-- 
cgit v1.2.3


From 21acb9caa2e30b100e9a1943d995bb99d40f4035 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Date: Wed, 4 Feb 2009 10:12:08 +0100
Subject: trivial: fix where cgroup documentation is not correctly referred to

cgroup documentation was moved to Documentation/cgroups/. There are some
places that still refer to Documentation/controllers/,
Documentation/cgroups.txt and Documentation/cpusets.txt. Fix those.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/00-INDEX                         |  4 ++--
 Documentation/cgroups/00-INDEX                 | 18 ++++++++++++++++++
 Documentation/kernel-parameters.txt            |  4 ++--
 Documentation/scheduler/sched-rt-group.txt     |  2 +-
 Documentation/vm/numa_memory_policy.txt        |  3 ++-
 Documentation/vm/page_migration                |  3 ++-
 Documentation/x86/x86_64/fake-numa-for-cpusets |  5 +++--
 include/linux/cgroup.h                         |  5 ++++-
 init/Kconfig                                   |  2 +-
 9 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 Documentation/cgroups/00-INDEX

(limited to 'include/linux')

diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 2a39aeba1464..d05737aaa84b 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -86,6 +86,8 @@ cachetlb.txt
 	- describes the cache/TLB flushing interfaces Linux uses.
 cdrom/
 	- directory with information on the CD-ROM drivers that Linux has.
+cgroups/
+	- cgroups features, including cpusets and memory controller.
 connector/
 	- docs on the netlink based userspace<->kernel space communication mod.
 console/
@@ -98,8 +100,6 @@ cpu-load.txt
 	- document describing how CPU load statistics are collected.
 cpuidle/
 	- info on CPU_IDLE, CPU idle state management subsystem.
-cpusets.txt
-	- documents the cpusets feature; assign CPUs and Mem to a set of tasks.
 cputopology.txt
 	- documentation on how CPU topology info is exported via sysfs.
 cris/
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX
new file mode 100644
index 000000000000..3f58fa3d6d00
--- /dev/null
+++ b/Documentation/cgroups/00-INDEX
@@ -0,0 +1,18 @@
+00-INDEX
+	- this file
+cgroups.txt
+	- Control Groups definition, implementation details, examples and API.
+cpuacct.txt
+	- CPU Accounting Controller; account CPU usage for groups of tasks.
+cpusets.txt
+	- documents the cpusets feature; assign CPUs and Mem to a set of tasks.
+devices.txt
+	- Device Whitelist Controller; description, interface and security.
+freezer-subsystem.txt
+	- checkpointing; rationale to not use signals, interface.
+memcg_test.txt
+	- Memory Resource Controller; implementation details.
+memory.txt
+	- Memory Resource Controller; design, accounting, interface, testing.
+resource_counter.txt
+	- Resource Counter API.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index be3bde51b564..755def2cb071 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1593,7 +1593,7 @@ and is between 256 and 4096 characters. It is defined in the file
 	nosoftlockup	[KNL] Disable the soft-lockup detector.
 
 	noswapaccount	[KNL] Disable accounting of swap in memory resource
-			controller. (See Documentation/controllers/memory.txt)
+			controller. (See Documentation/cgroups/memory.txt)
 
 	nosync		[HW,M68K] Disables sync negotiation for all devices.
 
@@ -1932,7 +1932,7 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	relax_domain_level=
 			[KNL, SMP] Set scheduler's default relax_domain_level.
-			See Documentation/cpusets.txt.
+			See Documentation/cgroups/cpusets.txt.
 
 	reserve=	[KNL,BUGS] Force the kernel to ignore some iomem area
 
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 3ef339f491e0..5ba4d3fc625a 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -126,7 +126,7 @@ This uses the /cgroup virtual file system and "/cgroup/<cgroup>/cpu.rt_runtime_u
 to control the CPU time reserved for each control group instead.
 
 For more information on working with control groups, you should read
-Documentation/cgroups.txt as well.
+Documentation/cgroups/cgroups.txt as well.
 
 Group settings are checked against the following limits in order to keep the configuration
 schedulable:
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index 6aaaeb38730c..be45dbb9d7f2 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -8,7 +8,8 @@ The current memory policy support was added to Linux 2.6 around May 2004.  This
 document attempts to describe the concepts and APIs of the 2.6 memory policy
 support.
 
-Memory policies should not be confused with cpusets (Documentation/cpusets.txt)
+Memory policies should not be confused with cpusets
+(Documentation/cgroups/cpusets.txt)
 which is an administrative mechanism for restricting the nodes from which
 memory may be allocated by a set of processes. Memory policies are a
 programming interface that a NUMA-aware application can take advantage of.  When
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index d5fdfd34bbaf..6513fe2d90b8 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -37,7 +37,8 @@ locations.
 
 Larger installations usually partition the system using cpusets into
 sections of nodes. Paul Jackson has equipped cpusets with the ability to
-move pages when a task is moved to another cpuset (See ../cpusets.txt).
+move pages when a task is moved to another cpuset (See
+Documentation/cgroups/cpusets.txt).
 Cpusets allows the automation of process locality. If a task is moved to
 a new cpuset then also all its pages are moved with it so that the
 performance of the process does not sink dramatically. Also the pages
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets b/Documentation/x86/x86_64/fake-numa-for-cpusets
index 33bb56655991..0f11d9becb0b 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets
@@ -7,7 +7,8 @@ you can create fake NUMA nodes that represent contiguous chunks of memory and
 assign them to cpusets and their attached tasks.  This is a way of limiting the
 amount of system memory that are available to a certain class of tasks.
 
-For more information on the features of cpusets, see Documentation/cpusets.txt.
+For more information on the features of cpusets, see
+Documentation/cgroups/cpusets.txt.
 There are a number of different configurations you can use for your needs.  For
 more information on the numa=fake command line option and its various ways of
 configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt.
@@ -32,7 +33,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg:
 	On node 3 totalpages: 131072
 
 Now following the instructions for mounting the cpusets filesystem from
-Documentation/cpusets.txt, you can assign fake nodes (i.e. contiguous memory
+Documentation/cgroups/cpusets.txt, you can assign fake nodes (i.e. contiguous memory
 address spaces) to individual cpusets:
 
 	[root@xroads /]# mkdir exampleset
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 499900d0cee7..b837631fe499 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -342,7 +342,10 @@ int cgroup_task_count(const struct cgroup *cgrp);
 /* Return true if the cgroup is a descendant of the current cgroup */
 int cgroup_is_descendant(const struct cgroup *cgrp);
 
-/* Control Group subsystem type. See Documentation/cgroups.txt for details */
+/*
+ * Control Group subsystem type.
+ * See Documentation/cgroups/cgroups.txt for details
+ */
 
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
diff --git a/init/Kconfig b/init/Kconfig
index bcffc0e47647..99eb4196bd0a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -565,7 +565,7 @@ config CGROUP_MEM_RES_CTLR
 	select MM_OWNER
 	help
 	  Provides a memory resource controller that manages both anonymous
-	  memory and page cache. (See Documentation/controllers/memory.txt)
+	  memory and page cache. (See Documentation/cgroups/memory.txt)
 
 	  Note that setting this option increases fixed memory overhead
 	  associated with each page of memory in the system. By this,
-- 
cgit v1.2.3


From 39db4b8dd16b6d3b56fd3155f309e0eec8481c9a Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Mon, 9 Feb 2009 23:07:35 +0100
Subject: trivial: wusb: Storage class should be before const qualifier

The C99 specification states in section 6.11.5:

The placement of a storage-class specifier other than at the beginning
of the declaration specifiers in a declaration is an obsolescent
feature.

Acked-by: David Vrabel <david.vrabel@csr.com>
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/usb/wusb.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/wusb.h b/include/linux/usb/wusb.h
index 5f401b644ed5..429c631d2aad 100644
--- a/include/linux/usb/wusb.h
+++ b/include/linux/usb/wusb.h
@@ -80,8 +80,7 @@ struct wusb_ckhdid {
 	u8 data[16];
 } __attribute__((packed));
 
-const static
-struct wusb_ckhdid wusb_ckhdid_zero = { .data = { 0 } };
+static const struct wusb_ckhdid wusb_ckhdid_zero = { .data = { 0 } };
 
 #define WUSB_CKHDID_STRSIZE (3 * sizeof(struct wusb_ckhdid) + 1)
 
-- 
cgit v1.2.3


From 0cbfdc8648115b2e8451ae9122311d01d2722005 Mon Sep 17 00:00:00 2001
From: Kazuo Moriwaka <moriwaka@redhat.com>
Date: Tue, 3 Mar 2009 13:01:23 +0100
Subject: trivial: jbd header comment typo fix for JBD_PARANOID_IOFAIL

jbd header comment typo fix.

Signed-off-by: Kazuo Moriwaka <moriwaka@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/jbd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 64246dce5663..e8ca681d8acd 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -35,7 +35,7 @@
 #define journal_oom_retry 1
 
 /*
- * Define JBD_PARANIOD_IOFAIL to cause a kernel BUG() if ext3 finds
+ * Define JBD_PARANOID_IOFAIL to cause a kernel BUG() if ext3 finds
  * certain classes of error which can occur due to failed IOs.  Under
  * normal use we want ext3 to continue after such errors, because
  * hardware _can_ fail, but for debugging purposes when running tests on
-- 
cgit v1.2.3


From e180a6b7759a99a28cbcce3547c4c80822cb6c2a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:29 -0600
Subject: param: fix charp parameters set via sysfs

Impact: fix crash on reading from /sys/module/.../ieee80211_default_rc_algo

The module_param type "charp" simply sets a char * pointer in the
module to the parameter in the commandline string: this is why we keep
the (mangled) module command line around.  But when set via sysfs (as
about 11 charp parameters can be) this memory is freed on the way
out of the write().  Future reads hit random mem.

So we kstrdup instead: we have to check we're not in early commandline
parsing, and we have to note when we've used it so we can reliably
kfree the parameter when it's next overwritten, and also on module
unload.

(Thanks to Randy Dunlap for CONFIG_SYSFS=n fixes)

Reported-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Diagnosed-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Christof Schmitt <christof.schmitt@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h      |  4 ++++
 include/linux/moduleparam.h | 10 ++++++++++
 kernel/module.c             | 14 ++++++++------
 kernel/params.c             | 26 +++++++++++++++++++++++++-
 4 files changed, 47 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 145a75528cc1..08e5e75d6122 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -248,6 +248,10 @@ struct module
 	const unsigned long *crcs;
 	unsigned int num_syms;
 
+	/* Kernel parameters. */
+	struct kernel_param *kp;
+	unsigned int num_kp;
+
 	/* GPL-only exported symbols. */
 	unsigned int num_gpl_syms;
 	const struct kernel_symbol *gpl_syms;
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index e4af3399ef48..a4f0b931846c 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -138,6 +138,16 @@ extern int parse_args(const char *name,
 		      unsigned num,
 		      int (*unknown)(char *param, char *val));
 
+/* Called by module remove. */
+#ifdef CONFIG_SYSFS
+extern void destroy_params(const struct kernel_param *params, unsigned num);
+#else
+static inline void destroy_params(const struct kernel_param *params,
+				  unsigned num)
+{
+}
+#endif /* !CONFIG_SYSFS */
+
 /* All the helper functions */
 /* The macros to do compile-time type checking stolen from Jakub
    Jelinek, who IIRC came up with this idea for the 2.4 module init code. */
diff --git a/kernel/module.c b/kernel/module.c
index f77ac320d0b5..b862fdb6a372 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1491,6 +1491,9 @@ static void free_module(struct module *mod)
 	/* Module unload stuff */
 	module_unload_free(mod);
 
+	/* Free any allocated parameters. */
+	destroy_params(mod->kp, mod->num_kp);
+
 	/* release any pointers to mcount in this module */
 	ftrace_release(mod->module_core, mod->core_size);
 
@@ -1898,8 +1901,7 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
-	unsigned int num_kp, num_mcount;
-	struct kernel_param *kp;
+	unsigned int num_mcount;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -2144,8 +2146,8 @@ static noinline struct module *load_module(void __user *umod,
 
 	/* Now we've got everything in the final locations, we can
 	 * find optional sections. */
-	kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
-			  &num_kp);
+	mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
+			       sizeof(*mod->kp), &mod->num_kp);
 	mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
 				 sizeof(*mod->syms), &mod->num_syms);
 	mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@@ -2291,11 +2293,11 @@ static noinline struct module *load_module(void __user *umod,
 	 */
 	list_add_rcu(&mod->list, &modules);
 
-	err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
+	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
 	if (err < 0)
 		goto unlink;
 
-	err = mod_sysfs_setup(mod, kp, num_kp);
+	err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
 	if (err < 0)
 		goto unlink;
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
diff --git a/kernel/params.c b/kernel/params.c
index a1e3025b19a9..de273ec85bd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,9 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 
+/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
+#define KPARAM_KMALLOCED	0x80000000
+
 #if 0
 #define DEBUGP printk
 #else
@@ -217,7 +220,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
 		return -ENOSPC;
 	}
 
-	*(char **)kp->arg = (char *)val;
+	if (kp->perm & KPARAM_KMALLOCED)
+		kfree(*(char **)kp->arg);
+
+	/* This is a hack.  We can't need to strdup in early boot, and we
+	 * don't need to; this mangled commandline is preserved. */
+	if (slab_is_available()) {
+		kp->perm |= KPARAM_KMALLOCED;
+		*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
+		if (!kp->arg)
+			return -ENOMEM;
+	} else
+		*(const char **)kp->arg = val;
+
 	return 0;
 }
 
@@ -571,6 +586,15 @@ void module_param_sysfs_remove(struct module *mod)
 }
 #endif
 
+void destroy_params(const struct kernel_param *params, unsigned num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++)
+		if (params[i].perm & KPARAM_KMALLOCED)
+			kfree(*(char **)params[i].arg);
+}
+
 static void __init kernel_add_sysfs_param(const char *name,
 					  struct kernel_param *kparam,
 					  unsigned int name_skip)
-- 
cgit v1.2.3


From e610499e2656e61975affd0af56b26eb73964c84 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:31 -0600
Subject: module: __module_address

Impact: New API, cleanup

ksplice wants to know the bounds of a module, not just the module text.

It makes sense to have __module_address.  We then implement
is_module_address and __module_text_address in terms of this (and
change is_module_text_address() to bool while we're at it).

Also, add proper kerneldoc for them all.

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 20 +++++++++----
 kernel/module.c        | 76 ++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 73 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 08e5e75d6122..fd1241e1416f 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -365,7 +365,9 @@ static inline int module_is_live(struct module *mod)
 /* Is this address in a module? (second is with no locks, for oops) */
 struct module *module_text_address(unsigned long addr);
 struct module *__module_text_address(unsigned long addr);
-int is_module_address(unsigned long addr);
+struct module *__module_address(unsigned long addr);
+bool is_module_address(unsigned long addr);
+bool is_module_text_address(unsigned long addr);
 
 static inline int within_module_core(unsigned long addr, struct module *mod)
 {
@@ -494,21 +496,29 @@ search_module_extables(unsigned long addr)
 	return NULL;
 }
 
-/* Is this address in a module? */
 static inline struct module *module_text_address(unsigned long addr)
 {
 	return NULL;
 }
 
-/* Is this address in a module? (don't take a lock, we're oopsing) */
+static inline struct module *__module_address(unsigned long addr)
+{
+	return NULL;
+}
+
 static inline struct module *__module_text_address(unsigned long addr)
 {
 	return NULL;
 }
 
-static inline int is_module_address(unsigned long addr)
+static inline bool is_module_address(unsigned long addr)
 {
-	return 0;
+	return false;
+}
+
+static inline bool is_module_text_address(unsigned long addr)
+{
+	return false;
 }
 
 /* Get/put a kernel symbol (calls should be symmetric) */
diff --git a/kernel/module.c b/kernel/module.c
index 2f0fddf3c114..bd15a94f91c1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -76,7 +76,7 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
-/* Bounds of module allocation, for speeding __module_text_address */
+/* Bounds of module allocation, for speeding __module_address */
 static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 
 int register_module_notifier(struct notifier_block * nb)
@@ -2745,29 +2745,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
 }
 
 /*
- * Is this a valid module address?
+ * is_module_address - is this address inside a module?
+ * @addr: the address to check.
+ *
+ * See is_module_text_address() if you simply want to see if the address
+ * is code (not data).
  */
-int is_module_address(unsigned long addr)
+bool is_module_address(unsigned long addr)
 {
-	struct module *mod;
+	bool ret;
 
 	preempt_disable();
-
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (within_module_core(addr, mod)) {
-			preempt_enable();
-			return 1;
-		}
-	}
-
+	ret = __module_address(addr) != NULL;
 	preempt_enable();
 
-	return 0;
+	return ret;
 }
 
-
-/* Is this a valid kernel address? */
-__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
+/*
+ * __module_address - get the module which contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+__notrace_funcgraph struct module *__module_address(unsigned long addr)
 {
 	struct module *mod;
 
@@ -2775,12 +2777,50 @@ __notrace_funcgraph struct module *__module_text_address(unsigned long addr)
 		return NULL;
 
 	list_for_each_entry_rcu(mod, &modules, list)
-		if (within(addr, mod->module_init, mod->init_text_size)
-		    || within(addr, mod->module_core, mod->core_text_size))
+		if (within_module_core(addr, mod)
+		    || within_module_init(addr, mod))
 			return mod;
 	return NULL;
 }
 
+/*
+ * is_module_text_address - is this address inside module code?
+ * @addr: the address to check.
+ *
+ * See is_module_address() if you simply want to see if the address is
+ * anywhere in a module.  See kernel_text_address() for testing if an
+ * address corresponds to kernel or module code.
+ */
+bool is_module_text_address(unsigned long addr)
+{
+	bool ret;
+
+	preempt_disable();
+	ret = __module_text_address(addr) != NULL;
+	preempt_enable();
+
+	return ret;
+}
+
+/*
+ * __module_text_address - get the module whose code contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_text_address(unsigned long addr)
+{
+	struct module *mod = __module_address(addr);
+	if (mod) {
+		/* Make sure it's within the text section. */
+		if (!within(addr, mod->module_init, mod->init_text_size)
+		    && !within(addr, mod->module_core, mod->core_text_size))
+			mod = NULL;
+	}
+	return mod;
+}
+
 struct module *module_text_address(unsigned long addr)
 {
 	struct module *mod;
-- 
cgit v1.2.3


From a6e6abd575fcbe6572ebc7a70ad616406d206fa8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:31 -0600
Subject: module: remove module_text_address()

Impact: Replace and remove risky (non-EXPORTed) API

module_text_address() returns a pointer to the module, which given locking
improvements in module.c, is useless except to test for NULL:

1) If the module can't go away, use __module_text_address.
2) Otherwise, just use is_module_text_address().

Cc: linux-mtd@lists.infradead.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/mtd/nand/nand_base.c |  4 ++--
 include/linux/module.h       |  7 -------
 kernel/extable.c             |  6 +++---
 kernel/module.c              | 17 ++++-------------
 4 files changed, 9 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0c3afccde8a2..5f71371eb1b0 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2720,14 +2720,14 @@ int nand_scan_tail(struct mtd_info *mtd)
 	return chip->scan_bbt(mtd);
 }
 
-/* module_text_address() isn't exported, and it's mostly a pointless
+/* is_module_text_address() isn't exported, and it's mostly a pointless
    test if this is a module _anyway_ -- they'd have to try _really_ hard
    to call us from in-kernel code if the core NAND support is modular. */
 #ifdef MODULE
 #define caller_is_module() (1)
 #else
 #define caller_is_module() \
-	module_text_address((unsigned long)__builtin_return_address(0))
+	is_module_text_address((unsigned long)__builtin_return_address(0))
 #endif
 
 /**
diff --git a/include/linux/module.h b/include/linux/module.h
index fd1241e1416f..69761ce0dbf0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -362,8 +362,6 @@ static inline int module_is_live(struct module *mod)
 	return mod->state != MODULE_STATE_GOING;
 }
 
-/* Is this address in a module? (second is with no locks, for oops) */
-struct module *module_text_address(unsigned long addr);
 struct module *__module_text_address(unsigned long addr);
 struct module *__module_address(unsigned long addr);
 bool is_module_address(unsigned long addr);
@@ -496,11 +494,6 @@ search_module_extables(unsigned long addr)
 	return NULL;
 }
 
-static inline struct module *module_text_address(unsigned long addr)
-{
-	return NULL;
-}
-
 static inline struct module *__module_address(unsigned long addr)
 {
 	return NULL;
diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82ba..384f0da8a03e 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -58,14 +58,14 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return __module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
 
 int kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
 
 /*
@@ -81,5 +81,5 @@ int func_ptr_is_kernel_text(void *ptr)
 	addr = (unsigned long) dereference_function_descriptor(ptr);
 	if (core_kernel_text(addr))
 		return 1;
-	return module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
diff --git a/kernel/module.c b/kernel/module.c
index bd15a94f91c1..8ddca629e079 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -908,8 +908,10 @@ void symbol_put_addr(void *addr)
 	if (core_kernel_text((unsigned long)addr))
 		return;
 
-	if (!(modaddr = module_text_address((unsigned long)addr)))
-		BUG();
+	/* module_text_address is safe here: we're supposed to have reference
+	 * to module from symbol_get, so it can't go away. */
+	modaddr = __module_text_address((unsigned long)addr);
+	BUG_ON(!modaddr);
 	module_put(modaddr);
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -2821,17 +2823,6 @@ struct module *__module_text_address(unsigned long addr)
 	return mod;
 }
 
-struct module *module_text_address(unsigned long addr)
-{
-	struct module *mod;
-
-	preempt_disable();
-	mod = __module_text_address(addr);
-	preempt_enable();
-
-	return mod;
-}
-
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
 {
-- 
cgit v1.2.3


From 75a66614db21007bcc8c37f9c5d5b922981387b9 Mon Sep 17 00:00:00 2001
From: Anders Kaseorg <andersk@mit.edu>
Date: Fri, 5 Dec 2008 19:03:58 -0500
Subject: Ksplice: Add functions for walking kallsyms symbols

Impact: New API

kallsyms_lookup_name only returns the first match that it finds.  Ksplice
needs information about all symbols with a given name in order to correctly
resolve local symbols.

kallsyms_on_each_symbol provides a generic mechanism for iterating over the
kallsyms table.

Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/kallsyms.h | 15 +++++++++++++++
 include/linux/module.h   | 12 ++++++++++++
 kernel/kallsyms.c        | 19 +++++++++++++++++++
 kernel/module.c          | 19 +++++++++++++++++++
 4 files changed, 65 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index f3fe34391d8e..792274269f2b 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -13,10 +13,17 @@
 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
 			 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)
 
+struct module;
+
 #ifdef CONFIG_KALLSYMS
 /* Lookup the address for a symbol. Returns 0 if not found. */
 unsigned long kallsyms_lookup_name(const char *name);
 
+/* Call a function on each kallsyms symbol in the core kernel */
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+				      unsigned long),
+			    void *data);
+
 extern int kallsyms_lookup_size_offset(unsigned long addr,
 				  unsigned long *symbolsize,
 				  unsigned long *offset);
@@ -43,6 +50,14 @@ static inline unsigned long kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+						    struct module *,
+						    unsigned long),
+					  void *data)
+{
+	return 0;
+}
+
 static inline int kallsyms_lookup_size_offset(unsigned long addr,
 					      unsigned long *symbolsize,
 					      unsigned long *offset)
diff --git a/include/linux/module.h b/include/linux/module.h
index 69761ce0dbf0..c3d3fc4ffb18 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -387,6 +387,10 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data);
+
 extern void __module_put_and_exit(struct module *mod, long code)
 	__attribute__((noreturn));
 #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code);
@@ -566,6 +570,14 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+							   struct module *,
+							   unsigned long),
+						 void *data)
+{
+	return 0;
+}
+
 static inline int register_module_notifier(struct notifier_block * nb)
 {
 	/* no events will happen anyway, so this can always succeed */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b1..374faf9bfdc7 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -161,6 +161,25 @@ unsigned long kallsyms_lookup_name(const char *name)
 	return module_kallsyms_lookup_name(name);
 }
 
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+				      unsigned long),
+			    void *data)
+{
+	char namebuf[KSYM_NAME_LEN];
+	unsigned long i;
+	unsigned int off;
+	int ret;
+
+	for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+		off = kallsyms_expand_symbol(off, namebuf);
+		ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
+		if (ret != 0)
+			return ret;
+	}
+	return module_kallsyms_on_each_symbol(fn, data);
+}
+EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
+
 static unsigned long get_symbol_pos(unsigned long addr,
 				    unsigned long *symbolsize,
 				    unsigned long *offset)
diff --git a/kernel/module.c b/kernel/module.c
index 8ddca629e079..dd4389be9152 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2612,6 +2612,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 	preempt_enable();
 	return ret;
 }
+
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data)
+{
+	struct module *mod;
+	unsigned int i;
+	int ret;
+
+	list_for_each_entry(mod, &modules, list) {
+		for (i = 0; i < mod->num_symtab; i++) {
+			ret = fn(data, mod->strtab + mod->symtab[i].st_name,
+				 mod, mod->symtab[i].st_value);
+			if (ret != 0)
+				return ret;
+		}
+	}
+	return 0;
+}
 #endif /* CONFIG_KALLSYMS */
 
 static char *module_flags(struct module *mod, char *buf)
-- 
cgit v1.2.3


From c6b37801911d7f4663c99cad8aa230bc934cea82 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@mit.edu>
Date: Fri, 5 Dec 2008 19:03:59 -0500
Subject: module: Export symbols needed for Ksplice

Impact: Expose some module.c symbols

Ksplice uses several functions from module.c in order to resolve
symbols and implement dependency handling.  Calling these functions
requires holding module_mutex, so it is exported.

(This is just the module part of a bigger add-exports patch from Tim).

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Signed-off-by: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 28 ++++++++++++++++++++++++++++
 kernel/module.c        | 43 +++++++++++++++++++------------------------
 2 files changed, 47 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index c3d3fc4ffb18..d246da0b0f8c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -354,6 +354,8 @@ struct module
 #define MODULE_ARCH_INIT {}
 #endif
 
+extern struct mutex module_mutex;
+
 /* FIXME: It'd be nice to isolate modules during init, too, so they
    aren't used before they (may) fail.  But presently too much code
    (IDE & SCSI) require entry into the module during init.*/
@@ -379,6 +381,31 @@ static inline int within_module_init(unsigned long addr, struct module *mod)
 	       addr < (unsigned long)mod->module_init + mod->init_size;
 }
 
+/* Search for module by name: must hold module_mutex. */
+struct module *find_module(const char *name);
+
+struct symsearch {
+	const struct kernel_symbol *start, *stop;
+	const unsigned long *crcs;
+	enum {
+		NOT_GPL_ONLY,
+		GPL_ONLY,
+		WILL_BE_GPL_ONLY,
+	} licence;
+	bool unused;
+};
+
+/* Search for an exported symbol by name. */
+const struct kernel_symbol *find_symbol(const char *name,
+					struct module **owner,
+					const unsigned long **crc,
+					bool gplok,
+					bool warn);
+
+/* Walk the exported symbol table */
+bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+			    unsigned int symnum, void *data), void *data);
+
 /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
    symnum out of range. */
 int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
@@ -452,6 +479,7 @@ static inline void __module_get(struct module *module)
 #define symbol_put_addr(p) do { } while(0)
 
 #endif /* CONFIG_MODULE_UNLOAD */
+int use_module(struct module *a, struct module *b);
 
 /* This is a #define so the string doesn't get put in every .o file */
 #define module_name(mod)			\
diff --git a/kernel/module.c b/kernel/module.c
index dd4389be9152..5fd00766a4dc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -68,7 +68,8 @@
 
 /* List of modules, protected by module_mutex or preempt_disable
  * (delete uses stop_machine/add uses RCU list operations). */
-static DEFINE_MUTEX(module_mutex);
+DEFINE_MUTEX(module_mutex);
+EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
 
 /* Waiting for a module to finish initializing? */
@@ -186,17 +187,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
-struct symsearch {
-	const struct kernel_symbol *start, *stop;
-	const unsigned long *crcs;
-	enum {
-		NOT_GPL_ONLY,
-		GPL_ONLY,
-		WILL_BE_GPL_ONLY,
-	} licence;
-	bool unused;
-};
-
 static bool each_symbol_in_section(const struct symsearch *arr,
 				   unsigned int arrsize,
 				   struct module *owner,
@@ -217,10 +207,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
 }
 
 /* Returns true as soon as fn returns true, otherwise false. */
-static bool each_symbol(bool (*fn)(const struct symsearch *arr,
-				   struct module *owner,
-				   unsigned int symnum, void *data),
-			void *data)
+bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+			    unsigned int symnum, void *data), void *data)
 {
 	struct module *mod;
 	const struct symsearch arr[] = {
@@ -273,6 +261,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
 	}
 	return false;
 }
+EXPORT_SYMBOL_GPL(each_symbol);
 
 struct find_symbol_arg {
 	/* Input */
@@ -330,11 +319,11 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 
 /* Find a symbol and return it, along with, (optional) crc and
  * (optional) module which owns it */
-static const struct kernel_symbol *find_symbol(const char *name,
-					       struct module **owner,
-					       const unsigned long **crc,
-					       bool gplok,
-					       bool warn)
+const struct kernel_symbol *find_symbol(const char *name,
+					struct module **owner,
+					const unsigned long **crc,
+					bool gplok,
+					bool warn)
 {
 	struct find_symbol_arg fsa;
 
@@ -353,9 +342,10 @@ static const struct kernel_symbol *find_symbol(const char *name,
 	DEBUGP("Failed to find symbol %s\n", name);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(find_symbol);
 
 /* Search for module by name: must hold module_mutex. */
-static struct module *find_module(const char *name)
+struct module *find_module(const char *name)
 {
 	struct module *mod;
 
@@ -365,6 +355,7 @@ static struct module *find_module(const char *name)
 	}
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(find_module);
 
 #ifdef CONFIG_SMP
 
@@ -641,7 +632,7 @@ static int already_uses(struct module *a, struct module *b)
 }
 
 /* Module a uses b */
-static int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
 {
 	struct module_use *use;
 	int no_warn, err;
@@ -674,6 +665,7 @@ static int use_module(struct module *a, struct module *b)
 	no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
 	return 1;
 }
+EXPORT_SYMBOL_GPL(use_module);
 
 /* Clear the unload stuff of the module. */
 static void module_unload_free(struct module *mod)
@@ -951,10 +943,11 @@ static inline void module_unload_free(struct module *mod)
 {
 }
 
-static inline int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
 {
 	return strong_try_module_get(b) == 0;
 }
+EXPORT_SYMBOL_GPL(use_module);
 
 static inline void module_unload_init(struct module *mod)
 {
@@ -2803,6 +2796,7 @@ __notrace_funcgraph struct module *__module_address(unsigned long addr)
 			return mod;
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(__module_address);
 
 /*
  * is_module_text_address - is this address inside module code?
@@ -2841,6 +2835,7 @@ struct module *__module_text_address(unsigned long addr)
 	}
 	return mod;
 }
+EXPORT_SYMBOL_GPL(__module_text_address);
 
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
-- 
cgit v1.2.3


From acae05156551fd7528fbb616271e672789388e3c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 8 Feb 2009 10:42:01 -0800
Subject: module: create a request_module_nowait()

There seems to be a common pattern in the kernel where drivers want to
call request_module() from inside a module_init() function. Currently
this would deadlock.

As a result, several drivers go through hoops like scheduling things via
kevent, or creating custom work queues (because kevent can deadlock on them).

This patch changes this to use a request_module_nowait() function macro instead,
which just fires the modprobe off but doesn't wait for it, and thus avoids the
original deadlock entirely.

On my laptop this already results in one less kernel thread running..

(Includes Jiri's patch to use enum umh_wait)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (bool-ified)
Cc: Jiri Slaby <jirislaby@gmail.com>
---
 include/linux/kmod.h | 11 ++++++++---
 kernel/kmod.c        | 10 ++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 92213a9194e1..d5fa565086d1 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -29,10 +29,15 @@
 #ifdef CONFIG_MODULES
 /* modprobe exit status on success, -ve on error.  Return value
  * usually useless though. */
-extern int request_module(const char * name, ...) __attribute__ ((format (printf, 1, 2)));
-#define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
+extern int __request_module(bool wait, const char *name, ...) \
+	__attribute__((format(printf, 2, 3)));
+#define request_module(mod...) __request_module(true, mod)
+#define request_module_nowait(mod...) __request_module(false, mod)
+#define try_then_request_module(x, mod...) \
+	((x) ?: (__request_module(false, mod), (x)))
 #else
-static inline int request_module(const char * name, ...) { return -ENOSYS; }
+static inline int request_module(const char *name, ...) { return -ENOSYS; }
+static inline int request_module_nowait(const char *name, ...) { return -ENOSYS; }
 #define try_then_request_module(x, mod...) (x)
 #endif
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f0c8f545180d..b750675251e5 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -50,7 +50,8 @@ static struct workqueue_struct *khelper_wq;
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
 /**
- * request_module - try to load a kernel module
+ * __request_module - try to load a kernel module
+ * @wait: wait (or not) for the operation to complete
  * @fmt: printf style format string for the name of the module
  * @...: arguments as specified in the format string
  *
@@ -63,7 +64,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
  * If module auto-loading support is disabled then this function
  * becomes a no-operation.
  */
-int request_module(const char *fmt, ...)
+int __request_module(bool wait, const char *fmt, ...)
 {
 	va_list args;
 	char module_name[MODULE_NAME_LEN];
@@ -108,11 +109,12 @@ int request_module(const char *fmt, ...)
 		return -ENOMEM;
 	}
 
-	ret = call_usermodehelper(modprobe_path, argv, envp, 1);
+	ret = call_usermodehelper(modprobe_path, argv, envp,
+			wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
 	atomic_dec(&kmod_concurrent);
 	return ret;
 }
-EXPORT_SYMBOL(request_module);
+EXPORT_SYMBOL(__request_module);
 #endif /* CONFIG_MODULES */
 
 struct subprocess_info {
-- 
cgit v1.2.3


From 66f92cf9d415e96a5bdd6c64de8dd8418595d2fc Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:36 -0600
Subject: strstarts: helper function for !strncmp(str, prefix, strlen(prefix))

Impact: minor new API

ksplice added a "starts_with" function, which seems like a common need.
When people open-code it they seem to use fixed numbers rather than strlen,
so it's quite a readability win (also, strncmp() almost always wants != 0
on it).

So here's strstarts().

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/string.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index d18fc198aa2f..76ec218bb30f 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -114,5 +114,14 @@ extern bool sysfs_streq(const char *s1, const char *s2);
 extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
 			const void *from, size_t available);
 
+/**
+ * strstarts - does @str start with @prefix?
+ * @str: string to examine
+ * @prefix: prefix to look for.
+ */
+static inline bool strstarts(const char *str, const char *prefix)
+{
+	return strncmp(str, prefix, strlen(prefix)) == 0;
+}
 #endif
 #endif /* _LINUX_STRING_H_ */
-- 
cgit v1.2.3


From eea1bf384e05b5ab747f8530c4fba9e9e6907fff Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Mar 2009 14:27:02 +1100
Subject: md: Fix is_mddev_idle test (again).

There are two problems with is_mddev_idle.

1/ sync_io is 'atomic_t' and hence 'int'.  curr_events and all the
   rest are 'long'.
   So if sync_io were to wrap on a 64bit host, the value of
   curr_events would go very negative suddenly, and take a very
   long time to return to positive.

   So do all calculations as 'int'.  That gives us plenty of precision
   for what we need.

2/ To initialise rdev->last_events we simply call is_mddev_idle, on
   the assumption that it will make sure that last_events is in a
   suitable range.  It used to do this, but now it does not.
   So now we need to be more explicit about initialisation.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 16 ++++++++--------
 include/linux/raid/md_k.h |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 03b4cd0a6344..a99c50e217c0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5716,19 +5716,19 @@ int unregister_md_personality(struct mdk_personality *p)
 	return 0;
 }
 
-static int is_mddev_idle(mddev_t *mddev)
+static int is_mddev_idle(mddev_t *mddev, int init)
 {
 	mdk_rdev_t * rdev;
 	int idle;
-	long curr_events;
+	int curr_events;
 
 	idle = 1;
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
-		curr_events = part_stat_read(&disk->part0, sectors[0]) +
-				part_stat_read(&disk->part0, sectors[1]) -
-				atomic_read(&disk->sync_io);
+		curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+			      (int)part_stat_read(&disk->part0, sectors[1]) -
+			      atomic_read(&disk->sync_io);
 		/* sync IO will cause sync_io to increase before the disk_stats
 		 * as sync_io is counted when a request starts, and
 		 * disk_stats is counted when it completes.
@@ -5751,7 +5751,7 @@ static int is_mddev_idle(mddev_t *mddev)
 		 * always make curr_events less than last_events.
 		 *
 		 */
-		if (curr_events - rdev->last_events > 4096) {
+		if (init || curr_events - rdev->last_events > 64) {
 			rdev->last_events = curr_events;
 			idle = 0;
 		}
@@ -5994,7 +5994,7 @@ void md_do_sync(mddev_t *mddev)
 	       "(but not more than %d KB/sec) for %s.\n",
 	       speed_max(mddev), desc);
 
-	is_mddev_idle(mddev); /* this also initializes IO event counters */
+	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
 
 	io_sectors = 0;
 	for (m = 0; m < SYNC_MARKS; m++) {
@@ -6096,7 +6096,7 @@ void md_do_sync(mddev_t *mddev)
 
 		if (currspeed > speed_min(mddev)) {
 			if ((currspeed > speed_max(mddev)) ||
-					!is_mddev_idle(mddev)) {
+					!is_mddev_idle(mddev, 0)) {
 				msleep(500);
 				goto repeat;
 			}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 9743e4dbc918..4aedb9fe2bd8 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -51,7 +51,7 @@ struct mdk_rdev_s
 
 	sector_t size;			/* Device size (in blocks) */
 	mddev_t *mddev;			/* RAID array if running */
-	long last_events;		/* IO event timestamp */
+	int last_events;		/* IO event timestamp */
 
 	struct block_device *bdev;	/* block device handle */
 
-- 
cgit v1.2.3


From ef740c372dfd80e706dbf955d4e4aedda6c0c148 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 31 Mar 2009 14:27:03 +1100
Subject: md: move headers out of include/linux/raid/

Move the headers with the local structures for the disciplines and
bitmap.h into drivers/md/ so that they are more easily grepable for
hacking and not far away.  md.h is left where it is for now as there
are some uses from the outside.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c            |   2 +-
 drivers/md/bitmap.h            | 288 +++++++++++++++++++++++++++++
 drivers/md/linear.c            |   2 +-
 drivers/md/linear.h            |  31 ++++
 drivers/md/md.c                |   2 +-
 drivers/md/multipath.c         |   2 +-
 drivers/md/multipath.h         |  42 +++++
 drivers/md/raid0.c             |   2 +-
 drivers/md/raid0.h             |  30 +++
 drivers/md/raid1.c             |   4 +-
 drivers/md/raid1.h             | 134 ++++++++++++++
 drivers/md/raid10.c            |   4 +-
 drivers/md/raid10.h            | 123 +++++++++++++
 drivers/md/raid5.c             |   5 +-
 drivers/md/raid5.h             | 402 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid6.h             |   2 +-
 include/linux/raid/bitmap.h    | 288 -----------------------------
 include/linux/raid/linear.h    |  31 ----
 include/linux/raid/multipath.h |  42 -----
 include/linux/raid/raid0.h     |  30 ---
 include/linux/raid/raid1.h     | 134 --------------
 include/linux/raid/raid10.h    | 123 -------------
 include/linux/raid/raid5.h     | 402 -----------------------------------------
 23 files changed, 1062 insertions(+), 1063 deletions(-)
 create mode 100644 drivers/md/bitmap.h
 create mode 100644 drivers/md/linear.h
 create mode 100644 drivers/md/multipath.h
 create mode 100644 drivers/md/raid0.h
 create mode 100644 drivers/md/raid1.h
 create mode 100644 drivers/md/raid10.h
 create mode 100644 drivers/md/raid5.h
 delete mode 100644 include/linux/raid/bitmap.h
 delete mode 100644 include/linux/raid/linear.h
 delete mode 100644 include/linux/raid/multipath.h
 delete mode 100644 include/linux/raid/raid0.h
 delete mode 100644 include/linux/raid/raid1.h
 delete mode 100644 include/linux/raid/raid10.h
 delete mode 100644 include/linux/raid/raid5.h

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 27f978dfe6a3..7666117738c7 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -27,7 +27,7 @@
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
 #include <linux/raid/md.h>
-#include <linux/raid/bitmap.h>
+#include "bitmap.h"
 
 /* debug macros */
 
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
new file mode 100644
index 000000000000..e98900671ca9
--- /dev/null
+++ b/drivers/md/bitmap.h
@@ -0,0 +1,288 @@
+/*
+ * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
+ *
+ * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+ */
+#ifndef BITMAP_H
+#define BITMAP_H 1
+
+#define BITMAP_MAJOR_LO 3
+/* version 4 insists the bitmap is in little-endian order
+ * with version 3, it is host-endian which is non-portable
+ */
+#define BITMAP_MAJOR_HI 4
+#define	BITMAP_MAJOR_HOSTENDIAN 3
+
+#define BITMAP_MINOR 39
+
+/*
+ * in-memory bitmap:
+ *
+ * Use 16 bit block counters to track pending writes to each "chunk".
+ * The 2 high order bits are special-purpose, the first is a flag indicating
+ * whether a resync is needed.  The second is a flag indicating whether a
+ * resync is active.
+ * This means that the counter is actually 14 bits:
+ *
+ * +--------+--------+------------------------------------------------+
+ * | resync | resync |               counter                          |
+ * | needed | active |                                                |
+ * |  (0-1) |  (0-1) |              (0-16383)                         |
+ * +--------+--------+------------------------------------------------+
+ *
+ * The "resync needed" bit is set when:
+ *    a '1' bit is read from storage at startup.
+ *    a write request fails on some drives
+ *    a resync is aborted on a chunk with 'resync active' set
+ * It is cleared (and resync-active set) when a resync starts across all drives
+ * of the chunk.
+ *
+ *
+ * The "resync active" bit is set when:
+ *    a resync is started on all drives, and resync_needed is set.
+ *       resync_needed will be cleared (as long as resync_active wasn't already set).
+ * It is cleared when a resync completes.
+ *
+ * The counter counts pending write requests, plus the on-disk bit.
+ * When the counter is '1' and the resync bits are clear, the on-disk
+ * bit can be cleared aswell, thus setting the counter to 0.
+ * When we set a bit, or in the counter (to start a write), if the fields is
+ * 0, we first set the disk bit and set the counter to 1.
+ *
+ * If the counter is 0, the on-disk bit is clear and the stipe is clean
+ * Anything that dirties the stipe pushes the counter to 2 (at least)
+ * and sets the on-disk bit (lazily).
+ * If a periodic sweep find the counter at 2, it is decremented to 1.
+ * If the sweep find the counter at 1, the on-disk bit is cleared and the
+ * counter goes to zero.
+ *
+ * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
+ * counters as a fallback when "page" memory cannot be allocated:
+ *
+ * Normal case (page memory allocated):
+ *
+ *     page pointer (32-bit)
+ *
+ *     [ ] ------+
+ *               |
+ *               +-------> [   ][   ]..[   ] (4096 byte page == 2048 counters)
+ *                          c1   c2    c2048
+ *
+ * Hijacked case (page memory allocation failed):
+ *
+ *     hijacked page pointer (32-bit)
+ *
+ *     [		  ][		  ] (no page memory allocated)
+ *      counter #1 (16-bit) counter #2 (16-bit)
+ *
+ */
+
+#ifdef __KERNEL__
+
+#define PAGE_BITS (PAGE_SIZE << 3)
+#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+
+typedef __u16 bitmap_counter_t;
+#define COUNTER_BITS 16
+#define COUNTER_BIT_SHIFT 4
+#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
+#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
+
+#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
+#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
+#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
+#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
+#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
+#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
+
+/* how many counters per page? */
+#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
+/* same, except a shift value for more efficient bitops */
+#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
+/* same, except a mask value for more efficient bitops */
+#define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)
+
+#define BITMAP_BLOCK_SIZE 512
+#define BITMAP_BLOCK_SHIFT 9
+
+/* how many blocks per chunk? (this is variable) */
+#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
+
+/* when hijacked, the counters and bits represent even larger "chunks" */
+/* there will be 1024 chunks represented by each counter in the page pointers */
+#define PAGEPTR_BLOCK_RATIO(bitmap) \
+			(CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
+#define PAGEPTR_BLOCK_SHIFT(bitmap) \
+			(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
+#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
+
+/*
+ * on-disk bitmap:
+ *
+ * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
+ * file a page at a time. There's a superblock at the start of the file.
+ */
+
+/* map chunks (bits) to file pages - offset by the size of the superblock */
+#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
+
+#endif
+
+/*
+ * bitmap structures:
+ */
+
+#define BITMAP_MAGIC 0x6d746962
+
+/* use these for bitmap->flags and bitmap->sb->state bit-fields */
+enum bitmap_state {
+	BITMAP_STALE  = 0x002,  /* the bitmap file is out of date or had -EIO */
+	BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
+	BITMAP_HOSTENDIAN = 0x8000,
+};
+
+/* the superblock at the front of the bitmap file -- little endian */
+typedef struct bitmap_super_s {
+	__le32 magic;        /*  0  BITMAP_MAGIC */
+	__le32 version;      /*  4  the bitmap major for now, could change... */
+	__u8  uuid[16];      /*  8  128 bit uuid - must match md device uuid */
+	__le64 events;       /* 24  event counter for the bitmap (1)*/
+	__le64 events_cleared;/*32  event counter when last bit cleared (2) */
+	__le64 sync_size;    /* 40  the size of the md device's sync range(3) */
+	__le32 state;        /* 48  bitmap state information */
+	__le32 chunksize;    /* 52  the bitmap chunk size in bytes */
+	__le32 daemon_sleep; /* 56  seconds between disk flushes */
+	__le32 write_behind; /* 60  number of outstanding write-behind writes */
+
+	__u8  pad[256 - 64]; /* set to zero */
+} bitmap_super_t;
+
+/* notes:
+ * (1) This event counter is updated before the eventcounter in the md superblock
+ *    When a bitmap is loaded, it is only accepted if this event counter is equal
+ *    to, or one greater than, the event counter in the superblock.
+ * (2) This event counter is updated when the other one is *if*and*only*if* the
+ *    array is not degraded.  As bits are not cleared when the array is degraded,
+ *    this represents the last time that any bits were cleared.
+ *    If a device is being added that has an event count with this value or
+ *    higher, it is accepted as conforming to the bitmap.
+ * (3)This is the number of sectors represented by the bitmap, and is the range that
+ *    resync happens across.  For raid1 and raid5/6 it is the size of individual
+ *    devices.  For raid10 it is the size of the array.
+ */
+
+#ifdef __KERNEL__
+
+/* the in-memory bitmap is represented by bitmap_pages */
+struct bitmap_page {
+	/*
+	 * map points to the actual memory page
+	 */
+	char *map;
+	/*
+	 * in emergencies (when map cannot be alloced), hijack the map
+	 * pointer and use it as two counters itself
+	 */
+	unsigned int hijacked:1;
+	/*
+	 * count of dirty bits on the page
+	 */
+	unsigned int  count:31;
+};
+
+/* keep track of bitmap file pages that have pending writes on them */
+struct page_list {
+	struct list_head list;
+	struct page *page;
+};
+
+/* the main bitmap structure - one per mddev */
+struct bitmap {
+	struct bitmap_page *bp;
+	unsigned long pages; /* total number of pages in the bitmap */
+	unsigned long missing_pages; /* number of pages not yet allocated */
+
+	mddev_t *mddev; /* the md device that the bitmap is for */
+
+	int counter_bits; /* how many bits per block counter */
+
+	/* bitmap chunksize -- how much data does each bit represent? */
+	unsigned long chunksize;
+	unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
+	unsigned long chunks; /* total number of data chunks for the array */
+
+	/* We hold a count on the chunk currently being synced, and drop
+	 * it when the last block is started.  If the resync is aborted
+	 * midway, we need to be able to drop that count, so we remember
+	 * the counted chunk..
+	 */
+	unsigned long syncchunk;
+
+	__u64	events_cleared;
+	int need_sync;
+
+	/* bitmap spinlock */
+	spinlock_t lock;
+
+	long offset; /* offset from superblock if file is NULL */
+	struct file *file; /* backing disk file */
+	struct page *sb_page; /* cached copy of the bitmap file superblock */
+	struct page **filemap; /* list of cache pages for the file */
+	unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
+	unsigned long file_pages; /* number of pages in the file */
+	int last_page_size; /* bytes in the last page */
+
+	unsigned long flags;
+
+	int allclean;
+
+	unsigned long max_write_behind; /* write-behind mode */
+	atomic_t behind_writes;
+
+	/*
+	 * the bitmap daemon - periodically wakes up and sweeps the bitmap
+	 * file, cleaning up bits and flushing out pages to disk as necessary
+	 */
+	unsigned long daemon_lastrun; /* jiffies of last run */
+	unsigned long daemon_sleep; /* how many seconds between updates? */
+	unsigned long last_end_sync; /* when we lasted called end_sync to
+				      * update bitmap with resync progress */
+
+	atomic_t pending_writes; /* pending writes to the bitmap file */
+	wait_queue_head_t write_wait;
+	wait_queue_head_t overflow_wait;
+
+};
+
+/* the bitmap API */
+
+/* these are used only by md/bitmap */
+int  bitmap_create(mddev_t *mddev);
+void bitmap_flush(mddev_t *mddev);
+void bitmap_destroy(mddev_t *mddev);
+
+void bitmap_print_sb(struct bitmap *bitmap);
+void bitmap_update_sb(struct bitmap *bitmap);
+
+int  bitmap_setallbits(struct bitmap *bitmap);
+void bitmap_write_all(struct bitmap *bitmap);
+
+void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
+
+/* these are exported */
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int behind);
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int success, int behind);
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
+void bitmap_close_sync(struct bitmap *bitmap);
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
+
+void bitmap_unplug(struct bitmap *bitmap);
+void bitmap_daemon_work(struct bitmap *bitmap);
+#endif
+
+#endif
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 09658b218474..3603ffa9edc5 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -16,7 +16,7 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
-#include <linux/raid/linear.h>
+#include "linear.h"
 
 /*
  * find which device holds a particular offset 
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
new file mode 100644
index 000000000000..f38b9c586afb
--- /dev/null
+++ b/drivers/md/linear.h
@@ -0,0 +1,31 @@
+#ifndef _LINEAR_H
+#define _LINEAR_H
+
+#include <linux/raid/md.h>
+
+struct dev_info {
+	mdk_rdev_t	*rdev;
+	sector_t	num_sectors;
+	sector_t	start_sector;
+};
+
+typedef struct dev_info dev_info_t;
+
+struct linear_private_data
+{
+	struct linear_private_data *prev;	/* earlier version */
+	dev_info_t		**hash_table;
+	sector_t		spacing;
+	sector_t		array_sectors;
+	int			sector_shift;	/* shift before dividing
+						 * by spacing
+						 */
+	dev_info_t		disks[0];
+};
+
+
+typedef struct linear_private_data linear_conf_t;
+
+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
+
+#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3efc0bceada2..9a3214c8585f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -34,7 +34,6 @@
 
 #include <linux/kthread.h>
 #include <linux/raid/md.h>
-#include <linux/raid/bitmap.h>
 #include <linux/sysctl.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/poll.h>
@@ -45,6 +44,7 @@
 #include <linux/reboot.h>
 #include <linux/file.h>
 #include <linux/delay.h>
+#include "bitmap.h"
 
 /* 63 partitions with the alternate major number (mdp) */
 #define MdpMinorShift 6
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index f6d08f241671..547df09a7af3 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -19,7 +19,7 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include <linux/raid/multipath.h>
+#include "multipath.h"
 
 #define MAX_WORK_PER_DISK 128
 
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
new file mode 100644
index 000000000000..6f53fc177a47
--- /dev/null
+++ b/drivers/md/multipath.h
@@ -0,0 +1,42 @@
+#ifndef _MULTIPATH_H
+#define _MULTIPATH_H
+
+#include <linux/raid/md.h>
+
+struct multipath_info {
+	mdk_rdev_t	*rdev;
+};
+
+struct multipath_private_data {
+	mddev_t			*mddev;
+	struct multipath_info	*multipaths;
+	int			raid_disks;
+	int			working_disks;
+	spinlock_t		device_lock;
+	struct list_head	retry_list;
+
+	mempool_t		*pool;
+};
+
+typedef struct multipath_private_data multipath_conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
+
+/*
+ * this is our 'private' 'collective' MULTIPATH buffer head.
+ * it contains information about what kind of IO operations were started
+ * for this MULTIPATH operation, and about their status:
+ */
+
+struct multipath_bh {
+	mddev_t			*mddev;
+	struct bio		*master_bio;
+	struct bio		bio;
+	int			path;
+	struct list_head	retry_list;
+};
+#endif
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c605ba805586..ef09ed04864e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -18,7 +18,7 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
-#include <linux/raid/raid0.h>
+#include "raid0.h"
 
 static void raid0_unplug(struct request_queue *q)
 {
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
new file mode 100644
index 000000000000..fd42aa87c391
--- /dev/null
+++ b/drivers/md/raid0.h
@@ -0,0 +1,30 @@
+#ifndef _RAID0_H
+#define _RAID0_H
+
+#include <linux/raid/md.h>
+
+struct strip_zone
+{
+	sector_t zone_start;	/* Zone offset in md_dev (in sectors) */
+	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
+	sector_t sectors;	/* Zone size in sectors */
+	int nb_dev;		/* # of devices attached to the zone */
+	mdk_rdev_t **dev;	/* Devices attached to the zone */
+};
+
+struct raid0_private_data
+{
+	struct strip_zone **hash_table; /* Table of indexes into strip_zone */
+	struct strip_zone *strip_zone;
+	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
+	int nr_strip_zones;
+
+	sector_t spacing;
+	int sector_shift; /* shift this before divide by spacing */
+};
+
+typedef struct raid0_private_data raid0_conf_t;
+
+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
+
+#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2466425d9ca..bff32285f8bb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -33,8 +33,8 @@
 
 #include "dm-bio-list.h"
 #include <linux/delay.h>
-#include <linux/raid/raid1.h>
-#include <linux/raid/bitmap.h>
+#include "raid1.h"
+#include "bitmap.h"
 
 #define DEBUG 0
 #if DEBUG
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
new file mode 100644
index 000000000000..0a9ba7c3302e
--- /dev/null
+++ b/drivers/md/raid1.h
@@ -0,0 +1,134 @@
+#ifndef _RAID1_H
+#define _RAID1_H
+
+#include <linux/raid/md.h>
+
+typedef struct mirror_info mirror_info_t;
+
+struct mirror_info {
+	mdk_rdev_t	*rdev;
+	sector_t	head_position;
+};
+
+/*
+ * memory pools need a pointer to the mddev, so they can force an unplug
+ * when memory is tight, and a count of the number of drives that the
+ * pool was allocated for, so they know how much to allocate and free.
+ * mddev->raid_disks cannot be used, as it can change while a pool is active
+ * These two datums are stored in a kmalloced struct.
+ */
+
+struct pool_info {
+	mddev_t *mddev;
+	int	raid_disks;
+};
+
+
+typedef struct r1bio_s r1bio_t;
+
+struct r1_private_data_s {
+	mddev_t			*mddev;
+	mirror_info_t		*mirrors;
+	int			raid_disks;
+	int			last_used;
+	sector_t		next_seq_sect;
+	spinlock_t		device_lock;
+
+	struct list_head	retry_list;
+	/* queue pending writes and submit them on unplug */
+	struct bio_list		pending_bio_list;
+	/* queue of writes that have been unplugged */
+	struct bio_list		flushing_bio_list;
+
+	/* for use when syncing mirrors: */
+
+	spinlock_t		resync_lock;
+	int			nr_pending;
+	int			nr_waiting;
+	int			nr_queued;
+	int			barrier;
+	sector_t		next_resync;
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
+	wait_queue_head_t	wait_barrier;
+
+	struct pool_info	*poolinfo;
+
+	struct page		*tmppage;
+
+	mempool_t *r1bio_pool;
+	mempool_t *r1buf_pool;
+};
+
+typedef struct r1_private_data_s conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
+
+/*
+ * this is our 'private' RAID1 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID1 operation, and about their status:
+ */
+
+struct r1bio_s {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	atomic_t		behind_remaining; /* number of write-behind ios remaining
+						 * in this BehindIO request
+						 */
+	sector_t		sector;
+	int			sectors;
+	unsigned long		state;
+	mddev_t			*mddev;
+	/*
+	 * original bio going to /dev/mdx
+	 */
+	struct bio		*master_bio;
+	/*
+	 * if the IO is in READ direction, then this is where we read
+	 */
+	int			read_disk;
+
+	struct list_head	retry_list;
+	struct bitmap_update	*bitmap_update;
+	/*
+	 * if the IO is in WRITE direction, then multiple bios are used.
+	 * We choose the number when they are allocated.
+	 */
+	struct bio		*bios[0];
+	/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
+};
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
+/* bits for r1bio.state */
+#define	R1BIO_Uptodate	0
+#define	R1BIO_IsSync	1
+#define	R1BIO_Degraded	2
+#define	R1BIO_BehindIO	3
+#define	R1BIO_Barrier	4
+#define R1BIO_BarrierRetry 5
+/* For write-behind requests, we call bi_end_io when
+ * the last non-write-behind device completes, providing
+ * any write was successful.  Otherwise we call when
+ * any write-behind write succeeds, otherwise we call
+ * with failure when last write completes (and all failed).
+ * Record that bi_end_io was called with this flag...
+ */
+#define	R1BIO_Returned 6
+
+#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7301631abe04..f03dd70d12a5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -20,8 +20,8 @@
 
 #include "dm-bio-list.h"
 #include <linux/delay.h>
-#include <linux/raid/raid10.h>
-#include <linux/raid/bitmap.h>
+#include "raid10.h"
+#include "bitmap.h"
 
 /*
  * RAID10 provides a combination of RAID0 and RAID1 functionality.
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
new file mode 100644
index 000000000000..e9091cfeb286
--- /dev/null
+++ b/drivers/md/raid10.h
@@ -0,0 +1,123 @@
+#ifndef _RAID10_H
+#define _RAID10_H
+
+#include <linux/raid/md.h>
+
+typedef struct mirror_info mirror_info_t;
+
+struct mirror_info {
+	mdk_rdev_t	*rdev;
+	sector_t	head_position;
+};
+
+typedef struct r10bio_s r10bio_t;
+
+struct r10_private_data_s {
+	mddev_t			*mddev;
+	mirror_info_t		*mirrors;
+	int			raid_disks;
+	spinlock_t		device_lock;
+
+	/* geometry */
+	int			near_copies;  /* number of copies layed out raid0 style */
+	int 			far_copies;   /* number of copies layed out
+					       * at large strides across drives
+					       */
+	int			far_offset;   /* far_copies are offset by 1 stripe
+					       * instead of many
+					       */
+	int			copies;	      /* near_copies * far_copies.
+					       * must be <= raid_disks
+					       */
+	sector_t		stride;	      /* distance between far copies.
+					       * This is size / far_copies unless
+					       * far_offset, in which case it is
+					       * 1 stripe.
+					       */
+
+	int chunk_shift; /* shift from chunks to sectors */
+	sector_t chunk_mask;
+
+	struct list_head	retry_list;
+	/* queue pending writes and submit them on unplug */
+	struct bio_list		pending_bio_list;
+
+
+	spinlock_t		resync_lock;
+	int nr_pending;
+	int nr_waiting;
+	int nr_queued;
+	int barrier;
+	sector_t		next_resync;
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
+	wait_queue_head_t	wait_barrier;
+
+	mempool_t *r10bio_pool;
+	mempool_t *r10buf_pool;
+	struct page		*tmppage;
+};
+
+typedef struct r10_private_data_s conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
+
+/*
+ * this is our 'private' RAID10 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID10 operation, and about their status:
+ */
+
+struct r10bio_s {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	sector_t		sector;	/* virtual sector number */
+	int			sectors;
+	unsigned long		state;
+	mddev_t			*mddev;
+	/*
+	 * original bio going to /dev/mdx
+	 */
+	struct bio		*master_bio;
+	/*
+	 * if the IO is in READ direction, then this is where we read
+	 */
+	int			read_slot;
+
+	struct list_head	retry_list;
+	/*
+	 * if the IO is in WRITE direction, then multiple bios are used,
+	 * one for each copy.
+	 * When resyncing we also use one for each copy.
+	 * When reconstructing, we use 2 bios, one for read, one for write.
+	 * We choose the number when they are allocated.
+	 */
+	struct {
+		struct bio		*bio;
+		sector_t addr;
+		int devnum;
+	} devs[0];
+};
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
+/* bits for r10bio.state */
+#define	R10BIO_Uptodate	0
+#define	R10BIO_IsSync	1
+#define	R10BIO_IsRecover 2
+#define	R10BIO_Degraded 3
+#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a5ba080d303b..f75698b1f63d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -44,10 +44,9 @@
  */
 
 #include <linux/kthread.h>
-#include "raid6.h"
-
-#include <linux/raid/bitmap.h>
 #include <linux/async_tx.h>
+#include "raid6.h"
+#include "bitmap.h"
 
 /*
  * Stripe cache
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
new file mode 100644
index 000000000000..40f1d0335c74
--- /dev/null
+++ b/drivers/md/raid5.h
@@ -0,0 +1,402 @@
+#ifndef _RAID5_H
+#define _RAID5_H
+
+#include <linux/raid/md.h>
+#include <linux/raid/xor.h>
+
+/*
+ *
+ * Each stripe contains one buffer per disc.  Each buffer can be in
+ * one of a number of states stored in "flags".  Changes between
+ * these states happen *almost* exclusively under a per-stripe
+ * spinlock.  Some very specific changes can happen in bi_end_io, and
+ * these are not protected by the spin lock.
+ *
+ * The flag bits that are used to represent these states are:
+ *   R5_UPTODATE and R5_LOCKED
+ *
+ * State Empty == !UPTODATE, !LOCK
+ *        We have no data, and there is no active request
+ * State Want == !UPTODATE, LOCK
+ *        A read request is being submitted for this block
+ * State Dirty == UPTODATE, LOCK
+ *        Some new data is in this buffer, and it is being written out
+ * State Clean == UPTODATE, !LOCK
+ *        We have valid data which is the same as on disc
+ *
+ * The possible state transitions are:
+ *
+ *  Empty -> Want   - on read or write to get old data for  parity calc
+ *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
+ *  Empty -> Clean  - on compute_block when computing a block for failed drive
+ *  Want  -> Empty  - on failed read
+ *  Want  -> Clean  - on successful completion of read request
+ *  Dirty -> Clean  - on successful completion of write request
+ *  Dirty -> Clean  - on failed write
+ *  Clean -> Dirty  - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
+ *
+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions
+ * all happen in b_end_io at interrupt time.
+ * Each sets the Uptodate bit before releasing the Lock bit.
+ * This leaves one multi-stage transition:
+ *    Want->Dirty->Clean
+ * This is safe because thinking that a Clean buffer is actually dirty
+ * will at worst delay some action, and the stripe will be scheduled
+ * for attention after the transition is complete.
+ *
+ * There is one possibility that is not covered by these states.  That
+ * is if one drive has failed and there is a spare being rebuilt.  We
+ * can't distinguish between a clean block that has been generated
+ * from parity calculations, and a clean block that has been
+ * successfully written to the spare ( or to parity when resyncing).
+ * To distingush these states we have a stripe bit STRIPE_INSYNC that
+ * is set whenever a write is scheduled to the spare, or to the parity
+ * disc if there is no spare.  A sync request clears this bit, and
+ * when we find it set with no buffers locked, we know the sync is
+ * complete.
+ *
+ * Buffers for the md device that arrive via make_request are attached
+ * to the appropriate stripe in one of two lists linked on b_reqnext.
+ * One list (bh_read) for read requests, one (bh_write) for write.
+ * There should never be more than one buffer on the two lists
+ * together, but we are not guaranteed of that so we allow for more.
+ *
+ * If a buffer is on the read list when the associated cache buffer is
+ * Uptodate, the data is copied into the read buffer and it's b_end_io
+ * routine is called.  This may happen in the end_request routine only
+ * if the buffer has just successfully been read.  end_request should
+ * remove the buffers from the list and then set the Uptodate bit on
+ * the buffer.  Other threads may do this only if they first check
+ * that the Uptodate bit is set.  Once they have checked that they may
+ * take buffers off the read queue.
+ *
+ * When a buffer on the write list is committed for write it is copied
+ * into the cache buffer, which is then marked dirty, and moved onto a
+ * third list, the written list (bh_written).  Once both the parity
+ * block and the cached buffer are successfully written, any buffer on
+ * a written list can be returned with b_end_io.
+ *
+ * The write list and read list both act as fifos.  The read list is
+ * protected by the device_lock.  The write and written lists are
+ * protected by the stripe lock.  The device_lock, which can be
+ * claimed while the stipe lock is held, is only for list
+ * manipulations and will only be held for a very short time.  It can
+ * be claimed from interrupts.
+ *
+ *
+ * Stripes in the stripe cache can be on one of two lists (or on
+ * neither).  The "inactive_list" contains stripes which are not
+ * currently being used for any request.  They can freely be reused
+ * for another stripe.  The "handle_list" contains stripes that need
+ * to be handled in some way.  Both of these are fifo queues.  Each
+ * stripe is also (potentially) linked to a hash bucket in the hash
+ * table so that it can be found by sector number.  Stripes that are
+ * not hashed must be on the inactive_list, and will normally be at
+ * the front.  All stripes start life this way.
+ *
+ * The inactive_list, handle_list and hash bucket lists are all protected by the
+ * device_lock.
+ *  - stripes on the inactive_list never have their stripe_lock held.
+ *  - stripes have a reference counter. If count==0, they are on a list.
+ *  - If a stripe might need handling, STRIPE_HANDLE is set.
+ *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
+ *    handle_list else inactive_list
+ *
+ * This, combined with the fact that STRIPE_HANDLE is only ever
+ * cleared while a stripe has a non-zero count means that if the
+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the
+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
+ * the stripe is on inactive_list.
+ *
+ * The possible transitions are:
+ *  activate an unhashed/inactive stripe (get_active_stripe())
+ *     lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
+ *  activate a hashed, possibly active stripe (get_active_stripe())
+ *     lockdev check-hash if(!cnt++)unlink-stripe unlockdev
+ *  attach a request to an active stripe (add_stripe_bh())
+ *     lockdev attach-buffer unlockdev
+ *  handle a stripe (handle_stripe())
+ *     lockstripe clrSTRIPE_HANDLE ...
+ *		(lockdev check-buffers unlockdev) ..
+ *		change-state ..
+ *		record io/ops needed unlockstripe schedule io/ops
+ *  release an active stripe (release_stripe())
+ *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
+ *
+ * The refcount counts each thread that have activated the stripe,
+ * plus raid5d if it is handling it, plus one for each active request
+ * on a cached buffer, and plus one if the stripe is undergoing stripe
+ * operations.
+ *
+ * Stripe operations are performed outside the stripe lock,
+ * the stripe operations are:
+ * -copying data between the stripe cache and user application buffers
+ * -computing blocks to save a disk access, or to recover a missing block
+ * -updating the parity on a write operation (reconstruct write and
+ *  read-modify-write)
+ * -checking parity correctness
+ * -running i/o to disk
+ * These operations are carried out by raid5_run_ops which uses the async_tx
+ * api to (optionally) offload operations to dedicated hardware engines.
+ * When requesting an operation handle_stripe sets the pending bit for the
+ * operation and increments the count.  raid5_run_ops is then run whenever
+ * the count is non-zero.
+ * There are some critical dependencies between the operations that prevent some
+ * from being requested while another is in flight.
+ * 1/ Parity check operations destroy the in cache version of the parity block,
+ *    so we prevent parity dependent operations like writes and compute_blocks
+ *    from starting while a check is in progress.  Some dma engines can perform
+ *    the check without damaging the parity block, in these cases the parity
+ *    block is re-marked up to date (assuming the check was successful) and is
+ *    not re-read from disk.
+ * 2/ When a write operation is requested we immediately lock the affected
+ *    blocks, and mark them as not up to date.  This causes new read requests
+ *    to be held off, as well as parity checks and compute block operations.
+ * 3/ Once a compute block operation has been requested handle_stripe treats
+ *    that block as if it is up to date.  raid5_run_ops guaruntees that any
+ *    operation that is dependent on the compute block result is initiated after
+ *    the compute block completes.
+ */
+
+/*
+ * Operations state - intermediate states that are visible outside of sh->lock
+ * In general _idle indicates nothing is running, _run indicates a data
+ * processing operation is active, and _result means the data processing result
+ * is stable and can be acted upon.  For simple operations like biofill and
+ * compute that only have an _idle and _run state they are indicated with
+ * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
+ */
+/**
+ * enum check_states - handles syncing / repairing a stripe
+ * @check_state_idle - check operations are quiesced
+ * @check_state_run - check operation is running
+ * @check_state_result - set outside lock when check result is valid
+ * @check_state_compute_run - check failed and we are repairing
+ * @check_state_compute_result - set outside lock when compute result is valid
+ */
+enum check_states {
+	check_state_idle = 0,
+	check_state_run, /* parity check */
+	check_state_check_result,
+	check_state_compute_run, /* parity repair */
+	check_state_compute_result,
+};
+
+/**
+ * enum reconstruct_states - handles writing or expanding a stripe
+ */
+enum reconstruct_states {
+	reconstruct_state_idle = 0,
+	reconstruct_state_prexor_drain_run,	/* prexor-write */
+	reconstruct_state_drain_run,		/* write */
+	reconstruct_state_run,			/* expand */
+	reconstruct_state_prexor_drain_result,
+	reconstruct_state_drain_result,
+	reconstruct_state_result,
+};
+
+struct stripe_head {
+	struct hlist_node	hash;
+	struct list_head	lru;			/* inactive_list or handle_list */
+	struct raid5_private_data	*raid_conf;
+	sector_t		sector;			/* sector of this row */
+	int			pd_idx;			/* parity disk index */
+	unsigned long		state;			/* state flags */
+	atomic_t		count;			/* nr of active thread/requests */
+	spinlock_t		lock;
+	int			bm_seq;	/* sequence number for bitmap flushes */
+	int			disks;			/* disks in stripe */
+	enum check_states	check_state;
+	enum reconstruct_states reconstruct_state;
+	/* stripe_operations
+	 * @target - STRIPE_OP_COMPUTE_BLK target
+	 */
+	struct stripe_operations {
+		int		   target;
+		u32		   zero_sum_result;
+	} ops;
+	struct r5dev {
+		struct bio	req;
+		struct bio_vec	vec;
+		struct page	*page;
+		struct bio	*toread, *read, *towrite, *written;
+		sector_t	sector;			/* sector of this page */
+		unsigned long	flags;
+	} dev[1]; /* allocated with extra space depending of RAID geometry */
+};
+
+/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
+ *     for handle_stripe.  It is only valid under spin_lock(sh->lock);
+ */
+struct stripe_head_state {
+	int syncing, expanding, expanded;
+	int locked, uptodate, to_read, to_write, failed, written;
+	int to_fill, compute, req_compute, non_overwrite;
+	int failed_num;
+	unsigned long ops_request;
+};
+
+/* r6_state - extra state data only relevant to r6 */
+struct r6_state {
+	int p_failed, q_failed, qd_idx, failed_num[2];
+};
+
+/* Flags */
+#define	R5_UPTODATE	0	/* page contains current data */
+#define	R5_LOCKED	1	/* IO has been submitted on "req" */
+#define	R5_OVERWRITE	2	/* towrite covers whole page */
+/* and some that are internal to handle_stripe */
+#define	R5_Insync	3	/* rdev && rdev->in_sync at start */
+#define	R5_Wantread	4	/* want to schedule a read */
+#define	R5_Wantwrite	5
+#define	R5_Overlap	7	/* There is a pending overlapping request on this block */
+#define	R5_ReadError	8	/* seen a read error here recently */
+#define	R5_ReWrite	9	/* have tried to over-write the readerror */
+
+#define	R5_Expanded	10	/* This block now has post-expand data */
+#define	R5_Wantcompute	11 /* compute_block in progress treat as
+				    * uptodate
+				    */
+#define	R5_Wantfill	12 /* dev->toread contains a bio that needs
+				    * filling
+				    */
+#define R5_Wantdrain	13 /* dev->towrite needs to be drained */
+/*
+ * Write method
+ */
+#define RECONSTRUCT_WRITE	1
+#define READ_MODIFY_WRITE	2
+/* not a write method, but a compute_parity mode */
+#define	CHECK_PARITY		3
+
+/*
+ * Stripe state
+ */
+#define STRIPE_HANDLE		2
+#define	STRIPE_SYNCING		3
+#define	STRIPE_INSYNC		4
+#define	STRIPE_PREREAD_ACTIVE	5
+#define	STRIPE_DELAYED		6
+#define	STRIPE_DEGRADED		7
+#define	STRIPE_BIT_DELAY	8
+#define	STRIPE_EXPANDING	9
+#define	STRIPE_EXPAND_SOURCE	10
+#define	STRIPE_EXPAND_READY	11
+#define	STRIPE_IO_STARTED	12 /* do not count towards 'bypass_count' */
+#define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */
+#define	STRIPE_BIOFILL_RUN	14
+#define	STRIPE_COMPUTE_RUN	15
+/*
+ * Operation request flags
+ */
+#define STRIPE_OP_BIOFILL	0
+#define STRIPE_OP_COMPUTE_BLK	1
+#define STRIPE_OP_PREXOR	2
+#define STRIPE_OP_BIODRAIN	3
+#define STRIPE_OP_POSTXOR	4
+#define STRIPE_OP_CHECK	5
+
+/*
+ * Plugging:
+ *
+ * To improve write throughput, we need to delay the handling of some
+ * stripes until there has been a chance that several write requests
+ * for the one stripe have all been collected.
+ * In particular, any write request that would require pre-reading
+ * is put on a "delayed" queue until there are no stripes currently
+ * in a pre-read phase.  Further, if the "delayed" queue is empty when
+ * a stripe is put on it then we "plug" the queue and do not process it
+ * until an unplug call is made. (the unplug_io_fn() is called).
+ *
+ * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
+ * it to the count of prereading stripes.
+ * When write is initiated, or the stripe refcnt == 0 (just in case) we
+ * clear the PREREAD_ACTIVE flag and decrement the count
+ * Whenever the 'handle' queue is empty and the device is not plugged, we
+ * move any strips from delayed to handle and clear the DELAYED flag and set
+ * PREREAD_ACTIVE.
+ * In stripe_handle, if we find pre-reading is necessary, we do it if
+ * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
+ * HANDLE gets cleared if stripe_handle leave nothing locked.
+ */
+
+
+struct disk_info {
+	mdk_rdev_t	*rdev;
+};
+
+struct raid5_private_data {
+	struct hlist_head	*stripe_hashtbl;
+	mddev_t			*mddev;
+	struct disk_info	*spare;
+	int			chunk_size, level, algorithm;
+	int			max_degraded;
+	int			raid_disks;
+	int			max_nr_stripes;
+
+	/* used during an expand */
+	sector_t		expand_progress;	/* MaxSector when no expand happening */
+	sector_t		expand_lo; /* from here up to expand_progress it out-of-bounds
+					    * as we haven't flushed the metadata yet
+					    */
+	int			previous_raid_disks;
+
+	struct list_head	handle_list; /* stripes needing handling */
+	struct list_head	hold_list; /* preread ready stripes */
+	struct list_head	delayed_list; /* stripes that have plugged requests */
+	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
+	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
+	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
+	atomic_t		preread_active_stripes; /* stripes with scheduled io */
+	atomic_t		active_aligned_reads;
+	atomic_t		pending_full_writes; /* full write backlog */
+	int			bypass_count; /* bypassed prereads */
+	int			bypass_threshold; /* preread nice */
+	struct list_head	*last_hold; /* detect hold_list promotions */
+
+	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
+	/* unfortunately we need two cache names as we temporarily have
+	 * two caches.
+	 */
+	int			active_name;
+	char			cache_name[2][20];
+	struct kmem_cache		*slab_cache; /* for allocating stripes */
+
+	int			seq_flush, seq_write;
+	int			quiesce;
+
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
+	struct page 		*spare_page; /* Used when checking P/Q in raid6 */
+
+	/*
+	 * Free stripes pool
+	 */
+	atomic_t		active_stripes;
+	struct list_head	inactive_list;
+	wait_queue_head_t	wait_for_stripe;
+	wait_queue_head_t	wait_for_overlap;
+	int			inactive_blocked;	/* release of inactive stripes blocked,
+							 * waiting for 25% to be free
+							 */
+	int			pool_size; /* number of disks in stripeheads in pool */
+	spinlock_t		device_lock;
+	struct disk_info	*disks;
+};
+
+typedef struct raid5_private_data raid5_conf_t;
+
+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
+
+/*
+ * Our supported algorithms
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC	0
+#define ALGORITHM_RIGHT_ASYMMETRIC	1
+#define ALGORITHM_LEFT_SYMMETRIC	2
+#define ALGORITHM_RIGHT_SYMMETRIC	3
+
+#endif
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
index 98dcde88470e..f6c13af65002 100644
--- a/drivers/md/raid6.h
+++ b/drivers/md/raid6.h
@@ -19,7 +19,7 @@
 #define RAID6_USE_EMPTY_ZERO_PAGE 0
 
 #include <linux/raid/md.h>
-#include <linux/raid/raid5.h>
+#include "raid5.h"
 
 typedef raid5_conf_t raid6_conf_t; /* Same configuration */
 
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
deleted file mode 100644
index e98900671ca9..000000000000
--- a/include/linux/raid/bitmap.h
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
- *
- * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
- */
-#ifndef BITMAP_H
-#define BITMAP_H 1
-
-#define BITMAP_MAJOR_LO 3
-/* version 4 insists the bitmap is in little-endian order
- * with version 3, it is host-endian which is non-portable
- */
-#define BITMAP_MAJOR_HI 4
-#define	BITMAP_MAJOR_HOSTENDIAN 3
-
-#define BITMAP_MINOR 39
-
-/*
- * in-memory bitmap:
- *
- * Use 16 bit block counters to track pending writes to each "chunk".
- * The 2 high order bits are special-purpose, the first is a flag indicating
- * whether a resync is needed.  The second is a flag indicating whether a
- * resync is active.
- * This means that the counter is actually 14 bits:
- *
- * +--------+--------+------------------------------------------------+
- * | resync | resync |               counter                          |
- * | needed | active |                                                |
- * |  (0-1) |  (0-1) |              (0-16383)                         |
- * +--------+--------+------------------------------------------------+
- *
- * The "resync needed" bit is set when:
- *    a '1' bit is read from storage at startup.
- *    a write request fails on some drives
- *    a resync is aborted on a chunk with 'resync active' set
- * It is cleared (and resync-active set) when a resync starts across all drives
- * of the chunk.
- *
- *
- * The "resync active" bit is set when:
- *    a resync is started on all drives, and resync_needed is set.
- *       resync_needed will be cleared (as long as resync_active wasn't already set).
- * It is cleared when a resync completes.
- *
- * The counter counts pending write requests, plus the on-disk bit.
- * When the counter is '1' and the resync bits are clear, the on-disk
- * bit can be cleared aswell, thus setting the counter to 0.
- * When we set a bit, or in the counter (to start a write), if the fields is
- * 0, we first set the disk bit and set the counter to 1.
- *
- * If the counter is 0, the on-disk bit is clear and the stipe is clean
- * Anything that dirties the stipe pushes the counter to 2 (at least)
- * and sets the on-disk bit (lazily).
- * If a periodic sweep find the counter at 2, it is decremented to 1.
- * If the sweep find the counter at 1, the on-disk bit is cleared and the
- * counter goes to zero.
- *
- * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
- * counters as a fallback when "page" memory cannot be allocated:
- *
- * Normal case (page memory allocated):
- *
- *     page pointer (32-bit)
- *
- *     [ ] ------+
- *               |
- *               +-------> [   ][   ]..[   ] (4096 byte page == 2048 counters)
- *                          c1   c2    c2048
- *
- * Hijacked case (page memory allocation failed):
- *
- *     hijacked page pointer (32-bit)
- *
- *     [		  ][		  ] (no page memory allocated)
- *      counter #1 (16-bit) counter #2 (16-bit)
- *
- */
-
-#ifdef __KERNEL__
-
-#define PAGE_BITS (PAGE_SIZE << 3)
-#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
-
-typedef __u16 bitmap_counter_t;
-#define COUNTER_BITS 16
-#define COUNTER_BIT_SHIFT 4
-#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
-#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
-
-#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
-#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
-#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
-#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
-#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
-#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
-
-/* how many counters per page? */
-#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
-/* same, except a shift value for more efficient bitops */
-#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
-/* same, except a mask value for more efficient bitops */
-#define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)
-
-#define BITMAP_BLOCK_SIZE 512
-#define BITMAP_BLOCK_SHIFT 9
-
-/* how many blocks per chunk? (this is variable) */
-#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
-
-/* when hijacked, the counters and bits represent even larger "chunks" */
-/* there will be 1024 chunks represented by each counter in the page pointers */
-#define PAGEPTR_BLOCK_RATIO(bitmap) \
-			(CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
-#define PAGEPTR_BLOCK_SHIFT(bitmap) \
-			(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
-#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
-
-/*
- * on-disk bitmap:
- *
- * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
- * file a page at a time. There's a superblock at the start of the file.
- */
-
-/* map chunks (bits) to file pages - offset by the size of the superblock */
-#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
-
-#endif
-
-/*
- * bitmap structures:
- */
-
-#define BITMAP_MAGIC 0x6d746962
-
-/* use these for bitmap->flags and bitmap->sb->state bit-fields */
-enum bitmap_state {
-	BITMAP_STALE  = 0x002,  /* the bitmap file is out of date or had -EIO */
-	BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
-	BITMAP_HOSTENDIAN = 0x8000,
-};
-
-/* the superblock at the front of the bitmap file -- little endian */
-typedef struct bitmap_super_s {
-	__le32 magic;        /*  0  BITMAP_MAGIC */
-	__le32 version;      /*  4  the bitmap major for now, could change... */
-	__u8  uuid[16];      /*  8  128 bit uuid - must match md device uuid */
-	__le64 events;       /* 24  event counter for the bitmap (1)*/
-	__le64 events_cleared;/*32  event counter when last bit cleared (2) */
-	__le64 sync_size;    /* 40  the size of the md device's sync range(3) */
-	__le32 state;        /* 48  bitmap state information */
-	__le32 chunksize;    /* 52  the bitmap chunk size in bytes */
-	__le32 daemon_sleep; /* 56  seconds between disk flushes */
-	__le32 write_behind; /* 60  number of outstanding write-behind writes */
-
-	__u8  pad[256 - 64]; /* set to zero */
-} bitmap_super_t;
-
-/* notes:
- * (1) This event counter is updated before the eventcounter in the md superblock
- *    When a bitmap is loaded, it is only accepted if this event counter is equal
- *    to, or one greater than, the event counter in the superblock.
- * (2) This event counter is updated when the other one is *if*and*only*if* the
- *    array is not degraded.  As bits are not cleared when the array is degraded,
- *    this represents the last time that any bits were cleared.
- *    If a device is being added that has an event count with this value or
- *    higher, it is accepted as conforming to the bitmap.
- * (3)This is the number of sectors represented by the bitmap, and is the range that
- *    resync happens across.  For raid1 and raid5/6 it is the size of individual
- *    devices.  For raid10 it is the size of the array.
- */
-
-#ifdef __KERNEL__
-
-/* the in-memory bitmap is represented by bitmap_pages */
-struct bitmap_page {
-	/*
-	 * map points to the actual memory page
-	 */
-	char *map;
-	/*
-	 * in emergencies (when map cannot be alloced), hijack the map
-	 * pointer and use it as two counters itself
-	 */
-	unsigned int hijacked:1;
-	/*
-	 * count of dirty bits on the page
-	 */
-	unsigned int  count:31;
-};
-
-/* keep track of bitmap file pages that have pending writes on them */
-struct page_list {
-	struct list_head list;
-	struct page *page;
-};
-
-/* the main bitmap structure - one per mddev */
-struct bitmap {
-	struct bitmap_page *bp;
-	unsigned long pages; /* total number of pages in the bitmap */
-	unsigned long missing_pages; /* number of pages not yet allocated */
-
-	mddev_t *mddev; /* the md device that the bitmap is for */
-
-	int counter_bits; /* how many bits per block counter */
-
-	/* bitmap chunksize -- how much data does each bit represent? */
-	unsigned long chunksize;
-	unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
-	unsigned long chunks; /* total number of data chunks for the array */
-
-	/* We hold a count on the chunk currently being synced, and drop
-	 * it when the last block is started.  If the resync is aborted
-	 * midway, we need to be able to drop that count, so we remember
-	 * the counted chunk..
-	 */
-	unsigned long syncchunk;
-
-	__u64	events_cleared;
-	int need_sync;
-
-	/* bitmap spinlock */
-	spinlock_t lock;
-
-	long offset; /* offset from superblock if file is NULL */
-	struct file *file; /* backing disk file */
-	struct page *sb_page; /* cached copy of the bitmap file superblock */
-	struct page **filemap; /* list of cache pages for the file */
-	unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
-	unsigned long file_pages; /* number of pages in the file */
-	int last_page_size; /* bytes in the last page */
-
-	unsigned long flags;
-
-	int allclean;
-
-	unsigned long max_write_behind; /* write-behind mode */
-	atomic_t behind_writes;
-
-	/*
-	 * the bitmap daemon - periodically wakes up and sweeps the bitmap
-	 * file, cleaning up bits and flushing out pages to disk as necessary
-	 */
-	unsigned long daemon_lastrun; /* jiffies of last run */
-	unsigned long daemon_sleep; /* how many seconds between updates? */
-	unsigned long last_end_sync; /* when we lasted called end_sync to
-				      * update bitmap with resync progress */
-
-	atomic_t pending_writes; /* pending writes to the bitmap file */
-	wait_queue_head_t write_wait;
-	wait_queue_head_t overflow_wait;
-
-};
-
-/* the bitmap API */
-
-/* these are used only by md/bitmap */
-int  bitmap_create(mddev_t *mddev);
-void bitmap_flush(mddev_t *mddev);
-void bitmap_destroy(mddev_t *mddev);
-
-void bitmap_print_sb(struct bitmap *bitmap);
-void bitmap_update_sb(struct bitmap *bitmap);
-
-int  bitmap_setallbits(struct bitmap *bitmap);
-void bitmap_write_all(struct bitmap *bitmap);
-
-void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
-
-/* these are exported */
-int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
-			unsigned long sectors, int behind);
-void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
-			unsigned long sectors, int success, int behind);
-int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
-void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
-void bitmap_close_sync(struct bitmap *bitmap);
-void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
-
-void bitmap_unplug(struct bitmap *bitmap);
-void bitmap_daemon_work(struct bitmap *bitmap);
-#endif
-
-#endif
diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h
deleted file mode 100644
index f38b9c586afb..000000000000
--- a/include/linux/raid/linear.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _LINEAR_H
-#define _LINEAR_H
-
-#include <linux/raid/md.h>
-
-struct dev_info {
-	mdk_rdev_t	*rdev;
-	sector_t	num_sectors;
-	sector_t	start_sector;
-};
-
-typedef struct dev_info dev_info_t;
-
-struct linear_private_data
-{
-	struct linear_private_data *prev;	/* earlier version */
-	dev_info_t		**hash_table;
-	sector_t		spacing;
-	sector_t		array_sectors;
-	int			sector_shift;	/* shift before dividing
-						 * by spacing
-						 */
-	dev_info_t		disks[0];
-};
-
-
-typedef struct linear_private_data linear_conf_t;
-
-#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
-
-#endif
diff --git a/include/linux/raid/multipath.h b/include/linux/raid/multipath.h
deleted file mode 100644
index 6f53fc177a47..000000000000
--- a/include/linux/raid/multipath.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef _MULTIPATH_H
-#define _MULTIPATH_H
-
-#include <linux/raid/md.h>
-
-struct multipath_info {
-	mdk_rdev_t	*rdev;
-};
-
-struct multipath_private_data {
-	mddev_t			*mddev;
-	struct multipath_info	*multipaths;
-	int			raid_disks;
-	int			working_disks;
-	spinlock_t		device_lock;
-	struct list_head	retry_list;
-
-	mempool_t		*pool;
-};
-
-typedef struct multipath_private_data multipath_conf_t;
-
-/*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
-
-/*
- * this is our 'private' 'collective' MULTIPATH buffer head.
- * it contains information about what kind of IO operations were started
- * for this MULTIPATH operation, and about their status:
- */
-
-struct multipath_bh {
-	mddev_t			*mddev;
-	struct bio		*master_bio;
-	struct bio		bio;
-	int			path;
-	struct list_head	retry_list;
-};
-#endif
diff --git a/include/linux/raid/raid0.h b/include/linux/raid/raid0.h
deleted file mode 100644
index fd42aa87c391..000000000000
--- a/include/linux/raid/raid0.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _RAID0_H
-#define _RAID0_H
-
-#include <linux/raid/md.h>
-
-struct strip_zone
-{
-	sector_t zone_start;	/* Zone offset in md_dev (in sectors) */
-	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
-	sector_t sectors;	/* Zone size in sectors */
-	int nb_dev;		/* # of devices attached to the zone */
-	mdk_rdev_t **dev;	/* Devices attached to the zone */
-};
-
-struct raid0_private_data
-{
-	struct strip_zone **hash_table; /* Table of indexes into strip_zone */
-	struct strip_zone *strip_zone;
-	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
-	int nr_strip_zones;
-
-	sector_t spacing;
-	int sector_shift; /* shift this before divide by spacing */
-};
-
-typedef struct raid0_private_data raid0_conf_t;
-
-#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
-
-#endif
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
deleted file mode 100644
index 0a9ba7c3302e..000000000000
--- a/include/linux/raid/raid1.h
+++ /dev/null
@@ -1,134 +0,0 @@
-#ifndef _RAID1_H
-#define _RAID1_H
-
-#include <linux/raid/md.h>
-
-typedef struct mirror_info mirror_info_t;
-
-struct mirror_info {
-	mdk_rdev_t	*rdev;
-	sector_t	head_position;
-};
-
-/*
- * memory pools need a pointer to the mddev, so they can force an unplug
- * when memory is tight, and a count of the number of drives that the
- * pool was allocated for, so they know how much to allocate and free.
- * mddev->raid_disks cannot be used, as it can change while a pool is active
- * These two datums are stored in a kmalloced struct.
- */
-
-struct pool_info {
-	mddev_t *mddev;
-	int	raid_disks;
-};
-
-
-typedef struct r1bio_s r1bio_t;
-
-struct r1_private_data_s {
-	mddev_t			*mddev;
-	mirror_info_t		*mirrors;
-	int			raid_disks;
-	int			last_used;
-	sector_t		next_seq_sect;
-	spinlock_t		device_lock;
-
-	struct list_head	retry_list;
-	/* queue pending writes and submit them on unplug */
-	struct bio_list		pending_bio_list;
-	/* queue of writes that have been unplugged */
-	struct bio_list		flushing_bio_list;
-
-	/* for use when syncing mirrors: */
-
-	spinlock_t		resync_lock;
-	int			nr_pending;
-	int			nr_waiting;
-	int			nr_queued;
-	int			barrier;
-	sector_t		next_resync;
-	int			fullsync;  /* set to 1 if a full sync is needed,
-					    * (fresh device added).
-					    * Cleared when a sync completes.
-					    */
-
-	wait_queue_head_t	wait_barrier;
-
-	struct pool_info	*poolinfo;
-
-	struct page		*tmppage;
-
-	mempool_t *r1bio_pool;
-	mempool_t *r1buf_pool;
-};
-
-typedef struct r1_private_data_s conf_t;
-
-/*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-
-/*
- * this is our 'private' RAID1 bio.
- *
- * it contains information about what kind of IO operations were started
- * for this RAID1 operation, and about their status:
- */
-
-struct r1bio_s {
-	atomic_t		remaining; /* 'have we finished' count,
-					    * used from IRQ handlers
-					    */
-	atomic_t		behind_remaining; /* number of write-behind ios remaining
-						 * in this BehindIO request
-						 */
-	sector_t		sector;
-	int			sectors;
-	unsigned long		state;
-	mddev_t			*mddev;
-	/*
-	 * original bio going to /dev/mdx
-	 */
-	struct bio		*master_bio;
-	/*
-	 * if the IO is in READ direction, then this is where we read
-	 */
-	int			read_disk;
-
-	struct list_head	retry_list;
-	struct bitmap_update	*bitmap_update;
-	/*
-	 * if the IO is in WRITE direction, then multiple bios are used.
-	 * We choose the number when they are allocated.
-	 */
-	struct bio		*bios[0];
-	/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
-};
-
-/* when we get a read error on a read-only array, we redirect to another
- * device without failing the first device, or trying to over-write to
- * correct the read error.  To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio*)1)
-
-/* bits for r1bio.state */
-#define	R1BIO_Uptodate	0
-#define	R1BIO_IsSync	1
-#define	R1BIO_Degraded	2
-#define	R1BIO_BehindIO	3
-#define	R1BIO_Barrier	4
-#define R1BIO_BarrierRetry 5
-/* For write-behind requests, we call bi_end_io when
- * the last non-write-behind device completes, providing
- * any write was successful.  Otherwise we call when
- * any write-behind write succeeds, otherwise we call
- * with failure when last write completes (and all failed).
- * Record that bi_end_io was called with this flag...
- */
-#define	R1BIO_Returned 6
-
-#endif
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
deleted file mode 100644
index e9091cfeb286..000000000000
--- a/include/linux/raid/raid10.h
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifndef _RAID10_H
-#define _RAID10_H
-
-#include <linux/raid/md.h>
-
-typedef struct mirror_info mirror_info_t;
-
-struct mirror_info {
-	mdk_rdev_t	*rdev;
-	sector_t	head_position;
-};
-
-typedef struct r10bio_s r10bio_t;
-
-struct r10_private_data_s {
-	mddev_t			*mddev;
-	mirror_info_t		*mirrors;
-	int			raid_disks;
-	spinlock_t		device_lock;
-
-	/* geometry */
-	int			near_copies;  /* number of copies layed out raid0 style */
-	int 			far_copies;   /* number of copies layed out
-					       * at large strides across drives
-					       */
-	int			far_offset;   /* far_copies are offset by 1 stripe
-					       * instead of many
-					       */
-	int			copies;	      /* near_copies * far_copies.
-					       * must be <= raid_disks
-					       */
-	sector_t		stride;	      /* distance between far copies.
-					       * This is size / far_copies unless
-					       * far_offset, in which case it is
-					       * 1 stripe.
-					       */
-
-	int chunk_shift; /* shift from chunks to sectors */
-	sector_t chunk_mask;
-
-	struct list_head	retry_list;
-	/* queue pending writes and submit them on unplug */
-	struct bio_list		pending_bio_list;
-
-
-	spinlock_t		resync_lock;
-	int nr_pending;
-	int nr_waiting;
-	int nr_queued;
-	int barrier;
-	sector_t		next_resync;
-	int			fullsync;  /* set to 1 if a full sync is needed,
-					    * (fresh device added).
-					    * Cleared when a sync completes.
-					    */
-
-	wait_queue_head_t	wait_barrier;
-
-	mempool_t *r10bio_pool;
-	mempool_t *r10buf_pool;
-	struct page		*tmppage;
-};
-
-typedef struct r10_private_data_s conf_t;
-
-/*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-
-/*
- * this is our 'private' RAID10 bio.
- *
- * it contains information about what kind of IO operations were started
- * for this RAID10 operation, and about their status:
- */
-
-struct r10bio_s {
-	atomic_t		remaining; /* 'have we finished' count,
-					    * used from IRQ handlers
-					    */
-	sector_t		sector;	/* virtual sector number */
-	int			sectors;
-	unsigned long		state;
-	mddev_t			*mddev;
-	/*
-	 * original bio going to /dev/mdx
-	 */
-	struct bio		*master_bio;
-	/*
-	 * if the IO is in READ direction, then this is where we read
-	 */
-	int			read_slot;
-
-	struct list_head	retry_list;
-	/*
-	 * if the IO is in WRITE direction, then multiple bios are used,
-	 * one for each copy.
-	 * When resyncing we also use one for each copy.
-	 * When reconstructing, we use 2 bios, one for read, one for write.
-	 * We choose the number when they are allocated.
-	 */
-	struct {
-		struct bio		*bio;
-		sector_t addr;
-		int devnum;
-	} devs[0];
-};
-
-/* when we get a read error on a read-only array, we redirect to another
- * device without failing the first device, or trying to over-write to
- * correct the read error.  To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio*)1)
-
-/* bits for r10bio.state */
-#define	R10BIO_Uptodate	0
-#define	R10BIO_IsSync	1
-#define	R10BIO_IsRecover 2
-#define	R10BIO_Degraded 3
-#endif
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
deleted file mode 100644
index 3b2672792457..000000000000
--- a/include/linux/raid/raid5.h
+++ /dev/null
@@ -1,402 +0,0 @@
-#ifndef _RAID5_H
-#define _RAID5_H
-
-#include <linux/raid/md.h>
-#include <linux/raid/xor.h>
-
-/*
- *
- * Each stripe contains one buffer per disc.  Each buffer can be in
- * one of a number of states stored in "flags".  Changes between
- * these states happen *almost* exclusively under a per-stripe
- * spinlock.  Some very specific changes can happen in bi_end_io, and
- * these are not protected by the spin lock.
- *
- * The flag bits that are used to represent these states are:
- *   R5_UPTODATE and R5_LOCKED
- *
- * State Empty == !UPTODATE, !LOCK
- *        We have no data, and there is no active request
- * State Want == !UPTODATE, LOCK
- *        A read request is being submitted for this block
- * State Dirty == UPTODATE, LOCK
- *        Some new data is in this buffer, and it is being written out
- * State Clean == UPTODATE, !LOCK
- *        We have valid data which is the same as on disc
- *
- * The possible state transitions are:
- *
- *  Empty -> Want   - on read or write to get old data for  parity calc
- *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
- *  Empty -> Clean  - on compute_block when computing a block for failed drive
- *  Want  -> Empty  - on failed read
- *  Want  -> Clean  - on successful completion of read request
- *  Dirty -> Clean  - on successful completion of write request
- *  Dirty -> Clean  - on failed write
- *  Clean -> Dirty  - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
- *
- * The Want->Empty, Want->Clean, Dirty->Clean, transitions
- * all happen in b_end_io at interrupt time.
- * Each sets the Uptodate bit before releasing the Lock bit.
- * This leaves one multi-stage transition:
- *    Want->Dirty->Clean
- * This is safe because thinking that a Clean buffer is actually dirty
- * will at worst delay some action, and the stripe will be scheduled
- * for attention after the transition is complete.
- *
- * There is one possibility that is not covered by these states.  That
- * is if one drive has failed and there is a spare being rebuilt.  We
- * can't distinguish between a clean block that has been generated
- * from parity calculations, and a clean block that has been
- * successfully written to the spare ( or to parity when resyncing).
- * To distingush these states we have a stripe bit STRIPE_INSYNC that
- * is set whenever a write is scheduled to the spare, or to the parity
- * disc if there is no spare.  A sync request clears this bit, and
- * when we find it set with no buffers locked, we know the sync is
- * complete.
- *
- * Buffers for the md device that arrive via make_request are attached
- * to the appropriate stripe in one of two lists linked on b_reqnext.
- * One list (bh_read) for read requests, one (bh_write) for write.
- * There should never be more than one buffer on the two lists
- * together, but we are not guaranteed of that so we allow for more.
- *
- * If a buffer is on the read list when the associated cache buffer is
- * Uptodate, the data is copied into the read buffer and it's b_end_io
- * routine is called.  This may happen in the end_request routine only
- * if the buffer has just successfully been read.  end_request should
- * remove the buffers from the list and then set the Uptodate bit on
- * the buffer.  Other threads may do this only if they first check
- * that the Uptodate bit is set.  Once they have checked that they may
- * take buffers off the read queue.
- *
- * When a buffer on the write list is committed for write it is copied
- * into the cache buffer, which is then marked dirty, and moved onto a
- * third list, the written list (bh_written).  Once both the parity
- * block and the cached buffer are successfully written, any buffer on
- * a written list can be returned with b_end_io.
- *
- * The write list and read list both act as fifos.  The read list is
- * protected by the device_lock.  The write and written lists are
- * protected by the stripe lock.  The device_lock, which can be
- * claimed while the stipe lock is held, is only for list
- * manipulations and will only be held for a very short time.  It can
- * be claimed from interrupts.
- *
- *
- * Stripes in the stripe cache can be on one of two lists (or on
- * neither).  The "inactive_list" contains stripes which are not
- * currently being used for any request.  They can freely be reused
- * for another stripe.  The "handle_list" contains stripes that need
- * to be handled in some way.  Both of these are fifo queues.  Each
- * stripe is also (potentially) linked to a hash bucket in the hash
- * table so that it can be found by sector number.  Stripes that are
- * not hashed must be on the inactive_list, and will normally be at
- * the front.  All stripes start life this way.
- *
- * The inactive_list, handle_list and hash bucket lists are all protected by the
- * device_lock.
- *  - stripes on the inactive_list never have their stripe_lock held.
- *  - stripes have a reference counter. If count==0, they are on a list.
- *  - If a stripe might need handling, STRIPE_HANDLE is set.
- *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
- *    handle_list else inactive_list
- *
- * This, combined with the fact that STRIPE_HANDLE is only ever
- * cleared while a stripe has a non-zero count means that if the
- * refcount is 0 and STRIPE_HANDLE is set, then it is on the
- * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
- * the stripe is on inactive_list.
- *
- * The possible transitions are:
- *  activate an unhashed/inactive stripe (get_active_stripe())
- *     lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
- *  activate a hashed, possibly active stripe (get_active_stripe())
- *     lockdev check-hash if(!cnt++)unlink-stripe unlockdev
- *  attach a request to an active stripe (add_stripe_bh())
- *     lockdev attach-buffer unlockdev
- *  handle a stripe (handle_stripe())
- *     lockstripe clrSTRIPE_HANDLE ...
- *		(lockdev check-buffers unlockdev) ..
- *		change-state ..
- *		record io/ops needed unlockstripe schedule io/ops
- *  release an active stripe (release_stripe())
- *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
- *
- * The refcount counts each thread that have activated the stripe,
- * plus raid5d if it is handling it, plus one for each active request
- * on a cached buffer, and plus one if the stripe is undergoing stripe
- * operations.
- *
- * Stripe operations are performed outside the stripe lock,
- * the stripe operations are:
- * -copying data between the stripe cache and user application buffers
- * -computing blocks to save a disk access, or to recover a missing block
- * -updating the parity on a write operation (reconstruct write and
- *  read-modify-write)
- * -checking parity correctness
- * -running i/o to disk
- * These operations are carried out by raid5_run_ops which uses the async_tx
- * api to (optionally) offload operations to dedicated hardware engines.
- * When requesting an operation handle_stripe sets the pending bit for the
- * operation and increments the count.  raid5_run_ops is then run whenever
- * the count is non-zero.
- * There are some critical dependencies between the operations that prevent some
- * from being requested while another is in flight.
- * 1/ Parity check operations destroy the in cache version of the parity block,
- *    so we prevent parity dependent operations like writes and compute_blocks
- *    from starting while a check is in progress.  Some dma engines can perform
- *    the check without damaging the parity block, in these cases the parity
- *    block is re-marked up to date (assuming the check was successful) and is
- *    not re-read from disk.
- * 2/ When a write operation is requested we immediately lock the affected
- *    blocks, and mark them as not up to date.  This causes new read requests
- *    to be held off, as well as parity checks and compute block operations.
- * 3/ Once a compute block operation has been requested handle_stripe treats
- *    that block as if it is up to date.  raid5_run_ops guaruntees that any
- *    operation that is dependent on the compute block result is initiated after
- *    the compute block completes.
- */
-
-/*
- * Operations state - intermediate states that are visible outside of sh->lock
- * In general _idle indicates nothing is running, _run indicates a data
- * processing operation is active, and _result means the data processing result
- * is stable and can be acted upon.  For simple operations like biofill and
- * compute that only have an _idle and _run state they are indicated with
- * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
- */
-/**
- * enum check_states - handles syncing / repairing a stripe
- * @check_state_idle - check operations are quiesced
- * @check_state_run - check operation is running
- * @check_state_result - set outside lock when check result is valid
- * @check_state_compute_run - check failed and we are repairing
- * @check_state_compute_result - set outside lock when compute result is valid
- */
-enum check_states {
-	check_state_idle = 0,
-	check_state_run, /* parity check */
-	check_state_check_result,
-	check_state_compute_run, /* parity repair */
-	check_state_compute_result,
-};
-
-/**
- * enum reconstruct_states - handles writing or expanding a stripe
- */
-enum reconstruct_states {
-	reconstruct_state_idle = 0,
-	reconstruct_state_prexor_drain_run,	/* prexor-write */
-	reconstruct_state_drain_run,		/* write */
-	reconstruct_state_run,			/* expand */
-	reconstruct_state_prexor_drain_result,
-	reconstruct_state_drain_result,
-	reconstruct_state_result,
-};
-
-struct stripe_head {
-	struct hlist_node	hash;
-	struct list_head	lru;			/* inactive_list or handle_list */
-	struct raid5_private_data	*raid_conf;
-	sector_t		sector;			/* sector of this row */
-	int			pd_idx;			/* parity disk index */
-	unsigned long		state;			/* state flags */
-	atomic_t		count;			/* nr of active thread/requests */
-	spinlock_t		lock;
-	int			bm_seq;	/* sequence number for bitmap flushes */
-	int			disks;			/* disks in stripe */
-	enum check_states	check_state;
-	enum reconstruct_states reconstruct_state;
-	/* stripe_operations
-	 * @target - STRIPE_OP_COMPUTE_BLK target
-	 */
-	struct stripe_operations {
-		int		   target;
-		u32		   zero_sum_result;
-	} ops;
-	struct r5dev {
-		struct bio	req;
-		struct bio_vec	vec;
-		struct page	*page;
-		struct bio	*toread, *read, *towrite, *written;
-		sector_t	sector;			/* sector of this page */
-		unsigned long	flags;
-	} dev[1]; /* allocated with extra space depending of RAID geometry */
-};
-
-/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
- *     for handle_stripe.  It is only valid under spin_lock(sh->lock);
- */
-struct stripe_head_state {
-	int syncing, expanding, expanded;
-	int locked, uptodate, to_read, to_write, failed, written;
-	int to_fill, compute, req_compute, non_overwrite;
-	int failed_num;
-	unsigned long ops_request;
-};
-
-/* r6_state - extra state data only relevant to r6 */
-struct r6_state {
-	int p_failed, q_failed, qd_idx, failed_num[2];
-};
-
-/* Flags */
-#define	R5_UPTODATE	0	/* page contains current data */
-#define	R5_LOCKED	1	/* IO has been submitted on "req" */
-#define	R5_OVERWRITE	2	/* towrite covers whole page */
-/* and some that are internal to handle_stripe */
-#define	R5_Insync	3	/* rdev && rdev->in_sync at start */
-#define	R5_Wantread	4	/* want to schedule a read */
-#define	R5_Wantwrite	5
-#define	R5_Overlap	7	/* There is a pending overlapping request on this block */
-#define	R5_ReadError	8	/* seen a read error here recently */
-#define	R5_ReWrite	9	/* have tried to over-write the readerror */
-
-#define	R5_Expanded	10	/* This block now has post-expand data */
-#define	R5_Wantcompute	11 /* compute_block in progress treat as
-				    * uptodate
-				    */
-#define	R5_Wantfill	12 /* dev->toread contains a bio that needs
-				    * filling
-				    */
-#define R5_Wantdrain	13 /* dev->towrite needs to be drained */
-/*
- * Write method
- */
-#define RECONSTRUCT_WRITE	1
-#define READ_MODIFY_WRITE	2
-/* not a write method, but a compute_parity mode */
-#define	CHECK_PARITY		3
-
-/*
- * Stripe state
- */
-#define STRIPE_HANDLE		2
-#define	STRIPE_SYNCING		3
-#define	STRIPE_INSYNC		4
-#define	STRIPE_PREREAD_ACTIVE	5
-#define	STRIPE_DELAYED		6
-#define	STRIPE_DEGRADED		7
-#define	STRIPE_BIT_DELAY	8
-#define	STRIPE_EXPANDING	9
-#define	STRIPE_EXPAND_SOURCE	10
-#define	STRIPE_EXPAND_READY	11
-#define	STRIPE_IO_STARTED	12 /* do not count towards 'bypass_count' */
-#define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */
-#define	STRIPE_BIOFILL_RUN	14
-#define	STRIPE_COMPUTE_RUN	15
-/*
- * Operation request flags
- */
-#define STRIPE_OP_BIOFILL	0
-#define STRIPE_OP_COMPUTE_BLK	1
-#define STRIPE_OP_PREXOR	2
-#define STRIPE_OP_BIODRAIN	3
-#define STRIPE_OP_POSTXOR	4
-#define STRIPE_OP_CHECK	5
-
-/*
- * Plugging:
- *
- * To improve write throughput, we need to delay the handling of some
- * stripes until there has been a chance that several write requests
- * for the one stripe have all been collected.
- * In particular, any write request that would require pre-reading
- * is put on a "delayed" queue until there are no stripes currently
- * in a pre-read phase.  Further, if the "delayed" queue is empty when
- * a stripe is put on it then we "plug" the queue and do not process it
- * until an unplug call is made. (the unplug_io_fn() is called).
- *
- * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
- * it to the count of prereading stripes.
- * When write is initiated, or the stripe refcnt == 0 (just in case) we
- * clear the PREREAD_ACTIVE flag and decrement the count
- * Whenever the 'handle' queue is empty and the device is not plugged, we
- * move any strips from delayed to handle and clear the DELAYED flag and set
- * PREREAD_ACTIVE.
- * In stripe_handle, if we find pre-reading is necessary, we do it if
- * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
- * HANDLE gets cleared if stripe_handle leave nothing locked.
- */
- 
-
-struct disk_info {
-	mdk_rdev_t	*rdev;
-};
-
-struct raid5_private_data {
-	struct hlist_head	*stripe_hashtbl;
-	mddev_t			*mddev;
-	struct disk_info	*spare;
-	int			chunk_size, level, algorithm;
-	int			max_degraded;
-	int			raid_disks;
-	int			max_nr_stripes;
-
-	/* used during an expand */
-	sector_t		expand_progress;	/* MaxSector when no expand happening */
-	sector_t		expand_lo; /* from here up to expand_progress it out-of-bounds
-					    * as we haven't flushed the metadata yet
-					    */
-	int			previous_raid_disks;
-
-	struct list_head	handle_list; /* stripes needing handling */
-	struct list_head	hold_list; /* preread ready stripes */
-	struct list_head	delayed_list; /* stripes that have plugged requests */
-	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
-	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
-	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
-	atomic_t		preread_active_stripes; /* stripes with scheduled io */
-	atomic_t		active_aligned_reads;
-	atomic_t		pending_full_writes; /* full write backlog */
-	int			bypass_count; /* bypassed prereads */
-	int			bypass_threshold; /* preread nice */
-	struct list_head	*last_hold; /* detect hold_list promotions */
-
-	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
-	/* unfortunately we need two cache names as we temporarily have
-	 * two caches.
-	 */
-	int			active_name;
-	char			cache_name[2][20];
-	struct kmem_cache		*slab_cache; /* for allocating stripes */
-
-	int			seq_flush, seq_write;
-	int			quiesce;
-
-	int			fullsync;  /* set to 1 if a full sync is needed,
-					    * (fresh device added).
-					    * Cleared when a sync completes.
-					    */
-
-	struct page 		*spare_page; /* Used when checking P/Q in raid6 */
-
-	/*
-	 * Free stripes pool
-	 */
-	atomic_t		active_stripes;
-	struct list_head	inactive_list;
-	wait_queue_head_t	wait_for_stripe;
-	wait_queue_head_t	wait_for_overlap;
-	int			inactive_blocked;	/* release of inactive stripes blocked,
-							 * waiting for 25% to be free
-							 */
-	int			pool_size; /* number of disks in stripeheads in pool */
-	spinlock_t		device_lock;
-	struct disk_info	*disks;
-};
-
-typedef struct raid5_private_data raid5_conf_t;
-
-#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
-
-/*
- * Our supported algorithms
- */
-#define ALGORITHM_LEFT_ASYMMETRIC	0
-#define ALGORITHM_RIGHT_ASYMMETRIC	1
-#define ALGORITHM_LEFT_SYMMETRIC	2
-#define ALGORITHM_RIGHT_SYMMETRIC	3
-
-#endif
-- 
cgit v1.2.3


From 8b2b5c217c20b5460218ab8731295f2e46c7dd29 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Mar 2009 14:27:03 +1100
Subject: md: move LEVEL_* definition from md_k.h to md_u.h

.. as they are part of the user-space interface.
Also move MdpMinorShift into there so we can remove duplication.

Lastly move mdp_major in.  It is less obviously part of the user-space
interface, but do_mounts_md.c uses it, and it is acting a bit like
user-space.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           |  3 ---
 include/linux/raid/md.h   |  2 --
 include/linux/raid/md_k.h | 10 ----------
 include/linux/raid/md_u.h | 17 +++++++++++++++++
 init/do_mounts_md.c       |  2 --
 5 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9a3214c8585f..96336b050b59 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -46,9 +46,6 @@
 #include <linux/delay.h>
 #include "bitmap.h"
 
-/* 63 partitions with the alternate major number (mdp) */
-#define MdpMinorShift 6
-
 #define DEBUG 0
 #define dprintk(x...) ((void)(DEBUG && printk(x)))
 
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 82bea14cae1a..8bfaf6b1d309 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -52,8 +52,6 @@
  */
 #define MD_PATCHLEVEL_VERSION           3
 
-extern int mdp_major;
-
 extern int register_md_personality(struct mdk_personality *p);
 extern int unregister_md_personality(struct mdk_personality *p);
 extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4aedb9fe2bd8..758ec2842d9a 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -20,16 +20,6 @@
 
 #ifdef CONFIG_BLOCK
 
-#define	LEVEL_MULTIPATH		(-4)
-#define	LEVEL_LINEAR		(-1)
-#define	LEVEL_FAULTY		(-5)
-
-/* we need a value for 'no level specified' and 0
- * means 'raid0', so we need something else.  This is
- * for internal use only
- */
-#define	LEVEL_NONE		(-1000000)
-
 #define MaxSector (~(sector_t)0)
 
 typedef struct mddev_s mddev_t;
diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h
index 7192035fc4b0..2f824aa889f3 100644
--- a/include/linux/raid/md_u.h
+++ b/include/linux/raid/md_u.h
@@ -46,6 +46,12 @@
 #define STOP_ARRAY_RO		_IO (MD_MAJOR, 0x33)
 #define RESTART_ARRAY_RW	_IO (MD_MAJOR, 0x34)
 
+/* 63 partitions with the alternate major number (mdp) */
+#define MdpMinorShift 6
+#ifdef __KERNEL__
+extern int mdp_major;
+#endif
+
 typedef struct mdu_version_s {
 	int major;
 	int minor;
@@ -85,6 +91,17 @@ typedef struct mdu_array_info_s {
 
 } mdu_array_info_t;
 
+/* non-obvious values for 'level' */
+#define	LEVEL_MULTIPATH		(-4)
+#define	LEVEL_LINEAR		(-1)
+#define	LEVEL_FAULTY		(-5)
+
+/* we need a value for 'no level specified' and 0
+ * means 'raid0', so we need something else.  This is
+ * for internal use only
+ */
+#define	LEVEL_NONE		(-1000000)
+
 typedef struct mdu_disk_info_s {
 	/*
 	 * configuration/status of one particular disk
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index 9bdddbcb3d6a..23a15fb57e15 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -112,8 +112,6 @@ static int __init md_setup(char *str)
 	return 1;
 }
 
-#define MdpMinorShift 6
-
 static void __init md_setup_drive(void)
 {
 	int minor, i, ent, partitioned;
-- 
cgit v1.2.3


From 92022950c6b1bb3da90b2976b20271cdfd98b8a3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Mar 2009 14:33:13 +1100
Subject: md: move most content from md.h to md_k.h

The extern function definitions are kernel-internal definitions, so
they belong in md_k.h

The MD_*_VERSION values could reasonably go in a number of places,
but md_u.h seems most reasonable.

This leaves almost nothing in md.h.  It will go soon.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/raid/md.h   | 40 ----------------------------------------
 include/linux/raid/md_k.h | 22 ++++++++++++++++++++++
 include/linux/raid/md_u.h | 18 ++++++++++++++++++
 3 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 8bfaf6b1d309..71c4fd19c317 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -34,46 +34,6 @@
 
 #ifdef CONFIG_MD
 
-/*
- * Different major versions are not compatible.
- * Different minor versions are only downward compatible.
- * Different patchlevel versions are downward and upward compatible.
- */
-#define MD_MAJOR_VERSION                0
-#define MD_MINOR_VERSION                90
-/*
- * MD_PATCHLEVEL_VERSION indicates kernel functionality.
- * >=1 means different superblock formats are selectable using SET_ARRAY_INFO
- *     and major_version/minor_version accordingly
- * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT
- *     in the super status byte
- * >=3 means that bitmap superblock version 4 is supported, which uses
- *     little-ending representation rather than host-endian
- */
-#define MD_PATCHLEVEL_VERSION           3
-
-extern int register_md_personality(struct mdk_personality *p);
-extern int unregister_md_personality(struct mdk_personality *p);
-extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
-				mddev_t *mddev, const char *name);
-extern void md_unregister_thread(mdk_thread_t *thread);
-extern void md_wakeup_thread(mdk_thread_t *thread);
-extern void md_check_recovery(mddev_t *mddev);
-extern void md_write_start(mddev_t *mddev, struct bio *bi);
-extern void md_write_end(mddev_t *mddev);
-extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
-extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
-
-extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
-			   sector_t sector, int size, struct page *page);
-extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
-			struct page *page, int rw);
-extern void md_do_sync(mddev_t *mddev);
-extern void md_new_event(mddev_t *mddev);
-extern int md_allow_write(mddev_t *mddev);
-extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
-
 #endif /* CONFIG_MD */
 #endif 
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 758ec2842d9a..4c5e2d00ff5e 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -390,3 +390,25 @@ static inline void safe_put_page(struct page *p)
 #endif /* CONFIG_BLOCK */
 #endif
 
+
+extern int register_md_personality(struct mdk_personality *p);
+extern int unregister_md_personality(struct mdk_personality *p);
+extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
+				mddev_t *mddev, const char *name);
+extern void md_unregister_thread(mdk_thread_t *thread);
+extern void md_wakeup_thread(mdk_thread_t *thread);
+extern void md_check_recovery(mddev_t *mddev);
+extern void md_write_start(mddev_t *mddev, struct bio *bi);
+extern void md_write_end(mddev_t *mddev);
+extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
+extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
+
+extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+			   sector_t sector, int size, struct page *page);
+extern void md_super_wait(mddev_t *mddev);
+extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+			struct page *page, int rw);
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
+extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h
index 2f824aa889f3..fb1abb3367e9 100644
--- a/include/linux/raid/md_u.h
+++ b/include/linux/raid/md_u.h
@@ -15,6 +15,24 @@
 #ifndef _MD_U_H
 #define _MD_U_H
 
+/*
+ * Different major versions are not compatible.
+ * Different minor versions are only downward compatible.
+ * Different patchlevel versions are downward and upward compatible.
+ */
+#define MD_MAJOR_VERSION                0
+#define MD_MINOR_VERSION                90
+/*
+ * MD_PATCHLEVEL_VERSION indicates kernel functionality.
+ * >=1 means different superblock formats are selectable using SET_ARRAY_INFO
+ *     and major_version/minor_version accordingly
+ * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT
+ *     in the super status byte
+ * >=3 means that bitmap superblock version 4 is supported, which uses
+ *     little-ending representation rather than host-endian
+ */
+#define MD_PATCHLEVEL_VERSION           3
+
 /* ioctls */
 
 /* status */
-- 
cgit v1.2.3


From bff61975b3d6c18ee31457cc5b4d73042f44915f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Mar 2009 14:33:13 +1100
Subject: md: move lots of #include lines out of .h files and into .c

This makes the includes more explicit, and is preparation for moving
md_k.h to drivers/md/md.h

Remove include/raid/md.h as its only remaining use was to #include
other files.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 crypto/xor.c              |  2 +-
 drivers/md/bitmap.c       |  3 ++-
 drivers/md/faulty.c       |  5 ++++-
 drivers/md/linear.c       |  4 ++++
 drivers/md/linear.h       |  2 --
 drivers/md/md.c           |  6 +++++-
 drivers/md/multipath.c    |  4 ++++
 drivers/md/multipath.h    |  2 --
 drivers/md/raid0.c        |  3 +++
 drivers/md/raid0.h        |  2 --
 drivers/md/raid1.c        |  5 ++++-
 drivers/md/raid1.h        |  2 --
 drivers/md/raid10.c       |  5 ++++-
 drivers/md/raid10.h       |  2 --
 drivers/md/raid5.c        |  8 ++++++--
 drivers/md/raid5.h        |  1 -
 drivers/md/raid6.h        |  6 +-----
 fs/compat_ioctl.c         |  2 +-
 include/linux/raid/md.h   | 39 ---------------------------------------
 include/linux/raid/md_k.h |  3 ---
 include/linux/raid/xor.h  |  2 --
 init/do_mounts.h          |  1 +
 init/do_mounts_md.c       |  3 ++-
 23 files changed, 42 insertions(+), 70 deletions(-)
 delete mode 100644 include/linux/raid/md.h

(limited to 'include/linux')

diff --git a/crypto/xor.c b/crypto/xor.c
index b2e6db075e49..996b6ee57d9e 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -18,8 +18,8 @@
 
 #define BH_TRACE 0
 #include <linux/module.h>
-#include <linux/raid/md.h>
 #include <linux/raid/xor.h>
+#include <linux/jiffies.h>
 #include <asm/xor.h>
 
 /* The xor routines to use.  */
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7666117738c7..1df012e9d73d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -16,6 +16,7 @@
  * wait if count gets too high, wake when it drops to half.
  */
 
+#include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
@@ -26,7 +27,7 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_k.h>
 #include "bitmap.h"
 
 /* debug macros */
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 86d9adf90e79..cc5d2cf08dfc 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -62,7 +62,10 @@
 #define	ModeShift	5
 
 #define MaxFault	50
-#include <linux/raid/md.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
 
 
 static void faulty_fail(struct bio *bio, int error)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3603ffa9edc5..c43c3b60ef09 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -16,6 +16,10 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
 #include "linear.h"
 
 /*
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index f38b9c586afb..bf8179587f95 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -1,8 +1,6 @@
 #ifndef _LINEAR_H
 #define _LINEAR_H
 
-#include <linux/raid/md.h>
-
 struct dev_info {
 	mdk_rdev_t	*rdev;
 	sector_t	num_sectors;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 96336b050b59..11d6e0e1045a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,8 +33,9 @@
 */
 
 #include <linux/kthread.h>
-#include <linux/raid/md.h>
+#include <linux/blkdev.h>
 #include <linux/sysctl.h>
+#include <linux/seq_file.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/poll.h>
 #include <linux/ctype.h>
@@ -44,6 +45,9 @@
 #include <linux/reboot.h>
 #include <linux/file.h>
 #include <linux/delay.h>
+#include <linux/raid/md_k.h>
+#include <linux/raid/md_p.h>
+#include <linux/raid/md_u.h>
 #include "bitmap.h"
 
 #define DEBUG 0
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 547df09a7af3..148b3cd058bf 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -19,6 +19,10 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
 #include "multipath.h"
 
 #define MAX_WORK_PER_DISK 128
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index 6f53fc177a47..6fa70b400cda 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -1,8 +1,6 @@
 #ifndef _MULTIPATH_H
 #define _MULTIPATH_H
 
-#include <linux/raid/md.h>
-
 struct multipath_info {
 	mdk_rdev_t	*rdev;
 };
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ef09ed04864e..64e4c77a1568 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -18,6 +18,9 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
+#include <linux/blkdev.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
 #include "raid0.h"
 
 static void raid0_unplug(struct request_queue *q)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index fd42aa87c391..824b12eb1d4f 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -1,8 +1,6 @@
 #ifndef _RAID0_H
 #define _RAID0_H
 
-#include <linux/raid/md.h>
-
 struct strip_zone
 {
 	sector_t zone_start;	/* Zone offset in md_dev (in sectors) */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index bff32285f8bb..253b09c86eca 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -31,8 +31,11 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include "dm-bio-list.h"
 #include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
+#include "dm-bio-list.h"
 #include "raid1.h"
 #include "bitmap.h"
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0a9ba7c3302e..1620eea3d57c 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,8 +1,6 @@
 #ifndef _RAID1_H
 #define _RAID1_H
 
-#include <linux/raid/md.h>
-
 typedef struct mirror_info mirror_info_t;
 
 struct mirror_info {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f03dd70d12a5..186e1b199d46 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,8 +18,11 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include "dm-bio-list.h"
 #include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
+#include "dm-bio-list.h"
 #include "raid10.h"
 #include "bitmap.h"
 
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index e9091cfeb286..244dbe507a54 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -1,8 +1,6 @@
 #ifndef _RAID10_H
 #define _RAID10_H
 
-#include <linux/raid/md.h>
-
 typedef struct mirror_info mirror_info_t;
 
 struct mirror_info {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f75698b1f63d..816157e7d8e0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -43,8 +43,12 @@
  * miss any bits.
  */
 
+#include <linux/blkdev.h>
+#include <linux/raid/md_k.h>
 #include <linux/kthread.h>
 #include <linux/async_tx.h>
+#include <linux/seq_file.h>
+#include "raid5.h"
 #include "raid6.h"
 #include "bitmap.h"
 
@@ -1467,7 +1471,7 @@ static void copy_data(int frombio, struct bio *bio,
 
 static void compute_parity6(struct stripe_head *sh, int method)
 {
-	raid6_conf_t *conf = sh->raid_conf;
+	raid5_conf_t *conf = sh->raid_conf;
 	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
 	struct bio *chosen;
 	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
@@ -2795,7 +2799,7 @@ static bool handle_stripe5(struct stripe_head *sh)
 
 static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 {
-	raid6_conf_t *conf = sh->raid_conf;
+	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
 	struct bio *return_bi = NULL;
 	int i, pd_idx = sh->pd_idx;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 40f1d0335c74..0ed22dff56e0 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -1,7 +1,6 @@
 #ifndef _RAID5_H
 #define _RAID5_H
 
-#include <linux/raid/md.h>
 #include <linux/raid/xor.h>
 
 /*
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
index f6c13af65002..66e6b0c6734f 100644
--- a/drivers/md/raid6.h
+++ b/drivers/md/raid6.h
@@ -17,11 +17,7 @@
 
 /* Set to 1 to use kernel-wide empty_zero_page */
 #define RAID6_USE_EMPTY_ZERO_PAGE 0
-
-#include <linux/raid/md.h>
-#include "raid5.h"
-
-typedef raid5_conf_t raid6_conf_t; /* Same configuration */
+#include <linux/blkdev.h>
 
 /* Additional compute_parity mode -- updates the parity w/o LOCKING */
 #define UPDATE_PARITY	4
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 45e59d3c7f1f..141c03829153 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,7 @@
 #include <linux/if.h>
 #include <linux/if_bridge.h>
 #include <linux/slab.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_u.h>
 #include <linux/kd.h>
 #include <linux/route.h>
 #include <linux/in6.h>
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
deleted file mode 100644
index 71c4fd19c317..000000000000
--- a/include/linux/raid/md.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
-   md.h : Multiple Devices driver for Linux
-          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
-          Copyright (C) 1994-96 Marc ZYNGIER
-	  <zyngier@ufr-info-p7.ibp.fr> or
-	  <maz@gloups.fdn.fr>
-	  
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2, or (at your option)
-   any later version.
-   
-   You should have received a copy of the GNU General Public License
-   (for example /usr/src/linux/COPYING); if not, write to the Free
-   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
-*/
-
-#ifndef _MD_H
-#define _MD_H
-
-#include <linux/blkdev.h>
-#include <linux/seq_file.h>
-
-/*
- * 'md_p.h' holds the 'physical' layout of RAID devices
- * 'md_u.h' holds the user <=> kernel API
- *
- * 'md_k.h' holds kernel internal definitions
- */
-
-#include <linux/raid/md_p.h>
-#include <linux/raid/md_u.h>
-#include <linux/raid/md_k.h>
-
-#ifdef CONFIG_MD
-
-#endif /* CONFIG_MD */
-#endif 
-
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4c5e2d00ff5e..e78b3c1d55fd 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -15,9 +15,6 @@
 #ifndef _MD_K_H
 #define _MD_K_H
 
-/* and dm-bio-list.h is not under include/linux because.... ??? */
-#include "../../../drivers/md/dm-bio-list.h"
-
 #ifdef CONFIG_BLOCK
 
 #define MaxSector (~(sector_t)0)
diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h
index 3e120587eada..5a210959e3f8 100644
--- a/include/linux/raid/xor.h
+++ b/include/linux/raid/xor.h
@@ -1,8 +1,6 @@
 #ifndef _XOR_H
 #define _XOR_H
 
-#include <linux/raid/md.h>
-
 #define MAX_XOR_BLOCKS 4
 
 extern void xor_blocks(unsigned int count, unsigned int bytes,
diff --git a/init/do_mounts.h b/init/do_mounts.h
index 9aa968d54329..f5b978a9bb92 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -1,4 +1,5 @@
 #include <linux/kernel.h>
+#include <linux/blkdev.h>
 #include <linux/init.h>
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index 23a15fb57e15..69aebbf8fd2d 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -1,5 +1,6 @@
 #include <linux/delay.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_p.h>
 
 #include "do_mounts.h"
 
-- 
cgit v1.2.3


From 43b2e5d86d8bdd77386226db0bc961529492c043 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Mar 2009 14:33:13 +1100
Subject: md: move md_k.h from include/linux/raid/ to drivers/md/

It really is nicer to keep related code together..

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c       |   2 +-
 drivers/md/faulty.c       |   2 +-
 drivers/md/linear.c       |   2 +-
 drivers/md/md.c           |   2 +-
 drivers/md/md.h           | 411 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/multipath.c    |   2 +-
 drivers/md/raid0.c        |   2 +-
 drivers/md/raid1.c        |   2 +-
 drivers/md/raid10.c       |   2 +-
 drivers/md/raid5.c        |   2 +-
 include/linux/raid/md_k.h | 411 ----------------------------------------------
 11 files changed, 420 insertions(+), 420 deletions(-)
 create mode 100644 drivers/md/md.h
 delete mode 100644 include/linux/raid/md_k.h

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1df012e9d73d..623292a5473e 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -27,7 +27,7 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
-#include <linux/raid/md_k.h>
+#include "md.h"
 #include "bitmap.h"
 
 /* debug macros */
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index cc5d2cf08dfc..7b66b9fca29d 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -64,7 +64,7 @@
 #define MaxFault	50
 #include <linux/blkdev.h>
 #include <linux/raid/md_u.h>
-#include <linux/raid/md_k.h>
+#include "md.h"
 #include <linux/seq_file.h>
 
 
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index c43c3b60ef09..f2488343ed4a 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -18,8 +18,8 @@
 
 #include <linux/blkdev.h>
 #include <linux/raid/md_u.h>
-#include <linux/raid/md_k.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "linear.h"
 
 /*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 11d6e0e1045a..aad0ac54bf90 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -45,9 +45,9 @@
 #include <linux/reboot.h>
 #include <linux/file.h>
 #include <linux/delay.h>
-#include <linux/raid/md_k.h>
 #include <linux/raid/md_p.h>
 #include <linux/raid/md_u.h>
+#include "md.h"
 #include "bitmap.h"
 
 #define DEBUG 0
diff --git a/drivers/md/md.h b/drivers/md/md.h
new file mode 100644
index 000000000000..e78b3c1d55fd
--- /dev/null
+++ b/drivers/md/md.h
@@ -0,0 +1,411 @@
+/*
+   md_k.h : kernel internal structure of the Linux MD driver
+          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+	  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#ifndef _MD_K_H
+#define _MD_K_H
+
+#ifdef CONFIG_BLOCK
+
+#define MaxSector (~(sector_t)0)
+
+typedef struct mddev_s mddev_t;
+typedef struct mdk_rdev_s mdk_rdev_t;
+
+/*
+ * options passed in raidrun:
+ */
+
+/* Currently this must fit in an 'int' */
+#define MAX_CHUNK_SIZE (1<<30)
+
+/*
+ * MD's 'extended' device
+ */
+struct mdk_rdev_s
+{
+	struct list_head same_set;	/* RAID devices within the same set */
+
+	sector_t size;			/* Device size (in blocks) */
+	mddev_t *mddev;			/* RAID array if running */
+	int last_events;		/* IO event timestamp */
+
+	struct block_device *bdev;	/* block device handle */
+
+	struct page	*sb_page;
+	int		sb_loaded;
+	__u64		sb_events;
+	sector_t	data_offset;	/* start of data in array */
+	sector_t 	sb_start;	/* offset of the super block (in 512byte sectors) */
+	int		sb_size;	/* bytes in the superblock */
+	int		preferred_minor;	/* autorun support */
+
+	struct kobject	kobj;
+
+	/* A device can be in one of three states based on two flags:
+	 * Not working:   faulty==1 in_sync==0
+	 * Fully working: faulty==0 in_sync==1
+	 * Working, but not
+	 * in sync with array
+	 *                faulty==0 in_sync==0
+	 *
+	 * It can never have faulty==1, in_sync==1
+	 * This reduces the burden of testing multiple flags in many cases
+	 */
+
+	unsigned long	flags;
+#define	Faulty		1		/* device is known to have a fault */
+#define	In_sync		2		/* device is in_sync with rest of array */
+#define	WriteMostly	4		/* Avoid reading if at all possible */
+#define	BarriersNotsupp	5		/* BIO_RW_BARRIER is not supported */
+#define	AllReserved	6		/* If whole device is reserved for
+					 * one array */
+#define	AutoDetected	7		/* added by auto-detect */
+#define Blocked		8		/* An error occured on an externally
+					 * managed array, don't allow writes
+					 * until it is cleared */
+#define StateChanged	9		/* Faulty or Blocked has changed during
+					 * interrupt, so it needs to be
+					 * notified by the thread */
+	wait_queue_head_t blocked_wait;
+
+	int desc_nr;			/* descriptor index in the superblock */
+	int raid_disk;			/* role of device in array */
+	int saved_raid_disk;		/* role that device used to have in the
+					 * array and could again if we did a partial
+					 * resync from the bitmap
+					 */
+	sector_t	recovery_offset;/* If this device has been partially
+					 * recovered, this is where we were
+					 * up to.
+					 */
+
+	atomic_t	nr_pending;	/* number of pending requests.
+					 * only maintained for arrays that
+					 * support hot removal
+					 */
+	atomic_t	read_errors;	/* number of consecutive read errors that
+					 * we have tried to ignore.
+					 */
+	atomic_t	corrected_errors; /* number of corrected read errors,
+					   * for reporting to userspace and storing
+					   * in superblock.
+					   */
+	struct work_struct del_work;	/* used for delayed sysfs removal */
+
+	struct sysfs_dirent *sysfs_state; /* handle for 'state'
+					   * sysfs entry */
+};
+
+struct mddev_s
+{
+	void				*private;
+	struct mdk_personality		*pers;
+	dev_t				unit;
+	int				md_minor;
+	struct list_head 		disks;
+	unsigned long			flags;
+#define MD_CHANGE_DEVS	0	/* Some device status has changed */
+#define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
+#define MD_CHANGE_PENDING 2	/* superblock update in progress */
+
+	int				ro;
+
+	struct gendisk			*gendisk;
+
+	struct kobject			kobj;
+	int				hold_active;
+#define	UNTIL_IOCTL	1
+#define	UNTIL_STOP	2
+
+	/* Superblock information */
+	int				major_version,
+					minor_version,
+					patch_version;
+	int				persistent;
+	int 				external;	/* metadata is
+							 * managed externally */
+	char				metadata_type[17]; /* externally set*/
+	int				chunk_size;
+	time_t				ctime, utime;
+	int				level, layout;
+	char				clevel[16];
+	int				raid_disks;
+	int				max_disks;
+	sector_t			size; /* used size of component devices */
+	sector_t			array_sectors; /* exported array size */
+	__u64				events;
+
+	char				uuid[16];
+
+	/* If the array is being reshaped, we need to record the
+	 * new shape and an indication of where we are up to.
+	 * This is written to the superblock.
+	 * If reshape_position is MaxSector, then no reshape is happening (yet).
+	 */
+	sector_t			reshape_position;
+	int				delta_disks, new_level, new_layout, new_chunk;
+
+	struct mdk_thread_s		*thread;	/* management thread */
+	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
+	sector_t			curr_resync;	/* last block scheduled */
+	unsigned long			resync_mark;	/* a recent timestamp */
+	sector_t			resync_mark_cnt;/* blocks written at resync_mark */
+	sector_t			curr_mark_cnt; /* blocks scheduled now */
+
+	sector_t			resync_max_sectors; /* may be set by personality */
+
+	sector_t			resync_mismatches; /* count of sectors where
+							    * parity/replica mismatch found
+							    */
+
+	/* allow user-space to request suspension of IO to regions of the array */
+	sector_t			suspend_lo;
+	sector_t			suspend_hi;
+	/* if zero, use the system-wide default */
+	int				sync_speed_min;
+	int				sync_speed_max;
+
+	/* resync even though the same disks are shared among md-devices */
+	int				parallel_resync;
+
+	int				ok_start_degraded;
+	/* recovery/resync flags 
+	 * NEEDED:   we might need to start a resync/recover
+	 * RUNNING:  a thread is running, or about to be started
+	 * SYNC:     actually doing a resync, not a recovery
+	 * RECOVER:  doing recovery, or need to try it.
+	 * INTR:     resync needs to be aborted for some reason
+	 * DONE:     thread is done and is waiting to be reaped
+	 * REQUEST:  user-space has requested a sync (used with SYNC)
+	 * CHECK:    user-space request for for check-only, no repair
+	 * RESHAPE:  A reshape is happening
+	 *
+	 * If neither SYNC or RESHAPE are set, then it is a recovery.
+	 */
+#define	MD_RECOVERY_RUNNING	0
+#define	MD_RECOVERY_SYNC	1
+#define	MD_RECOVERY_RECOVER	2
+#define	MD_RECOVERY_INTR	3
+#define	MD_RECOVERY_DONE	4
+#define	MD_RECOVERY_NEEDED	5
+#define	MD_RECOVERY_REQUESTED	6
+#define	MD_RECOVERY_CHECK	7
+#define MD_RECOVERY_RESHAPE	8
+#define	MD_RECOVERY_FROZEN	9
+
+	unsigned long			recovery;
+	int				recovery_disabled; /* if we detect that recovery
+							    * will always fail, set this
+							    * so we don't loop trying */
+
+	int				in_sync;	/* know to not need resync */
+	struct mutex			reconfig_mutex;
+	atomic_t			active;		/* general refcount */
+	atomic_t			openers;	/* number of active opens */
+
+	int				changed;	/* true if we might need to reread partition info */
+	int				degraded;	/* whether md should consider
+							 * adding a spare
+							 */
+	int				barriers_work;	/* initialised to true, cleared as soon
+							 * as a barrier request to slave
+							 * fails.  Only supported
+							 */
+	struct bio			*biolist; 	/* bios that need to be retried
+							 * because BIO_RW_BARRIER is not supported
+							 */
+
+	atomic_t			recovery_active; /* blocks scheduled, but not written */
+	wait_queue_head_t		recovery_wait;
+	sector_t			recovery_cp;
+	sector_t			resync_min;	/* user requested sync
+							 * starts here */
+	sector_t			resync_max;	/* resync should pause
+							 * when it gets here */
+
+	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
+							 * file in sysfs.
+							 */
+	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
+
+	struct work_struct del_work;	/* used for delayed sysfs removal */
+
+	spinlock_t			write_lock;
+	wait_queue_head_t		sb_wait;	/* for waiting on superblock updates */
+	atomic_t			pending_writes;	/* number of active superblock writes */
+
+	unsigned int			safemode;	/* if set, update "clean" superblock
+							 * when no writes pending.
+							 */ 
+	unsigned int			safemode_delay;
+	struct timer_list		safemode_timer;
+	atomic_t			writes_pending; 
+	struct request_queue		*queue;	/* for plugging ... */
+
+	atomic_t                        write_behind; /* outstanding async IO */
+	unsigned int                    max_write_behind; /* 0 = sync */
+
+	struct bitmap                   *bitmap; /* the bitmap for the device */
+	struct file			*bitmap_file; /* the bitmap file */
+	long				bitmap_offset; /* offset from superblock of
+							* start of bitmap. May be
+							* negative, but not '0'
+							*/
+	long				default_bitmap_offset; /* this is the offset to use when
+								* hot-adding a bitmap.  It should
+								* eventually be settable by sysfs.
+								*/
+
+	struct list_head		all_mddevs;
+};
+
+
+static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
+{
+	int faulty = test_bit(Faulty, &rdev->flags);
+	if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+}
+
+static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
+{
+        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
+}
+
+struct mdk_personality
+{
+	char *name;
+	int level;
+	struct list_head list;
+	struct module *owner;
+	int (*make_request)(struct request_queue *q, struct bio *bio);
+	int (*run)(mddev_t *mddev);
+	int (*stop)(mddev_t *mddev);
+	void (*status)(struct seq_file *seq, mddev_t *mddev);
+	/* error_handler must set ->faulty and clear ->in_sync
+	 * if appropriate, and should abort recovery if needed 
+	 */
+	void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
+	int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
+	int (*hot_remove_disk) (mddev_t *mddev, int number);
+	int (*spare_active) (mddev_t *mddev);
+	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
+	int (*resize) (mddev_t *mddev, sector_t sectors);
+	int (*check_reshape) (mddev_t *mddev);
+	int (*start_reshape) (mddev_t *mddev);
+	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
+	/* quiesce moves between quiescence states
+	 * 0 - fully active
+	 * 1 - no new requests allowed
+	 * others - reserved
+	 */
+	void (*quiesce) (mddev_t *mddev, int state);
+};
+
+
+struct md_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(mddev_t *, char *);
+	ssize_t (*store)(mddev_t *, const char *, size_t);
+};
+
+
+static inline char * mdname (mddev_t * mddev)
+{
+	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
+}
+
+/*
+ * iterates through some rdev ringlist. It's safe to remove the
+ * current 'rdev'. Dont touch 'tmp' though.
+ */
+#define rdev_for_each_list(rdev, tmp, head)				\
+	list_for_each_entry_safe(rdev, tmp, head, same_set)
+
+/*
+ * iterates through the 'same array disks' ringlist
+ */
+#define rdev_for_each(rdev, tmp, mddev)				\
+	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
+
+#define rdev_for_each_rcu(rdev, mddev)				\
+	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
+
+typedef struct mdk_thread_s {
+	void			(*run) (mddev_t *mddev);
+	mddev_t			*mddev;
+	wait_queue_head_t	wqueue;
+	unsigned long           flags;
+	struct task_struct	*tsk;
+	unsigned long		timeout;
+} mdk_thread_t;
+
+#define THREAD_WAKEUP  0
+
+#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
+do {									\
+	wait_queue_t __wait;						\
+	init_waitqueue_entry(&__wait, current);				\
+									\
+	add_wait_queue(&wq, &__wait);					\
+	for (;;) {							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (condition)						\
+			break;						\
+		spin_unlock_irq(&lock);					\
+		cmd;							\
+		schedule();						\
+		spin_lock_irq(&lock);					\
+	}								\
+	current->state = TASK_RUNNING;					\
+	remove_wait_queue(&wq, &__wait);				\
+} while (0)
+
+#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
+do {									\
+	if (condition)	 						\
+		break;							\
+	__wait_event_lock_irq(wq, condition, lock, cmd);		\
+} while (0)
+
+static inline void safe_put_page(struct page *p)
+{
+	if (p) put_page(p);
+}
+
+#endif /* CONFIG_BLOCK */
+#endif
+
+
+extern int register_md_personality(struct mdk_personality *p);
+extern int unregister_md_personality(struct mdk_personality *p);
+extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
+				mddev_t *mddev, const char *name);
+extern void md_unregister_thread(mdk_thread_t *thread);
+extern void md_wakeup_thread(mdk_thread_t *thread);
+extern void md_check_recovery(mddev_t *mddev);
+extern void md_write_start(mddev_t *mddev, struct bio *bi);
+extern void md_write_end(mddev_t *mddev);
+extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
+extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
+
+extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+			   sector_t sector, int size, struct page *page);
+extern void md_super_wait(mddev_t *mddev);
+extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+			struct page *page, int rw);
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
+extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 148b3cd058bf..0ed1005afb58 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -21,8 +21,8 @@
 
 #include <linux/blkdev.h>
 #include <linux/raid/md_u.h>
-#include <linux/raid/md_k.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "multipath.h"
 
 #define MAX_WORK_PER_DISK 128
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 64e4c77a1568..3d06df86da87 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -19,8 +19,8 @@
 */
 
 #include <linux/blkdev.h>
-#include <linux/raid/md_k.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "raid0.h"
 
 static void raid0_unplug(struct request_queue *q)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 253b09c86eca..051ecfa61514 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -33,8 +33,8 @@
 
 #include <linux/delay.h>
 #include <linux/blkdev.h>
-#include <linux/raid/md_k.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "dm-bio-list.h"
 #include "raid1.h"
 #include "bitmap.h"
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 186e1b199d46..fea61e3dcd95 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -20,8 +20,8 @@
 
 #include <linux/delay.h>
 #include <linux/blkdev.h>
-#include <linux/raid/md_k.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "dm-bio-list.h"
 #include "raid10.h"
 #include "bitmap.h"
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 816157e7d8e0..849478e9afdc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -44,10 +44,10 @@
  */
 
 #include <linux/blkdev.h>
-#include <linux/raid/md_k.h>
 #include <linux/kthread.h>
 #include <linux/async_tx.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "raid5.h"
 #include "raid6.h"
 #include "bitmap.h"
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
deleted file mode 100644
index e78b3c1d55fd..000000000000
--- a/include/linux/raid/md_k.h
+++ /dev/null
@@ -1,411 +0,0 @@
-/*
-   md_k.h : kernel internal structure of the Linux MD driver
-          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
-	  
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2, or (at your option)
-   any later version.
-   
-   You should have received a copy of the GNU General Public License
-   (for example /usr/src/linux/COPYING); if not, write to the Free
-   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
-*/
-
-#ifndef _MD_K_H
-#define _MD_K_H
-
-#ifdef CONFIG_BLOCK
-
-#define MaxSector (~(sector_t)0)
-
-typedef struct mddev_s mddev_t;
-typedef struct mdk_rdev_s mdk_rdev_t;
-
-/*
- * options passed in raidrun:
- */
-
-/* Currently this must fit in an 'int' */
-#define MAX_CHUNK_SIZE (1<<30)
-
-/*
- * MD's 'extended' device
- */
-struct mdk_rdev_s
-{
-	struct list_head same_set;	/* RAID devices within the same set */
-
-	sector_t size;			/* Device size (in blocks) */
-	mddev_t *mddev;			/* RAID array if running */
-	int last_events;		/* IO event timestamp */
-
-	struct block_device *bdev;	/* block device handle */
-
-	struct page	*sb_page;
-	int		sb_loaded;
-	__u64		sb_events;
-	sector_t	data_offset;	/* start of data in array */
-	sector_t 	sb_start;	/* offset of the super block (in 512byte sectors) */
-	int		sb_size;	/* bytes in the superblock */
-	int		preferred_minor;	/* autorun support */
-
-	struct kobject	kobj;
-
-	/* A device can be in one of three states based on two flags:
-	 * Not working:   faulty==1 in_sync==0
-	 * Fully working: faulty==0 in_sync==1
-	 * Working, but not
-	 * in sync with array
-	 *                faulty==0 in_sync==0
-	 *
-	 * It can never have faulty==1, in_sync==1
-	 * This reduces the burden of testing multiple flags in many cases
-	 */
-
-	unsigned long	flags;
-#define	Faulty		1		/* device is known to have a fault */
-#define	In_sync		2		/* device is in_sync with rest of array */
-#define	WriteMostly	4		/* Avoid reading if at all possible */
-#define	BarriersNotsupp	5		/* BIO_RW_BARRIER is not supported */
-#define	AllReserved	6		/* If whole device is reserved for
-					 * one array */
-#define	AutoDetected	7		/* added by auto-detect */
-#define Blocked		8		/* An error occured on an externally
-					 * managed array, don't allow writes
-					 * until it is cleared */
-#define StateChanged	9		/* Faulty or Blocked has changed during
-					 * interrupt, so it needs to be
-					 * notified by the thread */
-	wait_queue_head_t blocked_wait;
-
-	int desc_nr;			/* descriptor index in the superblock */
-	int raid_disk;			/* role of device in array */
-	int saved_raid_disk;		/* role that device used to have in the
-					 * array and could again if we did a partial
-					 * resync from the bitmap
-					 */
-	sector_t	recovery_offset;/* If this device has been partially
-					 * recovered, this is where we were
-					 * up to.
-					 */
-
-	atomic_t	nr_pending;	/* number of pending requests.
-					 * only maintained for arrays that
-					 * support hot removal
-					 */
-	atomic_t	read_errors;	/* number of consecutive read errors that
-					 * we have tried to ignore.
-					 */
-	atomic_t	corrected_errors; /* number of corrected read errors,
-					   * for reporting to userspace and storing
-					   * in superblock.
-					   */
-	struct work_struct del_work;	/* used for delayed sysfs removal */
-
-	struct sysfs_dirent *sysfs_state; /* handle for 'state'
-					   * sysfs entry */
-};
-
-struct mddev_s
-{
-	void				*private;
-	struct mdk_personality		*pers;
-	dev_t				unit;
-	int				md_minor;
-	struct list_head 		disks;
-	unsigned long			flags;
-#define MD_CHANGE_DEVS	0	/* Some device status has changed */
-#define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
-#define MD_CHANGE_PENDING 2	/* superblock update in progress */
-
-	int				ro;
-
-	struct gendisk			*gendisk;
-
-	struct kobject			kobj;
-	int				hold_active;
-#define	UNTIL_IOCTL	1
-#define	UNTIL_STOP	2
-
-	/* Superblock information */
-	int				major_version,
-					minor_version,
-					patch_version;
-	int				persistent;
-	int 				external;	/* metadata is
-							 * managed externally */
-	char				metadata_type[17]; /* externally set*/
-	int				chunk_size;
-	time_t				ctime, utime;
-	int				level, layout;
-	char				clevel[16];
-	int				raid_disks;
-	int				max_disks;
-	sector_t			size; /* used size of component devices */
-	sector_t			array_sectors; /* exported array size */
-	__u64				events;
-
-	char				uuid[16];
-
-	/* If the array is being reshaped, we need to record the
-	 * new shape and an indication of where we are up to.
-	 * This is written to the superblock.
-	 * If reshape_position is MaxSector, then no reshape is happening (yet).
-	 */
-	sector_t			reshape_position;
-	int				delta_disks, new_level, new_layout, new_chunk;
-
-	struct mdk_thread_s		*thread;	/* management thread */
-	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
-	sector_t			curr_resync;	/* last block scheduled */
-	unsigned long			resync_mark;	/* a recent timestamp */
-	sector_t			resync_mark_cnt;/* blocks written at resync_mark */
-	sector_t			curr_mark_cnt; /* blocks scheduled now */
-
-	sector_t			resync_max_sectors; /* may be set by personality */
-
-	sector_t			resync_mismatches; /* count of sectors where
-							    * parity/replica mismatch found
-							    */
-
-	/* allow user-space to request suspension of IO to regions of the array */
-	sector_t			suspend_lo;
-	sector_t			suspend_hi;
-	/* if zero, use the system-wide default */
-	int				sync_speed_min;
-	int				sync_speed_max;
-
-	/* resync even though the same disks are shared among md-devices */
-	int				parallel_resync;
-
-	int				ok_start_degraded;
-	/* recovery/resync flags 
-	 * NEEDED:   we might need to start a resync/recover
-	 * RUNNING:  a thread is running, or about to be started
-	 * SYNC:     actually doing a resync, not a recovery
-	 * RECOVER:  doing recovery, or need to try it.
-	 * INTR:     resync needs to be aborted for some reason
-	 * DONE:     thread is done and is waiting to be reaped
-	 * REQUEST:  user-space has requested a sync (used with SYNC)
-	 * CHECK:    user-space request for for check-only, no repair
-	 * RESHAPE:  A reshape is happening
-	 *
-	 * If neither SYNC or RESHAPE are set, then it is a recovery.
-	 */
-#define	MD_RECOVERY_RUNNING	0
-#define	MD_RECOVERY_SYNC	1
-#define	MD_RECOVERY_RECOVER	2
-#define	MD_RECOVERY_INTR	3
-#define	MD_RECOVERY_DONE	4
-#define	MD_RECOVERY_NEEDED	5
-#define	MD_RECOVERY_REQUESTED	6
-#define	MD_RECOVERY_CHECK	7
-#define MD_RECOVERY_RESHAPE	8
-#define	MD_RECOVERY_FROZEN	9
-
-	unsigned long			recovery;
-	int				recovery_disabled; /* if we detect that recovery
-							    * will always fail, set this
-							    * so we don't loop trying */
-
-	int				in_sync;	/* know to not need resync */
-	struct mutex			reconfig_mutex;
-	atomic_t			active;		/* general refcount */
-	atomic_t			openers;	/* number of active opens */
-
-	int				changed;	/* true if we might need to reread partition info */
-	int				degraded;	/* whether md should consider
-							 * adding a spare
-							 */
-	int				barriers_work;	/* initialised to true, cleared as soon
-							 * as a barrier request to slave
-							 * fails.  Only supported
-							 */
-	struct bio			*biolist; 	/* bios that need to be retried
-							 * because BIO_RW_BARRIER is not supported
-							 */
-
-	atomic_t			recovery_active; /* blocks scheduled, but not written */
-	wait_queue_head_t		recovery_wait;
-	sector_t			recovery_cp;
-	sector_t			resync_min;	/* user requested sync
-							 * starts here */
-	sector_t			resync_max;	/* resync should pause
-							 * when it gets here */
-
-	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
-							 * file in sysfs.
-							 */
-	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
-
-	struct work_struct del_work;	/* used for delayed sysfs removal */
-
-	spinlock_t			write_lock;
-	wait_queue_head_t		sb_wait;	/* for waiting on superblock updates */
-	atomic_t			pending_writes;	/* number of active superblock writes */
-
-	unsigned int			safemode;	/* if set, update "clean" superblock
-							 * when no writes pending.
-							 */ 
-	unsigned int			safemode_delay;
-	struct timer_list		safemode_timer;
-	atomic_t			writes_pending; 
-	struct request_queue		*queue;	/* for plugging ... */
-
-	atomic_t                        write_behind; /* outstanding async IO */
-	unsigned int                    max_write_behind; /* 0 = sync */
-
-	struct bitmap                   *bitmap; /* the bitmap for the device */
-	struct file			*bitmap_file; /* the bitmap file */
-	long				bitmap_offset; /* offset from superblock of
-							* start of bitmap. May be
-							* negative, but not '0'
-							*/
-	long				default_bitmap_offset; /* this is the offset to use when
-								* hot-adding a bitmap.  It should
-								* eventually be settable by sysfs.
-								*/
-
-	struct list_head		all_mddevs;
-};
-
-
-static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
-{
-	int faulty = test_bit(Faulty, &rdev->flags);
-	if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-}
-
-static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
-{
-        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
-}
-
-struct mdk_personality
-{
-	char *name;
-	int level;
-	struct list_head list;
-	struct module *owner;
-	int (*make_request)(struct request_queue *q, struct bio *bio);
-	int (*run)(mddev_t *mddev);
-	int (*stop)(mddev_t *mddev);
-	void (*status)(struct seq_file *seq, mddev_t *mddev);
-	/* error_handler must set ->faulty and clear ->in_sync
-	 * if appropriate, and should abort recovery if needed 
-	 */
-	void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
-	int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
-	int (*hot_remove_disk) (mddev_t *mddev, int number);
-	int (*spare_active) (mddev_t *mddev);
-	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
-	int (*resize) (mddev_t *mddev, sector_t sectors);
-	int (*check_reshape) (mddev_t *mddev);
-	int (*start_reshape) (mddev_t *mddev);
-	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
-	/* quiesce moves between quiescence states
-	 * 0 - fully active
-	 * 1 - no new requests allowed
-	 * others - reserved
-	 */
-	void (*quiesce) (mddev_t *mddev, int state);
-};
-
-
-struct md_sysfs_entry {
-	struct attribute attr;
-	ssize_t (*show)(mddev_t *, char *);
-	ssize_t (*store)(mddev_t *, const char *, size_t);
-};
-
-
-static inline char * mdname (mddev_t * mddev)
-{
-	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
-}
-
-/*
- * iterates through some rdev ringlist. It's safe to remove the
- * current 'rdev'. Dont touch 'tmp' though.
- */
-#define rdev_for_each_list(rdev, tmp, head)				\
-	list_for_each_entry_safe(rdev, tmp, head, same_set)
-
-/*
- * iterates through the 'same array disks' ringlist
- */
-#define rdev_for_each(rdev, tmp, mddev)				\
-	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
-
-#define rdev_for_each_rcu(rdev, mddev)				\
-	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
-
-typedef struct mdk_thread_s {
-	void			(*run) (mddev_t *mddev);
-	mddev_t			*mddev;
-	wait_queue_head_t	wqueue;
-	unsigned long           flags;
-	struct task_struct	*tsk;
-	unsigned long		timeout;
-} mdk_thread_t;
-
-#define THREAD_WAKEUP  0
-
-#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
-do {									\
-	wait_queue_t __wait;						\
-	init_waitqueue_entry(&__wait, current);				\
-									\
-	add_wait_queue(&wq, &__wait);					\
-	for (;;) {							\
-		set_current_state(TASK_UNINTERRUPTIBLE);		\
-		if (condition)						\
-			break;						\
-		spin_unlock_irq(&lock);					\
-		cmd;							\
-		schedule();						\
-		spin_lock_irq(&lock);					\
-	}								\
-	current->state = TASK_RUNNING;					\
-	remove_wait_queue(&wq, &__wait);				\
-} while (0)
-
-#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
-do {									\
-	if (condition)	 						\
-		break;							\
-	__wait_event_lock_irq(wq, condition, lock, cmd);		\
-} while (0)
-
-static inline void safe_put_page(struct page *p)
-{
-	if (p) put_page(p);
-}
-
-#endif /* CONFIG_BLOCK */
-#endif
-
-
-extern int register_md_personality(struct mdk_personality *p);
-extern int unregister_md_personality(struct mdk_personality *p);
-extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
-				mddev_t *mddev, const char *name);
-extern void md_unregister_thread(mdk_thread_t *thread);
-extern void md_wakeup_thread(mdk_thread_t *thread);
-extern void md_check_recovery(mddev_t *mddev);
-extern void md_write_start(mddev_t *mddev, struct bio *bi);
-extern void md_write_end(mddev_t *mddev);
-extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
-extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
-
-extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
-			   sector_t sector, int size, struct page *page);
-extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
-			struct page *page, int rw);
-extern void md_do_sync(mddev_t *mddev);
-extern void md_new_event(mddev_t *mddev);
-extern int md_allow_write(mddev_t *mddev);
-extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
-- 
cgit v1.2.3


From f701d589aa34d7531183c9ac6f7713ba14212b02 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 31 Mar 2009 15:09:39 +1100
Subject: md/raid6: move raid6 data processing to raid6_pq.ko

Move the raid6 data processing routines into a standalone module
(raid6_pq) to prepare them to be called from async_tx wrappers and other
non-md drivers/modules.  This precludes a circular dependency of raid456
needing the async modules for data processing while those modules in
turn depend on raid456 for the base level synchronous raid6 routines.

To support this move:
1/ The exportable definitions in raid6.h move to include/linux/raid/pq.h
2/ The raid6_call, recovery calls, and table symbols are exported
3/ Extra #ifdef __KERNEL__ statements to enable the userspace raid6test to
   compile

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/Kconfig            |   4 ++
 drivers/md/Makefile           |   4 +-
 drivers/md/mktables.c         |  14 ++++-
 drivers/md/raid5.c            |  12 +---
 drivers/md/raid5.h            |   2 +
 drivers/md/raid6.h            | 126 ----------------------------------------
 drivers/md/raid6algos.c       |  19 +++++-
 drivers/md/raid6altivec.uc    |   2 +-
 drivers/md/raid6int.uc        |   2 +-
 drivers/md/raid6mmx.c         |   2 +-
 drivers/md/raid6recov.c       |  11 ++--
 drivers/md/raid6sse1.c        |   2 +-
 drivers/md/raid6sse2.c        |   2 +-
 drivers/md/raid6test/Makefile |   2 +-
 drivers/md/raid6test/test.c   |   2 +-
 include/linux/raid/pq.h       | 132 ++++++++++++++++++++++++++++++++++++++++++
 16 files changed, 185 insertions(+), 153 deletions(-)
 delete mode 100644 drivers/md/raid6.h
 create mode 100644 include/linux/raid/pq.h

(limited to 'include/linux')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2281b5098e95..449d0b9cac14 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -121,6 +121,7 @@ config MD_RAID10
 config MD_RAID456
 	tristate "RAID-4/RAID-5/RAID-6 mode"
 	depends on BLK_DEV_MD
+	select MD_RAID6_PQ
 	select ASYNC_MEMCPY
 	select ASYNC_XOR
 	---help---
@@ -180,6 +181,9 @@ config MD_RAID5_RESHAPE
 
 	  If unsure, say Y.
 
+config MD_RAID6_PQ
+	tristate
+
 config MD_MULTIPATH
 	tristate "Multipath I/O support"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3b118da575ee..45cc5951d928 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -9,7 +9,8 @@ dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
 dm-mirror-y	+= dm-raid1.o
 md-mod-y	+= md.o bitmap.o
-raid456-y	+= raid5.o raid6algos.o raid6recov.o raid6tables.o \
+raid456-y	+= raid5.o
+raid6_pq-y	+= raid6algos.o raid6recov.o raid6tables.o \
 		   raid6int1.o raid6int2.o raid6int4.o \
 		   raid6int8.o raid6int16.o raid6int32.o \
 		   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
@@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)		+= raid0.o
 obj-$(CONFIG_MD_RAID1)		+= raid1.o
 obj-$(CONFIG_MD_RAID10)		+= raid10.o
+obj-$(CONFIG_MD_RAID6_PQ)	+= raid6_pq.o
 obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c
index b61d5767aae7..3b1500843bba 100644
--- a/drivers/md/mktables.c
+++ b/drivers/md/mktables.c
@@ -59,7 +59,7 @@ int main(int argc, char *argv[])
 	uint8_t v;
 	uint8_t exptbl[256], invtbl[256];
 
-	printf("#include \"raid6.h\"\n");
+	printf("#include <linux/raid/pq.h>\n");
 
 	/* Compute multiplication table */
 	printf("\nconst u8  __attribute__((aligned(256)))\n"
@@ -76,6 +76,9 @@ int main(int argc, char *argv[])
 		printf("\t},\n");
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfmul);\n");
+	printf("#endif\n");
 
 	/* Compute power-of-2 table (exponent) */
 	v = 1;
@@ -92,6 +95,9 @@ int main(int argc, char *argv[])
 		}
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfexp);\n");
+	printf("#endif\n");
 
 	/* Compute inverse table x^-1 == x^254 */
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -104,6 +110,9 @@ int main(int argc, char *argv[])
 		}
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfinv);\n");
+	printf("#endif\n");
 
 	/* Compute inv(2^x + 1) (exponent-xor-inverse) table */
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -115,6 +124,9 @@ int main(int argc, char *argv[])
 			       (j == 7) ? '\n' : ' ');
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfexi);\n");
+	printf("#endif\n");
 
 	return 0;
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e1ee181b79bb..1f1b054ff0b6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -45,11 +45,11 @@
 
 #include <linux/blkdev.h>
 #include <linux/kthread.h>
+#include <linux/raid/pq.h>
 #include <linux/async_tx.h>
 #include <linux/seq_file.h>
 #include "md.h"
 #include "raid5.h"
-#include "raid6.h"
 #include "bitmap.h"
 
 /*
@@ -94,11 +94,6 @@
 
 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
 
-#if !RAID6_USE_EMPTY_ZERO_PAGE
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-#endif
-
 /*
  * We maintain a biased count of active stripes in the bottom 16 bits of
  * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -5153,11 +5148,6 @@ static struct mdk_personality raid4_personality =
 
 static int __init raid5_init(void)
 {
-	int e;
-
-	e = raid6_select_algo();
-	if ( e )
-		return e;
 	register_md_personality(&raid6_personality);
 	register_md_personality(&raid5_personality);
 	register_md_personality(&raid4_personality);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index c172371481c7..2934ee0a39c6 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -269,6 +269,8 @@ struct r6_state {
 #define READ_MODIFY_WRITE	2
 /* not a write method, but a compute_parity mode */
 #define	CHECK_PARITY		3
+/* Additional compute_parity mode -- updates the parity w/o LOCKING */
+#define UPDATE_PARITY		4
 
 /*
  * Stripe state
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
deleted file mode 100644
index 8a9c823bab9e..000000000000
--- a/drivers/md/raid6.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- *   Copyright 2003 H. Peter Anvin - All Rights Reserved
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Boston MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-#ifndef LINUX_RAID_RAID6_H
-#define LINUX_RAID_RAID6_H
-
-#ifdef __KERNEL__
-
-/* Set to 1 to use kernel-wide empty_zero_page */
-#define RAID6_USE_EMPTY_ZERO_PAGE 0
-#include <linux/blkdev.h>
-
-/* Additional compute_parity mode -- updates the parity w/o LOCKING */
-#define UPDATE_PARITY	4
-
-/* We need a pre-zeroed page... if we don't want to use the kernel-provided
-   one define it here */
-#if RAID6_USE_EMPTY_ZERO_PAGE
-# define raid6_empty_zero_page empty_zero_page
-#else
-extern const char raid6_empty_zero_page[PAGE_SIZE];
-#endif
-
-#else /* ! __KERNEL__ */
-/* Used for testing in user space */
-
-#include <errno.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <stddef.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-
-/* Not standard, but glibc defines it */
-#define BITS_PER_LONG __WORDSIZE
-
-typedef uint8_t  u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-
-#ifndef PAGE_SIZE
-# define PAGE_SIZE 4096
-#endif
-extern const char raid6_empty_zero_page[PAGE_SIZE];
-
-#define __init
-#define __exit
-#define __attribute_const__ __attribute__((const))
-#define noinline __attribute__((noinline))
-
-#define preempt_enable()
-#define preempt_disable()
-#define cpu_has_feature(x) 1
-#define enable_kernel_altivec()
-#define disable_kernel_altivec()
-
-#endif /* __KERNEL__ */
-
-/* Routine choices */
-struct raid6_calls {
-	void (*gen_syndrome)(int, size_t, void **);
-	int  (*valid)(void);	/* Returns 1 if this routine set is usable */
-	const char *name;	/* Name of this routine set */
-	int prefer;		/* Has special performance attribute */
-};
-
-/* Selected algorithm */
-extern struct raid6_calls raid6_call;
-
-/* Algorithm list */
-extern const struct raid6_calls * const raid6_algos[];
-int raid6_select_algo(void);
-
-/* Return values from chk_syndrome */
-#define RAID6_OK	0
-#define RAID6_P_BAD	1
-#define RAID6_Q_BAD	2
-#define RAID6_PQ_BAD	3
-
-/* Galois field tables */
-extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
-extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
-extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
-extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
-
-/* Recovery routines */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
-void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
-void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
-
-/* Some definitions to allow code to be compiled for testing in userspace */
-#ifndef __KERNEL__
-
-# define jiffies	raid6_jiffies()
-# define printk 	printf
-# define GFP_KERNEL	0
-# define __get_free_pages(x,y)	((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0))
-# define free_pages(x,y)	munmap((void *)(x), (y)*PAGE_SIZE)
-
-static inline void cpu_relax(void)
-{
-	/* Nothing */
-}
-
-#undef  HZ
-#define HZ 1000
-static inline uint32_t raid6_jiffies(void)
-{
-	struct timeval tv;
-	gettimeofday(&tv, NULL);
-	return tv.tv_sec*1000 + tv.tv_usec/1000;
-}
-
-#endif /* ! __KERNEL__ */
-
-#endif /* LINUX_RAID_RAID6_H */
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
index 1f6a3c82ee0c..866215ac7f25 100644
--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -16,13 +16,20 @@
  * Algorithm list and algorithm selection for RAID-6
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #ifndef __KERNEL__
 #include <sys/mman.h>
 #include <stdio.h>
+#else
+#if !RAID6_USE_EMPTY_ZERO_PAGE
+/* In .bss so it's zeroed */
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+EXPORT_SYMBOL(raid6_empty_zero_page);
+#endif
 #endif
 
 struct raid6_calls raid6_call;
+EXPORT_SYMBOL_GPL(raid6_call);
 
 /* Various routine sets */
 extern const struct raid6_calls raid6_intx1;
@@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = {
 #else
 /* Need more time to be stable in userspace */
 #define RAID6_TIME_JIFFIES_LG2	9
+#define time_before(x, y) ((x) < (y))
 #endif
 
 /* Try to pick the best algorithm */
@@ -152,3 +160,12 @@ int __init raid6_select_algo(void)
 
 	return best ? 0 : -EINVAL;
 }
+
+static void raid6_exit(void)
+{
+	do { } while (0);
+}
+
+subsys_initcall(raid6_select_algo);
+module_exit(raid6_exit);
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc
index 217580667e0c..699dfeee4944 100644
--- a/drivers/md/raid6altivec.uc
+++ b/drivers/md/raid6altivec.uc
@@ -22,7 +22,7 @@
  * bracked this with preempt_disable/enable or in a lock)
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 #ifdef CONFIG_ALTIVEC
 
diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc
index 32a0bac3eb3d..f9bf9cba357f 100644
--- a/drivers/md/raid6int.uc
+++ b/drivers/md/raid6int.uc
@@ -18,7 +18,7 @@
  * This file is postprocessed using unroll.pl
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 /*
  * This is the C data type to use
diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c
index 804cb50ecc19..e7f6c13132bf 100644
--- a/drivers/md/raid6mmx.c
+++ b/drivers/md/raid6mmx.c
@@ -18,7 +18,7 @@
 
 #if defined(__i386__) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 /* Shared with raid6sse1.c */
diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c
index 7a98b8652582..2609f00e0d61 100644
--- a/drivers/md/raid6recov.c
+++ b/drivers/md/raid6recov.c
@@ -18,7 +18,7 @@
  * the syndrome.)
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 /* Recover two failed data blocks. */
 void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
@@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 		p++; q++;
 	}
 }
-
-
-
+EXPORT_SYMBOL_GPL(raid6_2data_recov);
 
 /* Recover failure of one data block plus the P block */
 void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
@@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
 		q++; dq++;
 	}
 }
+EXPORT_SYMBOL_GPL(raid6_datap_recov);
 
-
-#ifndef __KERNEL__		/* Testing only */
+#ifndef __KERNEL__
+/* Testing only */
 
 /* Recover two failed blocks. */
 void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c
index 15c588905225..b274dd5eab8f 100644
--- a/drivers/md/raid6sse1.c
+++ b/drivers/md/raid6sse1.c
@@ -23,7 +23,7 @@
 
 #if defined(__i386__) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 /* Defined in raid6mmx.c */
diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c
index 2e92e96275be..6ed6c6c0389f 100644
--- a/drivers/md/raid6sse2.c
+++ b/drivers/md/raid6sse2.c
@@ -19,7 +19,7 @@
 
 #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 static const struct raid6_sse_constants {
diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile
index 78e0396adf2a..58ffdf4f5161 100644
--- a/drivers/md/raid6test/Makefile
+++ b/drivers/md/raid6test/Makefile
@@ -5,7 +5,7 @@
 
 CC	 = gcc
 OPTFLAGS = -O2			# Adjust as desired
-CFLAGS	 = -I.. -g $(OPTFLAGS)
+CFLAGS	 = -I.. -I ../../../include -g $(OPTFLAGS)
 LD	 = ld
 PERL	 = perl
 AR	 = ar
diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c
index 559cc41b2585..7a930318b17d 100644
--- a/drivers/md/raid6test/test.c
+++ b/drivers/md/raid6test/test.c
@@ -17,7 +17,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 #define NDISKS		16	/* Including P and Q */
 
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
new file mode 100644
index 000000000000..d92480f8285c
--- /dev/null
+++ b/include/linux/raid/pq.h
@@ -0,0 +1,132 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2003 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+#ifndef LINUX_RAID_RAID6_H
+#define LINUX_RAID_RAID6_H
+
+#ifdef __KERNEL__
+
+/* Set to 1 to use kernel-wide empty_zero_page */
+#define RAID6_USE_EMPTY_ZERO_PAGE 0
+#include <linux/blkdev.h>
+
+/* We need a pre-zeroed page... if we don't want to use the kernel-provided
+   one define it here */
+#if RAID6_USE_EMPTY_ZERO_PAGE
+# define raid6_empty_zero_page empty_zero_page
+#else
+extern const char raid6_empty_zero_page[PAGE_SIZE];
+#endif
+
+#else /* ! __KERNEL__ */
+/* Used for testing in user space */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stddef.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+
+/* Not standard, but glibc defines it */
+#define BITS_PER_LONG __WORDSIZE
+
+typedef uint8_t  u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+extern const char raid6_empty_zero_page[PAGE_SIZE];
+
+#define __init
+#define __exit
+#define __attribute_const__ __attribute__((const))
+#define noinline __attribute__((noinline))
+
+#define preempt_enable()
+#define preempt_disable()
+#define cpu_has_feature(x) 1
+#define enable_kernel_altivec()
+#define disable_kernel_altivec()
+
+#define EXPORT_SYMBOL(sym)
+#define MODULE_LICENSE(licence)
+#define subsys_initcall(x)
+#define module_exit(x)
+#endif /* __KERNEL__ */
+
+/* Routine choices */
+struct raid6_calls {
+	void (*gen_syndrome)(int, size_t, void **);
+	int  (*valid)(void);	/* Returns 1 if this routine set is usable */
+	const char *name;	/* Name of this routine set */
+	int prefer;		/* Has special performance attribute */
+};
+
+/* Selected algorithm */
+extern struct raid6_calls raid6_call;
+
+/* Algorithm list */
+extern const struct raid6_calls * const raid6_algos[];
+int raid6_select_algo(void);
+
+/* Return values from chk_syndrome */
+#define RAID6_OK	0
+#define RAID6_P_BAD	1
+#define RAID6_Q_BAD	2
+#define RAID6_PQ_BAD	3
+
+/* Galois field tables */
+extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
+extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
+extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
+extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
+
+/* Recovery routines */
+void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+		       void **ptrs);
+void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
+void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
+		      void **ptrs);
+
+/* Some definitions to allow code to be compiled for testing in userspace */
+#ifndef __KERNEL__
+
+# define jiffies	raid6_jiffies()
+# define printk 	printf
+# define GFP_KERNEL	0
+# define __get_free_pages(x, y)	((unsigned long)mmap(NULL, PAGE_SIZE << (y), \
+						     PROT_READ|PROT_WRITE,   \
+						     MAP_PRIVATE|MAP_ANONYMOUS,\
+						     0, 0))
+# define free_pages(x, y)	munmap((void *)(x), (y)*PAGE_SIZE)
+
+static inline void cpu_relax(void)
+{
+	/* Nothing */
+}
+
+#undef  HZ
+#define HZ 1000
+static inline uint32_t raid6_jiffies(void)
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	return tv.tv_sec*1000 + tv.tv_usec/1000;
+}
+
+#endif /* ! __KERNEL__ */
+
+#endif /* LINUX_RAID_RAID6_H */
-- 
cgit v1.2.3


From 853116a10544206b6b2cf42ebc9d78fba2668888 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 14 Jan 2009 23:03:17 -0800
Subject: regulator: add get_status()

Based on previous LKML discussions:

 * Update docs for regulator sysfs class attributes to highlight
   the fact that all current attributes are intended to be control
   inputs, including notably "state" and "opmode" which previously
   implied otherwise.

 * Define a new regulator driver get_status() method, which is the
   first method reporting regulator outputs instead of inputs.
   It can report on/off and error status; or instead of simply
   "on", report the actual operating mode.

For the moment, this is a sysfs-only interface, not accessible to
regulator clients.  Such clients can use the current notification
interfaces to detect errors, if the regulator reports them.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 Documentation/ABI/testing/sysfs-class-regulator | 57 +++++++++++++++++++++----
 drivers/regulator/core.c                        | 46 ++++++++++++++++++++
 include/linux/regulator/driver.h                | 17 ++++++++
 3 files changed, 111 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-regulator b/Documentation/ABI/testing/sysfs-class-regulator
index 873ef1fc1569..e091fa873792 100644
--- a/Documentation/ABI/testing/sysfs-class-regulator
+++ b/Documentation/ABI/testing/sysfs-class-regulator
@@ -4,8 +4,8 @@ KernelVersion:	2.6.26
 Contact:	Liam Girdwood <lrg@slimlogic.co.uk>
 Description:
 		Some regulator directories will contain a field called
-		state. This reports the regulator enable status, for
-		regulators which can report that value.
+		state. This reports the regulator enable control, for
+		regulators which can report that input value.
 
 		This will be one of the following strings:
 
@@ -14,16 +14,54 @@ Description:
 		'unknown'
 
 		'enabled' means the regulator output is ON and is supplying
-		power to the system.
+		power to the system (assuming no error prevents it).
 
 		'disabled' means the regulator output is OFF and is not
-		supplying power to the system..
+		supplying power to the system (unless some non-Linux
+		control has enabled it).
 
 		'unknown' means software cannot determine the state, or
 		the reported state is invalid.
 
 		NOTE: this field can be used in conjunction with microvolts
-		and microamps to determine regulator output levels.
+		or microamps to determine configured regulator output levels.
+
+
+What:		/sys/class/regulator/.../status
+Description:
+		Some regulator directories will contain a field called
+		"status". This reports the current regulator status, for
+		regulators which can report that output value.
+
+		This will be one of the following strings:
+
+			off
+			on
+			error
+			fast
+			normal
+			idle
+			standby
+
+		"off" means the regulator is not supplying power to the
+		system.
+
+		"on" means the regulator is supplying power to the system,
+		and the regulator can't report a detailed operation mode.
+
+		"error" indicates an out-of-regulation status such as being
+		disabled due to thermal shutdown, or voltage being unstable
+		because of problems with the input power supply.
+
+		"fast", "normal", "idle", and "standby" are all detailed
+		regulator operation modes (described elsewhere).  They
+		imply "on", but provide more detail.
+
+		Note that regulator status is a function of many inputs,
+		not limited to control inputs from Linux.  For example,
+		the actual load presented may trigger "error" status; or
+		a regulator may be enabled by another user, even though
+		Linux did not enable it.
 
 
 What:		/sys/class/regulator/.../type
@@ -58,7 +96,7 @@ Description:
 		Some regulator directories will contain a field called
 		microvolts. This holds the regulator output voltage setting
 		measured in microvolts (i.e. E-6 Volts), for regulators
-		which can report that voltage.
+		which can report the control input for voltage.
 
 		NOTE: This value should not be used to determine the regulator
 		output voltage level as this value is the same regardless of
@@ -73,7 +111,7 @@ Description:
 		Some regulator directories will contain a field called
 		microamps. This holds the regulator output current limit
 		setting measured in microamps (i.e. E-6 Amps), for regulators
-		which can report that current.
+		which can report the control input for a current limit.
 
 		NOTE: This value should not be used to determine the regulator
 		output current level as this value is the same regardless of
@@ -87,7 +125,7 @@ Contact:	Liam Girdwood <lrg@slimlogic.co.uk>
 Description:
 		Some regulator directories will contain a field called
 		opmode. This holds the current regulator operating mode,
-		for regulators which can report it.
+		for regulators which can report that control input value.
 
 		The opmode value can be one of the following strings:
 
@@ -101,7 +139,8 @@ Description:
 
 		NOTE: This value should not be used to determine the regulator
 		output operating mode as this value is the same regardless of
-		whether the regulator is enabled or disabled.
+		whether the regulator is enabled or disabled.  A "status"
+		attribute may be available to determine the actual mode.
 
 
 What:		/sys/class/regulator/.../min_microvolts
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index f511a406fcaa..0ff95c3ccf5b 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -312,6 +312,47 @@ static ssize_t regulator_state_show(struct device *dev,
 }
 static DEVICE_ATTR(state, 0444, regulator_state_show, NULL);
 
+static ssize_t regulator_status_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct regulator_dev *rdev = dev_get_drvdata(dev);
+	int status;
+	char *label;
+
+	status = rdev->desc->ops->get_status(rdev);
+	if (status < 0)
+		return status;
+
+	switch (status) {
+	case REGULATOR_STATUS_OFF:
+		label = "off";
+		break;
+	case REGULATOR_STATUS_ON:
+		label = "on";
+		break;
+	case REGULATOR_STATUS_ERROR:
+		label = "error";
+		break;
+	case REGULATOR_STATUS_FAST:
+		label = "fast";
+		break;
+	case REGULATOR_STATUS_NORMAL:
+		label = "normal";
+		break;
+	case REGULATOR_STATUS_IDLE:
+		label = "idle";
+		break;
+	case REGULATOR_STATUS_STANDBY:
+		label = "standby";
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	return sprintf(buf, "%s\n", label);
+}
+static DEVICE_ATTR(status, 0444, regulator_status_show, NULL);
+
 static ssize_t regulator_min_uA_show(struct device *dev,
 				    struct device_attribute *attr, char *buf)
 {
@@ -1744,6 +1785,11 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
 		if (status < 0)
 			return status;
 	}
+	if (ops->get_status) {
+		status = device_create_file(dev, &dev_attr_status);
+		if (status < 0)
+			return status;
+	}
 
 	/* some attributes are type-specific */
 	if (rdev->desc->type == REGULATOR_CURRENT) {
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 2dae05705f13..6e957aae7629 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -21,6 +21,17 @@
 struct regulator_dev;
 struct regulator_init_data;
 
+enum regulator_status {
+	REGULATOR_STATUS_OFF,
+	REGULATOR_STATUS_ON,
+	REGULATOR_STATUS_ERROR,
+	/* fast/normal/idle/standby are flavors of "on" */
+	REGULATOR_STATUS_FAST,
+	REGULATOR_STATUS_NORMAL,
+	REGULATOR_STATUS_IDLE,
+	REGULATOR_STATUS_STANDBY,
+};
+
 /**
  * struct regulator_ops - regulator operations.
  *
@@ -72,6 +83,12 @@ struct regulator_ops {
 	int (*set_mode) (struct regulator_dev *, unsigned int mode);
 	unsigned int (*get_mode) (struct regulator_dev *);
 
+	/* report regulator status ... most other accessors report
+	 * control inputs, this reports results of combining inputs
+	 * from Linux (and other sources) with the actual load.
+	 */
+	int (*get_status)(struct regulator_dev *);
+
 	/* get most efficient regulator operating mode for load */
 	unsigned int (*get_optimum_mode) (struct regulator_dev *, int input_uV,
 					  int output_uV, int load_uA);
-- 
cgit v1.2.3


From b136fb4463d13eea129bf090a8a465bba6bf0003 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@cam.ac.uk>
Date: Mon, 19 Jan 2009 18:20:58 +0000
Subject: Regulator: Push lock out of _notifier_call_chain + add voltage change
 event.

Regulator: Push lock out of _notifier_call_chain and into caller functions
(side effect of fixing deadlock in regulator_force_disable)
+ Add a voltage changed event.
Signed-off-by: Jonathan Cameron <jic23@cam.ac.uk>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c             | 15 ++++++++++-----
 drivers/regulator/wm8350-regulator.c |  2 ++
 include/linux/regulator/consumer.h   |  2 ++
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 0ff95c3ccf5b..96c877dd9daf 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1284,6 +1284,7 @@ int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV)
 	ret = rdev->desc->ops->set_voltage(rdev, min_uV, max_uV);
 
 out:
+	_notifier_call_chain(rdev, REGULATOR_EVENT_VOLTAGE_CHANGE, NULL);
 	mutex_unlock(&rdev->mutex);
 	return ret;
 }
@@ -1584,20 +1585,23 @@ int regulator_unregister_notifier(struct regulator *regulator,
 }
 EXPORT_SYMBOL_GPL(regulator_unregister_notifier);
 
-/* notify regulator consumers and downstream regulator consumers */
+/* notify regulator consumers and downstream regulator consumers.
+ * Note mutex must be held by caller.
+ */
 static void _notifier_call_chain(struct regulator_dev *rdev,
 				  unsigned long event, void *data)
 {
 	struct regulator_dev *_rdev;
 
 	/* call rdev chain first */
-	mutex_lock(&rdev->mutex);
 	blocking_notifier_call_chain(&rdev->notifier, event, NULL);
-	mutex_unlock(&rdev->mutex);
 
 	/* now notify regulator we supply */
-	list_for_each_entry(_rdev, &rdev->supply_list, slist)
-		_notifier_call_chain(_rdev, event, data);
+	list_for_each_entry(_rdev, &rdev->supply_list, slist) {
+	  mutex_lock(&_rdev->mutex);
+	  _notifier_call_chain(_rdev, event, data);
+	  mutex_unlock(&_rdev->mutex);
+	}
 }
 
 /**
@@ -1744,6 +1748,7 @@ EXPORT_SYMBOL_GPL(regulator_bulk_free);
  *
  * Called by regulator drivers to notify clients a regulator event has
  * occurred. We also notify regulator clients downstream.
+ * Note lock must be held by caller.
  */
 int regulator_notifier_call_chain(struct regulator_dev *rdev,
 				  unsigned long event, void *data)
diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c
index 5056e23e4414..afad611fbb80 100644
--- a/drivers/regulator/wm8350-regulator.c
+++ b/drivers/regulator/wm8350-regulator.c
@@ -1293,6 +1293,7 @@ static void pmic_uv_handler(struct wm8350 *wm8350, int irq, void *data)
 {
 	struct regulator_dev *rdev = (struct regulator_dev *)data;
 
+	mutex_lock(&rdev->mutex);
 	if (irq == WM8350_IRQ_CS1 || irq == WM8350_IRQ_CS2)
 		regulator_notifier_call_chain(rdev,
 					      REGULATOR_EVENT_REGULATION_OUT,
@@ -1301,6 +1302,7 @@ static void pmic_uv_handler(struct wm8350 *wm8350, int irq, void *data)
 		regulator_notifier_call_chain(rdev,
 					      REGULATOR_EVENT_UNDER_VOLTAGE,
 					      wm8350);
+	mutex_unlock(&rdev->mutex);
 }
 
 static int wm8350_regulator_probe(struct platform_device *pdev)
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 801bf77ff4e2..533f4e26db96 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -88,6 +88,7 @@
  * FAIL           Regulator output has failed.
  * OVER_TEMP      Regulator over temp.
  * FORCE_DISABLE  Regulator shut down by software.
+ * VOLTAGE_CHANGE Regulator voltage changed.
  *
  * NOTE: These events can be OR'ed together when passed into handler.
  */
@@ -98,6 +99,7 @@
 #define REGULATOR_EVENT_FAIL			0x08
 #define REGULATOR_EVENT_OVER_TEMP		0x10
 #define REGULATOR_EVENT_FORCE_DISABLE		0x20
+#define REGULATOR_EVENT_VOLTAGE_CHANGE		0x40
 
 struct regulator;
 
-- 
cgit v1.2.3


From 0527100fd11d9710c7e153d791da78824b7b46fa Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 19 Jan 2009 13:37:02 +0000
Subject: regulator: Pass regulator init data as explict argument when
 registering

Rather than having the regulator init data read from the platform_data
member of the struct device that is registered for the regulator make
the init data an explict argument passed in when registering. This
allows drivers to use the platform data for their own purposes if they
wish.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/bq24022.c            | 2 +-
 drivers/regulator/core.c               | 5 +++--
 drivers/regulator/da903x.c             | 3 ++-
 drivers/regulator/pcf50633-regulator.c | 3 ++-
 drivers/regulator/wm8350-regulator.c   | 2 +-
 drivers/regulator/wm8400-regulator.c   | 2 +-
 include/linux/regulator/driver.h       | 3 ++-
 7 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/bq24022.c b/drivers/regulator/bq24022.c
index c175e38a4cd5..6804333492eb 100644
--- a/drivers/regulator/bq24022.c
+++ b/drivers/regulator/bq24022.c
@@ -105,7 +105,7 @@ static int __init bq24022_probe(struct platform_device *pdev)
 	ret = gpio_direction_output(pdata->gpio_iset2, 0);
 	ret = gpio_direction_output(pdata->gpio_nce, 1);
 
-	bq24022 = regulator_register(&bq24022_desc, &pdev->dev, pdata);
+	bq24022 = regulator_register(&bq24022_desc, &pdev->dev, NULL, pdata);
 	if (IS_ERR(bq24022)) {
 		dev_dbg(&pdev->dev, "couldn't register regulator\n");
 		ret = PTR_ERR(bq24022);
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 96c877dd9daf..f17362ac9c61 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1879,17 +1879,18 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
  * regulator_register - register regulator
  * @regulator_desc: regulator to register
  * @dev: struct device for the regulator
+ * @init_data: platform provided init data, passed through by driver
  * @driver_data: private regulator data
  *
  * Called by regulator drivers to register a regulator.
  * Returns 0 on success.
  */
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-	struct device *dev, void *driver_data)
+	struct device *dev, struct regulator_init_data *init_data,
+	void *driver_data)
 {
 	static atomic_t regulator_no = ATOMIC_INIT(0);
 	struct regulator_dev *rdev;
-	struct regulator_init_data *init_data = dev->platform_data;
 	int ret, i;
 
 	if (regulator_desc == NULL)
diff --git a/drivers/regulator/da903x.c b/drivers/regulator/da903x.c
index fe77730a7edb..72b15495183c 100644
--- a/drivers/regulator/da903x.c
+++ b/drivers/regulator/da903x.c
@@ -471,7 +471,8 @@ static int __devinit da903x_regulator_probe(struct platform_device *pdev)
 	if (ri->desc.id == DA9030_ID_LDO1 || ri->desc.id == DA9030_ID_LDO15)
 		ri->desc.ops = &da9030_regulator_ldo1_15_ops;
 
-	rdev = regulator_register(&ri->desc, &pdev->dev, ri);
+	rdev = regulator_register(&ri->desc, &pdev->dev,
+				  pdev->dev.platform_data, ri);
 	if (IS_ERR(rdev)) {
 		dev_err(&pdev->dev, "failed to register regulator %s\n",
 				ri->desc.name);
diff --git a/drivers/regulator/pcf50633-regulator.c b/drivers/regulator/pcf50633-regulator.c
index 4cc85ec6e120..cd761d85c8fd 100644
--- a/drivers/regulator/pcf50633-regulator.c
+++ b/drivers/regulator/pcf50633-regulator.c
@@ -284,7 +284,8 @@ static int __devinit pcf50633_regulator_probe(struct platform_device *pdev)
 	/* Already set by core driver */
 	pcf = platform_get_drvdata(pdev);
 
-	rdev = regulator_register(&regulators[pdev->id], &pdev->dev, pcf);
+	rdev = regulator_register(&regulators[pdev->id], &pdev->dev,
+				  pdev->dev.platform_data, pcf);
 	if (IS_ERR(rdev))
 		return PTR_ERR(rdev);
 
diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c
index afad611fbb80..93e0ce5a5c23 100644
--- a/drivers/regulator/wm8350-regulator.c
+++ b/drivers/regulator/wm8350-regulator.c
@@ -1335,9 +1335,9 @@ static int wm8350_regulator_probe(struct platform_device *pdev)
 		break;
 	}
 
-
 	/* register regulator */
 	rdev = regulator_register(&wm8350_reg[pdev->id], &pdev->dev,
+				  pdev->dev.platform_data,
 				  dev_get_drvdata(&pdev->dev));
 	if (IS_ERR(rdev)) {
 		dev_err(&pdev->dev, "failed to register %s\n",
diff --git a/drivers/regulator/wm8400-regulator.c b/drivers/regulator/wm8400-regulator.c
index 56e23d44ba59..6ed43b0dbdfc 100644
--- a/drivers/regulator/wm8400-regulator.c
+++ b/drivers/regulator/wm8400-regulator.c
@@ -294,7 +294,7 @@ static int __devinit wm8400_regulator_probe(struct platform_device *pdev)
 	struct regulator_dev *rdev;
 
 	rdev = regulator_register(&regulators[pdev->id], &pdev->dev,
-		pdev->dev.driver_data);
+		pdev->dev.platform_data, pdev->dev.driver_data);
 
 	if (IS_ERR(rdev))
 		return PTR_ERR(rdev);
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 6e957aae7629..2254ad93b784 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -138,7 +138,8 @@ struct regulator_desc {
 };
 
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-	struct device *dev, void *driver_data);
+	struct device *dev, struct regulator_init_data *init_data,
+	void *driver_data);
 void regulator_unregister(struct regulator_dev *rdev);
 
 int regulator_notifier_call_chain(struct regulator_dev *rdev,
-- 
cgit v1.2.3


From 93c62da23a717f59933ec799688da42f71d8c6c4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 19 Jan 2009 13:37:03 +0000
Subject: regulator: Allow init data to be supplied for bq24022

Previously it was not possible to do so, making it impossible for
machines to configure the driver.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/bq24022.c       | 3 ++-
 include/linux/regulator/bq24022.h | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/bq24022.c b/drivers/regulator/bq24022.c
index 6804333492eb..7ecb820ceebc 100644
--- a/drivers/regulator/bq24022.c
+++ b/drivers/regulator/bq24022.c
@@ -105,7 +105,8 @@ static int __init bq24022_probe(struct platform_device *pdev)
 	ret = gpio_direction_output(pdata->gpio_iset2, 0);
 	ret = gpio_direction_output(pdata->gpio_nce, 1);
 
-	bq24022 = regulator_register(&bq24022_desc, &pdev->dev, NULL, pdata);
+	bq24022 = regulator_register(&bq24022_desc, &pdev->dev,
+				     pdata->init_data, pdata);
 	if (IS_ERR(bq24022)) {
 		dev_dbg(&pdev->dev, "couldn't register regulator\n");
 		ret = PTR_ERR(bq24022);
diff --git a/include/linux/regulator/bq24022.h b/include/linux/regulator/bq24022.h
index e84b0a9feda5..a6d014005d49 100644
--- a/include/linux/regulator/bq24022.h
+++ b/include/linux/regulator/bq24022.h
@@ -10,6 +10,8 @@
  *
  */
 
+struct regulator_init_data;
+
 /**
  * bq24022_mach_info - platform data for bq24022
  * @gpio_nce: GPIO line connected to the nCE pin, used to enable / disable charging
@@ -18,4 +20,5 @@
 struct bq24022_mach_info {
 	int gpio_nce;
 	int gpio_iset2;
+	struct regulator_init_data *init_data;
 };
-- 
cgit v1.2.3


From bcf3402c50a48d51462f37f72129d9c4369702b4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 19 Jan 2009 13:37:04 +0000
Subject: regulator: Allow init_data to be passed to fixed voltage regulators

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/fixed.c       | 3 ++-
 include/linux/regulator/fixed.h | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c
index d31db3e14913..23d554628a76 100644
--- a/drivers/regulator/fixed.c
+++ b/drivers/regulator/fixed.c
@@ -73,7 +73,8 @@ static int regulator_fixed_voltage_probe(struct platform_device *pdev)
 
 	drvdata->microvolts = config->microvolts;
 
-	drvdata->dev = regulator_register(&drvdata->desc, drvdata);
+	drvdata->dev = regulator_register(&drvdata->desc, &pdev->dev,
+					  config->init_data, drvdata);
 	if (IS_ERR(drvdata->dev)) {
 		ret = PTR_ERR(drvdata->dev);
 		goto err_name;
diff --git a/include/linux/regulator/fixed.h b/include/linux/regulator/fixed.h
index 1387a5d2190e..91b4da31f1b5 100644
--- a/include/linux/regulator/fixed.h
+++ b/include/linux/regulator/fixed.h
@@ -14,9 +14,12 @@
 #ifndef __REGULATOR_FIXED_H
 #define __REGULATOR_FIXED_H
 
+struct regulator_init_data;
+
 struct fixed_voltage_config {
 	const char *supply_name;
 	int microvolts;
+	struct regulator_init_data *init_data;
 };
 
 #endif
-- 
cgit v1.2.3


From 1fa9ad52b07811ebf258f3f6907de8dbf020ec2d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 21 Jan 2009 14:08:40 +0000
Subject: regulator: Hoist struct regulator_dev out of core to fix notifiers

Commit 872ed3fe176833f7d43748eb88010da4bbd2f983 caused regulator drivers
to take the struct regulator_dev lock themselves which requires that the
struct be visible to them. Band aid this by making the struct visible.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c         | 27 ---------------------------
 include/linux/regulator/driver.h | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index f17362ac9c61..0ed13c2a8c3c 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -29,33 +29,6 @@ static DEFINE_MUTEX(regulator_list_mutex);
 static LIST_HEAD(regulator_list);
 static LIST_HEAD(regulator_map_list);
 
-/*
- * struct regulator_dev
- *
- * Voltage / Current regulator class device. One for each regulator.
- */
-struct regulator_dev {
-	struct regulator_desc *desc;
-	int use_count;
-
-	/* lists we belong to */
-	struct list_head list; /* list of all regulators */
-	struct list_head slist; /* list of supplied regulators */
-
-	/* lists we own */
-	struct list_head consumer_list; /* consumers we supply */
-	struct list_head supply_list; /* regulators we supply */
-
-	struct blocking_notifier_head notifier;
-	struct mutex mutex; /* consumer lock */
-	struct module *owner;
-	struct device dev;
-	struct regulation_constraints *constraints;
-	struct regulator_dev *supply;	/* for tree */
-
-	void *reg_data;		/* regulator_dev data */
-};
-
 /*
  * struct regulator_map
  *
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 2254ad93b784..c263e36e564e 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -137,6 +137,38 @@ struct regulator_desc {
 	struct module *owner;
 };
 
+/*
+ * struct regulator_dev
+ *
+ * Voltage / Current regulator class device. One for each
+ * regulator.
+ *
+ * This should *not* be used directly by anything except the regulator
+ * core and notification injection (which should take the mutex and do
+ * no other direct access).
+ */
+struct regulator_dev {
+	struct regulator_desc *desc;
+	int use_count;
+
+	/* lists we belong to */
+	struct list_head list; /* list of all regulators */
+	struct list_head slist; /* list of supplied regulators */
+
+	/* lists we own */
+	struct list_head consumer_list; /* consumers we supply */
+	struct list_head supply_list; /* regulators we supply */
+
+	struct blocking_notifier_head notifier;
+	struct mutex mutex; /* consumer lock */
+	struct module *owner;
+	struct device dev;
+	struct regulation_constraints *constraints;
+	struct regulator_dev *supply;	/* for tree */
+
+	void *reg_data;		/* regulator_dev data */
+};
+
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
 	struct device *dev, struct regulator_init_data *init_data,
 	void *driver_data);
-- 
cgit v1.2.3


From 90ca563b1030bece8a4f15a910e39a46f059ff48 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 20 Jan 2009 16:29:05 -0800
Subject: regulator: fix header file missing kernel-doc

Fix regulator/driver.h missing kernel-doc:

Warning(linux-next-20090120//include/linux/regulator/driver.h:108): No description found for parameter 'get_status'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
cc: Liam Girdwood <lrg@slimlogic.co.uk>
cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/regulator/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index c263e36e564e..eb8773b05ac3 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -51,6 +51,7 @@ enum regulator_status {
  *
  * @set_mode: Set the operating mode for the regulator.
  * @get_mode: Get the current operating mode for the regulator.
+ * @get_status: Report the regulator status.
  * @get_optimum_mode: Get the most efficient operating mode for the regulator
  *                    when running with the specified parameters.
  *
-- 
cgit v1.2.3


From 1dd68f01886a2d5cabbbe90b86e82f70917de89c Mon Sep 17 00:00:00 2001
From: Liam Girdwood <lrg@slimlogic.co.uk>
Date: Mon, 2 Feb 2009 21:43:31 +0000
Subject: regulator: email - update email address and regulator webpage.

Remove deceased email address and update to new address. Also update
website details in MAINTAINERS with correct page.

Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 MAINTAINERS                        | 2 +-
 include/linux/regulator/consumer.h | 2 +-
 include/linux/regulator/driver.h   | 2 +-
 include/linux/regulator/machine.h  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c5f4e9d27b64..15e1b73bb8d5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4850,7 +4850,7 @@ M:	lrg@slimlogic.co.uk
 P:	Mark Brown
 M:	broonie@opensource.wolfsonmicro.com
 W:	http://opensource.wolfsonmicro.com/node/15
-W:	http://www.slimlogic.co.uk/?page_id=5
+W:	http://www.slimlogic.co.uk/?p=48
 T:	git kernel.org/pub/scm/linux/kernel/git/lrg/voltage-2.6.git
 S:	Supported
 
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 533f4e26db96..df6c4bcf38f8 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2007, 2008 Wolfson Microelectronics PLC.
  *
- * Author: Liam Girdwood <lg@opensource.wolfsonmicro.com>
+ * Author: Liam Girdwood <lrg@slimlogic.co.uk>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index eb8773b05ac3..0cf37bc85c41 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2007, 2008 Wolfson Microelectronics PLC.
  *
- * Author: Liam Girdwood <lg@opensource.wolfsonmicro.com>
+ * Author: Liam Girdwood <lrg@slimlogic.co.uk>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 3794773b23d2..5aa00ee36a3d 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2007, 2008 Wolfson Microelectronics PLC.
  *
- * Author: Liam Girdwood <lg@opensource.wolfsonmicro.com>
+ * Author: Liam Girdwood <lrg@slimlogic.co.uk>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
-- 
cgit v1.2.3


From a308466c24b4f42bab6945026e938874d22cde50 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 26 Feb 2009 19:24:19 +0000
Subject: regulator: Allow regulators to set the initial operating mode

This is useful when wishing to run in a fixed operating mode that isn't
the default.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c          | 17 +++++++++++++++++
 include/linux/regulator/machine.h |  4 ++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index d55a25a6fab2..75abcd85e51b 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -724,6 +724,23 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 		}
 	}
 
+	if (constraints->initial_mode) {
+		if (!ops->set_mode) {
+			printk(KERN_ERR "%s: no set_mode operation for %s\n",
+			       __func__, name);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		ret = ops->set_mode(rdev, constraints->initial_mode);
+		if (ret < 0) {
+			printk(KERN_ERR
+			       "%s: failed to set initial mode for %s: %d\n",
+			       __func__, name, ret);
+			goto out;
+		}
+	}
+
 	/* if always_on is set then turn the regulator on if it's not
 	 * already on. */
 	if (constraints->always_on && ops->enable &&
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 5aa00ee36a3d..1eb861cf4b2c 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -83,6 +83,7 @@ struct regulator_state {
  * @state_standby: State for regulator when system is suspended in standby
  *                 mode.
  * @initial_state: Suspend state to set by default.
+ * @initial_mode: Mode to set at startup.
  */
 struct regulation_constraints {
 
@@ -111,6 +112,9 @@ struct regulation_constraints {
 	struct regulator_state state_standby;
 	suspend_state_t initial_state; /* suspend state to set at init */
 
+	/* mode to set on startup */
+	unsigned int initial_mode;
+
 	/* constriant flags */
 	unsigned always_on:1;	/* regulator never off when system is on */
 	unsigned boot_on:1;	/* bootloader/firmware enabled regulator */
-- 
cgit v1.2.3


From 4367cfdc7c657ad8a797f51b9ffd3c64b31910e7 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Thu, 26 Feb 2009 11:48:36 -0800
Subject: regulator: enumerate voltages (v2)

Add a basic mechanism for regulators to report the discrete
voltages they support:  list_voltage() enumerates them using
selectors numbered from 0 to an upper bound.

Use those methods to force machine-level constraints into bounds.
(Example:  regulator supports 1.8V, 2.4V, 2.6V, 3.3V, and board
constraints for that rail are 2.0V to 3.6V ... so the range of
voltages is then 2.4V to 3.3V on this board.)

Export those voltages to the regulator consumer interface, so for
example regulator hooked up to an MMC/SD/SDIO slot can report the
actual voltage options available to cards connected there.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c           | 113 +++++++++++++++++++++++++++++++++++++
 include/linux/regulator/consumer.h |   2 +
 include/linux/regulator/driver.h   |   9 +++
 3 files changed, 124 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 75abcd85e51b..da357a07c98e 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -692,6 +692,69 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 	else
 		name = "regulator";
 
+	/* constrain machine-level voltage specs to fit
+	 * the actual range supported by this regulator.
+	 */
+	if (ops->list_voltage && rdev->desc->n_voltages) {
+		int	count = rdev->desc->n_voltages;
+		int	i;
+		int	min_uV = INT_MAX;
+		int	max_uV = INT_MIN;
+		int	cmin = constraints->min_uV;
+		int	cmax = constraints->max_uV;
+
+		/* it's safe to autoconfigure fixed-voltage supplies */
+		if (count == 1 && !cmin) {
+			cmin = INT_MIN;
+			cmax = INT_MAX;
+		}
+
+		/* else require explicit machine-level constraints */
+		else if (cmin <= 0 || cmax <= 0 || cmax < cmin) {
+			pr_err("%s: %s '%s' voltage constraints\n",
+				       __func__, "invalid", name);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		/* initial: [cmin..cmax] valid, [min_uV..max_uV] not */
+		for (i = 0; i < count; i++) {
+			int	value;
+
+			value = ops->list_voltage(rdev, i);
+			if (value <= 0)
+				continue;
+
+			/* maybe adjust [min_uV..max_uV] */
+			if (value >= cmin && value < min_uV)
+				min_uV = value;
+			if (value <= cmax && value > max_uV)
+				max_uV = value;
+		}
+
+		/* final: [min_uV..max_uV] valid iff constraints valid */
+		if (max_uV < min_uV) {
+			pr_err("%s: %s '%s' voltage constraints\n",
+				       __func__, "unsupportable", name);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		/* use regulator's subset of machine constraints */
+		if (constraints->min_uV < min_uV) {
+			pr_debug("%s: override '%s' %s, %d -> %d\n",
+				       __func__, name, "min_uV",
+					constraints->min_uV, min_uV);
+			constraints->min_uV = min_uV;
+		}
+		if (constraints->max_uV > max_uV) {
+			pr_debug("%s: override '%s' %s, %d -> %d\n",
+				       __func__, name, "max_uV",
+					constraints->max_uV, max_uV);
+			constraints->max_uV = max_uV;
+		}
+	}
+
 	rdev->constraints = constraints;
 
 	/* do we need to apply the constraint voltage */
@@ -1250,6 +1313,56 @@ int regulator_is_enabled(struct regulator *regulator)
 }
 EXPORT_SYMBOL_GPL(regulator_is_enabled);
 
+/**
+ * regulator_count_voltages - count regulator_list_voltage() selectors
+ * @regulator: regulator source
+ *
+ * Returns number of selectors, or negative errno.  Selectors are
+ * numbered starting at zero, and typically correspond to bitfields
+ * in hardware registers.
+ */
+int regulator_count_voltages(struct regulator *regulator)
+{
+	struct regulator_dev	*rdev = regulator->rdev;
+
+	return rdev->desc->n_voltages ? : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(regulator_count_voltages);
+
+/**
+ * regulator_list_voltage - enumerate supported voltages
+ * @regulator: regulator source
+ * @selector: identify voltage to list
+ * Context: can sleep
+ *
+ * Returns a voltage that can be passed to @regulator_set_voltage(),
+ * zero if this selector code can't be used on this sytem, or a
+ * negative errno.
+ */
+int regulator_list_voltage(struct regulator *regulator, unsigned selector)
+{
+	struct regulator_dev	*rdev = regulator->rdev;
+	struct regulator_ops	*ops = rdev->desc->ops;
+	int			ret;
+
+	if (!ops->list_voltage || selector >= rdev->desc->n_voltages)
+		return -EINVAL;
+
+	mutex_lock(&rdev->mutex);
+	ret = ops->list_voltage(rdev, selector);
+	mutex_unlock(&rdev->mutex);
+
+	if (ret > 0) {
+		if (ret < rdev->constraints->min_uV)
+			ret = 0;
+		else if (ret > rdev->constraints->max_uV)
+			ret = 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regulator_list_voltage);
+
 /**
  * regulator_set_voltage - set regulator output voltage
  * @regulator: regulator source
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index df6c4bcf38f8..277f4b964df5 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -142,6 +142,8 @@ int regulator_bulk_disable(int num_consumers,
 void regulator_bulk_free(int num_consumers,
 			 struct regulator_bulk_data *consumers);
 
+int regulator_count_voltages(struct regulator *regulator);
+int regulator_list_voltage(struct regulator *regulator, unsigned selector);
 int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV);
 int regulator_get_voltage(struct regulator *regulator);
 int regulator_set_current_limit(struct regulator *regulator,
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 0cf37bc85c41..2255468d456f 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -45,6 +45,10 @@ enum regulator_status {
  * @set_voltage: Set the voltage for the regulator within the range specified.
  *               The driver should select the voltage closest to min_uV.
  * @get_voltage: Return the currently configured voltage for the regulator.
+ * @list_voltage: Return one of the supported voltages, in microvolts; zero
+ *	if the selector indicates a voltage that is unusable on this system;
+ *	or negative errno.  Selectors range from zero to one less than
+ *	regulator_desc.n_voltages.  Voltages may be reported in any order.
  *
  * @set_current_limit: Configure a limit for a current-limited regulator.
  * @get_current_limit: Get the limit for a current-limited regulator.
@@ -66,6 +70,9 @@ enum regulator_status {
  */
 struct regulator_ops {
 
+	/* enumerate supported voltages */
+	int (*list_voltage) (struct regulator_dev *, unsigned selector);
+
 	/* get/set regulator voltage */
 	int (*set_voltage) (struct regulator_dev *, int min_uV, int max_uV);
 	int (*get_voltage) (struct regulator_dev *);
@@ -124,6 +131,7 @@ enum regulator_type {
  *
  * @name: Identifying name for the regulator.
  * @id: Numerical identifier for the regulator.
+ * @n_voltages: Number of selectors available for ops.list_voltage().
  * @ops: Regulator operations table.
  * @irq: Interrupt number for the regulator.
  * @type: Indicates if the regulator is a voltage or current regulator.
@@ -132,6 +140,7 @@ enum regulator_type {
 struct regulator_desc {
 	const char *name;
 	int id;
+	unsigned n_voltages;
 	struct regulator_ops *ops;
 	int irq;
 	enum regulator_type type;
-- 
cgit v1.2.3


From 3b2a6061afe6fcc44437cd5ec641b0aeb2825ee3 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Thu, 26 Feb 2009 13:28:41 -0800
Subject: regulator: get_status() grows kerneldoc

Add kerneldoc for the new get_status() message.  Fix the existing
kerneldoc for that struct in two ways:

 (a) Syntax, making sure parameter descriptions immediately
     follow the one-line struct description and that the first
     blank lines is before any more expansive description;
 (b) Presentation for a few points, to highlight the fact that
     the previous "get" methods exist only to report the current
     configuration, not to display actual status.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/regulator/driver.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 2255468d456f..4848d8dacd90 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -35,11 +35,8 @@ enum regulator_status {
 /**
  * struct regulator_ops - regulator operations.
  *
- * This struct describes regulator operations which can be implemented by
- * regulator chip drivers.
- *
- * @enable: Enable the regulator.
- * @disable: Disable the regulator.
+ * @enable: Configure the regulator as enabled.
+ * @disable: Configure the regulator as disabled.
  * @is_enabled: Return 1 if the regulator is enabled, 0 otherwise.
  *
  * @set_voltage: Set the voltage for the regulator within the range specified.
@@ -51,11 +48,11 @@ enum regulator_status {
  *	regulator_desc.n_voltages.  Voltages may be reported in any order.
  *
  * @set_current_limit: Configure a limit for a current-limited regulator.
- * @get_current_limit: Get the limit for a current-limited regulator.
+ * @get_current_limit: Get the configured limit for a current-limited regulator.
  *
- * @set_mode: Set the operating mode for the regulator.
- * @get_mode: Get the current operating mode for the regulator.
- * @get_status: Report the regulator status.
+ * @get_mode: Get the configured operating mode for the regulator.
+ * @get_status: Return actual (not as-configured) status of regulator, as a
+ *	REGULATOR_STATUS value (or negative errno)
  * @get_optimum_mode: Get the most efficient operating mode for the regulator
  *                    when running with the specified parameters.
  *
@@ -67,6 +64,9 @@ enum regulator_status {
  *                       suspended.
  * @set_suspend_mode: Set the operating mode for the regulator when the
  *                    system is suspended.
+ *
+ * This struct describes regulator operations which can be implemented by
+ * regulator chip drivers.
  */
 struct regulator_ops {
 
@@ -94,6 +94,7 @@ struct regulator_ops {
 	/* report regulator status ... most other accessors report
 	 * control inputs, this reports results of combining inputs
 	 * from Linux (and other sources) with the actual load.
+	 * returns REGULATOR_STATUS_* or negative errno.
 	 */
 	int (*get_status)(struct regulator_dev *);
 
-- 
cgit v1.2.3


From fa16a5c13a2fc1433cfff38a083b4f8c5138d022 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Sun, 8 Feb 2009 10:37:06 -0800
Subject: regulator: twl4030 regulators

Support most of the LDO regulators in the twl4030 family chips.
In the case of LDOs supporting MMC/SD, the voltage controls are
used; but in most other cases, the regulator framework is only
used to enable/disable a supplies, conserving power when a given
voltage rail is not needed.

The drivers/mfd/twl4030-core.c code already sets up the various
regulators according to board-specific configuration, and knows
that some chips don't provide the full set of voltage rails.

The omitted regulators are intended to be under hardware control,
such as during the hardware-mediated system powerup, powerdown,
and suspend states.  Unless/until software hooks are known to
be safe, they won't be exported here.

These regulators implement the new get_status() operation, but
can't realistically implement get_mode(); the status output is
effectively the result of a vote, with the relevant hardware
inputs not exposed.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/Kconfig             |   7 +
 drivers/regulator/Makefile            |   1 +
 drivers/regulator/twl4030-regulator.c | 511 ++++++++++++++++++++++++++++++++++
 include/linux/i2c/twl4030.h           |  47 ++++
 4 files changed, 566 insertions(+)
 create mode 100644 drivers/regulator/twl4030-regulator.c

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 85a1f407e755..e58c0ce65aa6 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -56,6 +56,13 @@ config REGULATOR_BQ24022
 	  charging select between 100 mA and 500 mA charging current
 	  limit.
 
+config REGULATOR_TWL4030
+	bool "TI TWL4030/TWL5030/TPS695x0 PMIC"
+	depends on TWL4030_CORE
+	help
+	  This driver supports the voltage regulators provided by
+	  this family of companion chips.
+
 config REGULATOR_WM8350
 	tristate "Wolfson Microelectroncis WM8350 AudioPlus PMIC"
 	depends on MFD_WM8350
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 61b30c6ddecc..bac133afc061 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_REGULATOR_FIXED_VOLTAGE) += fixed.o
 obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o
 
 obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o
+obj-$(CONFIG_REGULATOR_TWL4030) += twl4030-regulator.o
 obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o
 obj-$(CONFIG_REGULATOR_WM8400) += wm8400-regulator.o
 obj-$(CONFIG_REGULATOR_DA903X)	+= da903x.o
diff --git a/drivers/regulator/twl4030-regulator.c b/drivers/regulator/twl4030-regulator.c
new file mode 100644
index 000000000000..23f282670db5
--- /dev/null
+++ b/drivers/regulator/twl4030-regulator.c
@@ -0,0 +1,511 @@
+/*
+ * twl4030-regulator.c -- support regulators in twl4030 family chips
+ *
+ * Copyright (C) 2008 David Brownell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/i2c/twl4030.h>
+
+
+/*
+ * The TWL4030/TW5030/TPS659x0 family chips include power management, a
+ * USB OTG transceiver, an RTC, ADC, PWM, and lots more.  Some versions
+ * include an audio codec, battery charger, and more voltage regulators.
+ * These chips are often used in OMAP-based systems.
+ *
+ * This driver implements software-based resource control for various
+ * voltage regulators.  This is usually augmented with state machine
+ * based control.
+ */
+
+struct twlreg_info {
+	/* start of regulator's PM_RECEIVER control register bank */
+	u8			base;
+
+	/* twl4030 resource ID, for resource control state machine */
+	u8			id;
+
+	/* voltage in mV = table[VSEL]; table_len must be a power-of-two */
+	u8			table_len;
+	const u16		*table;
+
+	/* chip constraints on regulator behavior */
+	u16			min_mV;
+	u16			max_mV;
+
+	/* used by regulator core */
+	struct regulator_desc	desc;
+};
+
+
+/* LDO control registers ... offset is from the base of its register bank.
+ * The first three registers of all power resource banks help hardware to
+ * manage the various resource groups.
+ */
+#define VREG_GRP		0
+#define VREG_TYPE		1
+#define VREG_REMAP		2
+#define VREG_DEDICATED		3	/* LDO control */
+
+
+static inline int
+twl4030reg_read(struct twlreg_info *info, unsigned offset)
+{
+	u8 value;
+	int status;
+
+	status = twl4030_i2c_read_u8(TWL4030_MODULE_PM_RECEIVER,
+			&value, info->base + offset);
+	return (status < 0) ? status : value;
+}
+
+static inline int
+twl4030reg_write(struct twlreg_info *info, unsigned offset, u8 value)
+{
+	return twl4030_i2c_write_u8(TWL4030_MODULE_PM_RECEIVER,
+			value, info->base + offset);
+}
+
+/*----------------------------------------------------------------------*/
+
+/* generic power resource operations, which work on all regulators */
+
+static int twl4030reg_grp(struct regulator_dev *rdev)
+{
+	return twl4030reg_read(rdev_get_drvdata(rdev), VREG_GRP);
+}
+
+/*
+ * Enable/disable regulators by joining/leaving the P1 (processor) group.
+ * We assume nobody else is updating the DEV_GRP registers.
+ */
+
+#define P3_GRP		BIT(7)		/* "peripherals" */
+#define P2_GRP		BIT(6)		/* secondary processor, modem, etc */
+#define P1_GRP		BIT(5)		/* CPU/Linux */
+
+static int twl4030reg_is_enabled(struct regulator_dev *rdev)
+{
+	int	state = twl4030reg_grp(rdev);
+
+	if (state < 0)
+		return state;
+
+	return (state & P1_GRP) != 0;
+}
+
+static int twl4030reg_enable(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			grp;
+
+	grp = twl4030reg_read(info, VREG_GRP);
+	if (grp < 0)
+		return grp;
+
+	grp |= P1_GRP;
+	return twl4030reg_write(info, VREG_GRP, grp);
+}
+
+static int twl4030reg_disable(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			grp;
+
+	grp = twl4030reg_read(info, VREG_GRP);
+	if (grp < 0)
+		return grp;
+
+	grp &= ~P1_GRP;
+	return twl4030reg_write(info, VREG_GRP, grp);
+}
+
+static int twl4030reg_get_status(struct regulator_dev *rdev)
+{
+	int	state = twl4030reg_grp(rdev);
+
+	if (state < 0)
+		return state;
+	state &= 0x0f;
+
+	/* assume state != WARM_RESET; we'd not be running...  */
+	if (!state)
+		return REGULATOR_STATUS_OFF;
+	return (state & BIT(3))
+		? REGULATOR_STATUS_NORMAL
+		: REGULATOR_STATUS_STANDBY;
+}
+
+static int twl4030reg_set_mode(struct regulator_dev *rdev, unsigned mode)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	unsigned		message;
+	int			status;
+
+	/* We can only set the mode through state machine commands... */
+	switch (mode) {
+	case REGULATOR_MODE_NORMAL:
+		message = MSG_SINGULAR(DEV_GRP_P1, info->id, RES_STATE_ACTIVE);
+		break;
+	case REGULATOR_MODE_STANDBY:
+		message = MSG_SINGULAR(DEV_GRP_P1, info->id, RES_STATE_SLEEP);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Ensure the resource is associated with some group */
+	status = twl4030reg_grp(rdev);
+	if (status < 0)
+		return status;
+	if (!(status & (P3_GRP | P2_GRP | P1_GRP)))
+		return -EACCES;
+
+	status = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER,
+			message >> 8, 0x15 /* PB_WORD_MSB */ );
+	if (status >= 0)
+		return status;
+
+	return twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER,
+			message, 0x16 /* PB_WORD_LSB */ );
+}
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Support for adjustable-voltage LDOs uses a four bit (or less) voltage
+ * select field in its control register.   We use tables indexed by VSEL
+ * to record voltages in milliVolts.  (Accuracy is about three percent.)
+ *
+ * Note that VSEL values for VAUX2 changed in twl5030 and newer silicon;
+ * currently handled by listing two slightly different VAUX2 regulators,
+ * only one of which will be configured.
+ *
+ * VSEL values documented as "TI cannot support these values" are flagged
+ * in these tables as UNSUP() values; we normally won't assign them.
+ */
+#ifdef CONFIG_TWL4030_ALLOW_UNSUPPORTED
+#define UNSUP_MASK	0x0000
+#else
+#define UNSUP_MASK	0x8000
+#endif
+
+#define UNSUP(x)	(UNSUP_MASK | (x))
+#define IS_UNSUP(x)	(UNSUP_MASK & (x))
+#define LDO_MV(x)	(~UNSUP_MASK & (x))
+
+
+static const u16 VAUX1_VSEL_table[] = {
+	UNSUP(1500), UNSUP(1800), 2500, 2800,
+	3000, 3000, 3000, 3000,
+};
+static const u16 VAUX2_4030_VSEL_table[] = {
+	UNSUP(1000), UNSUP(1000), UNSUP(1200), 1300,
+	1500, 1800, UNSUP(1850), 2500,
+	UNSUP(2600), 2800, UNSUP(2850), UNSUP(3000),
+	UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150),
+};
+static const u16 VAUX2_VSEL_table[] = {
+	1700, 1700, 1900, 1300,
+	1500, 1800, 2000, 2500,
+	2100, 2800, 2200, 2300,
+	2400, 2400, 2400, 2400,
+};
+static const u16 VAUX3_VSEL_table[] = {
+	1500, 1800, 2500, 2800,
+	UNSUP(3000), UNSUP(3000), UNSUP(3000), UNSUP(3000),
+};
+static const u16 VAUX4_VSEL_table[] = {
+	700, 1000, 1200, UNSUP(1300),
+	1500, 1800, UNSUP(1850), 2500,
+};
+static const u16 VMMC1_VSEL_table[] = {
+	1850, 2850, 3000, 3150,
+};
+static const u16 VMMC2_VSEL_table[] = {
+	UNSUP(1000), UNSUP(1000), UNSUP(1200), UNSUP(1300),
+	UNSUP(1500), UNSUP(1800), 1850, UNSUP(2500),
+	2600, 2800, 2850, 3000,
+	3150, 3150, 3150, 3150,
+};
+static const u16 VPLL1_VSEL_table[] = {
+	1000, 1200, 1300, 1800,
+	UNSUP(2800), UNSUP(3000), UNSUP(3000), UNSUP(3000),
+};
+static const u16 VPLL2_VSEL_table[] = {
+	700, 1000, 1200, 1300,
+	UNSUP(1500), 1800, UNSUP(1850), UNSUP(2500),
+	UNSUP(2600), UNSUP(2800), UNSUP(2850), UNSUP(3000),
+	UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150),
+};
+static const u16 VSIM_VSEL_table[] = {
+	UNSUP(1000), UNSUP(1200), UNSUP(1300), 1800,
+	2800, 3000, 3000, 3000,
+};
+static const u16 VDAC_VSEL_table[] = {
+	1200, 1300, 1800, 1800,
+};
+
+
+static int
+twl4030ldo_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			vsel;
+
+	for (vsel = 0; vsel < info->table_len; vsel++) {
+		int mV = info->table[vsel];
+		int uV;
+
+		if (IS_UNSUP(mV))
+			continue;
+		uV = LDO_MV(mV) * 1000;
+
+		/* use the first in-range value */
+		if (min_uV <= uV && uV <= max_uV)
+			return twl4030reg_write(info, VREG_DEDICATED, vsel);
+	}
+
+	return -EDOM;
+}
+
+static int twl4030ldo_get_voltage(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			vsel = twl4030reg_read(info, VREG_DEDICATED);
+
+	if (vsel < 0)
+		return vsel;
+
+	vsel &= info->table_len - 1;
+	return LDO_MV(info->table[vsel]) * 1000;
+}
+
+static struct regulator_ops twl4030ldo_ops = {
+	.set_voltage	= twl4030ldo_set_voltage,
+	.get_voltage	= twl4030ldo_get_voltage,
+
+	.enable		= twl4030reg_enable,
+	.disable	= twl4030reg_disable,
+	.is_enabled	= twl4030reg_is_enabled,
+
+	.set_mode	= twl4030reg_set_mode,
+
+	.get_status	= twl4030reg_get_status,
+};
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Fixed voltage LDOs don't have a VSEL field to update.
+ */
+static int twl4030fixed_get_voltage(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+
+	return info->min_mV * 1000;
+}
+
+static struct regulator_ops twl4030fixed_ops = {
+	.get_voltage	= twl4030fixed_get_voltage,
+
+	.enable		= twl4030reg_enable,
+	.disable	= twl4030reg_disable,
+	.is_enabled	= twl4030reg_is_enabled,
+
+	.set_mode	= twl4030reg_set_mode,
+
+	.get_status	= twl4030reg_get_status,
+};
+
+/*----------------------------------------------------------------------*/
+
+#define TWL_ADJUSTABLE_LDO(label, offset, num) { \
+	.base = offset, \
+	.id = num, \
+	.table_len = ARRAY_SIZE(label##_VSEL_table), \
+	.table = label##_VSEL_table, \
+	.desc = { \
+		.name = #label, \
+		.id = TWL4030_REG_##label, \
+		.ops = &twl4030ldo_ops, \
+		.type = REGULATOR_VOLTAGE, \
+		.owner = THIS_MODULE, \
+		}, \
+	}
+
+#define TWL_FIXED_LDO(label, offset, mVolts, num) { \
+	.base = offset, \
+	.id = num, \
+	.min_mV = mVolts, \
+	.max_mV = mVolts, \
+	.desc = { \
+		.name = #label, \
+		.id = TWL4030_REG_##label, \
+		.ops = &twl4030fixed_ops, \
+		.type = REGULATOR_VOLTAGE, \
+		.owner = THIS_MODULE, \
+		}, \
+	}
+
+/*
+ * We list regulators here if systems need some level of
+ * software control over them after boot.
+ */
+static struct twlreg_info twl4030_regs[] = {
+	TWL_ADJUSTABLE_LDO(VAUX1, 0x17, 1),
+	TWL_ADJUSTABLE_LDO(VAUX2_4030, 0x1b, 2),
+	TWL_ADJUSTABLE_LDO(VAUX2, 0x1b, 2),
+	TWL_ADJUSTABLE_LDO(VAUX3, 0x1f, 3),
+	TWL_ADJUSTABLE_LDO(VAUX4, 0x23, 4),
+	TWL_ADJUSTABLE_LDO(VMMC1, 0x27, 5),
+	TWL_ADJUSTABLE_LDO(VMMC2, 0x2b, 6),
+	/*
+	TWL_ADJUSTABLE_LDO(VPLL1, 0x2f, 7),
+	TWL_ADJUSTABLE_LDO(VPLL2, 0x33, 8),
+	*/
+	TWL_ADJUSTABLE_LDO(VSIM, 0x37, 9),
+	TWL_ADJUSTABLE_LDO(VDAC, 0x3b, 10),
+	/*
+	TWL_ADJUSTABLE_LDO(VINTANA1, 0x3f, 11),
+	TWL_ADJUSTABLE_LDO(VINTANA2, 0x43, 12),
+	TWL_ADJUSTABLE_LDO(VINTDIG, 0x47, 13),
+	TWL_SMPS(VIO, 0x4b, 14),
+	TWL_SMPS(VDD1, 0x55, 15),
+	TWL_SMPS(VDD2, 0x63, 16),
+	 */
+	TWL_FIXED_LDO(VUSB1V5, 0x71, 1500, 17),
+	TWL_FIXED_LDO(VUSB1V8, 0x74, 1800, 18),
+	TWL_FIXED_LDO(VUSB3V1, 0x77, 3100, 19),
+	/* VUSBCP is managed *only* by the USB subchip */
+};
+
+static int twl4030reg_probe(struct platform_device *pdev)
+{
+	int				i;
+	struct twlreg_info		*info;
+	struct regulator_init_data	*initdata;
+	struct regulation_constraints	*c;
+	struct regulator_dev		*rdev;
+	int				min_uV, max_uV;
+
+	for (i = 0, info = NULL; i < ARRAY_SIZE(twl4030_regs); i++) {
+		if (twl4030_regs[i].desc.id != pdev->id)
+			continue;
+		info = twl4030_regs + i;
+		min_uV = info->min_mV * 1000;
+		max_uV = info->max_mV * 1000;
+		break;
+	}
+	if (!info)
+		return -ENODEV;
+
+	initdata = pdev->dev.platform_data;
+	if (!initdata)
+		return -EINVAL;
+
+	/* Constrain board-specific capabilities according to what
+	 * this driver and the chip itself can actually do.
+	 */
+	c = &initdata->constraints;
+	if (!c->min_uV || c->min_uV < min_uV)
+		c->min_uV = min_uV;
+	if (!c->max_uV || c->max_uV > max_uV)
+		c->max_uV = max_uV;
+	c->valid_modes_mask &= REGULATOR_MODE_NORMAL | REGULATOR_MODE_STANDBY;
+	c->valid_ops_mask &= REGULATOR_CHANGE_VOLTAGE
+				| REGULATOR_CHANGE_MODE
+				| REGULATOR_CHANGE_STATUS;
+
+	rdev = regulator_register(&info->desc, &pdev->dev, initdata, info);
+	if (IS_ERR(rdev)) {
+		dev_err(&pdev->dev, "can't register %s, %ld\n",
+				info->desc.name, PTR_ERR(rdev));
+		return PTR_ERR(rdev);
+	}
+	platform_set_drvdata(pdev, rdev);
+
+	/* NOTE:  many regulators support short-circuit IRQs (presentable
+	 * as REGULATOR_OVER_CURRENT notifications?) configured via:
+	 *  - SC_CONFIG
+	 *  - SC_DETECT1 (vintana2, vmmc1/2, vaux1/2/3/4)
+	 *  - SC_DETECT2 (vusb, vdac, vio, vdd1/2, vpll2)
+	 *  - IT_CONFIG
+	 */
+
+	return 0;
+}
+
+static int __devexit twl4030reg_remove(struct platform_device *pdev)
+{
+	regulator_unregister(platform_get_drvdata(pdev));
+	return 0;
+}
+
+MODULE_ALIAS("platform:twl4030_reg");
+
+static struct platform_driver twl4030reg_driver = {
+	.probe		= twl4030reg_probe,
+	.remove		= __devexit_p(twl4030reg_remove),
+	/* NOTE: short name, to work around driver model truncation of
+	 * "twl4030_regulator.12" (and friends) to "twl4030_regulator.1".
+	 */
+	.driver.name	= "twl4030_reg",
+	.driver.owner	= THIS_MODULE,
+};
+
+static int __init twl4030reg_init(void)
+{
+	unsigned i, j;
+
+	/* determine min/max voltage constraints, taking into account
+	 * whether set_voltage() will use the "unsupported" settings
+	 */
+	for (i = 0; i < ARRAY_SIZE(twl4030_regs); i++) {
+		struct twlreg_info	*info = twl4030_regs + i;
+		const u16		*table;
+
+		/* fixed-voltage regulators */
+		if (!info->table_len)
+			continue;
+
+		/* LDO regulators: */
+		for (j = 0, table = info->table;
+				j < info->table_len;
+				j++, table++) {
+			u16		mV = *table;
+
+			if (IS_UNSUP(mV))
+				continue;
+			mV = LDO_MV(mV);
+
+			if (info->min_mV == 0 || info->min_mV > mV)
+				info->min_mV = mV;
+			if (info->max_mV < mV)
+				info->max_mV = mV;
+		}
+	}
+
+	return platform_driver_register(&twl4030reg_driver);
+}
+subsys_initcall(twl4030reg_init);
+
+static void __exit twl4030reg_exit(void)
+{
+	platform_driver_unregister(&twl4030reg_driver);
+}
+module_exit(twl4030reg_exit)
+
+MODULE_DESCRIPTION("TWL4030 regulator driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h
index 8137f660a5cc..0dc80ef24975 100644
--- a/include/linux/i2c/twl4030.h
+++ b/include/linux/i2c/twl4030.h
@@ -218,6 +218,53 @@ int twl4030_i2c_read(u8 mod_no, u8 *value, u8 reg, unsigned num_bytes);
 
 /*----------------------------------------------------------------------*/
 
+/* Power bus message definitions */
+
+#define DEV_GRP_NULL		0x0
+#define DEV_GRP_P1		0x1
+#define DEV_GRP_P2		0x2
+#define DEV_GRP_P3		0x4
+
+#define RES_GRP_RES		0x0
+#define RES_GRP_PP		0x1
+#define RES_GRP_RC		0x2
+#define RES_GRP_PP_RC		0x3
+#define RES_GRP_PR		0x4
+#define RES_GRP_PP_PR		0x5
+#define RES_GRP_RC_PR		0x6
+#define RES_GRP_ALL		0x7
+
+#define RES_TYPE2_R0		0x0
+
+#define RES_TYPE_ALL		0x7
+
+#define RES_STATE_WRST		0xF
+#define RES_STATE_ACTIVE	0xE
+#define RES_STATE_SLEEP		0x8
+#define RES_STATE_OFF		0x0
+
+/*
+ * Power Bus Message Format ... these can be sent individually by Linux,
+ * but are usually part of downloaded scripts that are run when various
+ * power events are triggered.
+ *
+ *  Broadcast Message (16 Bits):
+ *    DEV_GRP[15:13] MT[12]  RES_GRP[11:9]  RES_TYPE2[8:7] RES_TYPE[6:4]
+ *    RES_STATE[3:0]
+ *
+ *  Singular Message (16 Bits):
+ *    DEV_GRP[15:13] MT[12]  RES_ID[11:4]  RES_STATE[3:0]
+ */
+
+#define MSG_BROADCAST(devgrp, grp, type, type2, state) \
+	( (devgrp) << 13 | 1 << 12 | (grp) << 9 | (type2) << 7 \
+	| (type) << 4 | (state))
+
+#define MSG_SINGULAR(devgrp, id, state) \
+	((devgrp) << 13 | 0 << 12 | (id) << 4 | (state))
+
+/*----------------------------------------------------------------------*/
+
 struct twl4030_bci_platform_data {
 	int *battery_tmp_tbl;
 	unsigned int tblsize;
-- 
cgit v1.2.3


From 5c13941acc513669c7d07b28789c3f9ba66ddddf Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 11 Mar 2009 03:30:43 -0800
Subject: MMC: regulator utilities

Glue between MMC and regulator stacks ... verified with
some OMAP3 boards using adjustable and configured-as-fixed
regulators on several MMC controllers.

These calls are intended to be used by MMC host adapters
using at least one regulator per host.  Examples include
slots with regulators supporting multiple voltages and
ones using multiple voltage rails (e.g. DAT4..DAT7 using a
separate supply, or a split rail chip like certain SDIO
WLAN or eMMC solutions).

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Pierre Ossman <drzeus@drzeus.cx>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/mmc/core/core.c  | 100 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/host.h |   5 +++
 2 files changed, 105 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index df6ce4a06cf3..1445ea8f10a6 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -21,6 +21,7 @@
 #include <linux/leds.h>
 #include <linux/scatterlist.h>
 #include <linux/log2.h>
+#include <linux/regulator/consumer.h>
 
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
@@ -523,6 +524,105 @@ u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max)
 }
 EXPORT_SYMBOL(mmc_vddrange_to_ocrmask);
 
+#ifdef CONFIG_REGULATOR
+
+/**
+ * mmc_regulator_get_ocrmask - return mask of supported voltages
+ * @supply: regulator to use
+ *
+ * This returns either a negative errno, or a mask of voltages that
+ * can be provided to MMC/SD/SDIO devices using the specified voltage
+ * regulator.  This would normally be called before registering the
+ * MMC host adapter.
+ */
+int mmc_regulator_get_ocrmask(struct regulator *supply)
+{
+	int			result = 0;
+	int			count;
+	int			i;
+
+	count = regulator_count_voltages(supply);
+	if (count < 0)
+		return count;
+
+	for (i = 0; i < count; i++) {
+		int		vdd_uV;
+		int		vdd_mV;
+
+		vdd_uV = regulator_list_voltage(supply, i);
+		if (vdd_uV <= 0)
+			continue;
+
+		vdd_mV = vdd_uV / 1000;
+		result |= mmc_vddrange_to_ocrmask(vdd_mV, vdd_mV);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(mmc_regulator_get_ocrmask);
+
+/**
+ * mmc_regulator_set_ocr - set regulator to match host->ios voltage
+ * @vdd_bit: zero for power off, else a bit number (host->ios.vdd)
+ * @supply: regulator to use
+ *
+ * Returns zero on success, else negative errno.
+ *
+ * MMC host drivers may use this to enable or disable a regulator using
+ * a particular supply voltage.  This would normally be called from the
+ * set_ios() method.
+ */
+int mmc_regulator_set_ocr(struct regulator *supply, unsigned short vdd_bit)
+{
+	int			result = 0;
+	int			min_uV, max_uV;
+	int			enabled;
+
+	enabled = regulator_is_enabled(supply);
+	if (enabled < 0)
+		return enabled;
+
+	if (vdd_bit) {
+		int		tmp;
+		int		voltage;
+
+		/* REVISIT mmc_vddrange_to_ocrmask() may have set some
+		 * bits this regulator doesn't quite support ... don't
+		 * be too picky, most cards and regulators are OK with
+		 * a 0.1V range goof (it's a small error percentage).
+		 */
+		tmp = vdd_bit - ilog2(MMC_VDD_165_195);
+		if (tmp == 0) {
+			min_uV = 1650 * 1000;
+			max_uV = 1950 * 1000;
+		} else {
+			min_uV = 1900 * 1000 + tmp * 100 * 1000;
+			max_uV = min_uV + 100 * 1000;
+		}
+
+		/* avoid needless changes to this voltage; the regulator
+		 * might not allow this operation
+		 */
+		voltage = regulator_get_voltage(supply);
+		if (voltage < 0)
+			result = voltage;
+		else if (voltage < min_uV || voltage > max_uV)
+			result = regulator_set_voltage(supply, min_uV, max_uV);
+		else
+			result = 0;
+
+		if (result == 0 && !enabled)
+			result = regulator_enable(supply);
+	} else if (enabled) {
+		result = regulator_disable(supply);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(mmc_regulator_set_ocr);
+
+#endif
+
 /*
  * Mask off any voltages we don't support and select
  * the lowest voltage
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 4e457256bd33..3e7615e9087e 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -192,5 +192,10 @@ static inline void mmc_signal_sdio_irq(struct mmc_host *host)
 	wake_up_process(host->sdio_irq_thread);
 }
 
+struct regulator;
+
+int mmc_regulator_get_ocrmask(struct regulator *supply);
+int mmc_regulator_set_ocr(struct regulator *supply, unsigned short vdd_bit);
+
 #endif
 
-- 
cgit v1.2.3


From cacf90f24e80cec9334f98e0377149f943fe9f16 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 2 Mar 2009 16:32:46 +0000
Subject: regulator: Allow boot_on regulators to be disabled by clients

Rather than incrementing the reference count for boot_on regulators
(which prevents them being disabled later on) simply force the
regulator to be enabled when applying the constraints. Previously
boot_on was essentially equivalent to always_on.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c          | 13 ++++---------
 include/linux/regulator/machine.h |  4 +++-
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 2ff76349f392..08441e24946e 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -776,10 +776,6 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 			}
 	}
 
-	/* are we enabled at boot time by firmware / bootloader */
-	if (rdev->constraints->boot_on)
-		rdev->use_count = 1;
-
 	/* do we need to setup our suspend state */
 	if (constraints->initial_state) {
 		ret = suspend_prepare(rdev, constraints->initial_state);
@@ -808,11 +804,10 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 		}
 	}
 
-	/* if always_on is set then turn the regulator on if it's not
-	 * already on. */
-	if (constraints->always_on && ops->enable &&
-	    ((ops->is_enabled && !ops->is_enabled(rdev)) ||
-	     (!ops->is_enabled && !constraints->boot_on))) {
+	/* If the constraints say the regulator should be on at this point
+	 * and we have control then make sure it is enabled.
+	 */
+	if ((constraints->always_on || constraints->boot_on) && ops->enable) {
 		ret = ops->enable(rdev);
 		if (ret < 0) {
 			printk(KERN_ERR "%s: failed to enable %s\n",
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 1eb861cf4b2c..5de7aa3b02a6 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -73,7 +73,9 @@ struct regulator_state {
  *
  * @always_on: Set if the regulator should never be disabled.
  * @boot_on: Set if the regulator is enabled when the system is initially
- *           started.
+ *           started.  If the regulator is not enabled by the hardware or
+ *           bootloader then it will be enabled when the constraints are
+ *           applied.
  * @apply_uV: Apply the voltage constraint when initialising.
  *
  * @input_uV: Input voltage for regulator when supplied by another regulator.
-- 
cgit v1.2.3


From ca7255614e0861e36480103f4a402a115803d7b5 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 16 Mar 2009 19:36:34 +0000
Subject: regulator: Support disabling of unused regulators by machines

At present it is not possible for machine constraints to disable
regulators which have been left on when the system starts, for example
as a result of fixed default configurations in hardware. This means that
power may be wasted by these regulators if they are not in use.

Provide intial support for this with a late_initcall which will disable
any unused regulators if the machine has enabled this feature by calling
regulator_has_full_constraints(). If this has not been called then print
a warning to encourage users to fully specify their constraints so that
we can change this to be the default behaviour in future.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c          | 92 +++++++++++++++++++++++++++++++++++++++
 include/linux/regulator/machine.h |  2 +
 2 files changed, 94 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 8588a2490e0a..01f7702a805d 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -28,6 +28,7 @@
 static DEFINE_MUTEX(regulator_list_mutex);
 static LIST_HEAD(regulator_list);
 static LIST_HEAD(regulator_map_list);
+static int has_full_constraints;
 
 /*
  * struct regulator_map
@@ -2142,6 +2143,23 @@ out:
 }
 EXPORT_SYMBOL_GPL(regulator_suspend_prepare);
 
+/**
+ * regulator_has_full_constraints - the system has fully specified constraints
+ *
+ * Calling this function will cause the regulator API to disable all
+ * regulators which have a zero use count and don't have an always_on
+ * constraint in a late_initcall.
+ *
+ * The intention is that this will become the default behaviour in a
+ * future kernel release so users are encouraged to use this facility
+ * now.
+ */
+void regulator_has_full_constraints(void)
+{
+	has_full_constraints = 1;
+}
+EXPORT_SYMBOL_GPL(regulator_has_full_constraints);
+
 /**
  * rdev_get_drvdata - get rdev regulator driver data
  * @rdev: regulator
@@ -2209,3 +2227,77 @@ static int __init regulator_init(void)
 
 /* init early to allow our consumers to complete system booting */
 core_initcall(regulator_init);
+
+static int __init regulator_init_complete(void)
+{
+	struct regulator_dev *rdev;
+	struct regulator_ops *ops;
+	struct regulation_constraints *c;
+	int enabled, ret;
+	const char *name;
+
+	mutex_lock(&regulator_list_mutex);
+
+	/* If we have a full configuration then disable any regulators
+	 * which are not in use or always_on.  This will become the
+	 * default behaviour in the future.
+	 */
+	list_for_each_entry(rdev, &regulator_list, list) {
+		ops = rdev->desc->ops;
+		c = rdev->constraints;
+
+		if (c->name)
+			name = c->name;
+		else if (rdev->desc->name)
+			name = rdev->desc->name;
+		else
+			name = "regulator";
+
+		if (!ops->disable || c->always_on)
+			continue;
+
+		mutex_lock(&rdev->mutex);
+
+		if (rdev->use_count)
+			goto unlock;
+
+		/* If we can't read the status assume it's on. */
+		if (ops->is_enabled)
+			enabled = ops->is_enabled(rdev);
+		else
+			enabled = 1;
+
+		if (!enabled)
+			goto unlock;
+
+		if (has_full_constraints) {
+			/* We log since this may kill the system if it
+			 * goes wrong. */
+			printk(KERN_INFO "%s: disabling %s\n",
+			       __func__, name);
+			ret = ops->disable(rdev);
+			if (ret != 0) {
+				printk(KERN_ERR
+				       "%s: couldn't disable %s: %d\n",
+				       __func__, name, ret);
+			}
+		} else {
+			/* The intention is that in future we will
+			 * assume that full constraints are provided
+			 * so warn even if we aren't going to do
+			 * anything here.
+			 */
+			printk(KERN_WARNING
+			       "%s: incomplete constraints, leaving %s on\n",
+			       __func__, name);
+		}
+
+unlock:
+		mutex_unlock(&rdev->mutex);
+	}
+
+	mutex_unlock(&regulator_list_mutex);
+
+	return 0;
+}
+late_initcall(regulator_init_complete);
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 5de7aa3b02a6..bac64fa390f2 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -166,4 +166,6 @@ struct regulator_init_data {
 
 int regulator_suspend_prepare(suspend_state_t state);
 
+void regulator_has_full_constraints(void);
+
 #endif
-- 
cgit v1.2.3


From 7f1e2ca9f04b02794597f60e7b1d43f0a1317939 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:27 +0100
Subject: hrtimer: fix rq->lock inversion (again)

It appears I inadvertly introduced rq->lock recursion to the
hrtimer_start() path when I delegated running already expired
timers to softirq context.

This patch fixes it by introducing a __hrtimer_start_range_ns()
method that will not use raise_softirq_irqoff() but
__raise_softirq_irqoff() which avoids the wakeup.

It then also changes schedule() to check for pending softirqs and
do the wakeup then, I'm not quite sure I like this last bit, nor
am I convinced its really needed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus@samba.org
LKML-Reference: <20090313112301.096138802@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h   |  5 +++++
 include/linux/interrupt.h |  1 +
 kernel/hrtimer.c          | 55 +++++++++++++++++++++++++++++------------------
 kernel/sched.c            | 14 +++++++++---
 kernel/softirq.c          |  2 +-
 5 files changed, 52 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bd37078c2d7d..0d2f7c8a33d6 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -336,6 +336,11 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
 			 const enum hrtimer_mode mode);
 extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			unsigned long range_ns, const enum hrtimer_mode mode);
+extern int
+__hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+			 unsigned long delta_ns,
+			 const enum hrtimer_mode mode, int wakeup);
+
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c68bffd182bb..4528bf70866a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -294,6 +294,7 @@ extern void softirq_init(void);
 #define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
+extern void wakeup_softirqd(void);
 
 /* This is the worklist that queues up per-cpu softirq work.
  *
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f394d2a42ca3..cb8a15c19583 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -651,14 +651,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
  * and expiry check is done in the hrtimer_interrupt or in the softirq.
  */
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
 	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
-		spin_unlock(&base->cpu_base->lock);
-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-		spin_lock(&base->cpu_base->lock);
+		if (wakeup) {
+			spin_unlock(&base->cpu_base->lock);
+			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+			spin_lock(&base->cpu_base->lock);
+		} else
+			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+
 		return 1;
 	}
+
 	return 0;
 }
 
@@ -703,7 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
 	return 0;
 }
@@ -886,20 +893,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 	return 0;
 }
 
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @delta_ns:	"slack" range for the timer
- * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int
-hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
-			const enum hrtimer_mode mode)
+int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+		unsigned long delta_ns, const enum hrtimer_mode mode,
+		int wakeup)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
@@ -940,12 +936,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
 	 * XXX send_remote_softirq() ?
 	 */
 	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
-		hrtimer_enqueue_reprogram(timer, new_base);
+		hrtimer_enqueue_reprogram(timer, new_base, wakeup);
 
 	unlock_hrtimer_base(timer, &flags);
 
 	return ret;
 }
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
+ * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+		unsigned long delta_ns, const enum hrtimer_mode mode)
+{
+	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
+}
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
 /**
@@ -961,7 +974,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 int
 hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
-	return hrtimer_start_range_ns(timer, tim, 0, mode);
+	return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 196d48babbef..63256e3ede2a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 
 	spin_lock(&rt_b->rt_runtime_lock);
 	for (;;) {
+		unsigned long delta;
+		ktime_t soft, hard;
+
 		if (hrtimer_active(&rt_b->rt_period_timer))
 			break;
 
 		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-		hrtimer_start_expires(&rt_b->rt_period_timer,
-				HRTIMER_MODE_ABS);
+
+		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+		delta = ktime_to_ns(ktime_sub(hard, soft));
+		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+				HRTIMER_MODE_ABS, 0);
 	}
 	spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -1146,7 +1153,8 @@ static __init void init_hrtick(void)
  */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
-	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+			HRTIMER_MODE_REL, 0);
 }
 
 static inline void init_hrtick(void)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 487751604300..accc85197c49 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
-static inline void wakeup_softirqd(void)
+void wakeup_softirqd(void)
 {
 	/* Interrupts are disabled: no need to stop preemption */
 	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
-- 
cgit v1.2.3


From 9310933c832719b095f82dab30c6bf4e75e937ee Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Sat, 28 Mar 2009 15:07:16 -0600
Subject: powerpc: Remove unused symbols from fsl_devices.h

Remove old artifacts leftover from the platform driver gianfar and
fsl_i2c drivers.  These symbols became unused when the drivers
were migrated over to use the of_platform bus.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 include/linux/fsl_devices.h | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index d9051d717d27..4953709f687a 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -18,7 +18,6 @@
 #define _FSL_DEVICE_H_
 
 #include <linux/types.h>
-#include <linux/phy.h>
 
 /*
  * Some conventions on how we handle peripherals on Freescale chips
@@ -44,27 +43,6 @@
  *
  */
 
-struct gianfar_platform_data {
-	/* device specific information */
-	u32	device_flags;
-	char	bus_id[BUS_ID_SIZE];
-	phy_interface_t interface;
-};
-
-struct gianfar_mdio_data {
-	/* board specific information */
-	int	irq[32];
-};
-
-/* Flags in gianfar_platform_data */
-#define FSL_GIANFAR_BRD_HAS_PHY_INTR	0x00000001 /* set or use a timer */
-#define FSL_GIANFAR_BRD_IS_REDUCED	0x00000002 /* Set if RGMII, RMII */
-
-struct fsl_i2c_platform_data {
-	/* device specific information */
-	u32	device_flags;
-};
-
 /* Flags related to I2C device features */
 #define FSL_I2C_DEV_SEPARATE_DFSRR	0x00000001
 #define FSL_I2C_DEV_CLOCK_5200		0x00000002
-- 
cgit v1.2.3


From a08915ba594da66145f33a972db578a58b9135f1 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:13 +0200
Subject: ide-cd: use scatterlists for PIO transfers (fs requests)

* Export ide_pio_bytes().

* Add ->last_xfer_len field to struct ide_cmd.

* Add ide_cd_error_cmd() helper to ide-cd.

* Convert ide-cd to use scatterlists also for PIO transfers (fs requests
  only for now) and get rid of partial completions (except when the error
  happens -- which is still subject to change later because looking at
  ATAPI spec it seems that the device is free to error the whole transfer
  with setting the Error bit only on the last transfer chunk).

* Update ide_cd_{prepare_rw,restore_request,do_request}() accordingly.

* Inline ide_cd_restore_request() into cdrom_start_rw().

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c       | 146 +++++++++++++--------------------------------
 drivers/ide/ide-taskfile.c |   5 +-
 include/linux/ide.h        |   4 ++
 3 files changed, 50 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 30113e69c8bb..5f15859c2c73 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -539,64 +539,12 @@ static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive,
 {
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd_flags: 0x%x", rq->cmd_flags);
 
-	if (rq_data_dir(rq) == READ) {
-		unsigned short sectors_per_frame =
-			queue_hardsect_size(drive->queue) >> SECTOR_BITS;
-		int nskip = rq->sector & (sectors_per_frame - 1);
-
-		/*
-		 * If the requested sector doesn't start on a frame boundary,
-		 * we must adjust the start of the transfer so that it does,
-		 * and remember to skip the first few sectors.
-		 *
-		 * If the rq->current_nr_sectors field is larger than the size
-		 * of the buffer, it will mean that we're to skip a number of
-		 * sectors equal to the amount by which rq->current_nr_sectors
-		 * is larger than the buffer size.
-		 */
-		if (nskip > 0) {
-			/* sanity check... */
-			if (rq->current_nr_sectors !=
-			    bio_cur_sectors(rq->bio)) {
-				printk(KERN_ERR PFX "%s: %s: buffer botch (%u)\n",
-						drive->name, __func__,
-						rq->current_nr_sectors);
-				return ide_stopped;
-			}
-			rq->current_nr_sectors += nskip;
-		}
-	}
-
 	/* set up the command */
 	rq->timeout = ATAPI_WAIT_PC;
 
 	return ide_started;
 }
 
-/*
- * Fix up a possibly partially-processed request so that we can start it over
- * entirely, or even put it back on the request queue.
- */
-static void ide_cd_restore_request(ide_drive_t *drive, struct request *rq)
-{
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	if (rq->buffer != bio_data(rq->bio)) {
-		sector_t n =
-			(rq->buffer - (char *)bio_data(rq->bio)) / SECTOR_SIZE;
-
-		rq->buffer = bio_data(rq->bio);
-		rq->nr_sectors += n;
-		rq->sector -= n;
-	}
-	rq->current_nr_sectors = bio_cur_sectors(rq->bio);
-	rq->hard_cur_sectors = rq->current_nr_sectors;
-	rq->hard_nr_sectors = rq->nr_sectors;
-	rq->hard_sector = rq->sector;
-	rq->q->prep_rq_fn(rq->q, rq);
-}
-
 static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq)
 {
 	ide_debug_log(IDE_DBG_FUNC, "rq->cmd[0]: 0x%x", rq->cmd[0]);
@@ -690,6 +638,17 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 	return (flags & REQ_FAILED) ? -EIO : 0;
 }
 
+static void ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
+{
+	unsigned int nr_bytes = cmd->nbytes - cmd->nleft;
+
+	if (cmd->tf_flags & IDE_TFLAG_WRITE)
+		nr_bytes -= cmd->last_xfer_len;
+
+	if (nr_bytes > 0)
+		ide_complete_rq(drive, 0, nr_bytes);
+}
+
 /*
  * Called from blk_end_request_callback() after the data of the request is
  * completed and before the request itself is completed. By returning value '1',
@@ -703,6 +662,7 @@ static int cdrom_newpc_intr_dummy_cb(struct request *rq)
 static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	xfer_func_t *xferfunc;
 	ide_expiry_t *expiry = NULL;
@@ -769,11 +729,10 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			 * Otherwise, complete the command normally.
 			 */
 			uptodate = 1;
-			if (rq->current_nr_sectors > 0) {
+			if (cmd->nleft > 0) {
 				printk(KERN_ERR PFX "%s: %s: data underrun "
-						"(%d blocks)\n",
-						drive->name, __func__,
-						rq->current_nr_sectors);
+					"(%u bytes)\n", drive->name, __func__,
+					cmd->nleft);
 				if (!write)
 					rq->cmd_flags |= REQ_FAILED;
 				uptodate = 0;
@@ -795,24 +754,10 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 	if (blk_fs_request(rq)) {
 		if (write == 0) {
-			int nskip;
-
 			if (ide_cd_check_transfer_size(drive, len))
 				goto out_end;
-
-			/*
-			 * First, figure out if we need to bit-bucket
-			 * any of the leading sectors.
-			 */
-			nskip = min_t(int, rq->current_nr_sectors
-					   - bio_cur_sectors(rq->bio),
-					   thislen >> 9);
-			if (nskip > 0) {
-				ide_pad_transfer(drive, write, nskip << 9);
-				rq->current_nr_sectors -= nskip;
-				thislen -= (nskip << 9);
-			}
 		}
+		cmd->last_xfer_len = 0;
 	}
 
 	if (ireason == 0) {
@@ -835,15 +780,15 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		/* bio backed? */
 		if (rq->bio) {
 			if (blk_fs_request(rq)) {
-				ptr = rq->buffer;
-				blen = rq->current_nr_sectors << 9;
+				blen = min_t(int, thislen, cmd->nleft);
 			} else {
 				ptr = bio_data(rq->bio);
 				blen = bio_iovec(rq->bio)->bv_len;
 			}
 		}
 
-		if (!ptr) {
+		if ((blk_fs_request(rq) && cmd->nleft == 0) ||
+		    (blk_fs_request(rq) == 0 && ptr == NULL)) {
 			if (blk_fs_request(rq) && !write)
 				/*
 				 * If the buffers are full, pipe the rest into
@@ -863,26 +808,16 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		if (blen > thislen)
 			blen = thislen;
 
-		xferfunc(drive, NULL, ptr, blen);
+		if (blk_fs_request(rq)) {
+			ide_pio_bytes(drive, cmd, write, blen);
+			cmd->last_xfer_len += blen;
+		} else
+			xferfunc(drive, NULL, ptr, blen);
 
 		thislen -= blen;
 		len -= blen;
 
-		if (blk_fs_request(rq)) {
-			rq->buffer += blen;
-			rq->nr_sectors -= (blen >> 9);
-			rq->current_nr_sectors -= (blen >> 9);
-			rq->sector += (blen >> 9);
-
-			if (rq->current_nr_sectors == 0 && rq->nr_sectors) {
-				nsectors = rq->hard_cur_sectors;
-
-				if (nsectors == 0)
-					nsectors = 1;
-
-				ide_complete_rq(drive, 0, nsectors << 9);
-			}
-		} else {
+		if (blk_fs_request(rq) == 0) {
 			rq->data_len -= blen;
 
 			/*
@@ -933,8 +868,10 @@ out_end:
 			ide_cd_complete_failed_rq(drive, rq);
 
 		if (blk_fs_request(rq)) {
-			if (rq->current_nr_sectors == 0)
+			if (cmd->nleft == 0)
 				uptodate = 1;
+			if (uptodate == 0)
+				ide_cd_error_cmd(drive, cmd);
 		} else {
 			if (uptodate <= 0 && rq->errors == 0)
 				rq->errors = -EIO;
@@ -944,7 +881,7 @@ out_end:
 		if (blk_pc_request(rq))
 			nsectors = (rq->data_len + 511) >> 9;
 		else
-			nsectors = rq->hard_cur_sectors;
+			nsectors = rq->hard_nr_sectors;
 
 		if (nsectors == 0)
 			nsectors = 1;
@@ -960,9 +897,10 @@ out_end:
 static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 {
 	struct cdrom_info *cd = drive->driver_data;
+	struct request_queue *q = drive->queue;
 	int write = rq_data_dir(rq) == WRITE;
 	unsigned short sectors_per_frame =
-		queue_hardsect_size(drive->queue) >> SECTOR_BITS;
+		queue_hardsect_size(q) >> SECTOR_BITS;
 
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, write: 0x%x, "
 				  "secs_per_frame: %u",
@@ -977,17 +915,16 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 		 * We may be retrying this request after an error.  Fix up any
 		 * weirdness which might be present in the request packet.
 		 */
-		ide_cd_restore_request(drive, rq);
+		q->prep_rq_fn(q, rq);
 	}
 
-	/* use DMA, if possible / writes *must* be hardware frame aligned */
+	/* fs requests *must* be hardware frame aligned */
 	if ((rq->nr_sectors & (sectors_per_frame - 1)) ||
-	    (rq->sector & (sectors_per_frame - 1))) {
-		if (write)
-			return ide_stopped;
-		drive->dma = 0;
-	} else
-		drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
+	    (rq->sector & (sectors_per_frame - 1)))
+		return ide_stopped;
+
+	/* use DMA, if possible */
+	drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 
 	if (write)
 		cd->devinfo.media_written = 1;
@@ -1050,8 +987,6 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 	if (blk_fs_request(rq)) {
 		if (cdrom_start_rw(drive, rq) == ide_stopped ||
 		    ide_cd_prepare_rw_request(drive, rq) == ide_stopped) {
-			if (rq->current_nr_sectors == 0)
-				uptodate = 1;
 			goto out_end;
 		}
 	} else if (blk_sense_request(rq) || blk_pc_request(rq) ||
@@ -1078,6 +1013,11 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	cmd.rq = rq;
 
+	if (blk_fs_request(rq)) {
+		ide_init_sg_cmd(&cmd, rq->nr_sectors << 9);
+		ide_map_sg(drive, &cmd);
+	}
+
 	return ide_issue_pc(drive, &cmd);
 out_end:
 	nsectors = rq->hard_nr_sectors;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 0e333ecf2ad6..a3b7a50562b2 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -188,8 +188,8 @@ static u8 wait_drive_not_busy(ide_drive_t *drive)
 	return stat;
 }
 
-static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
-			  unsigned int write, unsigned int len)
+void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
+		   unsigned int write, unsigned int len)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
@@ -243,6 +243,7 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 		len -= nr_bytes;
 	}
 }
+EXPORT_SYMBOL_GPL(ide_pio_bytes);
 
 static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
 			      unsigned int write)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d5d832271f44..c2841c0c36c8 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -352,6 +352,8 @@ struct ide_cmd {
 
 	unsigned int		nbytes;
 	unsigned int		nleft;
+	unsigned int		last_xfer_len;
+
 	struct scatterlist	*cursg;
 	unsigned int		cursg_ofs;
 
@@ -1226,6 +1228,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *, struct ide_cmd *);
 
 ide_startstop_t do_rw_taskfile(ide_drive_t *, struct ide_cmd *);
 
+void ide_pio_bytes(ide_drive_t *, struct ide_cmd *, unsigned int, unsigned int);
+
 void ide_finish_cmd(ide_drive_t *, struct ide_cmd *, u8);
 
 int ide_raw_taskfile(ide_drive_t *, struct ide_cmd *, u8 *, u16);
-- 
cgit v1.2.3


From 06a449e30135aabb6686c95bf0c42b46d169a3b3 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:13 +0200
Subject: ide-cd: fix non-SECTOR_SIZE-multiples PIO transfers for fs requests

We now support arbitrary number of bytes per-IRQ also for fs requests
so remove ide_cd_check_transfer_size() and IDE_AFLAG_LIMIT_NFRAMES.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 36 +-----------------------------------
 include/linux/ide.h  |  5 -----
 2 files changed, 1 insertion(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5f15859c2c73..c0cefe5becf3 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -509,31 +509,6 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq,
 	return -1;
 }
 
-/*
- * Assume that the drive will always provide data in multiples of at least
- * SECTOR_SIZE, as it gets hairy to keep track of the transfers otherwise.
- */
-static int ide_cd_check_transfer_size(ide_drive_t *drive, int len)
-{
-	ide_debug_log(IDE_DBG_FUNC, "len: %d", len);
-
-	if ((len % SECTOR_SIZE) == 0)
-		return 0;
-
-	printk(KERN_ERR PFX "%s: %s: Bad transfer size %d\n", drive->name,
-			__func__, len);
-
-	if (drive->atapi_flags & IDE_AFLAG_LIMIT_NFRAMES)
-		printk(KERN_ERR PFX "This drive is not supported by this "
-				"version of the driver\n");
-	else {
-		printk(KERN_ERR PFX "Trying to limit transfer sizes\n");
-		drive->atapi_flags |= IDE_AFLAG_LIMIT_NFRAMES;
-	}
-
-	return 1;
-}
-
 static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive,
 						 struct request *rq)
 {
@@ -752,13 +727,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	if (rc)
 		goto out_end;
 
-	if (blk_fs_request(rq)) {
-		if (write == 0) {
-			if (ide_cd_check_transfer_size(drive, len))
-				goto out_end;
-		}
-		cmd->last_xfer_len = 0;
-	}
+	cmd->last_xfer_len = 0;
 
 	if (ireason == 0) {
 		write = 1;
@@ -1619,9 +1588,6 @@ static const struct ide_proc_devset *ide_cd_proc_devsets(ide_drive_t *drive)
 #endif
 
 static const struct cd_list_entry ide_cd_quirks_list[] = {
-	/* Limit transfer size per interrupt. */
-	{ "SAMSUNG CD-ROM SCR-2430", NULL,   IDE_AFLAG_LIMIT_NFRAMES	     },
-	{ "SAMSUNG CD-ROM SCR-2432", NULL,   IDE_AFLAG_LIMIT_NFRAMES	     },
 	/* SCR-3231 doesn't support the SET_CD_SPEED command. */
 	{ "SAMSUNG CD-ROM SCR-3231", NULL,   IDE_AFLAG_NO_SPEED_SELECT	     },
 	/* Old NEC260 (not R) was released before ATAPI 1.2 spec. */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c2841c0c36c8..cb501bf78f7d 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -458,11 +458,6 @@ enum {
 	IDE_AFLAG_TOCADDR_AS_BCD	= (1 << 3),
 	/* TOC track numbers are in BCD. */
 	IDE_AFLAG_TOCTRACKS_AS_BCD	= (1 << 4),
-	/*
-	 * Drive does not provide data in multiples of SECTOR_SIZE
-	 * when more than one interrupt is needed.
-	 */
-	IDE_AFLAG_LIMIT_NFRAMES		= (1 << 5),
 	/* Saved TOC information is current. */
 	IDE_AFLAG_TOC_VALID		= (1 << 6),
 	/* We think that the drive door is locked. */
-- 
cgit v1.2.3


From 35c9b4daf4c94b30e5cede597d98016ebf31b5ad Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:19 +0200
Subject: ide: add ->dma_clear method and remove ->dma_timeout one

All custom ->dma_timeout implementations call the generic one thus it is
possible to have only an optional method for resetting DMA engine instead:

* Add ->dma_clear method and convert hpt366, pdc202xx_old and sl82c105
  host drivers to use it.

* Always use ide_dma_timeout() in ide_dma_timeout_retry() and remove
 ->dma_timeout method.

* Make ide_dma_timeout() static.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/alim15x3.c     |  1 -
 drivers/ide/au1xxx-ide.c   |  1 -
 drivers/ide/cmd64x.c       |  3 ---
 drivers/ide/cs5536.c       |  1 -
 drivers/ide/hpt366.c       | 10 +---------
 drivers/ide/icside.c       |  1 -
 drivers/ide/ide-dma-sff.c  |  3 +--
 drivers/ide/ide-dma.c      | 10 ++++++----
 drivers/ide/it821x.c       |  3 +--
 drivers/ide/ns87415.c      |  1 -
 drivers/ide/pdc202xx_old.c | 10 ++--------
 drivers/ide/pmac.c         |  1 -
 drivers/ide/sc1200.c       |  1 -
 drivers/ide/scc_pata.c     |  1 -
 drivers/ide/sgiioc4.c      |  1 -
 drivers/ide/siimage.c      |  1 -
 drivers/ide/sl82c105.c     |  7 +++----
 drivers/ide/tc86c001.c     |  1 -
 drivers/ide/trm290.c       |  1 -
 drivers/ide/tx4939ide.c    |  1 -
 include/linux/ide.h        |  4 ++--
 21 files changed, 16 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index d516168464fc..d3faf0b97f42 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -509,7 +509,6 @@ static const struct ide_dma_ops ali_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index d3a9d6c15328..0c08c5e01f2a 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -353,7 +353,6 @@ static const struct ide_dma_ops au1xxx_dma_ops = {
 	.dma_end		= auide_dma_end,
 	.dma_test_irq		= auide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static int auide_ddma_init(ide_hwif_t *hwif, const struct ide_port_info *d)
diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index bf0e3f470824..f0a49d2ff711 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c
@@ -384,7 +384,6 @@ static const struct ide_dma_ops cmd64x_dma_ops = {
 	.dma_test_irq		= cmd64x_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -396,7 +395,6 @@ static const struct ide_dma_ops cmd646_rev1_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -408,7 +406,6 @@ static const struct ide_dma_ops cmd648_dma_ops = {
 	.dma_test_irq		= cmd648_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/cs5536.c b/drivers/ide/cs5536.c
index d5dcf4899607..353a35bbba63 100644
--- a/drivers/ide/cs5536.c
+++ b/drivers/ide/cs5536.c
@@ -236,7 +236,6 @@ static const struct ide_dma_ops cs5536_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static const struct ide_port_info cs5536_info = {
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index dbaf184ed9c5..a0eb87f59134 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -835,12 +835,6 @@ static int hpt370_dma_end(ide_drive_t *drive)
 	return ide_dma_end(drive);
 }
 
-static void hpt370_dma_timeout(ide_drive_t *drive)
-{
-	hpt370_irq_timeout(drive);
-	ide_dma_timeout(drive);
-}
-
 /* returns 1 if DMA IRQ issued, 0 otherwise */
 static int hpt374_dma_test_irq(ide_drive_t *drive)
 {
@@ -1423,7 +1417,6 @@ static const struct ide_dma_ops hpt37x_dma_ops = {
 	.dma_test_irq		= hpt374_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -1435,7 +1428,7 @@ static const struct ide_dma_ops hpt370_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= hpt370_dma_timeout,
+	.dma_clear		= hpt370_irq_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -1447,7 +1440,6 @@ static const struct ide_dma_ops hpt36x_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= hpt366_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index 51ce404fe532..f069f122ee6e 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -377,7 +377,6 @@ static const struct ide_dma_ops icside_v6_dma_ops = {
 	.dma_start		= icside_dma_start,
 	.dma_end		= icside_dma_end,
 	.dma_test_irq		= icside_dma_test_irq,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
 };
 #else
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index 75a9ea2e4c82..7836d7e03fff 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -338,9 +338,8 @@ const struct ide_dma_ops sff_dma_ops = {
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 EXPORT_SYMBOL_GPL(sff_dma_ops);
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 3dbf80c15491..dc5d9bc4ced0 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -460,7 +460,7 @@ void ide_dma_lost_irq(ide_drive_t *drive)
 }
 EXPORT_SYMBOL_GPL(ide_dma_lost_irq);
 
-void ide_dma_timeout(ide_drive_t *drive)
+static void ide_dma_timeout(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 
@@ -473,7 +473,6 @@ void ide_dma_timeout(ide_drive_t *drive)
 
 	hwif->dma_ops->dma_end(drive);
 }
-EXPORT_SYMBOL_GPL(ide_dma_timeout);
 
 /*
  * un-busy the port etc, and clear any pending DMA status. we want to
@@ -483,6 +482,7 @@ EXPORT_SYMBOL_GPL(ide_dma_timeout);
 ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
 	struct request *rq;
 	ide_startstop_t ret = ide_stopped;
 
@@ -492,12 +492,14 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 
 	if (error < 0) {
 		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
-		(void)hwif->dma_ops->dma_end(drive);
+		(void)dma_ops->dma_end(drive);
 		ret = ide_error(drive, "dma timeout error",
 				hwif->tp_ops->read_status(hwif));
 	} else {
 		printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name);
-		hwif->dma_ops->dma_timeout(drive);
+		if (dma_ops->dma_clear)
+			dma_ops->dma_clear(drive);
+		ide_dma_timeout(drive);
 	}
 
 	/*
diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c
index 0d4ac65cf949..51aa745246dc 100644
--- a/drivers/ide/it821x.c
+++ b/drivers/ide/it821x.c
@@ -511,9 +511,8 @@ static struct ide_dma_ops it821x_pass_through_dma_ops = {
 	.dma_start		= it821x_dma_start,
 	.dma_end		= it821x_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 7b65fe5bf449..9039a373020f 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -306,7 +306,6 @@ static const struct ide_dma_ops ns87415_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= superio_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index f7536d1943f7..248a54bd2386 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -258,12 +258,6 @@ static void pdc202xx_dma_lost_irq(ide_drive_t *drive)
 	ide_dma_lost_irq(drive);
 }
 
-static void pdc202xx_dma_timeout(ide_drive_t *drive)
-{
-	pdc202xx_reset(drive);
-	ide_dma_timeout(drive);
-}
-
 static int init_chipset_pdc202xx(struct pci_dev *dev)
 {
 	unsigned long dmabase = pci_resource_start(dev, 4);
@@ -336,7 +330,7 @@ static const struct ide_dma_ops pdc20246_dma_ops = {
 	.dma_test_irq		= pdc202xx_dma_test_irq,
 	.dma_lost_irq		= pdc202xx_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= pdc202xx_dma_timeout,
+	.dma_clear		= pdc202xx_reset,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -348,7 +342,7 @@ static const struct ide_dma_ops pdc2026x_dma_ops = {
 	.dma_test_irq		= pdc202xx_dma_test_irq,
 	.dma_lost_irq		= pdc202xx_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= pdc202xx_dma_timeout,
+	.dma_clear		= pdc202xx_reset,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 2bfcfedaa076..d15cc46a66e3 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1650,7 +1650,6 @@ static const struct ide_dma_ops pmac_dma_ops = {
 	.dma_start		= pmac_ide_dma_start,
 	.dma_end		= pmac_ide_dma_end,
 	.dma_test_irq		= pmac_ide_dma_test_irq,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= pmac_ide_dma_lost_irq,
 };
 
diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c
index 1c3a82914999..371549d18a01 100644
--- a/drivers/ide/sc1200.c
+++ b/drivers/ide/sc1200.c
@@ -291,7 +291,6 @@ static const struct ide_dma_ops sc1200_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 0cc137cfe76d..64534d150b0c 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -872,7 +872,6 @@ static const struct ide_dma_ops scc_dma_ops = {
 	.dma_end		= scc_dma_end,
 	.dma_test_irq		= scc_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= scc_dma_sff_read_status,
 };
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index b12de8346c73..44df0c750bab 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -533,7 +533,6 @@ static const struct ide_dma_ops sgiioc4_dma_ops = {
 	.dma_end		= sgiioc4_dma_end,
 	.dma_test_irq		= sgiioc4_dma_test_irq,
 	.dma_lost_irq		= sgiioc4_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static const struct ide_port_info sgiioc4_port_info __devinitconst = {
diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c
index 075cb1243b2a..e4973cd1fba9 100644
--- a/drivers/ide/siimage.c
+++ b/drivers/ide/siimage.c
@@ -715,7 +715,6 @@ static const struct ide_dma_ops sil_dma_ops = {
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= siimage_dma_test_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c
index d25137b04e7a..d6f8977191c8 100644
--- a/drivers/ide/sl82c105.c
+++ b/drivers/ide/sl82c105.c
@@ -189,14 +189,13 @@ static void sl82c105_dma_start(ide_drive_t *drive)
 	ide_dma_start(drive);
 }
 
-static void sl82c105_dma_timeout(ide_drive_t *drive)
+static void sl82c105_dma_clear(ide_drive_t *drive)
 {
 	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
 
-	DBG(("sl82c105_dma_timeout(drive:%s)\n", drive->name));
+	DBG(("sl82c105_dma_clear(drive:%s)\n", drive->name));
 
 	sl82c105_reset_host(dev);
-	ide_dma_timeout(drive);
 }
 
 static int sl82c105_dma_end(ide_drive_t *drive)
@@ -298,7 +297,7 @@ static const struct ide_dma_ops sl82c105_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= sl82c105_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= sl82c105_dma_timeout,
+	.dma_clear		= sl82c105_dma_clear,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c
index 427d4b3c2c63..b4cf42dc8a6f 100644
--- a/drivers/ide/tc86c001.c
+++ b/drivers/ide/tc86c001.c
@@ -187,7 +187,6 @@ static const struct ide_dma_ops tc86c001_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index ed1496845a93..d6a950828e9f 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -314,7 +314,6 @@ static struct ide_dma_ops trm290_dma_ops = {
 	.dma_end		= trm290_dma_end,
 	.dma_test_irq		= trm290_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static const struct ide_port_info trm290_chipset __devinitdata = {
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index e0e0a803dde3..53f99853b065 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -632,7 +632,6 @@ static const struct ide_dma_ops tx4939ide_dma_ops = {
 	.dma_test_irq		= tx4939ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= tx4939ide_dma_sff_read_status,
 };
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index cb501bf78f7d..d3035f2f1250 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -716,8 +716,9 @@ struct ide_dma_ops {
 	int	(*dma_end)(struct ide_drive_s *);
 	int	(*dma_test_irq)(struct ide_drive_s *);
 	void	(*dma_lost_irq)(struct ide_drive_s *);
+	/* below ones are optional */
 	int	(*dma_timer_expiry)(struct ide_drive_s *);
-	void	(*dma_timeout)(struct ide_drive_s *);
+	void	(*dma_clear)(struct ide_drive_s *);
 	/*
 	 * The following method is optional and only required to be
 	 * implemented for the SFF-8038i compatible controllers.
@@ -1461,7 +1462,6 @@ static inline int config_drive_for_dma(ide_drive_t *drive) { return 0; }
 #endif /* CONFIG_BLK_DEV_IDEDMA_SFF */
 
 void ide_dma_lost_irq(ide_drive_t *);
-void ide_dma_timeout(ide_drive_t *);
 ide_startstop_t ide_dma_timeout_retry(ide_drive_t *, int);
 
 #else
-- 
cgit v1.2.3


From 4453011f959a5f5c6c7a33aea54fe17f5e43a867 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:20 +0200
Subject: ide: destroy DMA mappings after ending DMA (v2)

Move ide_destroy_dmatable() call out from ->dma_end method to
{ide_pc,cdrom_newpc,ide_dma}_intr(), ide_dma_timeout_retry()
and sgiioc4_resetproc().

This causes minor/safe behavior changes w.r.t.:
* cmd64x.c::cmd64{8,x}_dma_end()
* cs5536.c::cs5536_dma_end()
* icside.c::icside_dma_end()
* it821x.c::it821x_dma_end()
* scc_pata.c::__scc_dma_end()
* sl82c105.c::sl82c105_dma_end()
* tx4939ide.c::tx4939ide_dma_end()

v2:
* Fix build for CONFIG_BLK_DEV_IDEDMA=n (reported by Randy Dunlap).

Cc: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/au1xxx-ide.c  | 2 --
 drivers/ide/cmd64x.c      | 2 --
 drivers/ide/icside.c      | 3 ---
 drivers/ide/ide-atapi.c   | 7 +++++--
 drivers/ide/ide-cd.c      | 1 +
 drivers/ide/ide-dma-sff.c | 2 --
 drivers/ide/ide-dma.c     | 3 +++
 drivers/ide/ns87415.c     | 2 --
 drivers/ide/pmac.c        | 2 --
 drivers/ide/sc1200.c      | 1 -
 drivers/ide/scc_pata.c    | 2 --
 drivers/ide/sgiioc4.c     | 2 +-
 drivers/ide/trm290.c      | 3 +--
 drivers/ide/tx4939ide.c   | 4 +---
 include/linux/ide.h       | 1 +
 15 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 0c08c5e01f2a..ba2a211758a9 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -280,8 +280,6 @@ static int auide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 
 static int auide_dma_end(ide_drive_t *drive)
 {
-	ide_destroy_dmatable(drive);
-
 	return 0;
 }
 
diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index f0a49d2ff711..f2edf280ef8b 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c
@@ -327,8 +327,6 @@ static int cmd646_1_dma_end(ide_drive_t *drive)
 	outb(dma_cmd & ~1, hwif->dma_base + ATA_DMA_CMD);
 	/* clear the INTR & ERROR bits */
 	outb(dma_stat | 6, hwif->dma_base + ATA_DMA_STATUS);
-	/* and free any DMA resources */
-	ide_destroy_dmatable(drive);
 	/* verify good DMA status */
 	return (dma_stat & 7) != 4;
 }
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index f069f122ee6e..9bf57d7c8e57 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -291,9 +291,6 @@ static int icside_dma_end(ide_drive_t *drive)
 
 	disable_dma(ec->dma);
 
-	/* Teardown mappings after DMA has completed. */
-	ide_destroy_dmatable(drive);
-
 	return get_dma_residue(ec->dma) != 0;
 }
 
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index f591166d2c93..1481f71f8173 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -342,8 +342,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	stat = tp_ops->read_status(hwif);
 
 	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		if (hwif->dma_ops->dma_end(drive) ||
-		    (drive->media == ide_tape && (stat & ATA_ERR))) {
+		int rc = hwif->dma_ops->dma_end(drive);
+
+		ide_destroy_dmatable(drive);
+
+		if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) {
 			if (drive->media == ide_floppy)
 				printk(KERN_ERR "%s: DMA %s error\n",
 					drive->name, rq_data_dir(pc->rq)
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5319e7a73708..4a0d66ee9547 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -639,6 +639,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	if (dma) {
 		drive->dma = 0;
 		dma_error = hwif->dma_ops->dma_end(drive);
+		ide_destroy_dmatable(drive);
 		if (dma_error) {
 			printk(KERN_ERR PFX "%s: DMA %s error\n", drive->name,
 					write ? "write" : "read");
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index 7836d7e03fff..f8adbb5eb339 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -310,8 +310,6 @@ int ide_dma_end(ide_drive_t *drive)
 	/* clear INTR & ERROR bits */
 	ide_dma_sff_write_status(hwif, dma_stat | ATA_DMA_ERR | ATA_DMA_INTR);
 
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
 	wmb();
 
 	/* verify good DMA status */
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 4e2005071113..b430898bbcd6 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -92,6 +92,7 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 	u8 stat = 0, dma_stat = 0;
 
 	dma_stat = hwif->dma_ops->dma_end(drive);
+	ide_destroy_dmatable(drive);
 	stat = hwif->tp_ops->read_status(hwif);
 
 	if (OK_STAT(stat, DRIVE_READY, drive->bad_wstat | ATA_DRQ)) {
@@ -479,6 +480,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 	if (error < 0) {
 		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
 		(void)dma_ops->dma_end(drive);
+		ide_destroy_dmatable(drive);
 		ret = ide_error(drive, "dma timeout error",
 				hwif->tp_ops->read_status(hwif));
 	} else {
@@ -490,6 +492,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 			ide_dump_status(drive, "DMA timeout",
 					hwif->tp_ops->read_status(hwif));
 			(void)dma_ops->dma_end(drive);
+			ide_destroy_dmatable(drive);
 		}
 	}
 
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 9039a373020f..9ad71a74f93f 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -210,8 +210,6 @@ static int ns87415_dma_end(ide_drive_t *drive)
 	/* from ERRATA: clear the INTR & ERROR bits */
 	dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
 	outb(dma_cmd | 6, hwif->dma_base + ATA_DMA_CMD);
-	/* and free any DMA resources */
-	ide_destroy_dmatable(drive);
 	/* verify good DMA status */
 	return (dma_stat & 7) != 4;
 }
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index d15cc46a66e3..5643a8b957bf 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1562,8 +1562,6 @@ pmac_ide_dma_end (ide_drive_t *drive)
 	dstat = readl(&dma->status);
 	writel(((RUN|WAKE|DEAD) << 16), &dma->control);
 
-	ide_destroy_dmatable(drive);
-
 	/* verify good dma status. we don't check for ACTIVE beeing 0. We should...
 	 * in theory, but with ATAPI decices doing buffer underruns, that would
 	 * cause us to disable DMA, which isn't what we want
diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c
index 371549d18a01..d9c47034bedd 100644
--- a/drivers/ide/sc1200.c
+++ b/drivers/ide/sc1200.c
@@ -184,7 +184,6 @@ static int sc1200_dma_end(ide_drive_t *drive)
 	outb(inb(dma_base)&~1, dma_base);	/* !! DO THIS HERE !! stop DMA */
 
 	drive->waiting_for_dma = 0;
-	ide_destroy_dmatable(drive);		/* purge DMA mappings */
 
 	return (dma_stat & 7) != 4;		/* verify good DMA status */
 }
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 64534d150b0c..693536ebe331 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -365,8 +365,6 @@ static int __scc_dma_end(ide_drive_t *drive)
 	dma_stat = scc_dma_sff_read_status(hwif);
 	/* clear the INTR & ERROR bits */
 	scc_ide_outb(dma_stat | 6, hwif->dma_base + 4);
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
 	/* verify good DMA status */
 	wmb();
 	return (dma_stat & 7) != 4 ? (0x10 | dma_stat) : 0;
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 44df0c750bab..457a762a1f29 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -259,7 +259,6 @@ static int sgiioc4_dma_end(ide_drive_t *drive)
 	}
 
 	drive->waiting_for_dma = 0;
-	ide_destroy_dmatable(drive);
 
 	return dma_stat;
 }
@@ -284,6 +283,7 @@ static void
 sgiioc4_resetproc(ide_drive_t * drive)
 {
 	sgiioc4_dma_end(drive);
+	ide_destroy_dmatable(drive);
 	sgiioc4_clearirq(drive);
 }
 
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index d6a950828e9f..8dd3d8226870 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -216,8 +216,7 @@ static int trm290_dma_end(ide_drive_t *drive)
 	u16 status;
 
 	drive->waiting_for_dma = 0;
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
+
 	status = inw(drive->hwif->dma_base + 2);
 
 	return status != 0x00ff;
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 53f99853b065..f62ced855cf3 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -335,11 +335,9 @@ static int tx4939ide_dma_end(ide_drive_t *drive)
 	/* read and clear the INTR & ERROR bits */
 	dma_stat = tx4939ide_clear_dma_status(base);
 
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
-	/* verify good DMA status */
 	wmb();
 
+	/* verify good DMA status */
 	if ((dma_stat & (ATA_DMA_INTR | ATA_DMA_ERR | ATA_DMA_ACTIVE)) == 0 &&
 	    (ctl & (TX4939IDE_INT_XFEREND | TX4939IDE_INT_HOST)) ==
 	    (TX4939IDE_INT_XFEREND | TX4939IDE_INT_HOST))
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d3035f2f1250..b6c4942fde11 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1479,6 +1479,7 @@ static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int erro
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
 static inline int ide_build_sglist(ide_drive_t *drive,
 				   struct ide_cmd *cmd) { return 0; }
+static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
-- 
cgit v1.2.3


From 5ae5412d9a23b05ab08461b202bad21ad8f6b66d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:20 +0200
Subject: ide: add ide_dma_prepare() helper

* Add ide_dma_prepare() helper.

* Convert ide_issue_pc() and do_rw_taskfile() to use it.

* Make ide_build_sglist() static.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c    | 18 ++++--------------
 drivers/ide/ide-dma.c      | 11 ++++++++++-
 drivers/ide/ide-taskfile.c |  4 +---
 include/linux/ide.h        |  7 ++++---
 4 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 1481f71f8173..89d2339bdef3 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -638,7 +638,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct ide_atapi_pc *pc;
 	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
 	ide_expiry_t *expiry = NULL;
 	struct request *rq = hwif->rq;
 	unsigned int timeout;
@@ -652,12 +651,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 		expiry = ide_cd_expiry;
 		timeout = ATAPI_WAIT_PC;
 
-		if (drive->dma) {
-			if (ide_build_sglist(drive, cmd))
-				drive->dma = !dma_ops->dma_setup(drive, cmd);
-			else
-				drive->dma = 0;
-		}
+		if (drive->dma)
+			drive->dma = !ide_dma_prepare(drive, cmd);
 	} else {
 		pc = drive->pc;
 
@@ -675,13 +670,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 			ide_dma_off(drive);
 		}
 
-		if ((pc->flags & PC_FLAG_DMA_OK) &&
-		     (drive->dev_flags & IDE_DFLAG_USING_DMA)) {
-			if (ide_build_sglist(drive, cmd))
-				drive->dma = !dma_ops->dma_setup(drive, cmd);
-			else
-				drive->dma = 0;
-		}
+		if (pc->flags & PC_FLAG_DMA_OK)
+			drive->dma = !ide_dma_prepare(drive, cmd);
 
 		if (!drive->dma)
 			pc->flags &= ~PC_FLAG_DMA_OK;
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index b430898bbcd6..cf5897f5533f 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -128,7 +128,7 @@ int ide_dma_good_drive(ide_drive_t *drive)
  *	operate in a portable fashion.
  */
 
-int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
+static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
@@ -563,3 +563,12 @@ int ide_allocate_dma_engine(ide_hwif_t *hwif)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_allocate_dma_engine);
+
+int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
+{
+	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
+	    ide_build_sglist(drive, cmd) == 0 ||
+	    drive->hwif->dma_ops->dma_setup(drive, cmd))
+		return 1;
+	return 0;
+}
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index a3b7a50562b2..dba68db629bf 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -100,9 +100,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 		ide_execute_command(drive, cmd, handler, WAIT_WORSTCASE);
 		return ide_started;
 	case ATA_PROT_DMA:
-		if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
-		    ide_build_sglist(drive, cmd) == 0 ||
-		    dma_ops->dma_setup(drive, cmd))
+		if (ide_dma_prepare(drive, cmd))
 			return ide_stopped;
 		hwif->expiry = dma_ops->dma_timer_expiry;
 		ide_execute_command(drive, cmd, ide_dma_intr, 2 * WAIT_CMD);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index b6c4942fde11..78892e2a432c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1443,7 +1443,8 @@ ide_startstop_t ide_dma_intr(ide_drive_t *);
 int ide_allocate_dma_engine(ide_hwif_t *);
 void ide_release_dma_engine(ide_hwif_t *);
 
-int ide_build_sglist(ide_drive_t *, struct ide_cmd *);
+int ide_dma_prepare(ide_drive_t *, struct ide_cmd *);
+
 void ide_destroy_dmatable(ide_drive_t *);
 
 #ifdef CONFIG_BLK_DEV_IDEDMA_SFF
@@ -1477,8 +1478,8 @@ static inline void ide_check_dma_crc(ide_drive_t *drive) { ; }
 static inline ide_startstop_t ide_dma_intr(ide_drive_t *drive) { return ide_stopped; }
 static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) { return ide_stopped; }
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
-static inline int ide_build_sglist(ide_drive_t *drive,
-				   struct ide_cmd *cmd) { return 0; }
+static inline int ide_dma_prepare(ide_drive_t *drive,
+				  struct ide_cmd *cmd) { return 1; }
 static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
-- 
cgit v1.2.3


From 8a4a5738ba499083cf4c5668895efe220b1946d3 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:21 +0200
Subject: ide: add ->dma_check method

* Add (an optional) ->dma_check method for checking if DMA can be
  used for a given command and fail DMA setup in ide_dma_prepare()
  if necessary.

* Convert alim15x3 and trm290 host drivers to use ->dma_check.

* Rename ali15x3_dma_setup() to ali_dma_check() while at it.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/alim15x3.c |  9 +++++----
 drivers/ide/ide-dma.c  |  5 ++++-
 drivers/ide/trm290.c   | 17 ++++++++++-------
 include/linux/ide.h    |  1 +
 4 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index d3faf0b97f42..537da1cde16d 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -189,20 +189,20 @@ static void ali_set_dma_mode(ide_drive_t *drive, const u8 speed)
 }
 
 /**
- *	ali15x3_dma_setup	-	begin a DMA phase
+ *	ali_dma_check	-	DMA check
  *	@drive:	target device
  *	@cmd: command
  *
  *	Returns 1 if the DMA cannot be performed, zero on success.
  */
 
-static int ali15x3_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
+static int ali_dma_check(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	if (m5229_revision < 0xC2 && drive->media != ide_disk) {
 		if (cmd->tf_flags & IDE_TFLAG_WRITE)
 			return 1;	/* try PIO instead of DMA */
 	}
-	return ide_dma_setup(drive, cmd);
+	return 0;
 }
 
 /**
@@ -503,11 +503,12 @@ static const struct ide_port_ops ali_port_ops = {
 
 static const struct ide_dma_ops ali_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ali15x3_dma_setup,
+	.dma_setup		= ide_dma_setup,
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_check		= ali_dma_check,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index cf5897f5533f..c0505e2dfc2e 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -566,9 +566,12 @@ EXPORT_SYMBOL_GPL(ide_allocate_dma_engine);
 
 int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
 {
+	const struct ide_dma_ops *dma_ops = drive->hwif->dma_ops;
+
 	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
+	    (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)) ||
 	    ide_build_sglist(drive, cmd) == 0 ||
-	    drive->hwif->dma_ops->dma_setup(drive, cmd))
+	    dma_ops->dma_setup(drive, cmd))
 		return 1;
 	return 0;
 }
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index b91bb709af40..1076efd050dc 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -176,19 +176,21 @@ static void trm290_selectproc (ide_drive_t *drive)
 	trm290_prepare_drive(drive, !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
 }
 
-static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
+static int trm290_dma_check(ide_drive_t *drive, struct ide_cmd *cmd)
 {
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned int count, rw;
-
 	if (cmd->tf_flags & IDE_TFLAG_WRITE) {
 #ifdef TRM290_NO_DMA_WRITES
 		/* always use PIO for writes */
 		return 1;
 #endif
-		rw = 1;
-	} else
-		rw = 2;
+	}
+	return 0;
+}
+
+static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	unsigned int count, rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 1 : 2;
 
 	count = ide_build_dmatable(drive, cmd);
 	if (count == 0) {
@@ -312,6 +314,7 @@ static struct ide_dma_ops trm290_dma_ops = {
 	.dma_end		= trm290_dma_end,
 	.dma_test_irq		= trm290_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_check		= trm290_dma_check,
 };
 
 static const struct ide_port_info trm290_chipset __devinitdata = {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 78892e2a432c..b350667b83ad 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -717,6 +717,7 @@ struct ide_dma_ops {
 	int	(*dma_test_irq)(struct ide_drive_s *);
 	void	(*dma_lost_irq)(struct ide_drive_s *);
 	/* below ones are optional */
+	int	(*dma_check)(struct ide_drive_s *, struct ide_cmd *);
 	int	(*dma_timer_expiry)(struct ide_drive_s *);
 	void	(*dma_clear)(struct ide_drive_s *);
 	/*
-- 
cgit v1.2.3


From f094d4d83bccee9277ddb6aadccf35747889426b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:24 +0200
Subject: ide: sanitize ide_build_sglist() and ide_destroy_dmatable()

* Move ide_map_sg() calls out from ide_build_sglist()
  to ide_dma_prepare().

* Pass command to ide_destroy_dmatable().

* Rename ide_build_sglist() to ide_dma_map_sg()
  and ide_destroy_dmatable() to ide_dma_unmap_sg().

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c |  3 ++-
 drivers/ide/ide-cd.c    |  2 +-
 drivers/ide/ide-dma.c   | 50 ++++++++++++++++++++++++-------------------------
 drivers/ide/sgiioc4.c   |  7 ++++---
 include/linux/ide.h     |  6 +++---
 5 files changed, 35 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index db6e617790bd..3f3fc7c7b2fe 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -326,6 +326,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 {
 	struct ide_atapi_pc *pc = drive->pc;
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	xfer_func_t *xferfunc;
@@ -346,7 +347,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		drive->waiting_for_dma = 0;
 		rc = hwif->dma_ops->dma_end(drive);
-		ide_destroy_dmatable(drive);
+		ide_dma_unmap_sg(drive, cmd);
 
 		if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) {
 			if (drive->media == ide_floppy)
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index f5c7bb739f45..35729a47f797 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -640,7 +640,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		drive->dma = 0;
 		drive->waiting_for_dma = 0;
 		dma_error = hwif->dma_ops->dma_end(drive);
-		ide_destroy_dmatable(drive);
+		ide_dma_unmap_sg(drive, cmd);
 		if (dma_error) {
 			printk(KERN_ERR PFX "%s: DMA %s error\n", drive->name,
 					write ? "write" : "read");
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 4d3102887d9c..a5612eadc306 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -89,17 +89,16 @@ static const struct drive_list_entry drive_blacklist[] = {
 ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	u8 stat = 0, dma_stat = 0;
 
 	drive->waiting_for_dma = 0;
 	dma_stat = hwif->dma_ops->dma_end(drive);
-	ide_destroy_dmatable(drive);
+	ide_dma_unmap_sg(drive, cmd);
 	stat = hwif->tp_ops->read_status(hwif);
 
 	if (OK_STAT(stat, DRIVE_READY, drive->bad_wstat | ATA_DRQ)) {
 		if (!dma_stat) {
-			struct ide_cmd *cmd = &hwif->cmd;
-
 			if ((cmd->tf_flags & IDE_TFLAG_FS) == 0)
 				ide_finish_cmd(drive, cmd, stat);
 			else
@@ -119,8 +118,8 @@ int ide_dma_good_drive(ide_drive_t *drive)
 }
 
 /**
- *	ide_build_sglist	-	map IDE scatter gather for DMA I/O
- *	@drive: the drive to build the DMA table for
+ *	ide_dma_map_sg	-	map IDE scatter gather for DMA I/O
+ *	@drive: the drive to map the DMA table for
  *	@cmd: command
  *
  *	Perform the DMA mapping magic necessary to access the source or
@@ -129,23 +128,19 @@ int ide_dma_good_drive(ide_drive_t *drive)
  *	operate in a portable fashion.
  */
 
-static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
+static int ide_dma_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
 	int i;
 
-	ide_map_sg(drive, cmd);
-
 	if (cmd->tf_flags & IDE_TFLAG_WRITE)
 		cmd->sg_dma_direction = DMA_TO_DEVICE;
 	else
 		cmd->sg_dma_direction = DMA_FROM_DEVICE;
 
 	i = dma_map_sg(hwif->dev, sg, cmd->sg_nents, cmd->sg_dma_direction);
-	if (i == 0)
-		ide_map_sg(drive, cmd);
-	else {
+	if (i) {
 		cmd->orig_sg_nents = cmd->sg_nents;
 		cmd->sg_nents = i;
 	}
@@ -154,7 +149,7 @@ static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
 }
 
 /**
- *	ide_destroy_dmatable	-	clean up DMA mapping
+ *	ide_dma_unmap_sg	-	clean up DMA mapping
  *	@drive: The drive to unmap
  *
  *	Teardown mappings after DMA has completed. This must be called
@@ -164,15 +159,14 @@ static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
  *	time.
  */
 
-void ide_destroy_dmatable(ide_drive_t *drive)
+void ide_dma_unmap_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd *cmd = &hwif->cmd;
 
 	dma_unmap_sg(hwif->dev, hwif->sg_table, cmd->orig_sg_nents,
 		     cmd->sg_dma_direction);
 }
-EXPORT_SYMBOL_GPL(ide_destroy_dmatable);
+EXPORT_SYMBOL_GPL(ide_dma_unmap_sg);
 
 /**
  *	ide_dma_off_quietly	-	Generic DMA kill
@@ -471,6 +465,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq;
 	ide_startstop_t ret = ide_stopped;
 
@@ -482,7 +477,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
 		drive->waiting_for_dma = 0;
 		(void)dma_ops->dma_end(drive);
-		ide_destroy_dmatable(drive);
+		ide_dma_unmap_sg(drive, cmd);
 		ret = ide_error(drive, "dma timeout error",
 				hwif->tp_ops->read_status(hwif));
 	} else {
@@ -495,7 +490,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 					hwif->tp_ops->read_status(hwif));
 			drive->waiting_for_dma = 0;
 			(void)dma_ops->dma_end(drive);
-			ide_destroy_dmatable(drive);
+			ide_dma_unmap_sg(drive, cmd);
 		}
 	}
 
@@ -572,14 +567,19 @@ int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
 	const struct ide_dma_ops *dma_ops = drive->hwif->dma_ops;
 
 	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
-	    (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)) ||
-	    ide_build_sglist(drive, cmd) == 0)
-		return 1;
-	if (dma_ops->dma_setup(drive, cmd)) {
-		ide_destroy_dmatable(drive);
-		ide_map_sg(drive, cmd);
-		return 1;
-	}
+	    (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)))
+		goto out;
+	ide_map_sg(drive, cmd);
+	if (ide_dma_map_sg(drive, cmd) == 0)
+		goto out_map;
+	if (dma_ops->dma_setup(drive, cmd))
+		goto out_dma_unmap;
 	drive->waiting_for_dma = 1;
 	return 0;
+out_dma_unmap:
+	ide_dma_unmap_sg(drive, cmd);
+out_map:
+	ide_map_sg(drive, cmd);
+out:
+	return 1;
 }
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index cb2657c4c976..6ef5a567d377 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -277,11 +277,12 @@ static void sgiioc4_dma_host_set(ide_drive_t *drive, int on)
 		sgiioc4_clearirq(drive);
 }
 
-static void
-sgiioc4_resetproc(ide_drive_t * drive)
+static void sgiioc4_resetproc(ide_drive_t *drive)
 {
+	struct ide_cmd *cmd = &drive->hwif->cmd;
+
 	sgiioc4_dma_end(drive);
-	ide_destroy_dmatable(drive);
+	ide_dma_unmap_sg(drive, cmd);
 	sgiioc4_clearirq(drive);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index b350667b83ad..03c520917b7a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1445,8 +1445,7 @@ int ide_allocate_dma_engine(ide_hwif_t *);
 void ide_release_dma_engine(ide_hwif_t *);
 
 int ide_dma_prepare(ide_drive_t *, struct ide_cmd *);
-
-void ide_destroy_dmatable(ide_drive_t *);
+void ide_dma_unmap_sg(ide_drive_t *, struct ide_cmd *);
 
 #ifdef CONFIG_BLK_DEV_IDEDMA_SFF
 int config_drive_for_dma(ide_drive_t *);
@@ -1481,7 +1480,8 @@ static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int erro
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
 static inline int ide_dma_prepare(ide_drive_t *drive,
 				  struct ide_cmd *cmd) { return 1; }
-static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; }
+static inline void ide_dma_unmap_sg(ide_drive_t *drive,
+				    struct ide_cmd *cmd) { ; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
-- 
cgit v1.2.3


From 41fa9f863baacd32dd049daf8050d55a0c9e6f1a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:25 +0200
Subject: ide: decrease size of ->pc_buf field in struct ide_atapi_pc

struct ide_atapi_pc is often allocated on the stack and size of ->pc_buf
size is 256 bytes.  However since only ide_floppy_create_read_capacity_cmd()
and idetape_create_inquiry_cmd() require such size allocate buffers for
these pc-s explicitely and decrease ->pc_buf size to 64 bytes.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-floppy.c       | 5 ++++-
 drivers/ide/ide-floppy_ioctl.c | 5 ++++-
 drivers/ide/ide-tape.c         | 4 ++++
 include/linux/ide.h            | 2 +-
 4 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 7ae662334835..0faae3098295 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -385,7 +385,7 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	struct gendisk *disk = floppy->disk;
 	struct ide_atapi_pc pc;
 	u8 *cap_desc;
-	u8 header_len, desc_cnt;
+	u8 pc_buf[256], header_len, desc_cnt;
 	int i, rc = 1, blocks, length;
 
 	drive->bios_cyl = 0;
@@ -395,6 +395,9 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	drive->capacity64 = 0;
 
 	ide_floppy_create_read_capacity_cmd(&pc);
+	pc.buf = &pc_buf[0];
+	pc.buf_size = sizeof(pc_buf);
+
 	if (ide_queue_pc_tail(drive, disk, &pc)) {
 		printk(KERN_ERR PFX "Can't get floppy parameters\n");
 		return 1;
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index 8f8be8546038..cd8a42027ede 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -36,9 +36,9 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive,
 					    int __user *arg)
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
-	u8 header_len, desc_cnt;
 	int i, blocks, length, u_array_size, u_index;
 	int __user *argp;
+	u8 pc_buf[256], header_len, desc_cnt;
 
 	if (get_user(u_array_size, arg))
 		return -EFAULT;
@@ -47,6 +47,9 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive,
 		return -EINVAL;
 
 	ide_floppy_create_read_capacity_cmd(pc);
+	pc->buf = &pc_buf[0];
+	pc->buf_size = sizeof(pc_buf);
+
 	if (ide_queue_pc_tail(drive, floppy->disk, pc)) {
 		printk(KERN_ERR "ide-floppy: Can't get floppy parameters\n");
 		return -EIO;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 64dfa7458f8d..cafc67d9e2e8 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -2014,9 +2014,13 @@ static void idetape_get_inquiry_results(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	struct ide_atapi_pc pc;
+	u8 pc_buf[256];
 	char fw_rev[4], vendor_id[8], product_id[16];
 
 	idetape_create_inquiry_cmd(&pc);
+	pc.buf = &pc_buf[0];
+	pc.buf_size = sizeof(pc_buf);
+
 	if (ide_queue_pc_tail(drive, tape->disk, &pc)) {
 		printk(KERN_ERR "ide-tape: %s: can't get INQUIRY results\n",
 				tape->name);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 03c520917b7a..0f48fbd46028 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -377,7 +377,7 @@ enum {
  * With each packet command, we allocate a buffer of IDE_PC_BUFFER_SIZE bytes.
  * This is used for several packet commands (not for READ/WRITE commands).
  */
-#define IDE_PC_BUFFER_SIZE	256
+#define IDE_PC_BUFFER_SIZE	64
 #define ATAPI_WAIT_PC		(60 * HZ)
 
 struct ide_atapi_pc {
-- 
cgit v1.2.3


From 349d12a1fe57d49287a539909cf14f362634342d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:26 +0200
Subject: ide-floppy: use ide_pio_bytes()

* Fix ide_init_sg_cmd() setup for non-fs requests.

* Convert ide_pc_intr() to use ide_pio_bytes() for floppy media.

* Remove no longer needed ide_io_buffers() and sg/sg_cnt fields
  from struct ide_atapi_pc.

* Remove partial completions; kill idefloppy_update_buffers(), as a
  result.

* Add some more debugging statements.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c  | 68 ++++++++++++------------------------------------
 drivers/ide/ide-floppy.c | 24 ++++-------------
 include/linux/ide.h      |  5 ----
 3 files changed, 21 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 40f413b20bd8..100e6f94b4f0 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -71,49 +71,6 @@ int ide_check_atapi_device(ide_drive_t *drive, const char *s)
 }
 EXPORT_SYMBOL_GPL(ide_check_atapi_device);
 
-/* PIO data transfer routine using the scatter gather table. */
-int ide_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-		    unsigned int bcount, int write)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	xfer_func_t *xf = write ? tp_ops->output_data : tp_ops->input_data;
-	struct scatterlist *sg = pc->sg;
-	char *buf;
-	int count, done = 0;
-
-	while (bcount) {
-		count = min(sg->length - pc->b_count, bcount);
-
-		if (PageHighMem(sg_page(sg))) {
-			unsigned long flags;
-
-			local_irq_save(flags);
-			buf = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
-			xf(drive, NULL, buf + pc->b_count, count);
-			kunmap_atomic(buf - sg->offset, KM_IRQ0);
-			local_irq_restore(flags);
-		} else {
-			buf = sg_virt(sg);
-			xf(drive, NULL, buf + pc->b_count, count);
-		}
-
-		bcount -= count;
-		pc->b_count += count;
-		done += count;
-
-		if (pc->b_count == sg->length) {
-			if (!--pc->sg_cnt)
-				break;
-			pc->sg = sg = sg_next(sg);
-			pc->b_count = 0;
-		}
-	}
-
-	return done;
-}
-EXPORT_SYMBOL_GPL(ide_io_buffers);
-
 void ide_init_pc(struct ide_atapi_pc *pc)
 {
 	memset(pc, 0, sizeof(*pc));
@@ -353,6 +310,9 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			pc->xferred = pc->req_xfer;
 			if (drive->pc_update_buffers)
 				drive->pc_update_buffers(drive, pc);
+
+			if (drive->media == ide_floppy)
+				ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 		}
 		debug_log("%s: DMA finished\n", drive->name);
 	}
@@ -408,12 +368,19 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			rq->errors = 0;
 			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 		} else {
+			unsigned int done;
+
 			if (blk_fs_request(rq) == 0 && uptodate <= 0) {
 				if (rq->errors == 0)
 					rq->errors = -EIO;
 			}
-			ide_complete_rq(drive, uptodate ? 0 : -EIO,
-					ide_rq_bytes(rq));
+
+			if (drive->media == ide_tape)
+				done = ide_rq_bytes(rq); /* FIXME */
+			else
+				done = blk_rq_bytes(rq);
+
+			ide_complete_rq(drive, uptodate ? 0 : -EIO, done);
 		}
 
 		return ide_stopped;
@@ -446,14 +413,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 	xferfunc = write ? tp_ops->output_data : tp_ops->input_data;
 
-	if ((drive->media == ide_floppy && !pc->buf) ||
-	    (drive->media == ide_tape && pc->bh)) {
+	if (drive->media == ide_floppy && pc->buf == NULL) {
+		done = min_t(unsigned int, bcount, cmd->nleft);
+		ide_pio_bytes(drive, cmd, write, done);
+	} else if (drive->media == ide_tape && pc->bh) {
 		done = drive->pc_io_buffers(drive, pc, bcount, write);
-
-		/* FIXME: don't do partial completions */
-		if (drive->media == ide_floppy)
-			ide_complete_rq(drive, 0,
-					done ? done : ide_rq_bytes(rq));
 	} else {
 		done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred);
 		xferfunc(drive, NULL, pc->cur_pos, done);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 0faae3098295..2b4868d95f8b 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -61,16 +61,6 @@
  */
 #define IDEFLOPPY_PC_DELAY	(HZ/20)	/* default delay for ZIP 100 (50ms) */
 
-static void idefloppy_update_buffers(ide_drive_t *drive,
-				struct ide_atapi_pc *pc)
-{
-	struct request *rq = pc->rq;
-	struct bio *bio = rq->bio;
-
-	while ((bio = rq->bio) != NULL)
-		ide_complete_rq(drive, 0, ide_rq_bytes(rq));
-}
-
 static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
@@ -213,7 +203,6 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive,
 	memcpy(rq->cmd, pc->c, 12);
 
 	pc->rq = rq;
-	pc->b_count = 0;
 	if (rq->cmd_flags & REQ_RW)
 		pc->flags |= PC_FLAG_WRITING;
 	pc->buf = NULL;
@@ -227,7 +216,6 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
 	ide_init_pc(pc);
 	memcpy(pc->c, rq->cmd, sizeof(pc->c));
 	pc->rq = rq;
-	pc->b_count = 0;
 	if (rq->data_len && rq_data_dir(rq) == WRITE)
 		pc->flags |= PC_FLAG_WRITING;
 	pc->buf = rq->data;
@@ -244,10 +232,11 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 					     struct request *rq, sector_t block)
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
-	ide_hwif_t *hwif = drive->hwif;
 	struct ide_cmd cmd;
 	struct ide_atapi_pc *pc;
 
+	ide_debug_log(IDE_DBG_FUNC, "enter, cmd: 0x%x\n", rq->cmd[0]);
+
 	if (drive->debug_mask & IDE_DBG_RQ)
 		blk_dump_rq_flags(rq, (rq->rq_disk
 					? rq->rq_disk->disk_name
@@ -294,13 +283,10 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 	cmd.rq = rq;
 
 	if (blk_fs_request(rq) || pc->req_xfer) {
-		ide_init_sg_cmd(&cmd, rq->nr_sectors << 9);
+		ide_init_sg_cmd(&cmd, pc->req_xfer);
 		ide_map_sg(drive, &cmd);
 	}
 
-	pc->sg = hwif->sg_table;
-	pc->sg_cnt = cmd.sg_nents;
-
 	pc->rq = rq;
 
 	return ide_floppy_issue_pc(drive, &cmd, pc);
@@ -388,6 +374,8 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	u8 pc_buf[256], header_len, desc_cnt;
 	int i, rc = 1, blocks, length;
 
+	ide_debug_log(IDE_DBG_FUNC, "enter");
+
 	drive->bios_cyl = 0;
 	drive->bios_head = drive->bios_sect = 0;
 	floppy->blocks = 0;
@@ -488,8 +476,6 @@ static void ide_floppy_setup(ide_drive_t *drive)
 	u16 *id = drive->id;
 
 	drive->pc_callback	 = ide_floppy_callback;
-	drive->pc_update_buffers = idefloppy_update_buffers;
-	drive->pc_io_buffers	 = ide_io_buffers;
 
 	/*
 	 * We used to check revisions here. At this point however I'm giving up.
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 0f48fbd46028..836c4c6cb7e3 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -415,9 +415,6 @@ struct ide_atapi_pc {
 	struct idetape_bh *bh;
 	char *b_data;
 
-	struct scatterlist *sg;
-	unsigned int sg_cnt;
-
 	unsigned long timeout;
 };
 
@@ -1177,8 +1174,6 @@ void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 void ide_output_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 
-int ide_io_buffers(ide_drive_t *, struct ide_atapi_pc *, unsigned int, int);
-
 extern void SELECT_DRIVE(ide_drive_t *);
 void SELECT_MASK(ide_drive_t *, int);
 
-- 
cgit v1.2.3


From ecf3a31d2a08a419bdf919456f1724f5b72bde2c Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:30 +0200
Subject: ide: turn set_irq() method into write_devctl() method

Turn set_irq() method with its software reset hack into write_devctl() method
(for just writing a value into the device control register) at last...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c     |  2 +-
 drivers/ide/au1xxx-ide.c   |  3 +--
 drivers/ide/falconide.c    |  3 +--
 drivers/ide/ide-eh.c       |  7 +++----
 drivers/ide/ide-h8300.c    |  3 +--
 drivers/ide/ide-io-std.c   | 16 +++-------------
 drivers/ide/ide-io.c       |  4 +++-
 drivers/ide/ide-iops.c     |  4 ++--
 drivers/ide/ide-pm.c       |  2 +-
 drivers/ide/ide-probe.c    |  6 +++---
 drivers/ide/ide-taskfile.c |  2 +-
 drivers/ide/ns87415.c      |  3 +--
 drivers/ide/pmac.c         | 14 ++------------
 drivers/ide/q40ide.c       |  3 +--
 drivers/ide/scc_pata.c     | 14 ++------------
 drivers/ide/sgiioc4.c      |  3 +--
 drivers/ide/tx4938ide.c    |  3 +--
 drivers/ide/tx4939ide.c    |  6 ++----
 include/linux/ide.h        |  6 ++----
 19 files changed, 32 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 8fc6ae958b0b..e6e96743aa7b 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -295,7 +295,7 @@ static const struct ide_tp_ops at91_ide_tp_ops = {
 	.exec_command	= ide_exec_command,
 	.read_status	= ide_read_status,
 	.read_altstatus	= ide_read_altstatus,
-	.set_irq	= ide_set_irq,
+	.write_devctl	= ide_write_devctl,
 
 	.tf_load	= at91_ide_tf_load,
 	.tf_read	= at91_ide_tf_read,
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 1bfb43d0d3a8..2ca10d533dad 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -467,8 +467,7 @@ static const struct ide_tp_ops au1xxx_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index b368a5effc3a..5063be85dc33 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -89,8 +89,7 @@ static const struct ide_tp_ops falconide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 11664976eea3..de4b7f1c9c9f 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -401,15 +401,14 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
 	 * immediate interrupt due to the edge transition it produces.
 	 * This single interrupt gives us a "fast poll" for drives that
 	 * recover from reset very quickly, saving us the first 50ms wait time.
-	 *
-	 * TODO: add ->softreset method and stop abusing ->set_irq
 	 */
 	/* set SRST and nIEN */
-	tp_ops->set_irq(hwif, 4);
+	tp_ops->write_devctl(hwif, ATA_SRST | ATA_NIEN | ATA_DEVCTL_OBS);
 	/* more than enough time */
 	udelay(10);
 	/* clear SRST, leave nIEN (unless device is on the quirk list) */
-	tp_ops->set_irq(hwif, drive->quirk_list == 2);
+	tp_ops->write_devctl(hwif, (drive->quirk_list == 2 ? 0 : ATA_NIEN) |
+			     ATA_DEVCTL_OBS);
 	/* more than enough time */
 	udelay(10);
 	hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 7492f28d1290..a57ccad61acf 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -159,8 +159,7 @@ static const struct ide_tp_ops h8300_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= h8300_tf_load,
 	.tf_read		= h8300_tf_read,
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 3a867e49a0af..bbeedce6b17d 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -64,23 +64,14 @@ u8 ide_read_altstatus(ide_hwif_t *hwif)
 }
 EXPORT_SYMBOL_GPL(ide_read_altstatus);
 
-void ide_set_irq(ide_hwif_t *hwif, int on)
+void ide_write_devctl(ide_hwif_t *hwif, u8 ctl)
 {
-	u8 ctl = ATA_DEVCTL_OBS;
-
-	if (on == 4) { /* hack for SRST */
-		ctl |= 4;
-		on &= ~4;
-	}
-
-	ctl |= on ? 0 : 2;
-
 	if (hwif->host_flags & IDE_HFLAG_MMIO)
 		writeb(ctl, (void __iomem *)hwif->io_ports.ctl_addr);
 	else
 		outb(ctl, hwif->io_ports.ctl_addr);
 }
-EXPORT_SYMBOL_GPL(ide_set_irq);
+EXPORT_SYMBOL_GPL(ide_write_devctl);
 
 void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
@@ -312,8 +303,7 @@ const struct ide_tp_ops default_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 3c52317d8524..5b57905a7d71 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -494,7 +494,9 @@ repeat:
 			 * quirk_list may not like intr setups/cleanups
 			 */
 			if (prev_port && prev_port->cur_dev->quirk_list == 0)
-				prev_port->tp_ops->set_irq(prev_port, 0);
+				prev_port->tp_ops->write_devctl(prev_port,
+								ATA_NIEN |
+								ATA_DEVCTL_OBS);
 
 			hwif->host->cur_port = hwif;
 		}
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 0caca342802d..ae227dd8466f 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -360,7 +360,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	SELECT_DRIVE(drive);
 	SELECT_MASK(drive, 1);
 	udelay(1);
-	tp_ops->set_irq(hwif, 0);
+	tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.tf_flags = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT;
@@ -372,7 +372,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	tp_ops->exec_command(hwif, ATA_CMD_SET_FEATURES);
 
 	if (drive->quirk_list == 2)
-		tp_ops->set_irq(hwif, 1);
+		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 
 	error = __ide_wait_stat(drive, drive->ready_stat,
 				ATA_BUSY | ATA_DRQ | ATA_ERR,
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ebf2d21ebdcb..20553d4c42a2 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -233,7 +233,7 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		if (rc)
 			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
 		SELECT_DRIVE(drive);
-		hwif->tp_ops->set_irq(hwif, 1);
+		hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 		rc = ide_wait_not_busy(hwif, 100000);
 		if (rc)
 			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 7c1f1bf81836..d240f76b0da6 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -260,7 +260,7 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 	 * during the identify phase that the IRQ handler isn't expecting.
 	 */
 	if (io_ports->ctl_addr)
-		tp_ops->set_irq(hwif, 0);
+		tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
 
 	/* take a deep breath */
 	msleep(50);
@@ -628,7 +628,7 @@ static int ide_port_wait_ready(ide_hwif_t *hwif)
 		if ((drive->dev_flags & IDE_DFLAG_NOPROBE) == 0 ||
 		    (drive->dev_flags & IDE_DFLAG_PRESENT)) {
 			SELECT_DRIVE(drive);
-			hwif->tp_ops->set_irq(hwif, 1);
+			hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 			mdelay(2);
 			rc = ide_wait_not_busy(hwif, 35000);
 			if (rc)
@@ -845,7 +845,7 @@ static int init_irq (ide_hwif_t *hwif)
 		irq_handler = ide_intr;
 
 	if (io_ports->ctl_addr)
-		hwif->tp_ops->set_irq(hwif, 1);
+		hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 
 	if (request_irq(hwif->irq, irq_handler, sa, hwif->name, hwif))
 		goto out_up;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index dba68db629bf..47f13cd11031 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -80,7 +80,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 
 	if ((cmd->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) {
 		ide_tf_dump(drive->name, tf);
-		tp_ops->set_irq(hwif, 1);
+		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 		SELECT_MASK(drive, 0);
 		tp_ops->tf_load(drive, cmd);
 	}
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 13a9e00efa13..00ab0be7335a 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -109,8 +109,7 @@ static const struct ide_tp_ops superio_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= superio_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= superio_tf_read,
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 879c3d8d9f36..7aa45ea37eeb 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -476,17 +476,8 @@ static void pmac_exec_command(ide_hwif_t *hwif, u8 cmd)
 				     + IDE_TIMING_CONFIG));
 }
 
-static void pmac_set_irq(ide_hwif_t *hwif, int on)
+static void pmac_write_devctl(ide_hwif_t *hwif, u8 ctl)
 {
-	u8 ctl = ATA_DEVCTL_OBS;
-
-	if (on == 4) { /* hack for SRST */
-		ctl |= 4;
-		on &= ~4;
-	}
-
-	ctl |= on ? 0 : 2;
-
 	writeb(ctl, (void __iomem *)hwif->io_ports.ctl_addr);
 	(void)readl((void __iomem *)(hwif->io_ports.data_addr
 				     + IDE_TIMING_CONFIG));
@@ -954,8 +945,7 @@ static const struct ide_tp_ops pmac_tp_ops = {
 	.exec_command		= pmac_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= pmac_set_irq,
+	.write_devctl		= pmac_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index 2a43a2f49633..7fddfd34fcce 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -99,8 +99,7 @@ static const struct ide_tp_ops q40ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 6e47eac1cd7f..6ba4983d831c 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -148,17 +148,8 @@ static u8 scc_dma_sff_read_status(ide_hwif_t *hwif)
 	return (u8)in_be32((void *)(hwif->dma_base + 4));
 }
 
-static void scc_set_irq(ide_hwif_t *hwif, int on)
+static void scc_write_devctl(ide_hwif_t *hwif, u8 ctl)
 {
-	u8 ctl = ATA_DEVCTL_OBS;
-
-	if (on == 4) { /* hack for SRST */
-		ctl |= 4;
-		on &= ~4;
-	}
-
-	ctl |= on ? 0 : 2;
-
 	out_be32((void *)hwif->io_ports.ctl_addr, ctl);
 	eieio();
 	in_be32((void *)(hwif->dma_base + 0x01c));
@@ -843,8 +834,7 @@ static const struct ide_tp_ops scc_tp_ops = {
 	.exec_command		= scc_exec_command,
 	.read_status		= scc_read_status,
 	.read_altstatus		= scc_read_altstatus,
-
-	.set_irq		= scc_set_irq,
+	.write_devctl		= scc_write_devctl,
 
 	.tf_load		= scc_tf_load,
 	.tf_read		= scc_tf_read,
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 6ef5a567d377..58980fcafc3b 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -503,8 +503,7 @@ static const struct ide_tp_ops sgiioc4_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= sgiioc4_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index 1c4a78ac1a20..ec3aa32fbbe0 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -204,8 +204,7 @@ static const struct ide_tp_ops tx4938ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= tx4938ide_tf_load,
 	.tf_read		= tx4938ide_tf_read,
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 77aee5b2ce95..43bc0372413a 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -571,8 +571,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= tx4939ide_tf_read,
@@ -595,8 +594,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 836c4c6cb7e3..ccb70abe991b 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -655,8 +655,7 @@ struct ide_tp_ops {
 	void	(*exec_command)(struct hwif_s *, u8);
 	u8	(*read_status)(struct hwif_s *);
 	u8	(*read_altstatus)(struct hwif_s *);
-
-	void	(*set_irq)(struct hwif_s *, int);
+	void	(*write_devctl)(struct hwif_s *, u8);
 
 	void	(*tf_load)(ide_drive_t *, struct ide_cmd *);
 	void	(*tf_read)(ide_drive_t *, struct ide_cmd *);
@@ -1165,8 +1164,7 @@ void ide_tf_dump(const char *, struct ide_taskfile *);
 void ide_exec_command(ide_hwif_t *, u8);
 u8 ide_read_status(ide_hwif_t *);
 u8 ide_read_altstatus(ide_hwif_t *);
-
-void ide_set_irq(ide_hwif_t *, int);
+void ide_write_devctl(ide_hwif_t *, u8);
 
 void ide_tf_load(ide_drive_t *, struct ide_cmd *);
 void ide_tf_read(ide_drive_t *, struct ide_cmd *);
-- 
cgit v1.2.3


From 6762511934e6e7287ce3c8baac0d52ef64e3787b Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:30 +0200
Subject: ide: rename IDE_TFLAG_IN_[HOB_]FEATURE

The feature register has never been readable -- when its location is read, one
gets the error register value; hence rename IDE_TFLAG_IN_[HOB_]FEATURE into
IDE_TFLAG_IN_[HOB_]ERROR and introduce the 'hob_error' field into the 'struct
ide_taskfile' (despite the error register not really depending on the HOB bit).

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c   | 16 ++++++++--------
 drivers/ide/ide-h8300.c  | 16 ++++++++--------
 drivers/ide/ide-io-std.c | 16 ++++++++--------
 drivers/ide/ide-iops.c   |  2 +-
 drivers/ide/ns87415.c    | 16 ++++++++--------
 drivers/ide/scc_pata.c   | 16 ++++++++--------
 drivers/ide/tx4938ide.c  | 17 ++++++++---------
 drivers/ide/tx4939ide.c  | 17 ++++++++---------
 include/linux/ide.h      | 12 ++++++++----
 9 files changed, 65 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index e6e96743aa7b..9dce793d93b4 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -244,8 +244,8 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	ide_mm_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = ide_mm_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = ide_mm_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = ide_mm_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -260,16 +260,16 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		ide_mm_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = ide_mm_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = ide_mm_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = ide_mm_inb(io_ports->nsect_addr);
+			tf->hob_nsect = ide_mm_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = ide_mm_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = ide_mm_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = ide_mm_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = ide_mm_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = ide_mm_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = ide_mm_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index a57ccad61acf..1d45cd5b6a1c 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -100,8 +100,8 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -116,16 +116,16 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = inb(io_ports->nsect_addr);
+			tf->hob_nsect = inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = inb(io_ports->lbal_addr);
+			tf->hob_lbal  = inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = inb(io_ports->lbam_addr);
+			tf->hob_lbam  = inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = inb(io_ports->lbah_addr);
+			tf->hob_lbah  = inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index bbeedce6b17d..31f5c5f4c093 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -159,8 +159,8 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	tf_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = tf_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = tf_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tf_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -175,16 +175,16 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tf_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = tf_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = tf_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = tf_inb(io_ports->nsect_addr);
+			tf->hob_nsect = tf_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = tf_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = tf_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = tf_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = tf_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = tf_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = tf_inb(io_ports->lbah_addr);
 	}
 }
 EXPORT_SYMBOL_GPL(ide_tf_read);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index ae227dd8466f..6f363a26700d 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -55,7 +55,7 @@ u8 ide_read_error(ide_drive_t *drive)
 	struct ide_cmd cmd;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_IN_FEATURE;
+	cmd.tf_flags = IDE_TFLAG_IN_ERROR;
 
 	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 00ab0be7335a..0a6cf74c3265 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -76,8 +76,8 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -92,16 +92,16 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = inb(io_ports->nsect_addr);
+			tf->hob_nsect = inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = inb(io_ports->lbal_addr);
+			tf->hob_lbal  = inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = inb(io_ports->lbam_addr);
+			tf->hob_lbam  = inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = inb(io_ports->lbah_addr);
+			tf->hob_lbah  = inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 6ba4983d831c..ea0a9752c6f9 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -702,8 +702,8 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	scc_ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = scc_ide_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = scc_ide_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = scc_ide_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -718,16 +718,16 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		scc_ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = scc_ide_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = scc_ide_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = scc_ide_inb(io_ports->nsect_addr);
+			tf->hob_nsect = scc_ide_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = scc_ide_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = scc_ide_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = scc_ide_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = scc_ide_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = scc_ide_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = scc_ide_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index ec3aa32fbbe0..606c37f5267d 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -144,8 +144,8 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	tx4938ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = tx4938ide_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = tx4938ide_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tx4938ide_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -160,17 +160,16 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tx4938ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature =
-				tx4938ide_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = tx4938ide_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = tx4938ide_inb(io_ports->nsect_addr);
+			tf->hob_nsect = tx4938ide_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = tx4938ide_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = tx4938ide_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = tx4938ide_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = tx4938ide_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = tx4938ide_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = tx4938ide_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 43bc0372413a..f1e9da71110c 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -511,8 +511,8 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	tx4939ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = tx4939ide_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = tx4939ide_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tx4939ide_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -527,17 +527,16 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tx4939ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature =
-				tx4939ide_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error =	tx4939ide_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = tx4939ide_inb(io_ports->nsect_addr);
+			tf->hob_nsect = tx4939ide_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = tx4939ide_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = tx4939ide_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = tx4939ide_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = tx4939ide_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = tx4939ide_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = tx4939ide_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ccb70abe991b..e919c865f0c7 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -265,7 +265,7 @@ enum {
 	IDE_TFLAG_WRITE			= (1 << 12),
 	IDE_TFLAG_CUSTOM_HANDLER	= (1 << 13),
 	IDE_TFLAG_DMA_PIO_FALLBACK	= (1 << 14),
-	IDE_TFLAG_IN_HOB_FEATURE	= (1 << 15),
+	IDE_TFLAG_IN_HOB_ERROR		= (1 << 15),
 	IDE_TFLAG_IN_HOB_NSECT		= (1 << 16),
 	IDE_TFLAG_IN_HOB_LBAL		= (1 << 17),
 	IDE_TFLAG_IN_HOB_LBAM		= (1 << 18),
@@ -273,10 +273,10 @@ enum {
 	IDE_TFLAG_IN_HOB_LBA		= IDE_TFLAG_IN_HOB_LBAL |
 					  IDE_TFLAG_IN_HOB_LBAM |
 					  IDE_TFLAG_IN_HOB_LBAH,
-	IDE_TFLAG_IN_HOB		= IDE_TFLAG_IN_HOB_FEATURE |
+	IDE_TFLAG_IN_HOB		= IDE_TFLAG_IN_HOB_ERROR |
 					  IDE_TFLAG_IN_HOB_NSECT |
 					  IDE_TFLAG_IN_HOB_LBA,
-	IDE_TFLAG_IN_FEATURE		= (1 << 20),
+	IDE_TFLAG_IN_ERROR		= (1 << 20),
 	IDE_TFLAG_IN_NSECT		= (1 << 21),
 	IDE_TFLAG_IN_LBAL		= (1 << 22),
 	IDE_TFLAG_IN_LBAM		= (1 << 23),
@@ -310,8 +310,12 @@ enum {
 
 struct ide_taskfile {
 	u8	hob_data;	/*  0: high data byte (for TASKFILE IOCTL) */
+				/*  1-5: additional data to support LBA48 */
+	union {
+		u8 hob_error;	/*   read: error */
+		u8 hob_feature;	/*  write: feature */
+	};
 
-	u8	hob_feature;	/*  1-5: additional data to support LBA48 */
 	u8	hob_nsect;
 	u8	hob_lbal;
 	u8	hob_lbam;
-- 
cgit v1.2.3


From abb596b25edac1ec1acc4ef53df190771661c3d2 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:32 +0200
Subject: ide: turn selectproc() method into dev_select() method (take 5)

Turn selectproc() method into dev_select() method by teaching it to write to the
device register and moving it from 'struct ide_port_ops' to 'struct ide_tp_ops'.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: benh@kernel.crashing.org
Cc: petkovbb@gmail.com
[bart: add ->dev_select to at91_ide.c and tx4939.c (__BIG_ENDIAN case)]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c   |  1 +
 drivers/ide/au1xxx-ide.c |  1 +
 drivers/ide/falconide.c  |  1 +
 drivers/ide/ht6560b.c    | 20 +++++++++++++++--
 drivers/ide/ide-h8300.c  |  1 +
 drivers/ide/ide-io-std.c | 13 +++++++++++
 drivers/ide/ide-iops.c   | 12 +---------
 drivers/ide/ns87415.c    | 25 ++++++++++++++++-----
 drivers/ide/pmac.c       | 58 ++++++++++++++++++++++++++++++++----------------
 drivers/ide/q40ide.c     |  1 +
 drivers/ide/qd65xx.c     | 21 +++++++++++++++---
 drivers/ide/scc_pata.c   |  1 +
 drivers/ide/sgiioc4.c    |  1 +
 drivers/ide/trm290.c     | 20 +++++++++++++----
 drivers/ide/tx4938ide.c  |  1 +
 drivers/ide/tx4939ide.c  |  4 +++-
 include/linux/ide.h      |  6 ++---
 17 files changed, 139 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 04b39ff02d76..8eda552326e9 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -283,6 +283,7 @@ static const struct ide_tp_ops at91_ide_tp_ops = {
 	.read_altstatus	= ide_read_altstatus,
 	.write_devctl	= ide_write_devctl,
 
+	.dev_select	= ide_dev_select,
 	.tf_load	= at91_ide_tf_load,
 	.tf_read	= at91_ide_tf_read,
 
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 2ca10d533dad..46013644c965 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -469,6 +469,7 @@ static const struct ide_tp_ops au1xxx_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index 5063be85dc33..afa2af9a362b 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -91,6 +91,7 @@ static const struct ide_tp_ops falconide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/ht6560b.c b/drivers/ide/ht6560b.c
index c7e5c2246b79..2fb0f2965009 100644
--- a/drivers/ide/ht6560b.c
+++ b/drivers/ide/ht6560b.c
@@ -103,7 +103,7 @@
 /*
  * This routine is invoked from ide.c to prepare for access to a given drive.
  */
-static void ht6560b_selectproc (ide_drive_t *drive)
+static void ht6560b_dev_select(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	unsigned long flags;
@@ -143,6 +143,8 @@ static void ht6560b_selectproc (ide_drive_t *drive)
 #endif
 	}
 	local_irq_restore(flags);
+
+	outb(drive->select | ATA_DEVICE_OBS, hwif->io_ports.device_addr);
 }
 
 /*
@@ -305,15 +307,29 @@ static int probe_ht6560b;
 module_param_named(probe, probe_ht6560b, bool, 0);
 MODULE_PARM_DESC(probe, "probe for HT6560B chipset");
 
+static const struct ide_tp_ops ht6560b_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= ht6560b_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
+};
+
 static const struct ide_port_ops ht6560b_port_ops = {
 	.init_dev		= ht6560b_init_dev,
 	.set_pio_mode		= ht6560b_set_pio_mode,
-	.selectproc		= ht6560b_selectproc,
 };
 
 static const struct ide_port_info ht6560b_port_info __initdata = {
 	.name			= DRV_NAME,
 	.chipset		= ide_ht6560b,
+	.tp_ops 		= &ht6560b_tp_ops,
 	.port_ops		= &ht6560b_port_ops,
 	.host_flags		= IDE_HFLAG_SERIALIZE | /* is this needed? */
 				  IDE_HFLAG_NO_DMA |
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 8541a9abd7ac..dac9a6d44963 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -151,6 +151,7 @@ static const struct ide_tp_ops h8300_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= h8300_tf_load,
 	.tf_read		= h8300_tf_read,
 
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 7f77bb7db488..9cac281d82c4 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -73,6 +73,18 @@ void ide_write_devctl(ide_hwif_t *hwif, u8 ctl)
 }
 EXPORT_SYMBOL_GPL(ide_write_devctl);
 
+void ide_dev_select(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	u8 select = drive->select | ATA_DEVICE_OBS;
+
+	if (hwif->host_flags & IDE_HFLAG_MMIO)
+		writeb(select, (void __iomem *)hwif->io_ports.device_addr);
+	else
+		outb(select, hwif->io_ports.device_addr);
+}
+EXPORT_SYMBOL_GPL(ide_dev_select);
+
 void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -280,6 +292,7 @@ const struct ide_tp_ops default_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 6f363a26700d..dfb0ec317fa3 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -29,17 +29,7 @@
 
 void SELECT_DRIVE(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-	struct ide_cmd cmd;
-
-	if (port_ops && port_ops->selectproc)
-		port_ops->selectproc(drive);
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_OUT_DEVICE;
-
-	drive->hwif->tp_ops->tf_load(drive, &cmd);
+	drive->hwif->tp_ops->dev_select(drive);
 }
 
 void SELECT_MASK(ide_drive_t *drive, int mask)
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 9f6dff83b141..af1b421eb450 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -98,12 +98,15 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 }
 
+static void ns87415_dev_select(ide_drive_t *drive);
+
 static const struct ide_tp_ops superio_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= superio_read_status,
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ns87415_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= superio_tf_read,
 
@@ -182,10 +185,12 @@ static void ns87415_prepare_drive (ide_drive_t *drive, unsigned int use_dma)
 	local_irq_restore(flags);
 }
 
-static void ns87415_selectproc (ide_drive_t *drive)
+static void ns87415_dev_select(ide_drive_t *drive)
 {
 	ns87415_prepare_drive(drive,
 			      !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
+
+	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
 }
 
 static void ns87415_dma_start(ide_drive_t *drive)
@@ -229,7 +234,7 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
 	 * Also, leave IRQ masked during drive probing, to prevent infinite
 	 * interrupts from a potentially floating INTA..
 	 *
-	 * IRQs get unmasked in selectproc when drive is first used.
+	 * IRQs get unmasked in dev_select() when drive is first used.
 	 */
 	(void) pci_read_config_dword(dev, 0x40, &ctrl);
 	(void) pci_read_config_byte(dev, 0x09, &progif);
@@ -281,8 +286,18 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
 	outb(0x60, hwif->dma_base + ATA_DMA_STATUS);
 }
 
-static const struct ide_port_ops ns87415_port_ops = {
-	.selectproc		= ns87415_selectproc,
+static const struct ide_tp_ops ns87415_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= ns87415_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
 };
 
 static const struct ide_dma_ops ns87415_dma_ops = {
@@ -299,7 +314,7 @@ static const struct ide_dma_ops ns87415_dma_ops = {
 static const struct ide_port_info ns87415_chipset __devinitdata = {
 	.name		= DRV_NAME,
 	.init_hwif	= init_hwif_ns87415,
-	.port_ops	= &ns87415_port_ops,
+	.tp_ops 	= &ns87415_tp_ops,
 	.dma_ops	= &ns87415_dma_ops,
 	.host_flags	= IDE_HFLAG_TRUST_BIOS_FOR_DMA |
 			  IDE_HFLAG_NO_ATAPI_DMA,
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 7aa45ea37eeb..24ce1f805cd7 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -404,8 +404,6 @@ kauai_lookup_timing(struct kauai_timing* table, int cycle_time)
 #define IDE_WAKEUP_DELAY	(1*HZ)
 
 static int pmac_ide_init_dma(ide_hwif_t *, const struct ide_port_info *);
-static void pmac_ide_selectproc(ide_drive_t *drive);
-static void pmac_ide_kauai_selectproc(ide_drive_t *drive);
 
 #define PMAC_IDE_REG(x) \
 	((void __iomem *)((drive)->hwif->io_ports.data_addr + (x)))
@@ -415,8 +413,7 @@ static void pmac_ide_kauai_selectproc(ide_drive_t *drive);
  * timing register when selecting that unit. This version is for
  * ASICs with a single timing register
  */
-static void
-pmac_ide_selectproc(ide_drive_t *drive)
+static void pmac_ide_apply_timings(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	pmac_ide_hwif_t *pmif =
@@ -434,8 +431,7 @@ pmac_ide_selectproc(ide_drive_t *drive)
  * timing register when selecting that unit. This version is for
  * ASICs with a dual timing register (Kauai)
  */
-static void
-pmac_ide_kauai_selectproc(ide_drive_t *drive)
+static void pmac_ide_kauai_apply_timings(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	pmac_ide_hwif_t *pmif =
@@ -464,9 +460,25 @@ pmac_ide_do_update_timings(ide_drive_t *drive)
 	if (pmif->kind == controller_sh_ata6 ||
 	    pmif->kind == controller_un_ata6 ||
 	    pmif->kind == controller_k2_ata6)
-		pmac_ide_kauai_selectproc(drive);
+		pmac_ide_kauai_apply_timings(drive);
 	else
-		pmac_ide_selectproc(drive);
+		pmac_ide_apply_timings(drive);
+}
+
+static void pmac_dev_select(ide_drive_t *drive)
+{
+	pmac_ide_apply_timings(drive);
+
+	writeb(drive->select | ATA_DEVICE_OBS,
+	       (void __iomem *)drive->hwif->io_ports.device_addr);
+}
+
+static void pmac_kauai_dev_select(ide_drive_t *drive)
+{
+	pmac_ide_kauai_apply_timings(drive);
+
+	writeb(drive->select | ATA_DEVICE_OBS,
+	       (void __iomem *)drive->hwif->io_ports.device_addr);
 }
 
 static void pmac_exec_command(ide_hwif_t *hwif, u8 cmd)
@@ -947,6 +959,7 @@ static const struct ide_tp_ops pmac_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= pmac_write_devctl,
 
+	.dev_select		= pmac_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
@@ -954,19 +967,24 @@ static const struct ide_tp_ops pmac_tp_ops = {
 	.output_data		= ide_output_data,
 };
 
-static const struct ide_port_ops pmac_ide_ata6_port_ops = {
-	.init_dev		= pmac_ide_init_dev,
-	.set_pio_mode		= pmac_ide_set_pio_mode,
-	.set_dma_mode		= pmac_ide_set_dma_mode,
-	.selectproc		= pmac_ide_kauai_selectproc,
-	.cable_detect		= pmac_ide_cable_detect,
+static const struct ide_tp_ops pmac_ata6_tp_ops = {
+	.exec_command		= pmac_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= pmac_write_devctl,
+
+	.dev_select		= pmac_kauai_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
 };
 
 static const struct ide_port_ops pmac_ide_ata4_port_ops = {
 	.init_dev		= pmac_ide_init_dev,
 	.set_pio_mode		= pmac_ide_set_pio_mode,
 	.set_dma_mode		= pmac_ide_set_dma_mode,
-	.selectproc		= pmac_ide_selectproc,
 	.cable_detect		= pmac_ide_cable_detect,
 };
 
@@ -974,7 +992,6 @@ static const struct ide_port_ops pmac_ide_port_ops = {
 	.init_dev		= pmac_ide_init_dev,
 	.set_pio_mode		= pmac_ide_set_pio_mode,
 	.set_dma_mode		= pmac_ide_set_dma_mode,
-	.selectproc		= pmac_ide_selectproc,
 };
 
 static const struct ide_dma_ops pmac_dma_ops;
@@ -1011,15 +1028,18 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, hw_regs_t *hw)
 	pmif->broken_dma = pmif->broken_dma_warn = 0;
 	if (of_device_is_compatible(np, "shasta-ata")) {
 		pmif->kind = controller_sh_ata6;
-		d.port_ops = &pmac_ide_ata6_port_ops;
+		d.tp_ops = &pmac_ata6_tp_ops;
+		d.port_ops = &pmac_ide_ata4_port_ops;
 		d.udma_mask = ATA_UDMA6;
 	} else if (of_device_is_compatible(np, "kauai-ata")) {
 		pmif->kind = controller_un_ata6;
-		d.port_ops = &pmac_ide_ata6_port_ops;
+		d.tp_ops = &pmac_ata6_tp_ops;
+		d.port_ops = &pmac_ide_ata4_port_ops;
 		d.udma_mask = ATA_UDMA5;
 	} else if (of_device_is_compatible(np, "K2-UATA")) {
 		pmif->kind = controller_k2_ata6;
-		d.port_ops = &pmac_ide_ata6_port_ops;
+		d.tp_ops = &pmac_ata6_tp_ops;
+		d.port_ops = &pmac_ide_ata4_port_ops;
 		d.udma_mask = ATA_UDMA5;
 	} else if (of_device_is_compatible(np, "keylargo-ata")) {
 		if (strcmp(np->name, "ata-4") == 0) {
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index 7fddfd34fcce..d007e7f66598 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -101,6 +101,7 @@ static const struct ide_tp_ops q40ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/qd65xx.c b/drivers/ide/qd65xx.c
index 08c4fa35e9b1..c9a134986891 100644
--- a/drivers/ide/qd65xx.c
+++ b/drivers/ide/qd65xx.c
@@ -90,13 +90,15 @@ static int timings[4]={-1,-1,-1,-1}; /* stores current timing for each timer */
  * This routine is invoked to prepare for access to a given drive.
  */
 
-static void qd65xx_select(ide_drive_t *drive)
+static void qd65xx_dev_select(ide_drive_t *drive)
 {
 	u8 index = ((	(QD_TIMREG(drive)) & 0x80 ) >> 7) |
 			(QD_TIMREG(drive) & 0x02);
 
 	if (timings[index] != QD_TIMING(drive))
 		outb(timings[index] = QD_TIMING(drive), QD_TIMREG(drive));
+
+	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
 }
 
 /*
@@ -309,20 +311,33 @@ static void __init qd6580_init_dev(ide_drive_t *drive)
 	drive->drive_data = (drive->dn & 1) ? t2 : t1;
 }
 
+static const struct ide_tp_ops qd65xx_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= qd65xx_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
+};
+
 static const struct ide_port_ops qd6500_port_ops = {
 	.init_dev		= qd6500_init_dev,
 	.set_pio_mode		= qd6500_set_pio_mode,
-	.selectproc		= qd65xx_select,
 };
 
 static const struct ide_port_ops qd6580_port_ops = {
 	.init_dev		= qd6580_init_dev,
 	.set_pio_mode		= qd6580_set_pio_mode,
-	.selectproc		= qd65xx_select,
 };
 
 static const struct ide_port_info qd65xx_port_info __initdata = {
 	.name			= DRV_NAME,
+	.tp_ops 		= &qd65xx_tp_ops,
 	.chipset		= ide_qd65xx,
 	.host_flags		= IDE_HFLAG_IO_32BIT |
 				  IDE_HFLAG_NO_DMA,
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 97f8e0ef21b1..6d8dbd9c10bc 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -825,6 +825,7 @@ static const struct ide_tp_ops scc_tp_ops = {
 	.read_altstatus		= scc_read_altstatus,
 	.write_devctl		= scc_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= scc_tf_load,
 	.tf_read		= scc_tf_read,
 
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 58980fcafc3b..e5d2a48a84de 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -505,6 +505,7 @@ static const struct ide_tp_ops sgiioc4_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index c0528f27fcae..4b42ca091534 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -171,9 +171,11 @@ static void trm290_prepare_drive (ide_drive_t *drive, unsigned int use_dma)
 	local_irq_restore(flags);
 }
 
-static void trm290_selectproc (ide_drive_t *drive)
+static void trm290_dev_select(ide_drive_t *drive)
 {
 	trm290_prepare_drive(drive, !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
+
+	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
 }
 
 static int trm290_dma_check(ide_drive_t *drive, struct ide_cmd *cmd)
@@ -298,8 +300,18 @@ static void __devinit init_hwif_trm290(ide_hwif_t *hwif)
 #endif
 }
 
-static const struct ide_port_ops trm290_port_ops = {
-	.selectproc		= trm290_selectproc,
+static const struct ide_tp_ops trm290_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= trm290_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
 };
 
 static struct ide_dma_ops trm290_dma_ops = {
@@ -315,7 +327,7 @@ static struct ide_dma_ops trm290_dma_ops = {
 static const struct ide_port_info trm290_chipset __devinitdata = {
 	.name		= DRV_NAME,
 	.init_hwif	= init_hwif_trm290,
-	.port_ops	= &trm290_port_ops,
+	.tp_ops 	= &trm290_tp_ops,
 	.dma_ops	= &trm290_dma_ops,
 	.host_flags	= IDE_HFLAG_TRM290 |
 			  IDE_HFLAG_NO_ATAPI_DMA |
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index be391b615963..4cb79c4c2604 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -189,6 +189,7 @@ static const struct ide_tp_ops tx4938ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= tx4938ide_tf_load,
 	.tf_read		= tx4938ide_tf_read,
 
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 5a614d1c94f1..0040a9a3e26e 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -429,7 +429,7 @@ static void tx4939ide_tf_load_fixup(ide_drive_t *drive)
 	 * Fix ATA100 CORE System Control Register. (The write to the
 	 * Device/Head register may write wrong data to the System
 	 * Control Register)
-	 * While Sys_Ctl is written here, selectproc is not needed.
+	 * While Sys_Ctl is written here, dev_select() is not needed.
 	 */
 	tx4939ide_writew(sysctl, base, TX4939IDE_Sys_Ctl);
 }
@@ -556,6 +556,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= tx4939ide_tf_read,
 
@@ -579,6 +580,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e919c865f0c7..c69181c61fd8 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -603,7 +603,7 @@ struct ide_drive_s {
 
 	unsigned int	bios_cyl;	/* BIOS/fdisk/LILO number of cyls */
 	unsigned int	cyl;		/* "real" number of cyls */
-	unsigned int	drive_data;	/* used by set_pio_mode/selectproc */
+	unsigned int	drive_data;	/* used by set_pio_mode/dev_select() */
 	unsigned int	failures;	/* current failure count */
 	unsigned int	max_failures;	/* maximum allowed failure count */
 	u64		probed_capacity;/* initial reported media capacity (ide-cd only currently) */
@@ -661,6 +661,7 @@ struct ide_tp_ops {
 	u8	(*read_altstatus)(struct hwif_s *);
 	void	(*write_devctl)(struct hwif_s *, u8);
 
+	void	(*dev_select)(ide_drive_t *);
 	void	(*tf_load)(ide_drive_t *, struct ide_cmd *);
 	void	(*tf_read)(ide_drive_t *, struct ide_cmd *);
 
@@ -678,7 +679,6 @@ extern const struct ide_tp_ops default_tp_ops;
  * @init_dev:		host specific initialization of a device
  * @set_pio_mode:	routine to program host for PIO mode
  * @set_dma_mode:	routine to program host for DMA mode
- * @selectproc:		tweaks hardware to select drive
  * @reset_poll:		chipset polling based on hba specifics
  * @pre_reset:		chipset specific changes to default for device-hba resets
  * @resetproc:		routine to reset controller after a disk reset
@@ -695,7 +695,6 @@ struct ide_port_ops {
 	void	(*init_dev)(ide_drive_t *);
 	void	(*set_pio_mode)(ide_drive_t *, const u8);
 	void	(*set_dma_mode)(ide_drive_t *, const u8);
-	void	(*selectproc)(ide_drive_t *);
 	int	(*reset_poll)(ide_drive_t *);
 	void	(*pre_reset)(ide_drive_t *);
 	void	(*resetproc)(ide_drive_t *);
@@ -1170,6 +1169,7 @@ u8 ide_read_status(ide_hwif_t *);
 u8 ide_read_altstatus(ide_hwif_t *);
 void ide_write_devctl(ide_hwif_t *, u8);
 
+void ide_dev_select(ide_drive_t *);
 void ide_tf_load(ide_drive_t *, struct ide_cmd *);
 void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 
-- 
cgit v1.2.3


From fdd88f0af616db59a6a36bdf0185181d2b779f53 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:33 +0200
Subject: ide: inline SELECT_DRIVE()

Since SELECT_DRIVE() has boiled down to a mere dev_select() method call, it now
makes sense to just inline it...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-eh.c    |  7 ++++---
 drivers/ide/ide-io.c    |  2 +-
 drivers/ide/ide-iops.c  |  7 +------
 drivers/ide/ide-pm.c    |  5 +++--
 drivers/ide/ide-probe.c | 15 ++++++++-------
 drivers/ide/ns87415.c   |  2 +-
 include/linux/ide.h     |  1 -
 7 files changed, 18 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index de4b7f1c9c9f..5d5fb961b5ce 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -165,11 +165,12 @@ static ide_startstop_t do_reset1(ide_drive_t *, int);
 static ide_startstop_t atapi_reset_pollfunc(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	u8 stat;
 
-	SELECT_DRIVE(drive);
+	tp_ops->dev_select(drive);
 	udelay(10);
-	stat = hwif->tp_ops->read_status(hwif);
+	stat = tp_ops->read_status(hwif);
 
 	if (OK_STAT(stat, 0, ATA_BUSY))
 		printk(KERN_INFO "%s: ATAPI reset complete\n", drive->name);
@@ -348,7 +349,7 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
 	/* For an ATAPI device, first try an ATAPI SRST. */
 	if (drive->media != ide_disk && !do_not_try_atapi) {
 		pre_reset(drive);
-		SELECT_DRIVE(drive);
+		tp_ops->dev_select(drive);
 		udelay(20);
 		tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
 		ndelay(400);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 5589dce88674..1deb6d29b186 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -348,7 +348,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 	if (blk_pm_request(rq))
 		ide_check_pm_state(drive, rq);
 
-	SELECT_DRIVE(drive);
+	drive->hwif->tp_ops->dev_select(drive);
 	if (ide_wait_stat(&startstop, drive, drive->ready_stat,
 			  ATA_BUSY | ATA_DRQ, WAIT_READY)) {
 		printk(KERN_ERR "%s: drive not ready for command\n", drive->name);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index dfb0ec317fa3..27bb70ddd459 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -27,11 +27,6 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
-void SELECT_DRIVE(ide_drive_t *drive)
-{
-	drive->hwif->tp_ops->dev_select(drive);
-}
-
 void SELECT_MASK(ide_drive_t *drive, int mask)
 {
 	const struct ide_port_ops *port_ops = drive->hwif->port_ops;
@@ -347,7 +342,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	disable_irq_nosync(hwif->irq);
 
 	udelay(1);
-	SELECT_DRIVE(drive);
+	tp_ops->dev_select(drive);
 	SELECT_MASK(drive, 1);
 	udelay(1);
 	tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 20553d4c42a2..bb7858ebb7d1 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -223,6 +223,7 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		 * point.
 		 */
 		ide_hwif_t *hwif = drive->hwif;
+		const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 		struct request_queue *q = drive->queue;
 		unsigned long flags;
 		int rc;
@@ -232,8 +233,8 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		rc = ide_wait_not_busy(hwif, 35000);
 		if (rc)
 			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
-		SELECT_DRIVE(drive);
-		hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
+		tp_ops->dev_select(drive);
+		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 		rc = ide_wait_not_busy(hwif, 100000);
 		if (rc)
 			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index d240f76b0da6..d8c1c3e735bb 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -390,13 +390,13 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 	 * (e.g. crw9624 as drive0 with disk as slave)
 	 */
 	msleep(50);
-	SELECT_DRIVE(drive);
+	tp_ops->dev_select(drive);
 	msleep(50);
 
 	if (ide_read_device(drive) != drive->select && present == 0) {
 		if (drive->dn & 1) {
 			/* exit with drive0 selected */
-			SELECT_DRIVE(hwif->devices[0]);
+			tp_ops->dev_select(hwif->devices[0]);
 			/* allow ATA_BUSY to assert & clear */
 			msleep(50);
 		}
@@ -422,7 +422,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 			printk(KERN_ERR "%s: no response (status = 0x%02x), "
 					"resetting drive\n", drive->name, stat);
 			msleep(50);
-			SELECT_DRIVE(drive);
+			tp_ops->dev_select(drive);
 			msleep(50);
 			tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
 			(void)ide_busy_sleep(hwif, WAIT_WORSTCASE, 0);
@@ -441,7 +441,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 	}
 	if (drive->dn & 1) {
 		/* exit with drive0 selected */
-		SELECT_DRIVE(hwif->devices[0]);
+		tp_ops->dev_select(hwif->devices[0]);
 		msleep(50);
 		/* ensure drive irq is clear */
 		(void)tp_ops->read_status(hwif);
@@ -605,6 +605,7 @@ out:
 
 static int ide_port_wait_ready(ide_hwif_t *hwif)
 {
+	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	ide_drive_t *drive;
 	int i, rc;
 
@@ -627,8 +628,8 @@ static int ide_port_wait_ready(ide_hwif_t *hwif)
 		/* Ignore disks that we will not probe for later. */
 		if ((drive->dev_flags & IDE_DFLAG_NOPROBE) == 0 ||
 		    (drive->dev_flags & IDE_DFLAG_PRESENT)) {
-			SELECT_DRIVE(drive);
-			hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
+			tp_ops->dev_select(drive);
+			tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 			mdelay(2);
 			rc = ide_wait_not_busy(hwif, 35000);
 			if (rc)
@@ -640,7 +641,7 @@ static int ide_port_wait_ready(ide_hwif_t *hwif)
 out:
 	/* Exit function with master reselected (let's be sane) */
 	if (i)
-		SELECT_DRIVE(hwif->devices[0]);
+		tp_ops->dev_select(hwif->devices[0]);
 
 	return rc;
 }
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index af1b421eb450..71a39fb3856f 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -262,7 +262,7 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
 #ifdef __sparc_v9__
 		/*
 		 * XXX: Reset the device, if we don't it will not respond to
-		 *      SELECT_DRIVE() properly during first ide_probe_port().
+		 *      dev_select() properly during first ide_probe_port().
 		 */
 		timeout = 10000;
 		outb(12, hwif->io_ports.ctl_addr);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c69181c61fd8..a5d26f66ef78 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1176,7 +1176,6 @@ void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 void ide_output_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 
-extern void SELECT_DRIVE(ide_drive_t *);
 void SELECT_MASK(ide_drive_t *, int);
 
 u8 ide_read_error(ide_drive_t *);
-- 
cgit v1.2.3


From 3e93cd671813e204c258f1e6c797959920cf7772 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:00:13 -0400
Subject: Take fs_struct handling to new file (fs/fs_struct.c)

Pure code move; two new helper functions for nfsd and daemonize
(unshare_fs_struct() and daemonize_fs_struct() resp.; for now -
the same code as used to be in callers).  unshare_fs_struct()
exported (for nfsd, as copy_fs_struct()/exit_fs() used to be),
copy_fs_struct() and exit_fs() don't need exports anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Makefile               |   2 +-
 fs/fs_struct.c            | 141 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/internal.h             |   6 ++
 fs/namei.c                |   7 ---
 fs/namespace.c            |  68 ----------------------
 fs/nfsd/nfssvc.c          |   7 +--
 include/linux/fs_struct.h |   2 +
 kernel/exit.c             |  31 +---------
 kernel/fork.c             |  29 +---------
 9 files changed, 155 insertions(+), 138 deletions(-)
 create mode 100644 fs/fs_struct.c

(limited to 'include/linux')

diff --git a/fs/Makefile b/fs/Makefile
index 6e82a307bcd4..b5cd8e18dd9f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
-		stack.o
+		stack.o fs_struct.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 000000000000..36e0a123bbf3
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,141 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+
+/*
+ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_root(struct fs_struct *fs, struct path *path)
+{
+	struct path old_root;
+
+	write_lock(&fs->lock);
+	old_root = fs->root;
+	fs->root = *path;
+	path_get(path);
+	write_unlock(&fs->lock);
+	if (old_root.dentry)
+		path_put(&old_root);
+}
+
+/*
+ * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_pwd(struct fs_struct *fs, struct path *path)
+{
+	struct path old_pwd;
+
+	write_lock(&fs->lock);
+	old_pwd = fs->pwd;
+	fs->pwd = *path;
+	path_get(path);
+	write_unlock(&fs->lock);
+
+	if (old_pwd.dentry)
+		path_put(&old_pwd);
+}
+
+void chroot_fs_refs(struct path *old_root, struct path *new_root)
+{
+	struct task_struct *g, *p;
+	struct fs_struct *fs;
+	int count = 0;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		task_lock(p);
+		fs = p->fs;
+		if (fs) {
+			write_lock(&fs->lock);
+			if (fs->root.dentry == old_root->dentry
+			    && fs->root.mnt == old_root->mnt) {
+				path_get(new_root);
+				fs->root = *new_root;
+				count++;
+			}
+			if (fs->pwd.dentry == old_root->dentry
+			    && fs->pwd.mnt == old_root->mnt) {
+				path_get(new_root);
+				fs->pwd = *new_root;
+				count++;
+			}
+			write_unlock(&fs->lock);
+		}
+		task_unlock(p);
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+	while (count--)
+		path_put(old_root);
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+	/* No need to hold fs->lock if we are killing it */
+	if (atomic_dec_and_test(&fs->count)) {
+		path_put(&fs->root);
+		path_put(&fs->pwd);
+		kmem_cache_free(fs_cachep, fs);
+	}
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+	struct fs_struct * fs = tsk->fs;
+
+	if (fs) {
+		task_lock(tsk);
+		tsk->fs = NULL;
+		task_unlock(tsk);
+		put_fs_struct(fs);
+	}
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+	/* We don't need to lock fs - think why ;-) */
+	if (fs) {
+		atomic_set(&fs->count, 1);
+		rwlock_init(&fs->lock);
+		fs->umask = old->umask;
+		read_lock(&old->lock);
+		fs->root = old->root;
+		path_get(&old->root);
+		fs->pwd = old->pwd;
+		path_get(&old->pwd);
+		read_unlock(&old->lock);
+	}
+	return fs;
+}
+
+int unshare_fs_struct(void)
+{
+	struct fs_struct *fsp = copy_fs_struct(current->fs);
+	if (!fsp)
+		return -ENOMEM;
+	exit_fs(current);
+	current->fs = fsp;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unshare_fs_struct);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+	.count		= ATOMIC_INIT(1),
+	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
+	.umask		= 0022,
+};
+
+void daemonize_fs_struct(void)
+{
+	struct fs_struct *fs;
+
+	exit_fs(current);	/* current->fs->count--; */
+	fs = &init_fs;
+	current->fs = fs;
+	atomic_inc(&fs->count);
+}
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..477a105f8df3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 
 struct super_block;
 struct linux_binprm;
+struct path;
 
 /*
  * block_dev.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 
 extern void __init mnt_init(void);
+
+/*
+ * fs_struct.c
+ */
+extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/namei.c b/fs/namei.c
index d040ce11785d..4c65a6460138 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2897,10 +2897,3 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
-
-/* to be mentioned only in INIT_TASK */
-struct fs_struct init_fs = {
-	.count		= ATOMIC_INIT(1),
-	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
-	.umask		= 0022,
-};
diff --git a/fs/namespace.c b/fs/namespace.c
index f7ec283ccfbb..1e56303c718e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2092,74 +2092,6 @@ out1:
 	return retval;
 }
 
-/*
- * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_root(struct fs_struct *fs, struct path *path)
-{
-	struct path old_root;
-
-	write_lock(&fs->lock);
-	old_root = fs->root;
-	fs->root = *path;
-	path_get(path);
-	write_unlock(&fs->lock);
-	if (old_root.dentry)
-		path_put(&old_root);
-}
-
-/*
- * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_pwd(struct fs_struct *fs, struct path *path)
-{
-	struct path old_pwd;
-
-	write_lock(&fs->lock);
-	old_pwd = fs->pwd;
-	fs->pwd = *path;
-	path_get(path);
-	write_unlock(&fs->lock);
-
-	if (old_pwd.dentry)
-		path_put(&old_pwd);
-}
-
-static void chroot_fs_refs(struct path *old_root, struct path *new_root)
-{
-	struct task_struct *g, *p;
-	struct fs_struct *fs;
-	int count = 0;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		task_lock(p);
-		fs = p->fs;
-		if (fs) {
-			write_lock(&fs->lock);
-			if (fs->root.dentry == old_root->dentry
-			    && fs->root.mnt == old_root->mnt) {
-				path_get(new_root);
-				fs->root = *new_root;
-				count++;
-			}
-			if (fs->pwd.dentry == old_root->dentry
-			    && fs->pwd.mnt == old_root->mnt) {
-				path_get(new_root);
-				fs->pwd = *new_root;
-				count++;
-			}
-			write_unlock(&fs->lock);
-		}
-		task_unlock(p);
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
-	while (count--)
-		path_put(old_root);
-}
-
 /*
  * pivot_root Semantics:
  * Moves the root file system of the current process to the directory put_old,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..144d69918614 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -404,7 +404,6 @@ static int
 nfsd(void *vrqstp)
 {
 	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
-	struct fs_struct *fsp;
 	int err, preverr = 0;
 
 	/* Lock module and set up kernel thread */
@@ -413,13 +412,11 @@ nfsd(void *vrqstp)
 	/* At this point, the thread shares current->fs
 	 * with the init process. We need to create files with a
 	 * umask of 0 instead of init's umask. */
-	fsp = copy_fs_struct(current->fs);
-	if (!fsp) {
+	if (unshare_fs_struct() < 0) {
 		printk("Unable to start nfsd thread: out of memory\n");
 		goto out;
 	}
-	exit_fs(current);
-	current->fs = fsp;
+
 	current->fs->umask = 0;
 
 	/*
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 18b467dbe278..298cef1c0793 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -20,5 +20,7 @@ extern void set_fs_root(struct fs_struct *, struct path *);
 extern void set_fs_pwd(struct fs_struct *, struct path *);
 extern struct fs_struct *copy_fs_struct(struct fs_struct *);
 extern void put_fs_struct(struct fs_struct *);
+extern void daemonize_fs_struct(void);
+extern int unshare_fs_struct(void);
 
 #endif /* _LINUX_FS_STRUCT_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..ad8375758a79 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -429,7 +429,6 @@ EXPORT_SYMBOL(disallow_signal);
 void daemonize(const char *name, ...)
 {
 	va_list args;
-	struct fs_struct *fs;
 	sigset_t blocked;
 
 	va_start(args, name);
@@ -462,11 +461,7 @@ void daemonize(const char *name, ...)
 
 	/* Become as one with the init task */
 
-	exit_fs(current);	/* current->fs->count--; */
-	fs = init_task.fs;
-	current->fs = fs;
-	atomic_inc(&fs->count);
-
+	daemonize_fs_struct();
 	exit_files(current);
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
@@ -565,30 +560,6 @@ void exit_files(struct task_struct *tsk)
 	}
 }
 
-void put_fs_struct(struct fs_struct *fs)
-{
-	/* No need to hold fs->lock if we are killing it */
-	if (atomic_dec_and_test(&fs->count)) {
-		path_put(&fs->root);
-		path_put(&fs->pwd);
-		kmem_cache_free(fs_cachep, fs);
-	}
-}
-
-void exit_fs(struct task_struct *tsk)
-{
-	struct fs_struct * fs = tsk->fs;
-
-	if (fs) {
-		task_lock(tsk);
-		tsk->fs = NULL;
-		task_unlock(tsk);
-		put_fs_struct(fs);
-	}
-}
-
-EXPORT_SYMBOL_GPL(exit_fs);
-
 #ifdef CONFIG_MM_OWNER
 /*
  * Task p is exiting and it owned mm, lets find a new owner for it
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c15840a381..05c02dc586b1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -681,38 +681,13 @@ fail_nomem:
 	return retval;
 }
 
-static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
-{
-	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
-	/* We don't need to lock fs - think why ;-) */
-	if (fs) {
-		atomic_set(&fs->count, 1);
-		rwlock_init(&fs->lock);
-		fs->umask = old->umask;
-		read_lock(&old->lock);
-		fs->root = old->root;
-		path_get(&old->root);
-		fs->pwd = old->pwd;
-		path_get(&old->pwd);
-		read_unlock(&old->lock);
-	}
-	return fs;
-}
-
-struct fs_struct *copy_fs_struct(struct fs_struct *old)
-{
-	return __copy_fs_struct(old);
-}
-
-EXPORT_SYMBOL_GPL(copy_fs_struct);
-
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 {
 	if (clone_flags & CLONE_FS) {
 		atomic_inc(&current->fs->count);
 		return 0;
 	}
-	tsk->fs = __copy_fs_struct(current->fs);
+	tsk->fs = copy_fs_struct(current->fs);
 	if (!tsk->fs)
 		return -ENOMEM;
 	return 0;
@@ -1545,7 +1520,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 
 	if ((unshare_flags & CLONE_FS) &&
 	    (fs && atomic_read(&fs->count) > 1)) {
-		*new_fsp = __copy_fs_struct(current->fs);
+		*new_fsp = copy_fs_struct(current->fs);
 		if (!*new_fsp)
 			return -ENOMEM;
 	}
-- 
cgit v1.2.3


From 498052bba55ecaff58db6a1436b0e25bfd75a7ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Mar 2009 07:20:30 -0400
Subject: New locking/refcounting for fs_struct

* all changes of current->fs are done under task_lock and write_lock of
  old fs->lock
* refcount is not atomic anymore (same protection)
* its decrements are done when removing reference from current; at the
  same time we decide whether to free it.
* put_fs_struct() is gone
* new field - ->in_exec.  Set by check_unsafe_exec() if we are trying to do
  execve() and only subthreads share fs_struct.  Cleared when finishing exec
  (success and failure alike).  Makes CLONE_FS fail with -EAGAIN if set.
* check_unsafe_exec() may fail with -EAGAIN if another execve() from subthread
  is in progress.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c               | 16 +++++++++--
 fs/exec.c                 | 31 +++++++++++++++++----
 fs/fs_struct.c            | 69 +++++++++++++++++++++++++++++++++--------------
 fs/internal.h             |  2 +-
 fs/proc/task_nommu.c      |  2 +-
 include/linux/fs_struct.h |  8 +++---
 kernel/fork.c             | 37 ++++++++++++++++++-------
 7 files changed, 121 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index 55efdfebdf5a..baabf203b847 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
 #include <linux/poll.h>
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
+#include <linux/fs_struct.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1441,12 +1442,15 @@ int compat_do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+
+	retval = check_unsafe_exec(bprm);
+	if (retval)
+		goto out_unlock;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_unlock;
+		goto out_unmark;
 
 	sched_exec();
 
@@ -1488,6 +1492,9 @@ int compat_do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1506,6 +1513,11 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_unmark:
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+
 out_unlock:
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/exec.c b/fs/exec.c
index c5128fbc9165..07a059664b73 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1056,16 +1056,18 @@ EXPORT_SYMBOL(install_exec_creds);
  * - the caller must hold current->cred_exec_mutex to protect against
  *   PTRACE_ATTACH
  */
-void check_unsafe_exec(struct linux_binprm *bprm)
+int check_unsafe_exec(struct linux_binprm *bprm)
 {
 	struct task_struct *p = current, *t;
 	unsigned long flags;
 	unsigned n_fs, n_sighand;
+	int res = 0;
 
 	bprm->unsafe = tracehook_unsafe_exec(p);
 
 	n_fs = 1;
 	n_sighand = 1;
+	write_lock(&p->fs->lock);
 	lock_task_sighand(p, &flags);
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
 		if (t->fs == p->fs)
@@ -1073,11 +1075,19 @@ void check_unsafe_exec(struct linux_binprm *bprm)
 		n_sighand++;
 	}
 
-	if (atomic_read(&p->fs->count) > n_fs ||
-	    atomic_read(&p->sighand->count) > n_sighand)
+	if (p->fs->users > n_fs ||
+	    atomic_read(&p->sighand->count) > n_sighand) {
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
+	} else {
+		if (p->fs->in_exec)
+			res = -EAGAIN;
+		p->fs->in_exec = 1;
+	}
 
 	unlock_task_sighand(p, &flags);
+	write_unlock(&p->fs->lock);
+
+	return res;
 }
 
 /* 
@@ -1296,12 +1306,15 @@ int do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+
+	retval = check_unsafe_exec(bprm);
+	if (retval)
+		goto out_unlock;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_unlock;
+		goto out_unmark;
 
 	sched_exec();
 
@@ -1344,6 +1357,9 @@ int do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1362,6 +1378,11 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_unmark:
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+
 out_unlock:
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 36e0a123bbf3..41cff72b377b 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -72,25 +72,27 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 		path_put(old_root);
 }
 
-void put_fs_struct(struct fs_struct *fs)
+void free_fs_struct(struct fs_struct *fs)
 {
-	/* No need to hold fs->lock if we are killing it */
-	if (atomic_dec_and_test(&fs->count)) {
-		path_put(&fs->root);
-		path_put(&fs->pwd);
-		kmem_cache_free(fs_cachep, fs);
-	}
+	path_put(&fs->root);
+	path_put(&fs->pwd);
+	kmem_cache_free(fs_cachep, fs);
 }
 
 void exit_fs(struct task_struct *tsk)
 {
-	struct fs_struct * fs = tsk->fs;
+	struct fs_struct *fs = tsk->fs;
 
 	if (fs) {
+		int kill;
 		task_lock(tsk);
+		write_lock(&fs->lock);
 		tsk->fs = NULL;
+		kill = !--fs->users;
+		write_unlock(&fs->lock);
 		task_unlock(tsk);
-		put_fs_struct(fs);
+		if (kill)
+			free_fs_struct(fs);
 	}
 }
 
@@ -99,7 +101,8 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
 	/* We don't need to lock fs - think why ;-) */
 	if (fs) {
-		atomic_set(&fs->count, 1);
+		fs->users = 1;
+		fs->in_exec = 0;
 		rwlock_init(&fs->lock);
 		fs->umask = old->umask;
 		read_lock(&old->lock);
@@ -114,28 +117,54 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 
 int unshare_fs_struct(void)
 {
-	struct fs_struct *fsp = copy_fs_struct(current->fs);
-	if (!fsp)
+	struct fs_struct *fs = current->fs;
+	struct fs_struct *new_fs = copy_fs_struct(fs);
+	int kill;
+
+	if (!new_fs)
 		return -ENOMEM;
-	exit_fs(current);
-	current->fs = fsp;
+
+	task_lock(current);
+	write_lock(&fs->lock);
+	kill = !--fs->users;
+	current->fs = new_fs;
+	write_unlock(&fs->lock);
+	task_unlock(current);
+
+	if (kill)
+		free_fs_struct(fs);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(unshare_fs_struct);
 
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
-	.count		= ATOMIC_INIT(1),
+	.users		= 1,
 	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
 	.umask		= 0022,
 };
 
 void daemonize_fs_struct(void)
 {
-	struct fs_struct *fs;
+	struct fs_struct *fs = current->fs;
+
+	if (fs) {
+		int kill;
+
+		task_lock(current);
 
-	exit_fs(current);	/* current->fs->count--; */
-	fs = &init_fs;
-	current->fs = fs;
-	atomic_inc(&fs->count);
+		write_lock(&init_fs.lock);
+		init_fs.users++;
+		write_unlock(&init_fs.lock);
+
+		write_lock(&fs->lock);
+		current->fs = &init_fs;
+		kill = !--fs->users;
+		write_unlock(&fs->lock);
+
+		task_unlock(current);
+		if (kill)
+			free_fs_struct(fs);
+	}
 }
diff --git a/fs/internal.h b/fs/internal.h
index 477a105f8df3..b4dac4fb6b61 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -44,7 +44,7 @@ extern void __init chrdev_init(void);
 /*
  * exec.c
  */
-extern void check_unsafe_exec(struct linux_binprm *);
+extern int check_unsafe_exec(struct linux_binprm *);
 
 /*
  * namespace.c
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..6ca01052c5bc 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -49,7 +49,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	else
 		bytes += kobjsize(mm);
 	
-	if (current->fs && atomic_read(&current->fs->count) > 1)
+	if (current->fs && current->fs->users > 1)
 		sbytes += kobjsize(current->fs);
 	else
 		bytes += kobjsize(current->fs);
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 298cef1c0793..78a05bfcd8eb 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -4,12 +4,10 @@
 #include <linux/path.h>
 
 struct fs_struct {
-	atomic_t count;	/* This usage count is used by check_unsafe_exec() for
-			 * security checking purposes - therefore it may not be
-			 * incremented, except by clone(CLONE_FS).
-			 */
+	int users;
 	rwlock_t lock;
 	int umask;
+	int in_exec;
 	struct path root, pwd;
 };
 
@@ -19,7 +17,7 @@ extern void exit_fs(struct task_struct *);
 extern void set_fs_root(struct fs_struct *, struct path *);
 extern void set_fs_pwd(struct fs_struct *, struct path *);
 extern struct fs_struct *copy_fs_struct(struct fs_struct *);
-extern void put_fs_struct(struct fs_struct *);
+extern void free_fs_struct(struct fs_struct *);
 extern void daemonize_fs_struct(void);
 extern int unshare_fs_struct(void);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 05c02dc586b1..51f138a131de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -683,11 +683,19 @@ fail_nomem:
 
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 {
+	struct fs_struct *fs = current->fs;
 	if (clone_flags & CLONE_FS) {
-		atomic_inc(&current->fs->count);
+		/* tsk->fs is already what we want */
+		write_lock(&fs->lock);
+		if (fs->in_exec) {
+			write_unlock(&fs->lock);
+			return -EAGAIN;
+		}
+		fs->users++;
+		write_unlock(&fs->lock);
 		return 0;
 	}
-	tsk->fs = copy_fs_struct(current->fs);
+	tsk->fs = copy_fs_struct(fs);
 	if (!tsk->fs)
 		return -ENOMEM;
 	return 0;
@@ -1518,12 +1526,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 {
 	struct fs_struct *fs = current->fs;
 
-	if ((unshare_flags & CLONE_FS) &&
-	    (fs && atomic_read(&fs->count) > 1)) {
-		*new_fsp = copy_fs_struct(current->fs);
-		if (!*new_fsp)
-			return -ENOMEM;
-	}
+	if (!(unshare_flags & CLONE_FS) || !fs)
+		return 0;
+
+	/* don't need lock here; in the worst case we'll do useless copy */
+	if (fs->users == 1)
+		return 0;
+
+	*new_fsp = copy_fs_struct(fs);
+	if (!*new_fsp)
+		return -ENOMEM;
 
 	return 0;
 }
@@ -1639,8 +1651,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 
 		if (new_fs) {
 			fs = current->fs;
+			write_lock(&fs->lock);
 			current->fs = new_fs;
-			new_fs = fs;
+			if (--fs->users)
+				new_fs = NULL;
+			else
+				new_fs = fs;
+			write_unlock(&fs->lock);
 		}
 
 		if (new_mm) {
@@ -1679,7 +1696,7 @@ bad_unshare_cleanup_sigh:
 
 bad_unshare_cleanup_fs:
 	if (new_fs)
-		put_fs_struct(new_fs);
+		free_fs_struct(new_fs);
 
 bad_unshare_cleanup_thread:
 bad_unshare_out:
-- 
cgit v1.2.3


From ce3b0f8d5c2203301fc87f3aaaed73e5819e2a48 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:08:22 -0400
Subject: New helper - current_umask()

current->fs->umask is what most of fs_struct users are doing.
Put that into a helper function.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/inode.c | 2 +-
 fs/btrfs/acl.c                            | 2 +-
 fs/btrfs/ioctl.c                          | 2 +-
 fs/cifs/dir.c                             | 4 ++--
 fs/cifs/inode.c                           | 4 ++--
 fs/ext2/acl.c                             | 2 +-
 fs/ext3/acl.c                             | 2 +-
 fs/ext4/acl.c                             | 2 +-
 fs/fat/inode.c                            | 2 +-
 fs/fs_struct.c                            | 6 ++++++
 fs/generic_acl.c                          | 2 +-
 fs/gfs2/acl.c                             | 2 +-
 fs/hfsplus/options.c                      | 2 +-
 fs/hpfs/super.c                           | 2 +-
 fs/jffs2/acl.c                            | 2 +-
 fs/jfs/acl.c                              | 2 +-
 fs/namei.c                                | 6 +++---
 fs/nfs/nfs3proc.c                         | 6 +++---
 fs/nfs/nfs4proc.c                         | 2 +-
 fs/ocfs2/acl.c                            | 2 +-
 fs/omfs/inode.c                           | 2 +-
 fs/reiserfs/xattr_acl.c                   | 2 +-
 fs/xfs/linux-2.6/xfs_iops.c               | 4 ++--
 include/linux/fs.h                        | 2 ++
 ipc/mqueue.c                              | 2 +-
 net/unix/af_unix.c                        | 2 +-
 26 files changed, 39 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 64f068540d0d..706eb5c7e2ee 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -635,7 +635,7 @@ long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode,
 	if (dentry->d_inode)
 		goto out_dput;
 
-	mode &= ~current->fs->umask;
+	mode &= ~current_umask();
 
 	if (flags & SPU_CREATE_GANG)
 		ret = spufs_create_gang(nd->path.dentry->d_inode,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..7fdd184a528d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
 		}
 
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 
 	if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c8..7594bec1be10 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 		goto out_dput;
 
 	if (!IS_POSIXACL(parent->dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 
 	error = mnt_want_write(parent->mnt);
 	if (error)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 2f35cccfcd8d..54dce78fbb73 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -254,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		return -ENOMEM;
 	}
 
-	mode &= ~current->fs->umask;
+	mode &= ~current_umask();
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
 
@@ -479,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 		rc = -ENOMEM;
 	else if (pTcon->unix_ext) {
 		struct cifs_unix_set_info_args args = {
-			.mode	= mode & ~current->fs->umask,
+			.mode	= mode & ~current_umask(),
 			.ctime	= NO_CHANGE_64,
 			.atime	= NO_CHANGE_64,
 			.mtime	= NO_CHANGE_64,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a8797cc60805..f121a80fdd6f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1125,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			goto mkdir_out;
 		}
 
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
 				mode, NULL /* netfid */, pInfo, &oplock,
 				full_path, cifs_sb->local_nls,
@@ -1204,7 +1204,7 @@ mkdir_get_info:
 		if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
 				direntry->d_inode->i_nlink = 2;
 
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		/* must turn on setgid bit if parent dir has it */
 		if (inode->i_mode & S_ISGID)
 			mode |= S_ISGID;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ae8c4f850b27..d46e38cb85c5 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
                struct posix_acl *clone;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index b60bb241880c..d81ef2fdb08e 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 694ed6fadcc8..647e0d65a284 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0004fe6e00..ab657db4c94e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -930,7 +930,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 
 	opts->fs_uid = current_uid();
 	opts->fs_gid = current_gid();
-	opts->fs_fmask = opts->fs_dmask = current->fs->umask;
+	opts->fs_fmask = current_umask();
 	opts->allow_utime = -1;
 	opts->codepage = fat_default_codepage;
 	opts->iocharset = fat_default_iocharset;
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 41cff72b377b..6ac219338670 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -138,6 +138,12 @@ int unshare_fs_struct(void)
 }
 EXPORT_SYMBOL_GPL(unshare_fs_struct);
 
+int current_umask(void)
+{
+	return current->fs->umask;
+}
+EXPORT_SYMBOL(current_umask);
+
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.users		= 1,
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 995d63b2e747..e0b53aa7bbec 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
 	mode_t mode = inode->i_mode;
 	int error;
 
-	inode->i_mode = mode & ~current->fs->umask;
+	inode->i_mode = mode & ~current_umask();
 	if (!S_ISLNK(inode->i_mode))
 		acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
 	if (acl) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 43764f4fa763..fa881bdc3d85 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	if (error)
 		return error;
 	if (!acl) {
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		if (mode != ip->i_inode.i_mode)
 			error = munge_mode(ip, mode);
 		return error;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bab7f8d1bdfa..3fcbb0e1f6fc 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
 
 	opts->creator = HFSPLUS_DEF_CR_TYPE;
 	opts->type = HFSPLUS_DEF_CR_TYPE;
-	opts->umask = current->fs->umask;
+	opts->umask = current_umask();
 	opts->uid = current_uid();
 	opts->gid = current_gid();
 	opts->part = -1;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0d049b8919c4..c696d01bc8f7 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -477,7 +477,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 
 	uid = current_uid();
 	gid = current_gid();
-	umask = current->fs->umask;
+	umask = current_umask();
 	lowercase = 0;
 	conv = CONV_BINARY;
 	eas = 2;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index d98713777a1b..77ccf8cb0823 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 		return PTR_ERR(acl);
 
 	if (!acl) {
-		*i_mode &= ~current->fs->umask;
+		*i_mode &= ~current_umask();
 	} else {
 		if (S_ISDIR(*i_mode))
 			jffs2_iset_acl(inode, &f->i_acl_default, acl);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a166c1669e82..06ca1b8d2054 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 cleanup:
 		posix_acl_release(acl);
 	} else
-		inode->i_mode &= ~current->fs->umask;
+		inode->i_mode &= ~current_umask();
 
 	JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
 			       inode->i_mode;
diff --git a/fs/namei.c b/fs/namei.c
index 4c65a6460138..964c0249444b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1578,7 +1578,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
 	struct dentry *dir = nd->path.dentry;
 
 	if (!IS_POSIXACL(dir->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = security_path_mknod(&nd->path, path->dentry, mode, 0);
 	if (error)
 		goto out_unlock;
@@ -1989,7 +1989,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
 		goto out_unlock;
 	}
 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = may_mknod(mode);
 	if (error)
 		goto out_dput;
@@ -2067,7 +2067,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
 		goto out_unlock;
 
 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679e..e47d4400fb87 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		data->arg.create.verifier[1] = current->pid;
 	}
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	for (;;) {
 		status = nfs3_do_create(dir, dentry, data);
@@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 
 	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
@@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
 			MAJOR(rdev), MINOR(rdev));
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d9..bbee587dd597 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1509,7 +1509,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		attr.ia_mode = nd->intent.open.create_mode;
 		attr.ia_valid = ATTR_MODE;
 		if (!IS_POSIXACL(dir))
-			attr.ia_mode &= ~current->fs->umask;
+			attr.ia_mode &= ~current_umask();
 	} else {
 		attr.ia_valid = 0;
 		BUG_ON(nd->intent.open.flags & O_CREAT);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e5..fbeaec762103 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 633e9dc972bb..aa6fc30772af 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -421,7 +421,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_uid = current_uid();
 	sbi->s_gid = current_gid();
-	sbi->s_dmask = sbi->s_fmask = current->fs->umask;
+	sbi->s_dmask = sbi->s_fmask = current_umask();
 
 	if (!parse_options((char *) data, sbi))
 		goto end;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d423416d93d1..c303c426fe2b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 	} else {
 	      apply_umask:
 		/* no ACL, apply umask */
-		inode->i_mode &= ~current->fs->umask;
+		inode->i_mode &= ~current_umask();
 	}
 
 	return err;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 7aa53fefc67f..2940612e3aeb 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -227,7 +227,7 @@ xfs_vn_mknod(
 	xfs_dentry_to_name(&name, dentry);
 
 	if (IS_POSIXACL(dir) && !default_acl)
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 
 	switch (mode & S_IFMT) {
 	case S_IFCHR:
@@ -416,7 +416,7 @@ xfs_vn_symlink(
 	mode_t		mode;
 
 	mode = S_IFLNK |
-		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
 	xfs_dentry_to_name(&name, dentry);
 
 	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 87e7bfc5ebd7..3d7bd5447ca3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1741,6 +1741,8 @@ extern void drop_collected_mounts(struct vfsmount *);
 
 extern int vfs_statfs(struct dentry *, struct kstatfs *);
 
+extern int current_umask(void);
+
 /* /sys/fs */
 extern struct kobject *fs_kobj;
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a8ddadbc7459..916785363f0f 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -602,7 +602,7 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
 		dentry->d_fsdata = attr;
 	}
 
-	mode &= ~current->fs->umask;
+	mode &= ~current_umask();
 	ret = mnt_want_write(mqueue_mnt);
 	if (ret)
 		goto out;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index baac91049b0e..9dcc6e7f96ec 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -832,7 +832,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		 * All right, let's create it.
 		 */
 		mode = S_IFSOCK |
-		       (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
+		       (SOCK_INODE(sock)->i_mode & ~current_umask());
 		err = mnt_want_write(nd.path.mnt);
 		if (err)
 			goto out_mknod_dput;
-- 
cgit v1.2.3


From 5ad4e53bd5406ee214ddc5a41f03f779b8b2d526 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:50:06 -0400
Subject: Get rid of indirect include of fs_struct.h

Don't pull it in sched.h; very few files actually need it and those
can include directly.  sched.h itself only needs forward declaration
of struct fs_struct;

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/cris/kernel/process.c    | 1 -
 fs/dcache.c                   | 1 +
 fs/exec.c                     | 1 +
 fs/fs_struct.c                | 1 +
 fs/namei.c                    | 1 +
 fs/namespace.c                | 1 +
 fs/open.c                     | 1 +
 fs/proc/base.c                | 1 +
 fs/proc/task_nommu.c          | 1 +
 include/linux/mnt_namespace.h | 2 ++
 include/linux/nsproxy.h       | 1 +
 include/linux/sched.h         | 3 ++-
 init/do_mounts.c              | 1 +
 kernel/auditsc.c              | 1 +
 kernel/exec_domain.c          | 1 +
 kernel/exit.c                 | 1 +
 kernel/fork.c                 | 1 +
 kernel/sys.c                  | 1 +
 security/tomoyo/realpath.c    | 1 +
 19 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 60816e876455..4df0b320d524 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -19,7 +19,6 @@
 #include <asm/system.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 90bbd7e1b116..0dc4de21f088 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
 #include <linux/seqlock.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/exec.c b/fs/exec.c
index 614991bf0c87..052a961e41aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -53,6 +53,7 @@
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 6ac219338670..eee059052db5 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/path.h>
 #include <linux/slab.h>
+#include <linux/fs_struct.h>
 
 /*
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
diff --git a/fs/namei.c b/fs/namei.c
index 964c0249444b..b8433ebfae05 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
diff --git a/fs/namespace.c b/fs/namespace.c
index 1e56303c718e..c6f54e4c4290 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
 #include <linux/ramfs.h>
 #include <linux/log2.h>
 #include <linux/idr.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
diff --git a/fs/open.c b/fs/open.c
index 75b61677daaf..377eb25b6abf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/audit.h>
 #include <linux/falloc.h>
+#include <linux/fs_struct.h>
 
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e0afd326b688..f71559784bfb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
 #include <linux/oom.h>
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 
 /* NOTE:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 6ca01052c5bc..253afc04484c 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 830bbcd449d6..3a059298cc19 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -22,6 +22,8 @@ struct proc_mounts {
 	int event;
 };
 
+struct fs_struct;
+
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct fs_struct *);
 extern void __put_mnt_ns(struct mnt_namespace *ns);
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index afad7dec1b36..7b370c7cfeff 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -8,6 +8,7 @@ struct mnt_namespace;
 struct uts_namespace;
 struct ipc_namespace;
 struct pid_namespace;
+struct fs_struct;
 
 /*
  * A structure to contain pointers to all per-process
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29df6374d2de..b4e065ea0de1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -68,7 +68,7 @@ struct sched_param {
 #include <linux/smp.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
-#include <linux/fs_struct.h>
+#include <linux/path.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
 #include <linux/pid.h>
@@ -97,6 +97,7 @@ struct futex_pi_state;
 struct robust_list_head;
 struct bio;
 struct bts_tracer;
+struct fs_struct;
 
 /*
  * List of flags we want to share for kernel threads,
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 8d4ff5afc1d8..dd7ee5f203f3 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/initrd.h>
 #include <linux/async.h>
+#include <linux/fs_struct.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs_sb.h>
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8cbddff6c283..2bfc64786765 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -66,6 +66,7 @@
 #include <linux/syscalls.h>
 #include <linux/inotify.h>
 #include <linux/capability.h>
+#include <linux/fs_struct.h>
 
 #include "audit.h"
 
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index cb8e9626c215..c35452cadded 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -18,6 +18,7 @@
 #include <linux/syscalls.h>
 #include <linux/sysctl.h>
 #include <linux/types.h>
+#include <linux/fs_struct.h>
 
 
 static void default_handler(int, struct pt_regs *);
diff --git a/kernel/exit.c b/kernel/exit.c
index ad8375758a79..b5d656845c90 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
+#include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <trace/sched.h>
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 51f138a131de..e82a14577a98 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -60,6 +60,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/fs_struct.h>
 #include <trace/sched.h>
 #include <linux/magic.h>
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e6882a..ce182aaed204 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -34,6 +34,7 @@
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
 #include <linux/ptrace.h>
+#include <linux/fs_struct.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index d47f16b844b2..3bbe01a7a4b5 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/mount.h>
 #include <linux/mnt_namespace.h>
+#include <linux/fs_struct.h>
 #include "common.h"
 #include "realpath.h"
 
-- 
cgit v1.2.3


From 47e4491b40df73c3b117e3d80b31b5b512a4b19f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 1 Apr 2009 07:07:16 -0400
Subject: Cleanup after commit 585d3bc06f4ca57f975a5a1f698f65a45ea66225

fsync_bdev() export and a bunch of stubs for !CONFIG_BLOCK case had
been left behind

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c              |  1 +
 fs/buffer.c                 |  1 -
 include/linux/buffer_head.h | 12 ------------
 include/linux/fs.h          | 12 ++++++++++++
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8c3c6899ccf3..f45dbc18dd17 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev)
 	}
 	return sync_blockdev(bdev);
 }
+EXPORT_SYMBOL(fsync_bdev);
 
 /**
  * freeze_bdev  --  lock a filesystem and force it into a consistent state
diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97cb..b71e52925c83 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3281,7 +3281,6 @@ EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
-EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f19fd9045ea0..fc91665d39d0 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -332,22 +332,10 @@ extern int __set_page_dirty_buffers(struct page *page);
 
 static inline void buffer_init(void) {}
 static inline int try_to_free_buffers(struct page *page) { return 1; }
-static inline int sync_blockdev(struct block_device *bdev) { return 0; }
 static inline int inode_has_buffers(struct inode *inode) { return 0; }
 static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
-static inline void invalidate_bdev(struct block_device *bdev) {}
-
-static inline struct super_block *freeze_bdev(struct block_device *sb)
-{
-	return NULL;
-}
-
-static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
-{
-	return 0;
-}
 
 #endif /* CONFIG_BLOCK */
 #endif /* _LINUX_BUFFER_HEAD_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3d7bd5447ca3..674134725597 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1886,6 +1886,18 @@ extern int fsync_super(struct super_block *);
 extern int fsync_no_super(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
+static inline int sync_blockdev(struct block_device *bdev) { return 0; }
+static inline void invalidate_bdev(struct block_device *bdev) {}
+
+static inline struct super_block *freeze_bdev(struct block_device *sb)
+{
+	return NULL;
+}
+
+static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+	return 0;
+}
 #endif
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
-- 
cgit v1.2.3


From ced117c73edc917e96dea7cca98c91383f0792f7 Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Tue, 31 Mar 2009 00:41:20 +0300
Subject: Remove two unneeded exports and make two symbols static in fs/mpage.c

Commit 29a814d2ee0e43c2980f33f91c1311ec06c0aa35 (vfs: add hooks for
ext4's delayed allocation support) exported the following functions

mpage_bio_submit()
__mpage_writepage()

for the benefit of ext4's delayed allocation support. Since commit
a1d6cc563bfdf1bf2829d3e6ce4d8b774251796b (ext4: Rework the
ext4_da_writepages() function), these functions are not used by the
ext4 driver anymore. However, the now unnecessary exports still
remain, and this patch removes those. Moreover, these two functions
can become static again.

The issue was spotted by namespacecheck.

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mpage.c            | 13 +++++++++----
 include/linux/mpage.h | 10 ----------
 2 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mpage.c b/fs/mpage.c
index 16c3ef37eae3..680ba60863ff 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-struct bio *mpage_bio_submit(int rw, struct bio *bio)
+static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io_read;
 	if (rw == WRITE)
@@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio)
 	submit_bio(rw, bio);
 	return NULL;
 }
-EXPORT_SYMBOL(mpage_bio_submit);
 
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage);
  * just allocate full-size (16-page) BIOs.
  */
 
-int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+struct mpage_data {
+	struct bio *bio;
+	sector_t last_block_in_bio;
+	get_block_t *get_block;
+	unsigned use_writepage;
+};
+
+static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 		      void *data)
 {
 	struct mpage_data *mpd = data;
@@ -648,7 +654,6 @@ out:
 	mpd->bio = bio;
 	return ret;
 }
-EXPORT_SYMBOL(__mpage_writepage);
 
 /**
  * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 5c42821da2d1..068a0c9946af 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -11,21 +11,11 @@
  */
 #ifdef CONFIG_BLOCK
 
-struct mpage_data {
-	struct bio *bio;
-	sector_t last_block_in_bio;
-	get_block_t *get_block;
-	unsigned use_writepage;
-};
-
 struct writeback_control;
 
-struct bio *mpage_bio_submit(int rw, struct bio *bio);
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block);
 int mpage_readpage(struct page *page, get_block_t get_block);
-int __mpage_writepage(struct page *page, struct writeback_control *wbc,
-		      void *data);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
-- 
cgit v1.2.3


From ae149b6bec64a09373ba20fce75f8aa6b14b78fd Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:15 -0700
Subject: proc tty: add struct tty_operations::proc_fops

Used for gradual switch of TTY drivers from using ->read_proc which helps
with gradual switch from ->read_proc for the whole tree.

As side effect, fix possible race condition when ->data initialized after
PDE is hooked into proc tree.

->proc_fops takes precedence over ->read_proc.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_tty.c         | 20 +++++++++++++-------
 include/linux/tty_driver.h |  1 +
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 4a9e0f65ae60..854827b1d463 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,16 +144,22 @@ void proc_tty_register_driver(struct tty_driver *driver)
 {
 	struct proc_dir_entry *ent;
 		
-	if (!driver->ops->read_proc || !driver->driver_name ||
-	    driver->proc_entry)
+	if (!driver->driver_name || driver->proc_entry)
 		return;
 
-	ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
-	if (!ent)
+	if (driver->ops->proc_fops) {
+		ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
+				       driver->ops->proc_fops, driver);
+		if (!ent)
+			return;
+	} else if (driver->ops->read_proc) {
+		ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
+		if (!ent)
+			return;
+		ent->read_proc = driver->ops->read_proc;
+		ent->data = driver;
+	} else
 		return;
-	ent->read_proc = driver->ops->read_proc;
-	ent->data = driver;
-
 	driver->proc_entry = ent;
 }
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 08e088334dba..c9a69575ded6 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -264,6 +264,7 @@ struct tty_operations {
 	int (*poll_get_char)(struct tty_driver *driver, int line);
 	void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
 #endif
+	const struct file_operations *proc_fops;
 };
 
 struct tty_driver {
-- 
cgit v1.2.3


From 0f043a81ebe84be3576667f04fdda481609e3816 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:25 -0700
Subject: proc tty: remove struct tty_operations::read_proc

struct tty_operations::proc_fops took it's place and there is one less
create_proc_read_entry() user now!

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/isdn/capi/capi.c   |  7 -------
 fs/proc/proc_tty.c         | 18 ++++--------------
 include/linux/tty_driver.h |  2 --
 net/bluetooth/rfcomm/tty.c |  6 ------
 4 files changed, 4 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 3e468d2cf730..2d8352419c0d 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -1331,12 +1331,6 @@ static void capinc_tty_send_xchar(struct tty_struct *tty, char ch)
 #endif
 }
 
-static int capinc_tty_read_proc(char *page, char **start, off_t off,
-				int count, int *eof, void *data)
-{
-	return 0;
-}
-
 static struct tty_driver *capinc_tty_driver;
 
 static const struct tty_operations capinc_ops = {
@@ -1358,7 +1352,6 @@ static const struct tty_operations capinc_ops = {
 	.flush_buffer = capinc_tty_flush_buffer,
 	.set_ldisc = capinc_tty_set_ldisc,
 	.send_xchar = capinc_tty_send_xchar,
-	.read_proc = capinc_tty_read_proc,
 };
 
 static int capinc_tty_init(void)
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 854827b1d463..83adcc869437 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,22 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver)
 {
 	struct proc_dir_entry *ent;
 		
-	if (!driver->driver_name || driver->proc_entry)
+	if (!driver->driver_name || driver->proc_entry ||
+	    !driver->ops->proc_fops)
 		return;
 
-	if (driver->ops->proc_fops) {
-		ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
-				       driver->ops->proc_fops, driver);
-		if (!ent)
-			return;
-	} else if (driver->ops->read_proc) {
-		ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
-		if (!ent)
-			return;
-		ent->read_proc = driver->ops->read_proc;
-		ent->data = driver;
-	} else
-		return;
+	ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
+			       driver->ops->proc_fops, driver);
 	driver->proc_entry = ent;
 }
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index c9a69575ded6..8615d661ab60 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -252,8 +252,6 @@ struct tty_operations {
 	void (*set_ldisc)(struct tty_struct *tty);
 	void (*wait_until_sent)(struct tty_struct *tty, int timeout);
 	void (*send_xchar)(struct tty_struct *tty, char ch);
-	int (*read_proc)(char *page, char **start, off_t off,
-			  int count, int *eof, void *data);
 	int (*tiocmget)(struct tty_struct *tty, struct file *file);
 	int (*tiocmset)(struct tty_struct *tty, struct file *file,
 			unsigned int set, unsigned int clear);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index abdc703a11d2..cab71ea2796d 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -1093,11 +1093,6 @@ static void rfcomm_tty_hangup(struct tty_struct *tty)
 	}
 }
 
-static int rfcomm_tty_read_proc(char *buf, char **start, off_t offset, int len, int *eof, void *unused)
-{
-	return 0;
-}
-
 static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp)
 {
 	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
@@ -1156,7 +1151,6 @@ static const struct tty_operations rfcomm_ops = {
 	.send_xchar		= rfcomm_tty_send_xchar,
 	.hangup			= rfcomm_tty_hangup,
 	.wait_until_sent	= rfcomm_tty_wait_until_sent,
-	.read_proc		= rfcomm_tty_read_proc,
 	.tiocmget		= rfcomm_tty_tiocmget,
 	.tiocmset		= rfcomm_tty_tiocmset,
 };
-- 
cgit v1.2.3


From 9de1581e75ba9d7979766d4ab6d56f0f2d87f7c6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 31 Mar 2009 15:19:29 -0700
Subject: get_mm_hiwater_xxx: trivial, s/define/inline/

Andrew pointed out get_mm_hiwater_xxx() evaluate "mm" argument thrice/twice,
make them inline.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hugh@veritas.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29df6374d2de..481fad3a9b42 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -391,8 +391,15 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 		(mm)->hiwater_vm = (mm)->total_vm;	\
 } while (0)
 
-#define get_mm_hiwater_rss(mm)	max((mm)->hiwater_rss, get_mm_rss(mm))
-#define get_mm_hiwater_vm(mm)	max((mm)->hiwater_vm, (mm)->total_vm)
+static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
+{
+	return max(mm->hiwater_rss, get_mm_rss(mm));
+}
+
+static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
+{
+	return max(mm->hiwater_vm, mm->total_vm);
+}
 
 extern void set_dumpable(struct mm_struct *mm, int value);
 extern int get_dumpable(struct mm_struct *mm);
-- 
cgit v1.2.3


From ee99c71c59f897436ec65debb99372b3146f9985 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:19:31 -0700
Subject: mm: introduce for_each_populated_zone() macro

Impact: cleanup

In almost cases, for_each_zone() is used with populated_zone().  It's
because almost function doesn't need memoryless node information.
Therefore, for_each_populated_zone() can help to make code simplify.

This patch has no functional change.

[akpm@linux-foundation.org: small cleanup]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h  |  8 ++++++++
 kernel/power/snapshot.c |  9 +++------
 kernel/power/swsusp.c   | 17 ++++++++---------
 mm/page_alloc.c         | 26 +++++---------------------
 mm/vmscan.c             |  4 +---
 mm/vmstat.c             | 11 ++---------
 6 files changed, 27 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1aca6cebbb78..26ef24076b76 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -806,6 +806,14 @@ extern struct zone *next_zone(struct zone *zone);
 	     zone;					\
 	     zone = next_zone(zone))
 
+#define for_each_populated_zone(zone)		        \
+	for (zone = (first_online_pgdat())->node_zones; \
+	     zone;					\
+	     zone = next_zone(zone))			\
+		if (!populated_zone(zone))		\
+			; /* do nothing */		\
+		else
+
 static inline struct zone *zonelist_zone(struct zoneref *zoneref)
 {
 	return zoneref->zone;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f5fc2d7680f2..33e2e4a819f9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 
 	INIT_LIST_HEAD(list);
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		unsigned long zone_start, zone_end;
 		struct mem_extent *ext, *cur, *aux;
 
-		if (!populated_zone(zone))
-			continue;
-
 		zone_start = zone->zone_start_pfn;
 		zone_end = zone->zone_start_pfn + zone->spanned_pages;
 
@@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void)
 	struct zone *zone;
 	unsigned int cnt = 0;
 
-	for_each_zone(zone)
-		if (populated_zone(zone) && is_highmem(zone))
+	for_each_populated_zone(zone)
+		if (is_highmem(zone))
 			cnt += zone_page_state(zone, NR_FREE_PAGES);
 
 	return cnt;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a92c91451559..1ee6636414b2 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -229,17 +229,16 @@ int swsusp_shrink_memory(void)
 		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
 		tmp = size;
 		size += highmem_size;
-		for_each_zone (zone)
-			if (populated_zone(zone)) {
-				tmp += snapshot_additional_pages(zone);
-				if (is_highmem(zone)) {
-					highmem_size -=
+		for_each_populated_zone(zone) {
+			tmp += snapshot_additional_pages(zone);
+			if (is_highmem(zone)) {
+				highmem_size -=
 					zone_page_state(zone, NR_FREE_PAGES);
-				} else {
-					tmp -= zone_page_state(zone, NR_FREE_PAGES);
-					tmp += zone->lowmem_reserve[ZONE_NORMAL];
-				}
+			} else {
+				tmp -= zone_page_state(zone, NR_FREE_PAGES);
+				tmp += zone->lowmem_reserve[ZONE_NORMAL];
 			}
+		}
 
 		if (highmem_size < 0)
 			highmem_size = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a3803ea8c27d..cbd532161f68 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -922,13 +922,10 @@ static void drain_pages(unsigned int cpu)
 	unsigned long flags;
 	struct zone *zone;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 
-		if (!populated_zone(zone))
-			continue;
-
 		pset = zone_pcp(zone, cpu);
 
 		pcp = &pset->pcp;
@@ -1879,10 +1876,7 @@ void show_free_areas(void)
 	int cpu;
 	struct zone *zone;
 
-	for_each_zone(zone) {
-		if (!populated_zone(zone))
-			continue;
-
+	for_each_populated_zone(zone) {
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 
@@ -1922,12 +1916,9 @@ void show_free_areas(void)
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE));
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		int i;
 
-		if (!populated_zone(zone))
-			continue;
-
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
@@ -1967,12 +1958,9 @@ void show_free_areas(void)
 		printk("\n");
 	}
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
  		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 
-		if (!populated_zone(zone))
-			continue;
-
 		show_node(zone);
 		printk("%s: ", zone->name);
 
@@ -2784,11 +2772,7 @@ static int __cpuinit process_zones(int cpu)
 
 	node_set_state(node, N_CPU);	/* this node has a cpu */
 
-	for_each_zone(zone) {
-
-		if (!populated_zone(zone))
-			continue;
-
+	for_each_populated_zone(zone) {
 		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
 					 GFP_KERNEL, node);
 		if (!zone_pcp(zone, cpu))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1bca60f0c527..301f057fd115 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2061,11 +2061,9 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 	struct zone *zone;
 	unsigned long ret = 0;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		enum lru_list l;
 
-		if (!populated_zone(zone))
-			continue;
 		if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
 			continue;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8cd81ea1ddc1..9826766f1274 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -135,11 +135,7 @@ static void refresh_zone_stat_thresholds(void)
 	int cpu;
 	int threshold;
 
-	for_each_zone(zone) {
-
-		if (!zone->present_pages)
-			continue;
-
+	for_each_populated_zone(zone) {
 		threshold = calculate_threshold(zone);
 
 		for_each_online_cpu(cpu)
@@ -301,12 +297,9 @@ void refresh_cpu_vm_stats(int cpu)
 	int i;
 	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *p;
 
-		if (!populated_zone(zone))
-			continue;
-
 		p = zone_pcp(zone, cpu);
 
 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-- 
cgit v1.2.3


From e3a7cca1ef4c1af9b0acef9bd66eff6582a737b5 Mon Sep 17 00:00:00 2001
From: Edward Shishkin <edward.shishkin@gmail.com>
Date: Tue, 31 Mar 2009 15:19:39 -0700
Subject: vfs: add/use account_page_dirtied()

Add a helper function account_page_dirtied().  Use that from two
callsites.  reiser4 adds a function which adds a third callsite.

Signed-off-by: Edward Shishkin<edward.shishkin@gmail.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c         |  9 +--------
 include/linux/mm.h  |  1 +
 mm/page-writeback.c | 22 +++++++++++++++-------
 3 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97cb..73abe6d8218c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -621,14 +621,7 @@ static void __set_page_dirty(struct page *page,
 	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
-
-		if (mapping_cap_account_dirty(mapping)) {
-			__inc_zone_page_state(page, NR_FILE_DIRTY);
-			__inc_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
-			task_dirty_inc(current);
-			task_io_account_write(PAGE_CACHE_SIZE);
-		}
+		account_page_dirtied(page, mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b1ea37fc7a24..2223f8dfa568 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -834,6 +834,7 @@ int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
+void account_page_dirtied(struct page *page, struct address_space *mapping);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 40ca7cdb653e..6aa92b03c747 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1197,6 +1197,20 @@ int __set_page_dirty_no_writeback(struct page *page)
 	return 0;
 }
 
+/*
+ * Helper function for set_page_dirty family.
+ * NOTE: This relies on being atomic wrt interrupts.
+ */
+void account_page_dirtied(struct page *page, struct address_space *mapping)
+{
+	if (mapping_cap_account_dirty(mapping)) {
+		__inc_zone_page_state(page, NR_FILE_DIRTY);
+		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		task_dirty_inc(current);
+		task_io_account_write(PAGE_CACHE_SIZE);
+	}
+}
+
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
  * its radix tree.
@@ -1226,13 +1240,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 		if (mapping2) { /* Race with truncate? */
 			BUG_ON(mapping2 != mapping);
 			WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-			if (mapping_cap_account_dirty(mapping)) {
-				__inc_zone_page_state(page, NR_FILE_DIRTY);
-				__inc_bdi_stat(mapping->backing_dev_info,
-						BDI_RECLAIMABLE);
-				task_dirty_inc(current);
-				task_io_account_write(PAGE_CACHE_SIZE);
-			}
+			account_page_dirtied(page, mapping);
 			radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 		}
-- 
cgit v1.2.3


From d1d7487173eab8352125cf6cc271940f24254bd4 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:23:14 -0700
Subject: mm: remove pagevec_swap_free()

pagevec_swap_free() is now unused.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h |  1 -
 mm/swap.c               | 23 -----------------------
 2 files changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 7b2886fa7fdc..bab82f4c571c 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -24,7 +24,6 @@ void __pagevec_release(struct pagevec *pvec);
 void __pagevec_free(struct pagevec *pvec);
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
 void pagevec_strip(struct pagevec *pvec);
-void pagevec_swap_free(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
diff --git a/mm/swap.c b/mm/swap.c
index 8adb9feb61e1..6e83084c1f6c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -456,29 +456,6 @@ void pagevec_strip(struct pagevec *pvec)
 	}
 }
 
-/**
- * pagevec_swap_free - try to free swap space from the pages in a pagevec
- * @pvec: pagevec with swapcache pages to free the swap space of
- *
- * The caller needs to hold an extra reference to each page and
- * not hold the page lock on the pages.  This function uses a
- * trylock on the page lock so it may not always free the swap
- * space associated with a page.
- */
-void pagevec_swap_free(struct pagevec *pvec)
-{
-	int i;
-
-	for (i = 0; i < pagevec_count(pvec); i++) {
-		struct page *page = pvec->pages[i];
-
-		if (PageSwapCache(page) && trylock_page(page)) {
-			try_to_free_swap(page);
-			unlock_page(page);
-		}
-	}
-}
-
 /**
  * pagevec_lookup - gang pagecache lookup
  * @pvec:	Where the resulting pages are placed
-- 
cgit v1.2.3


From 610a77e04a8d9fe8764dc484e2182fa251ce1cc2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 15:23:16 -0700
Subject: memdup_user(): introduce

I notice there are many places doing copy_from_user() which follows
kmalloc():

        dst = kmalloc(len, GFP_KERNEL);
        if (!dst)
                return -ENOMEM;
        if (copy_from_user(dst, src, len)) {
		kfree(dst);
		return -EFAULT
	}

memdup_user() is a wrapper of the above code.  With this new function, we
don't have to write 'len' twice, which can lead to typos/mistakes.  It
also produces smaller code and kernel text.

A quick grep shows 250+ places where memdup_user() *may* be used.  I'll
prepare a patchset to do this conversion.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h |  1 +
 mm/util.c              | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index d18fc198aa2f..8852739f36df 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -12,6 +12,7 @@
 #include <linux/stddef.h>	/* for NULL */
 
 extern char *strndup_user(const char __user *, long);
+extern void *memdup_user(const void __user *, size_t);
 
 /*
  * Include machine specific inline routines
diff --git a/mm/util.c b/mm/util.c
index 37eaccdf3054..7c122e49f769 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -69,6 +69,36 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
 }
 EXPORT_SYMBOL(kmemdup);
 
+/**
+ * memdup_user - duplicate memory region from user space
+ *
+ * @src: source address in user space
+ * @len: number of bytes to copy
+ *
+ * Returns an ERR_PTR() on failure.
+ */
+void *memdup_user(const void __user *src, size_t len)
+{
+	void *p;
+
+	/*
+	 * Always use GFP_KERNEL, since copy_from_user() can sleep and
+	 * cause pagefault, which makes it pointless to use GFP_NOFS
+	 * or GFP_ATOMIC.
+	 */
+	p = kmalloc_track_caller(len, GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(p, src, len)) {
+		kfree(p);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return p;
+}
+EXPORT_SYMBOL(memdup_user);
+
 /**
  * __krealloc - like krealloc() but don't free @p.
  * @p: object to reallocate memory for.
-- 
cgit v1.2.3


From 6a11f75b6a17b5d9ac5025f8d048382fd1f47377 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 31 Mar 2009 15:23:17 -0700
Subject: generic debug pagealloc

CONFIG_DEBUG_PAGEALLOC is now supported by x86, powerpc, sparc64, and
s390.  This patch implements it for the rest of the architectures by
filling the pages with poison byte patterns after free_pages() and
verifying the poison patterns before alloc_pages().

This generic one cannot detect invalid page accesses immediately but
invalid read access may cause invalid dereference by poisoned memory and
invalid write access can be detected after a long delay.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/avr32/mm/fault.c            |  18 ------
 arch/powerpc/Kconfig             |   3 +
 arch/powerpc/Kconfig.debug       |   1 +
 arch/s390/Kconfig                |   3 +
 arch/s390/Kconfig.debug          |   1 +
 arch/sparc/Kconfig               |   3 +
 arch/sparc/Kconfig.debug         |   3 +-
 arch/x86/Kconfig                 |   3 +
 arch/x86/Kconfig.debug           |   1 +
 include/linux/mm_types.h         |   4 ++
 include/linux/page-debug-flags.h |  30 +++++++++
 include/linux/poison.h           |   3 +
 lib/Kconfig.debug                |   1 +
 mm/Kconfig.debug                 |  17 ++++++
 mm/Makefile                      |   1 +
 mm/debug-pagealloc.c             | 129 +++++++++++++++++++++++++++++++++++++++
 16 files changed, 202 insertions(+), 19 deletions(-)
 create mode 100644 include/linux/page-debug-flags.h
 create mode 100644 mm/Kconfig.debug
 create mode 100644 mm/debug-pagealloc.c

(limited to 'include/linux')

diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index ce4e4296b954..62d4abbaa654 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -250,21 +250,3 @@ asmlinkage void do_bus_error(unsigned long addr, int write_access,
 	dump_dtlb();
 	die("Bus Error", regs, SIGKILL);
 }
-
-/*
- * This functionality is currently not possible to implement because
- * we're using segmentation to ensure a fixed mapping of the kernel
- * virtual address space.
- *
- * It would be possible to implement this, but it would require us to
- * disable segmentation at startup and load the kernel mappings into
- * the TLB like any other pages. There will be lots of trickery to
- * avoid recursive invocation of the TLB miss handler, though...
- */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-void kernel_map_pages(struct page *page, int numpages, int enable)
-{
-
-}
-EXPORT_SYMBOL(kernel_map_pages);
-#endif
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index ad6b1c084fe3..45192dce65c4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -228,6 +228,9 @@ config PPC_OF_PLATFORM_PCI
 	depends on PPC64 # not supported on 32 bits yet
 	default n
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 22091bbfdc9b..6aa0b5e087cd 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -30,6 +30,7 @@ config DEBUG_STACK_USAGE
 config DEBUG_PAGEALLOC
         bool "Debug page memory allocations"
         depends on DEBUG_KERNEL && !HIBERNATION
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
         help
           Unmap pages from the kernel linear mapping after free_pages().
           This results in a large slowdown, but helps to find certain types
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2a8af5e16345..dcb667c4375a 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -72,6 +72,9 @@ config PGSTE
 config VIRT_CPU_ACCOUNTING
 	def_bool y
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y
+
 mainmenu "Linux Kernel Configuration"
 
 config S390
diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index 4599fa06bd82..7e297a3cde34 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -9,6 +9,7 @@ source "lib/Kconfig.debug"
 config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
 	depends on DEBUG_KERNEL
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	help
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a slowdown, but helps to find certain types of
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index c3ea215334f6..cc12cd48bbc5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -124,6 +124,9 @@ config ARCH_NO_VIRT_TO_BUS
 config OF
 	def_bool y
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y if SPARC64
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/sparc/Kconfig.debug b/arch/sparc/Kconfig.debug
index b8a15e271bfa..d001b42041a5 100644
--- a/arch/sparc/Kconfig.debug
+++ b/arch/sparc/Kconfig.debug
@@ -24,7 +24,8 @@ config STACK_DEBUG
 
 config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
-	depends on SPARC64 && DEBUG_KERNEL && !HIBERNATION
+	depends on DEBUG_KERNEL && !HIBERNATION
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	help
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a large slowdown, but helps to find certain types
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 45161b816313..748e50a1a152 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,9 @@ config AUDIT_ARCH
 config ARCH_SUPPORTS_OPTIMIZED_INLINING
 	def_bool y
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y
+
 # Use the generic interrupt handling code in kernel/irq/:
 config GENERIC_HARDIRQS
 	bool
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index fdb45df608b6..a345cb5447a8 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -75,6 +75,7 @@ config DEBUG_STACK_USAGE
 config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
 	depends on DEBUG_KERNEL
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	---help---
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a large slowdown, but helps to find certain types
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d84feb7bdbf0..ddadb4defe00 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -11,6 +11,7 @@
 #include <linux/rwsem.h>
 #include <linux/completion.h>
 #include <linux/cpumask.h>
+#include <linux/page-debug-flags.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -174,6 +175,9 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
+#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
+	unsigned long debug_flags;	/* Use atomic bitops on this */
+#endif
 };
 
 struct core_thread {
diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h
new file mode 100644
index 000000000000..b0638fd91e92
--- /dev/null
+++ b/include/linux/page-debug-flags.h
@@ -0,0 +1,30 @@
+#ifndef LINUX_PAGE_DEBUG_FLAGS_H
+#define  LINUX_PAGE_DEBUG_FLAGS_H
+
+/*
+ * page->debug_flags bits:
+ *
+ * PAGE_DEBUG_FLAG_POISON is set for poisoned pages. This is used to
+ * implement generic debug pagealloc feature. The pages are filled with
+ * poison patterns and set this flag after free_pages(). The poisoned
+ * pages are verified whether the patterns are not corrupted and clear
+ * the flag before alloc_pages().
+ */
+
+enum page_debug_flags {
+	PAGE_DEBUG_FLAG_POISON,		/* Page is poisoned */
+};
+
+/*
+ * Ensure that CONFIG_WANT_PAGE_DEBUG_FLAGS reliably
+ * gets turned off when no debug features are enabling it!
+ */
+
+#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
+#if !defined(CONFIG_PAGE_POISONING) \
+/* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */
+#error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features!
+#endif
+#endif /* CONFIG_WANT_PAGE_DEBUG_FLAGS */
+
+#endif /* LINUX_PAGE_DEBUG_FLAGS_H */
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 9f31683728fd..6729f7dcd60e 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -17,6 +17,9 @@
  */
 #define TIMER_ENTRY_STATIC	((void *) 0x74737461)
 
+/********** mm/debug-pagealloc.c **********/
+#define PAGE_POISON 0xaa
+
 /********** mm/slab.c **********/
 /*
  * Magic nums for obj red zoning.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 58bfe7e8faba..9638d99644af 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -796,6 +796,7 @@ config SYSCTL_SYSCALL_CHECK
 	  to properly maintain and use. This enables checks that help
 	  you to keep things correct.
 
+source mm/Kconfig.debug
 source kernel/trace/Kconfig
 
 config PROVIDE_OHCI1394_DMA_INIT
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
new file mode 100644
index 000000000000..c8d62d49a44e
--- /dev/null
+++ b/mm/Kconfig.debug
@@ -0,0 +1,17 @@
+config WANT_PAGE_DEBUG_FLAGS
+	bool
+
+config PAGE_POISONING
+	bool "Debug page memory allocations"
+	depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	depends on !HIBERNATION
+	select DEBUG_PAGEALLOC
+	select WANT_PAGE_DEBUG_FLAGS
+	help
+	   Fill the pages with poison patterns after free_pages() and verify
+	   the patterns before alloc_pages(). This results in a large slowdown,
+	   but helps to find certain types of memory corruptions.
+
+	   This option cannot enalbe with hibernation. Otherwise, it will get
+	   wrong messages for memory corruption because the free pages are not
+	   saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 818569b68f46..ec73c68b6015 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_FAILSLAB) += failslab.o
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
new file mode 100644
index 000000000000..a1e3324de2b5
--- /dev/null
+++ b/mm/debug-pagealloc.c
@@ -0,0 +1,129 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/page-debug-flags.h>
+#include <linux/poison.h>
+
+static inline void set_page_poison(struct page *page)
+{
+	__set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline void clear_page_poison(struct page *page)
+{
+	__clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline bool page_poison(struct page *page)
+{
+	return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static void poison_highpage(struct page *page)
+{
+	/*
+	 * Page poisoning for highmem pages is not implemented.
+	 *
+	 * This can be called from interrupt contexts.
+	 * So we need to create a new kmap_atomic slot for this
+	 * application and it will need interrupt protection.
+	 */
+}
+
+static void poison_page(struct page *page)
+{
+	void *addr;
+
+	if (PageHighMem(page)) {
+		poison_highpage(page);
+		return;
+	}
+	set_page_poison(page);
+	addr = page_address(page);
+	memset(addr, PAGE_POISON, PAGE_SIZE);
+}
+
+static void poison_pages(struct page *page, int n)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		poison_page(page + i);
+}
+
+static bool single_bit_flip(unsigned char a, unsigned char b)
+{
+	unsigned char error = a ^ b;
+
+	return error && !(error & (error - 1));
+}
+
+static void check_poison_mem(unsigned char *mem, size_t bytes)
+{
+	unsigned char *start;
+	unsigned char *end;
+
+	for (start = mem; start < mem + bytes; start++) {
+		if (*start != PAGE_POISON)
+			break;
+	}
+	if (start == mem + bytes)
+		return;
+
+	for (end = mem + bytes - 1; end > start; end--) {
+		if (*end != PAGE_POISON)
+			break;
+	}
+
+	if (!printk_ratelimit())
+		return;
+	else if (start == end && single_bit_flip(*start, PAGE_POISON))
+		printk(KERN_ERR "pagealloc: single bit error\n");
+	else
+		printk(KERN_ERR "pagealloc: memory corruption\n");
+
+	print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
+			end - start + 1, 1);
+	dump_stack();
+}
+
+static void unpoison_highpage(struct page *page)
+{
+	/*
+	 * See comment in poison_highpage().
+	 * Highmem pages should not be poisoned for now
+	 */
+	BUG_ON(page_poison(page));
+}
+
+static void unpoison_page(struct page *page)
+{
+	if (PageHighMem(page)) {
+		unpoison_highpage(page);
+		return;
+	}
+	if (page_poison(page)) {
+		void *addr = page_address(page);
+
+		check_poison_mem(addr, PAGE_SIZE);
+		clear_page_poison(page);
+	}
+}
+
+static void unpoison_pages(struct page *page, int n)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		unpoison_page(page + i);
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	if (!debug_pagealloc_enabled)
+		return;
+
+	if (enable)
+		unpoison_pages(page, numpages);
+	else
+		poison_pages(page, numpages);
+}
-- 
cgit v1.2.3


From 704503d836042d4a4c7685b7036e7de0418fbc0f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:23:18 -0700
Subject: mm: fix proc_dointvec_userhz_jiffies "breakage"

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=9838

On i386, HZ=1000, jiffies_to_clock_t() converts time in a somewhat strange
way from the user's point of view:

	# echo 500 >/proc/sys/vm/dirty_writeback_centisecs
	# cat /proc/sys/vm/dirty_writeback_centisecs
	499

So, we have 5000 jiffies converted to only 499 clock ticks and reported
back.

TICK_NSEC = 999848
ACTHZ = 256039

Keeping in-kernel variable in units passed from userspace will fix issue
of course, but this probably won't be right for every sysctl.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/writeback.h |  4 ++--
 kernel/sysctl.c           |  2 +-
 mm/page-writeback.c       | 20 +++++++++++---------
 3 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 7300ecdc480c..93445477f86a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -109,8 +109,8 @@ extern int dirty_background_ratio;
 extern unsigned long dirty_background_bytes;
 extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
-extern int dirty_writeback_interval;
-extern int dirty_expire_interval;
+extern unsigned int dirty_writeback_interval;
+extern unsigned int dirty_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44ff850f..2e490a389dd2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1010,7 +1010,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &dirty_expire_interval,
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_userhz_jiffies,
+		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.ctl_name	= VM_NR_PDFLUSH_THREADS,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6aa92b03c747..30351f0063ac 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -92,14 +92,14 @@ int vm_dirty_ratio = 20;
 unsigned long vm_dirty_bytes;
 
 /*
- * The interval between `kupdate'-style writebacks, in jiffies
+ * The interval between `kupdate'-style writebacks
  */
-int dirty_writeback_interval = 5 * HZ;
+unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */
 
 /*
- * The longest number of jiffies for which data is allowed to remain dirty
+ * The longest time for which data is allowed to remain dirty
  */
-int dirty_expire_interval = 30 * HZ;
+unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */
 
 /*
  * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -770,9 +770,9 @@ static void wb_kupdate(unsigned long arg)
 
 	sync_supers();
 
-	oldest_jif = jiffies - dirty_expire_interval;
+	oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval);
 	start_jif = jiffies;
-	next_jif = start_jif + dirty_writeback_interval;
+	next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
 	nr_to_write = global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
@@ -801,9 +801,10 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, file, buffer, length, ppos);
 	if (dirty_writeback_interval)
-		mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+		mod_timer(&wb_timer, jiffies +
+			msecs_to_jiffies(dirty_writeback_interval * 10));
 	else
 		del_timer(&wb_timer);
 	return 0;
@@ -905,7 +906,8 @@ void __init page_writeback_init(void)
 {
 	int shift;
 
-	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+	mod_timer(&wb_timer,
+		  jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 
-- 
cgit v1.2.3


From c2fdf3a9b2d52842808a8e551b53b55dd9b45030 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 31 Mar 2009 15:23:19 -0700
Subject: mm: enable hashdist by default on 64bit NUMA

On PowerPC we allocate large boot time hashes on node 0.  This leads to an
imbalance in the free memory, for example on a 64GB box (4 x 16GB nodes):

Free memory:
Node 0: 97.03%
Node 1: 98.54%
Node 2: 98.42%
Node 3: 98.53%

If we switch to using vmalloc (like ia64 and x86-64) things are more
balanced:

Free memory:
Node 0: 97.53%
Node 1: 98.35%
Node 2: 98.33%
Node 3: 98.33%

For many HPC applications we are limited by the free available memory on
the smallest node, so even though the same amount of memory is used the
better balancing helps.

Since all 64bit NUMA capable architectures should have sufficient vmalloc
space, it makes sense to enable it via CONFIG_64BIT.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 455d83219fae..bc3ab7073695 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -146,10 +146,10 @@ extern void *alloc_large_system_hash(const char *tablename,
 
 #define HASH_EARLY	0x00000001	/* Allocating during early boot? */
 
-/* Only NUMA needs hash distribution.
- * IA64 and x86_64 have sufficient vmalloc space.
+/* Only NUMA needs hash distribution. 64bit NUMA architectures have
+ * sufficient vmalloc space.
  */
-#if defined(CONFIG_NUMA) && (defined(CONFIG_IA64) || defined(CONFIG_X86_64))
+#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT)
 #define HASHDIST_DEFAULT 1
 #else
 #define HASHDIST_DEFAULT 0
-- 
cgit v1.2.3


From c2ec175c39f62949438354f603f4aa170846aabb Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 31 Mar 2009 15:23:21 -0700
Subject: mm: page_mkwrite change prototype to match fault

Change the page_mkwrite prototype to take a struct vm_fault, and return
VM_FAULT_xxx flags.  There should be no functional change.

This makes it possible to return much more detailed error information to
the VM (and also can provide more information eg.  virtual_address to the
driver, which might be important in some special cases).

This is required for a subsequent fix.  And will also make it easier to
merge page_mkwrite() with fault() in future.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Artem Bityutskiy <dedekind@infradead.org>
Cc: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking |  2 +-
 drivers/video/fb_defio.c          |  3 ++-
 fs/btrfs/ctree.h                  |  2 +-
 fs/btrfs/inode.c                  |  5 ++++-
 fs/buffer.c                       |  6 +++++-
 fs/ext4/ext4.h                    |  2 +-
 fs/ext4/inode.c                   |  5 ++++-
 fs/fuse/file.c                    |  3 ++-
 fs/gfs2/ops_file.c                |  5 ++++-
 fs/nfs/file.c                     |  5 ++++-
 fs/ocfs2/mmap.c                   |  6 ++++--
 fs/ubifs/file.c                   |  9 ++++++---
 fs/xfs/linux-2.6/xfs_file.c       |  4 ++--
 include/linux/buffer_head.h       |  2 +-
 include/linux/mm.h                |  3 ++-
 mm/memory.c                       | 26 ++++++++++++++++++++++----
 16 files changed, 65 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 4e78ce677843..76efe5b71d7d 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -505,7 +505,7 @@ prototypes:
 	void (*open)(struct vm_area_struct*);
 	void (*close)(struct vm_area_struct*);
 	int (*fault)(struct vm_area_struct*, struct vm_fault *);
-	int (*page_mkwrite)(struct vm_area_struct *, struct page *);
+	int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
 	int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
 
 locking rules:
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 082026546aee..0a7a6679ee6e 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -85,8 +85,9 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_fsync);
 
 /* vm_ops->page_mkwrite handler */
 static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
-				  struct page *page)
+				  struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct fb_info *info = vma->vm_private_data;
 	struct fb_deferred_io *fbdefio = info->fbdefio;
 	struct page *cur;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d8..7dd1b6d0bf32 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2060,7 +2060,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22a..ec5423790bbb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4292,8 +4292,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = fdentry(vma->vm_file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4362,6 +4363,8 @@ again:
 out_unlock:
 	unlock_page(page);
 out:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 73abe6d8218c..6d51a3da362c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2313,9 +2313,10 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
  * unlock the page.
  */
 int
-block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 		   get_block_t get_block)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
@@ -2340,6 +2341,9 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 		ret = block_commit_write(page, 0, end);
 
 out_unlock:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
+
 	unlock_page(page);
 	return ret;
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6083bb38057b..990c94000924 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1098,7 +1098,7 @@ extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t ext4_get_reserved_space(struct inode *inode);
 
 /* ioctl.c */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71d3ecd5db79..dd82ff390067 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5146,8 +5146,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
 	return !buffer_mapped(bh);
 }
 
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
 	int ret = -EINVAL;
@@ -5199,6 +5200,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		goto out_unlock;
 	ret = 0;
 out_unlock:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	up_read(&inode->i_alloc_sem);
 	return ret;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 821d10f719bd..4e340fedf768 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1234,8 +1234,9 @@ static void fuse_vma_close(struct vm_area_struct *vma)
  * - sync(2)
  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
  */
-static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	/*
 	 * Don't use page->mapping as it may become NULL from a
 	 * concurrent truncate.
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3b9e8de3500b..70b9b8548945 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page)
  * blocks allocated on disk to back that page.
  */
 
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -412,6 +413,8 @@ out_unlock:
 	gfs2_glock_dq(&gh);
 out:
 	gfs2_holder_uninit(&gh);
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d2..cec79392e4ba 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,8 +451,9 @@ const struct address_space_operations nfs_file_aops = {
 	.launder_page = nfs_launder_page,
 };
 
-static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct file *filp = vma->vm_file;
 	struct dentry *dentry = filp->f_path.dentry;
 	unsigned pagelen;
@@ -483,6 +484,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		ret = pagelen;
 out_unlock:
 	unlock_page(page);
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713ea..b606496b72ec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
 	return ret;
 }
 
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct buffer_head *di_bh = NULL;
 	sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
 	ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
 	if (ret2 < 0)
 		mlog_errno(ret2);
-
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 93b6de51f261..0ff89fe71e51 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
  * mmap()d file has taken write protection fault and is being made
  * writable. UBIFS must ensure page is budgeted for.
  */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct timespec now = ubifs_current_time(inode);
@@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
 
 	if (unlikely(c->ro_media))
-		return -EROFS;
+		return VM_FAULT_SIGBUS; /* -EROFS */
 
 	/*
 	 * We have not locked @page so far so we may budget for changing the
@@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		if (err == -ENOSPC)
 			ubifs_warn("out of space for mmapped file "
 				   "(inode number %lu)", inode->i_ino);
-		return err;
+		return VM_FAULT_SIGBUS;
 	}
 
 	lock_page(page);
@@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 out_unlock:
 	unlock_page(page);
 	ubifs_release_budget(c, &req);
+	if (err)
+		err = VM_FAULT_SIGBUS;
 	return err;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e14c4e3aea0c..f4e255441574 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -234,9 +234,9 @@ xfs_file_mmap(
 STATIC int
 xfs_vm_page_mkwrite(
 	struct vm_area_struct	*vma,
-	struct page		*page)
+	struct vm_fault		*vmf)
 {
-	return block_page_mkwrite(vma, page, xfs_get_blocks);
+	return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 }
 
 const struct file_operations xfs_file_operations = {
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f19fd9045ea0..3d7bcde2e332 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -216,7 +216,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
-int block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 				get_block_t get_block);
 void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2223f8dfa568..aeabe953ba4f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -135,6 +135,7 @@ extern pgprot_t protection_map[16];
 
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
+#define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
 
 /*
  * This interface is used by x86 PAT code to identify a pfn mapping that is
@@ -187,7 +188,7 @@ struct vm_operations_struct {
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
-	int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
+	int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
 
 	/* called by access_process_vm when get_user_pages() fails, typically
 	 * for use by special VMAs that can switch between memory and hardware
diff --git a/mm/memory.c b/mm/memory.c
index 5b4ad5e4f98d..cf6873e91c6a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1945,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * get_user_pages(.write=1, .force=1).
 		 */
 		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+			struct vm_fault vmf;
+			int tmp;
+
+			vmf.virtual_address = (void __user *)(address &
+								PAGE_MASK);
+			vmf.pgoff = old_page->index;
+			vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+			vmf.page = old_page;
+
 			/*
 			 * Notify the address space that the page is about to
 			 * become writable so that it can prohibit this or wait
@@ -1956,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
 
-			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+			tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+			if (unlikely(tmp &
+					(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+				ret = tmp;
 				goto unwritable_page;
+			}
 
 			/*
 			 * Since we dropped the lock we need to revalidate
@@ -2106,7 +2119,7 @@ oom:
 
 unwritable_page:
 	page_cache_release(old_page);
-	return VM_FAULT_SIGBUS;
+	return ret;
 }
 
 /*
@@ -2648,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			 * to become writable
 			 */
 			if (vma->vm_ops->page_mkwrite) {
+				int tmp;
+
 				unlock_page(page);
-				if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
-					ret = VM_FAULT_SIGBUS;
+				vmf.flags |= FAULT_FLAG_MKWRITE;
+				tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+				if (unlikely(tmp &
+					  (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+					ret = tmp;
 					anon = 1; /* no anon but release vmf.page */
 					goto out_unlocked;
 				}
-- 
cgit v1.2.3


From f4112de6b679d84bd9b9681c7504be7bdfb7c7d5 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 31 Mar 2009 15:23:25 -0700
Subject: mm: introduce debug_kmap_atomic

x86 has debug_kmap_atomic_prot() which is error checking function for
kmap_atomic.  It is usefull for the other architectures, although it needs
CONFIG_TRACE_IRQFLAGS_SUPPORT.

This patch exposes it to the other architectures.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/highmem_32.c | 45 +--------------------------------------------
 include/linux/highmem.h  | 12 ++++++++++++
 mm/highmem.c             | 45 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 522db5e3d0bf..8126e8d1a2a4 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -19,49 +19,6 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
-static void debug_kmap_atomic_prot(enum km_type type)
-{
-#ifdef CONFIG_DEBUG_HIGHMEM
-	static unsigned warn_count = 10;
-
-	if (unlikely(warn_count == 0))
-		return;
-
-	if (unlikely(in_interrupt())) {
-		if (in_irq()) {
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		} else if (!irqs_disabled()) {	/* softirq */
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
-			    type != KM_SKB_SUNRPC_DATA &&
-			    type != KM_SKB_DATA_SOFTIRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		}
-	}
-
-	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-			type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
-		if (!irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
-		if (irq_count() == 0 && !irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	}
-#endif
-}
-
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -81,7 +38,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic_prot(type);
+	debug_kmap_atomic(type);
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 13875ce9112a..7ff5c55f9b55 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -187,4 +187,16 @@ static inline void copy_highpage(struct page *to, struct page *from)
 	kunmap_atomic(vto, KM_USER1);
 }
 
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+
+void debug_kmap_atomic(enum km_type type);
+
+#else
+
+static inline void debug_kmap_atomic(enum km_type type)
+{
+}
+
+#endif
+
 #endif /* _LINUX_HIGHMEM_H */
diff --git a/mm/highmem.c b/mm/highmem.c
index 910198037bf5..68eb1d9b63fa 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -422,3 +422,48 @@ void __init page_address_init(void)
 }
 
 #endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+
+void debug_kmap_atomic(enum km_type type)
+{
+	static unsigned warn_count = 10;
+
+	if (unlikely(warn_count == 0))
+		return;
+
+	if (unlikely(in_interrupt())) {
+		if (in_irq()) {
+			if (type != KM_IRQ0 && type != KM_IRQ1 &&
+			    type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
+			    type != KM_BOUNCE_READ) {
+				WARN_ON(1);
+				warn_count--;
+			}
+		} else if (!irqs_disabled()) {	/* softirq */
+			if (type != KM_IRQ0 && type != KM_IRQ1 &&
+			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
+			    type != KM_SKB_SUNRPC_DATA &&
+			    type != KM_SKB_DATA_SOFTIRQ &&
+			    type != KM_BOUNCE_READ) {
+				WARN_ON(1);
+				warn_count--;
+			}
+		}
+	}
+
+	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
+			type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+		if (!irqs_disabled()) {
+			WARN_ON(1);
+			warn_count--;
+		}
+	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
+		if (irq_count() == 0 && !irqs_disabled()) {
+			WARN_ON(1);
+			warn_count--;
+		}
+	}
+}
+
+#endif
-- 
cgit v1.2.3


From 33925b25d2c00a29664f1994ab350a9bff70f7a2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 31 Mar 2009 15:23:26 -0700
Subject: nommu: there is no mlock() for NOMMU, so don't provide the bits

The mlock() facility does not exist for NOMMU since all mappings are
effectively locked anyway, so we don't make the bits available when
they're not useful.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Enrik Berkhan <Enrik.Berkhan@ge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 20 +++++++++++++-------
 mm/Kconfig                 |  8 ++++++++
 mm/internal.h              |  8 +++++---
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 219a523ecdb0..61df1779b2a5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -96,6 +96,8 @@ enum pageflags {
 	PG_swapbacked,		/* Page is backed by RAM/swap */
 #ifdef CONFIG_UNEVICTABLE_LRU
 	PG_unevictable,		/* Page is "unevictable"  */
+#endif
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
@@ -234,20 +236,20 @@ PAGEFLAG_FALSE(SwapCache)
 #ifdef CONFIG_UNEVICTABLE_LRU
 PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
 	TESTCLEARFLAG(Unevictable, unevictable)
+#else
+PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
+	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
+	__CLEARPAGEFLAG_NOOP(Unevictable)
+#endif
 
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 #define MLOCK_PAGES 1
 PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
 	TESTSCFLAG(Mlocked, mlocked)
-
 #else
-
 #define MLOCK_PAGES 0
 PAGEFLAG_FALSE(Mlocked)
 	SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
-
-PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
-	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
-	__CLEARPAGEFLAG_NOOP(Unevictable)
 #endif
 
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
@@ -367,9 +369,13 @@ static inline void __ClearPageTail(struct page *page)
 
 #ifdef CONFIG_UNEVICTABLE_LRU
 #define __PG_UNEVICTABLE	(1 << PG_unevictable)
-#define __PG_MLOCKED		(1 << PG_mlocked)
 #else
 #define __PG_UNEVICTABLE	0
+#endif
+
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
+#define __PG_MLOCKED		(1 << PG_mlocked)
+#else
 #define __PG_MLOCKED		0
 #endif
 
diff --git a/mm/Kconfig b/mm/Kconfig
index a5b77811fdf2..8c895973dfba 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -214,5 +214,13 @@ config UNEVICTABLE_LRU
 	  will use one page flag and increase the code size a little,
 	  say Y unless you know what you are doing.
 
+config HAVE_MLOCK
+	bool
+	default y if MMU=y
+
+config HAVE_MLOCKED_PAGE_BIT
+	bool
+	default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
+
 config MMU_NOTIFIER
 	bool
diff --git a/mm/internal.h b/mm/internal.h
index 478223b73a2a..987bb03fbdd8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -63,6 +63,7 @@ static inline unsigned long page_order(struct page *page)
 	return page_private(page);
 }
 
+#ifdef CONFIG_HAVE_MLOCK
 extern long mlock_vma_pages_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -71,6 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 {
 	munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
 }
+#endif
 
 #ifdef CONFIG_UNEVICTABLE_LRU
 /*
@@ -90,7 +92,7 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
 }
 #endif
 
-#ifdef CONFIG_UNEVICTABLE_LRU
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 /*
  * Called only in fault path via page_evictable() for a new page
  * to determine if it's being mapped into a LOCKED vma.
@@ -165,7 +167,7 @@ static inline void free_page_mlock(struct page *page)
 	}
 }
 
-#else /* CONFIG_UNEVICTABLE_LRU */
+#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 {
 	return 0;
@@ -175,7 +177,7 @@ static inline void mlock_vma_page(struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
 static inline void free_page_mlock(struct page *page) { }
 
-#endif /* CONFIG_UNEVICTABLE_LRU */
+#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
 
 /*
  * Return the mem_map entry representing the 'offset' subpage within
-- 
cgit v1.2.3


From 327c0e968645f2601a43f5ea7c19c7b3a5fa0a34 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:23:31 -0700
Subject: vmscan: fix it to take care of nodemask

try_to_free_pages() is used for the direct reclaim of up to
SWAP_CLUSTER_MAX pages when watermarks are low.  The caller to
alloc_pages_nodemask() can specify a nodemask of nodes that are allowed to
be used but this is not passed to try_to_free_pages().  This can lead to
unnecessary reclaim of pages that are unusable by the caller and int the
worst case lead to allocation failure as progress was not been make where
it is needed.

This patch passes the nodemask used for alloc_pages_nodemask() to
try_to_free_pages().

Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c          |  2 +-
 include/linux/swap.h |  2 +-
 mm/page_alloc.c      |  3 ++-
 mm/vmscan.c          | 13 +++++++++++--
 4 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 0c14f8d52ee5..c77b848c3d43 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -290,7 +290,7 @@ static void free_more_memory(void)
 						&zone);
 		if (zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-						GFP_NOFS);
+						GFP_NOFS, NULL);
 	}
 }
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d30215578877..b8b0c4ce83e6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -212,7 +212,7 @@ static inline void lru_cache_add_active_file(struct page *page)
 
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-					gfp_t gfp_mask);
+					gfp_t gfp_mask, nodemask_t *mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 						  gfp_t gfp_mask, bool noswap,
 						  unsigned int swappiness);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cbd532161f68..0284e528748d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1582,7 +1582,8 @@ nofail_alloc:
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+	did_some_progress = try_to_free_pages(zonelist, order,
+						gfp_mask, nodemask);
 
 	p->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f4619c6cd59e..06e72693b458 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -78,6 +78,12 @@ struct scan_control {
 	/* Which cgroup do we reclaim from */
 	struct mem_cgroup *mem_cgroup;
 
+	/*
+	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
+	 * are scanned.
+	 */
+	nodemask_t	*nodemask;
+
 	/* Pluggable isolate pages callback */
 	unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
 			unsigned long *scanned, int order, int mode,
@@ -1538,7 +1544,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 	struct zone *zone;
 
 	sc->all_unreclaimable = 1;
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+					sc->nodemask) {
 		if (!populated_zone(zone))
 			continue;
 		/*
@@ -1683,7 +1690,7 @@ out:
 }
 
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-								gfp_t gfp_mask)
+				gfp_t gfp_mask, nodemask_t *nodemask)
 {
 	struct scan_control sc = {
 		.gfp_mask = gfp_mask,
@@ -1694,6 +1701,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.order = order,
 		.mem_cgroup = NULL,
 		.isolate_pages = isolate_pages_global,
+		.nodemask = nodemask,
 	};
 
 	return do_try_to_free_pages(zonelist, &sc);
@@ -1714,6 +1722,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 		.order = 0,
 		.mem_cgroup = mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
+		.nodemask = NULL, /* we don't care the placement */
 	};
 	struct zonelist *zonelist;
 
-- 
cgit v1.2.3


From 9fab5619bdd7f84cdd22cc760778f759f9819a33 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 31 Mar 2009 15:23:33 -0700
Subject: shmem: writepage directly to swap

Synopsis: if shmem_writepage calls swap_writepage directly, most shmem
swap loads benefit, and a catastrophic interaction between SLUB and some
flash storage is avoided.

shmem_writepage() has always been peculiar in making no attempt to write:
it has just transferred a shmem page from file cache to swap cache, then
let that page make its way around the LRU again before being written and
freed.

The idea was that people use tmpfs because they want those pages to stay
in RAM; so although we give it an overflow to swap, we should resist
writing too soon, giving those pages a second chance before they can be
reclaimed.

That was always questionable, and I've toyed with this patch for years;
but never had a clear justification to depart from the original design.

It became more questionable in 2.6.28, when the split LRU patches classed
shmem and tmpfs pages as SwapBacked rather than as file_cache: that in
itself gives them more resistance to reclaim than normal file pages.  I
prepared this patch for 2.6.29, but the merge window arrived before I'd
completed gathering statistics to justify sending it in.

Then while comparing SLQB against SLUB, running SLUB on a laptop I'd
habitually used with SLAB, I found SLUB to run my tmpfs kbuild swapping
tests five times slower than SLAB or SLQB - other machines slower too, but
nowhere near so bad.  Simpler "cp -a" swapping tests showed the same.

slub_max_order=0 brings sanity to all, but heavy swapping is too far from
normal to justify such a tuning.  The crucial factor on that laptop turns
out to be that I'm using an SD card for swap.  What happens is this:

By default, SLUB uses order-2 pages for shmem_inode_cache (and many other
fs inodes), so creating tmpfs files under memory pressure brings lumpy
reclaim into play.  One subpage of the order is chosen from the bottom of
the LRU as usual, then the other three picked out from their random
positions on the LRUs.

In a tmpfs load, many of these pages will be ones which already passed
through shmem_writepage, so already have swap allocated.  And though their
offsets on swap were probably allocated sequentially, now that the pages
are picked off at random, their swap offsets are scattered.

But the flash storage on the SD card is very sensitive to having its
writes merged: once swap is written at scattered offsets, performance
falls apart.  Rotating disk seeks increase too, but less disastrously.

So: stop giving shmem/tmpfs pages a second pass around the LRU, write them
out to swap as soon as their swap has been allocated.

It's surely possible to devise an artificial load which runs faster the
old way, one whose sizing is such that the tmpfs pages on their second
pass are the ones that are wanted again, and other pages not.

But I've not yet found such a load: on all machines, under the loads I've
tried, immediate swap_writepage speeds up shmem swapping: especially when
using the SLUB allocator (and more effectively than slub_max_order=0), but
also with the others; and it also reduces the variance between runs.  How
much faster varies widely: a factor of five is rare, 5% is common.

One load which might have suffered: imagine a swapping shmem load in a
limited mem_cgroup on a machine with plenty of memory.  Before 2.6.29 the
swapcache was not charged, and such a load would have run quickest with
the shmem swapcache never written to swap.  But now swapcache is charged,
so even this load benefits from shmem_writepage directly to swap.

Apologies for the #ifndef CONFIG_SWAP swap_writepage() stub in swap.h:
it's silly because that will never get called; but refactoring shmem.c
sensibly according to CONFIG_SWAP will be a separate task.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 5 +++++
 mm/shmem.c           | 3 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index b8b0c4ce83e6..62d81435347a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -382,6 +382,11 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 	return NULL;
 }
 
+static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+{
+	return 0;
+}
+
 static inline struct page *lookup_swap_cache(swp_entry_t swp)
 {
 	return NULL;
diff --git a/mm/shmem.c b/mm/shmem.c
index 7ec78e24a30d..d94d2e9146bc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1068,8 +1068,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		swap_duplicate(swap);
 		BUG_ON(page_mapped(page));
 		page_cache_release(page);	/* pagecache ref */
-		set_page_dirty(page);
-		unlock_page(page);
+		swap_writepage(page, wbc);
 		if (inode) {
 			mutex_lock(&shmem_swaplist_mutex);
 			/* move instead of add in case we're racing */
-- 
cgit v1.2.3


From a8af78982ff4c0b3731527b0217d286a343a3089 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 31 Mar 2009 15:23:37 -0700
Subject: pm: rework includes, remove arch ifdefs

Make the following header file changes:

 - remove arch ifdefs and asm/suspend.h from linux/suspend.h
 - add asm/suspend.h to disk.c (for arch_prepare_suspend())
 - add linux/io.h to swsusp.c (for ioremap())
 - x86 32/64 bit compile fixes

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/asm-offsets_32.c | 1 +
 arch/x86/kernel/asm-offsets_64.c | 1 +
 arch/x86/power/cpu_32.c          | 1 +
 arch/x86/power/cpu_64.c          | 1 +
 arch/x86/power/hibernate_64.c    | 1 +
 include/linux/suspend.h          | 3 ---
 kernel/power/disk.c              | 1 +
 kernel/power/swsusp.c            | 1 +
 8 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fbf2f33e3080..5a6aa1c1162f 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -18,6 +18,7 @@
 #include <asm/thread_info.h>
 #include <asm/bootparam.h>
 #include <asm/elf.h>
+#include <asm/suspend.h>
 
 #include <xen/interface/xen.h>
 
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 8793ab33e2c1..e72f062fb4b5 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -16,6 +16,7 @@
 #include <asm/thread_info.h>
 #include <asm/ia32.h>
 #include <asm/bootparam.h>
+#include <asm/suspend.h>
 
 #include <xen/interface/xen.h>
 
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 274d06082f48..ce702c5b3a2c 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -12,6 +12,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/xcr.h>
+#include <asm/suspend.h>
 
 static struct saved_context saved_context;
 
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index e3b6cf70d62c..5343540f2607 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -15,6 +15,7 @@
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
 #include <asm/xcr.h>
+#include <asm/suspend.h>
 
 static void fix_processor_context(void);
 
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 6dd000dd7933..65fdc86e923f 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -14,6 +14,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
+#include <asm/suspend.h>
 
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index c7d9bb1832ba..3e3a4364cbff 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -1,9 +1,6 @@
 #ifndef _LINUX_SUSPEND_H
 #define _LINUX_SUSPEND_H
 
-#if defined(CONFIG_X86) || defined(CONFIG_FRV) || defined(CONFIG_PPC32) || defined(CONFIG_PPC64)
-#include <asm/suspend.h>
-#endif
 #include <linux/swap.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e886d1332a10..f3db382c2b2d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <asm/suspend.h>
 
 #include "power.h"
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 1ee6636414b2..78c35047586d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -51,6 +51,7 @@
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/rbtree.h>
+#include <linux/io.h>
 
 #include "power.h"
 
-- 
cgit v1.2.3


From 53d6660836f233df66490707365ab177e5fb2bb4 Mon Sep 17 00:00:00 2001
From: "J. R. Okajima" <hooanon05@yahoo.co.jp>
Date: Tue, 31 Mar 2009 15:23:43 -0700
Subject: loop: add ioctl to resize a loop device

Add the ability to 'resize' the loop device on the fly.

One practical application is a loop file with XFS filesystem, already
mounted: You can easily enlarge the file (append some bytes) and then call
ioctl(fd, LOOP_SET_CAPACITY, new); The loop driver will learn about the
new size and you can use xfs_growfs later on, which will allow you to use
full capacity of the loop file without the need to unmount.

Test app:

#include <linux/fs.h>
#include <linux/loop.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#define _GNU_SOURCE
#include <getopt.h>

char *me;

void usage(FILE *f)
{
	fprintf(f, "%s [options] loop_dev [backend_file]\n"
		"-s, --set new_size_in_bytes\n"
		"\twhen backend_file is given, "
		"it will be expanded too while keeping the original contents\n",
		me);
}

struct option opts[] = {
	{
		.name		= "set",
		.has_arg	= 1,
		.flag		= NULL,
		.val		= 's'
	},
	{
		.name		= "help",
		.has_arg	= 0,
		.flag		= NULL,
		.val		= 'h'
	}
};

void err_size(char *name, __u64 old)
{
	fprintf(stderr, "size must be larger than current %s (%llu)\n",
		name, old);
}

int main(int argc, char *argv[])
{
	int fd, err, c, i, bfd;
	ssize_t ssz;
	size_t sz;
	__u64 old, new, append;
	char a[BUFSIZ];
	struct stat st;
	FILE *out;
	char *backend, *dev;

	err = EINVAL;
	out = stderr;
	me = argv[0];
	new = 0;
	while ((c = getopt_long(argc, argv, "s:h", opts, &i)) != -1) {
		switch (c) {
		case 's':
			errno = 0;
			new = strtoull(optarg, NULL, 0);
			if (errno) {
				err = errno;
				perror(argv[i]);
				goto out;
			}
			break;

		case 'h':
			err = 0;
			out = stdout;
			goto err;

		default:
			perror(argv[i]);
			goto err;
		}
	}

	if (optind < argc)
		dev = argv[optind++];
	else
		goto err;

	fd = open(dev, O_RDONLY);
	if (fd < 0) {
		err = errno;
		perror(dev);
		goto out;
	}

	err = ioctl(fd, BLKGETSIZE64, &old);
	if (err) {
		err = errno;
		perror("ioctl BLKGETSIZE64");
		goto out;
	}

	if (!new) {
		printf("%llu\n", old);
		goto out;
	}

	if (new < old) {
		err = EINVAL;
		err_size(dev, old);
		goto out;
	}

	if (optind < argc) {
		backend = argv[optind++];
		bfd = open(backend, O_WRONLY|O_APPEND);
		if (bfd < 0) {
			err = errno;
			perror(backend);
			goto out;
		}
		err = fstat(bfd, &st);
		if (err) {
			err = errno;
			perror(backend);
			goto out;
		}
		if (new < st.st_size) {
			err = EINVAL;
			err_size(backend, st.st_size);
			goto out;
		}
		append = new - st.st_size;
		sz = sizeof(a);
		while (append > 0) {
			if (append < sz)
				sz = append;
			ssz = write(bfd, a, sz);
			if (ssz != sz) {
				err = errno;
				perror(backend);
				goto out;
			}
			append -= sz;
		}
		err = fsync(bfd);
		if (err) {
			err = errno;
			perror(backend);
			goto out;
		}
	}

	err = ioctl(fd, LOOP_SET_CAPACITY, new);
	if (err) {
		err = errno;
		perror("ioctl LOOP_SET_CAPACITY");
	}
	goto out;

 err:
	usage(out);
 out:
	return err;
}

Signed-off-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Signed-off-by: Tomas Matejicek <tomas@slax.org>
Cc: <util-linux-ng@vger.kernel.org>
Cc: Karel Zak <kzak@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/loop.c | 30 ++++++++++++++++++++++++++++++
 include/linux/loop.h |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 2621ed2ce6d2..40b17d3b55a1 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1192,6 +1192,30 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
 	return err;
 }
 
+static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev)
+{
+	int err;
+	sector_t sec;
+	loff_t sz;
+
+	err = -ENXIO;
+	if (unlikely(lo->lo_state != Lo_bound))
+		goto out;
+	err = figure_loop_size(lo);
+	if (unlikely(err))
+		goto out;
+	sec = get_capacity(lo->lo_disk);
+	/* the width of sector_t may be narrow for bit-shift */
+	sz = sec;
+	sz <<= 9;
+	mutex_lock(&bdev->bd_mutex);
+	bd_set_size(bdev, sz);
+	mutex_unlock(&bdev->bd_mutex);
+
+ out:
+	return err;
+}
+
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
@@ -1224,6 +1248,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	case LOOP_GET_STATUS64:
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
 		break;
+	case LOOP_SET_CAPACITY:
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+			err = loop_set_capacity(lo, bdev);
+		break;
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
@@ -1371,6 +1400,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 			lo, (struct compat_loop_info __user *) arg);
 		mutex_unlock(&lo->lo_ctl_mutex);
 		break;
+	case LOOP_SET_CAPACITY:
 	case LOOP_CLR_FD:
 	case LOOP_GET_STATUS64:
 	case LOOP_SET_STATUS64:
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 6ffd6db5bb0d..40725447f5e0 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -160,5 +160,6 @@ int loop_unregister_transfer(int number);
 #define LOOP_SET_STATUS64	0x4C04
 #define LOOP_GET_STATUS64	0x4C05
 #define LOOP_CHANGE_FD		0x4C06
+#define LOOP_SET_CAPACITY	0x4C07
 
 #endif
-- 
cgit v1.2.3


From c2d7543851849a6923680cdd7e1047ed1a84a1c5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 31 Mar 2009 15:23:46 -0700
Subject: filesystem freeze: allow SysRq emergency thaw to thaw frozen
 filesystems

Now that the filesystem freeze operation has been elevated to the VFS, and
is just an ioctl away, some sort of safety net for unintentionally frozen
root filesystems may be in order.

The timeout thaw originally proposed did not get merged, but perhaps
something like this would be useful in emergencies.

For example, freeze /path/to/mountpoint may freeze your root filesystem if
you forgot that you had that unmounted.

I chose 'j' as the last remaining character other than 'h' which is sort
of reserved for help (because help is generated on any unknown character).

I've tested this on a non-root fs with multiple (nested) freezers, as well
as on a system rendered unresponsive due to a frozen root fs.

[randy.dunlap@oracle.com: emergency thaw only if CONFIG_BLOCK enabled]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: Takashi Sato <t-sato@yk.jp.nec.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysrq.txt |  5 +++++
 drivers/char/sysrq.c    | 19 ++++++++++++++++++-
 fs/buffer.c             | 33 +++++++++++++++++++++++++++++++++
 include/linux/fs.h      |  1 +
 4 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 9e592c718afb..afa2946892da 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -81,6 +81,8 @@ On all -  write a character to /proc/sysrq-trigger.  e.g.:
 
 'i'     - Send a SIGKILL to all processes, except for init.
 
+'j'     - Forcibly "Just thaw it" - filesystems frozen by the FIFREEZE ioctl.
+
 'k'     - Secure Access Key (SAK) Kills all programs on the current virtual
           console. NOTE: See important comments below in SAK section.
 
@@ -160,6 +162,9 @@ t'E'rm and k'I'll are useful if you have some sort of runaway process you
 are unable to kill any other way, especially if it's spawning other
 processes.
 
+"'J'ust thaw it" is useful if your system becomes unresponsive due to a frozen
+(probably root) filesystem via the FIFREEZE ioctl.
+
 *  Sometimes SysRq seems to get 'stuck' after using it, what can I do?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 That happens to me, also. I've found that tapping shift, alt, and control
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 33a9351c896d..5afe7316c72e 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -346,6 +346,19 @@ static struct sysrq_key_op sysrq_moom_op = {
 	.enable_mask	= SYSRQ_ENABLE_SIGNAL,
 };
 
+#ifdef CONFIG_BLOCK
+static void sysrq_handle_thaw(int key, struct tty_struct *tty)
+{
+	emergency_thaw_all();
+}
+static struct sysrq_key_op sysrq_thaw_op = {
+	.handler	= sysrq_handle_thaw,
+	.help_msg	= "thaw-filesystems(J)",
+	.action_msg	= "Emergency Thaw of all frozen filesystems",
+	.enable_mask	= SYSRQ_ENABLE_SIGNAL,
+};
+#endif
+
 static void sysrq_handle_kill(int key, struct tty_struct *tty)
 {
 	send_sig_all(SIGKILL);
@@ -396,9 +409,13 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
 	&sysrq_moom_op,			/* f */
 	/* g: May be registered by ppc for kgdb */
 	NULL,				/* g */
-	NULL,				/* h */
+	NULL,				/* h - reserved for help */
 	&sysrq_kill_op,			/* i */
+#ifdef CONFIG_BLOCK
+	&sysrq_thaw_op,			/* j */
+#else
 	NULL,				/* j */
+#endif
 	&sysrq_SAK_op,			/* k */
 #ifdef CONFIG_SMP
 	&sysrq_showallcpus_op,		/* l */
diff --git a/fs/buffer.c b/fs/buffer.c
index c77b848c3d43..f5f8b15a6e40 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -547,6 +547,39 @@ repeat:
 	return err;
 }
 
+void do_thaw_all(unsigned long unused)
+{
+	struct super_block *sb;
+	char b[BDEVNAME_SIZE];
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+			printk(KERN_WARNING "Emergency Thaw on %s\n",
+			       bdevname(sb->s_bdev, b));
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+	printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+	pdflush_operation(do_thaw_all, 0);
+}
+
 /**
  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
  * @mapping: the mapping which wants those buffers written
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 87e7bfc5ebd7..61211ad823fe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1878,6 +1878,7 @@ extern struct block_device *open_by_devnum(dev_t, fmode_t);
 extern void invalidate_bdev(struct block_device *);
 extern int sync_blockdev(struct block_device *bdev);
 extern struct super_block *freeze_bdev(struct block_device *);
+extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
 extern int fsync_super(struct super_block *);
-- 
cgit v1.2.3


From 311d07611e8b354cc1ee6546e4c574c01111adc8 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 31 Mar 2009 15:23:51 -0700
Subject: introduce pr_cont() macro

We cover all log-levels by pr_...  macros except KERN_CONT one.  Add it
for convenience.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f81d80f47dcb..e720b0da7751 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -353,6 +353,8 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
         printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_info(fmt, ...) \
         printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_cont(fmt, ...) \
+	printk(KERN_CONT fmt, ##__VA_ARGS__)
 
 /* If you are writing a driver, please use dev_dbg instead */
 #if defined(DEBUG)
-- 
cgit v1.2.3


From bcd0b235bf3808dec5115c381cd55568f63b85f0 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:18 -0700
Subject: eventfd: improve support for semaphore-like behavior

People started using eventfd in a semaphore-like way where before they
were using pipes.

That is, counter-based resource access.  Where a "wait()" returns
immediately by decrementing the counter by one, if counter is greater than
zero.  Otherwise will wait.  And where a "post(count)" will add count to
the counter releasing the appropriate amount of waiters.  If eventfd the
"post" (write) part is fine, while the "wait" (read) does not dequeue 1,
but the whole counter value.

The problem with eventfd is that a read() on the fd returns and wipes the
whole counter, making the use of it as semaphore a little bit more
cumbersome.  You can do a read() followed by a write() of COUNTER-1, but
IMO it's pretty easy and cheap to make this work w/out extra steps.  This
patch introduces a new eventfd flag that tells eventfd to only dequeue 1
from the counter, allowing simple read/write to make it behave like a
semaphore.  Simple test here:

http://www.xmailserver.org/eventfd-sem.c

To be back-compatible with earlier kernels, userspace applications should
probe for the availability of this feature via

#ifdef EFD_SEMAPHORE
	fd = eventfd2 (CNT, EFD_SEMAPHORE);
	if (fd == -1 && errno == EINVAL)
		<fallback>
#else
		<fallback>
#endif

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: <linux-api@vger.kernel.org>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventfd.c            | 20 +++++++++++---------
 include/linux/eventfd.h | 12 +++++++++++-
 2 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 5de2c2db3aa2..91c0829a7035 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -28,6 +28,7 @@ struct eventfd_ctx {
 	 * issue a wakeup.
 	 */
 	__u64 count;
+	unsigned int flags;
 };
 
 /*
@@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 {
 	struct eventfd_ctx *ctx = file->private_data;
 	ssize_t res;
-	__u64 ucnt;
+	__u64 ucnt = 0;
 	DECLARE_WAITQUEUE(wait, current);
 
 	if (count < sizeof(ucnt))
 		return -EINVAL;
 	spin_lock_irq(&ctx->wqh.lock);
 	res = -EAGAIN;
-	ucnt = ctx->count;
-	if (ucnt > 0)
+	if (ctx->count > 0)
 		res = sizeof(ucnt);
 	else if (!(file->f_flags & O_NONBLOCK)) {
 		__add_wait_queue(&ctx->wqh, &wait);
 		for (res = 0;;) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (ctx->count > 0) {
-				ucnt = ctx->count;
 				res = sizeof(ucnt);
 				break;
 			}
@@ -117,8 +116,9 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (res > 0) {
-		ctx->count = 0;
+	if (likely(res > 0)) {
+		ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+		ctx->count -= ucnt;
 		if (waitqueue_active(&ctx->wqh))
 			wake_up_locked(&ctx->wqh);
 	}
@@ -166,7 +166,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (res > 0) {
+	if (likely(res > 0)) {
 		ctx->count += ucnt;
 		if (waitqueue_active(&ctx->wqh))
 			wake_up_locked(&ctx->wqh);
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 
-	if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
+	if (flags & ~EFD_FLAGS_SET)
 		return -EINVAL;
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
@@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
+	ctx->flags = flags;
 
 	/*
 	 * When we call this, the initialization must be complete, since
 	 * anon_inode_getfd() will install the fd.
 	 */
 	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
-			      flags & (O_CLOEXEC | O_NONBLOCK));
+			      flags & EFD_SHARED_FCNTL_FLAGS);
 	if (fd < 0)
 		kfree(ctx);
 	return fd;
@@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
 {
 	return sys_eventfd2(count, 0);
 }
+
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index a667637b54e3..f45a8ae5f828 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -13,10 +13,20 @@
 /* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
 
-/* Flags for eventfd2.  */
+/*
+ * CAREFUL: Check include/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from eventfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
+#define EFD_SEMAPHORE (1 << 0)
 #define EFD_CLOEXEC O_CLOEXEC
 #define EFD_NONBLOCK O_NONBLOCK
 
+#define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
+#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
+
 struct file *eventfd_fget(int fd);
 int eventfd_signal(struct file *file, int n);
 
-- 
cgit v1.2.3


From 4ede816ac36e027db5fe0051ad9c73f76db63772 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:20 -0700
Subject: epoll keyed wakeups: add __wake_up_locked_key() and
 __wake_up_sync_key()

This patchset introduces wakeup hints for some of the most popular (from
epoll POV) devices, so that epoll code can avoid spurious wakeups on its
waiters.

The problem with epoll is that the callback-based wakeups do not, ATM,
carry any information about the events the wakeup is related to.  So the
only choice epoll has (not being able to call f_op->poll() from inside the
callback), is to add the file* to a ready-list and resolve the real events
later on, at epoll_wait() (or its own f_op->poll()) time.  This can cause
spurious wakeups, since the wake_up() itself might be for an event the
caller is not interested into.

The rate of these spurious wakeup can be pretty high in case of many
network sockets being monitored.

By allowing devices to report the events the wakeups refer to (at least
the two major classes - POLLIN/POLLOUT), we are able to spare useless
wakeups by proper handling inside the epoll's poll callback.

Epoll will have in any case to call f_op->poll() on the file* later on,
since the change to be done in order to have the full event set sent via
wakeup, is too invasive for the way our f_op->poll() system works (the
full event set is calculated inside the poll function - there are too many
of them to even start thinking the change - also poll/select would need
change too).

Epoll is changed in a way that both devices which send event hints, and
the ones that don't, are correctly handled.  The former will gain some
efficiency though.

As a general rule for devices, would be to add an event mask by using
key-aware wakeup macros, when making up poll wait queues.  I tested it
(together with the epoll's poll fix patch Andrew has in -mm) and wakeups
for the supported devices are correctly filtered.

Test program available here:

http://www.xmailserver.org/epoll_test.c

This patch:

Nothing revolutionary here.  Just using the available "key" that our
wakeup core already support.  The __wake_up_locked_key() was no brainer,
since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers
around __wake_up_common().

The __wake_up_sync() function had a body, so the choice was between
borrowing the body for __wake_up_sync_key() and calling it from
__wake_up_sync(), or make an inline and calling it from both.  I chose the
former since in most archs it all resolves to "mov $0, REG; jmp ADDR".

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h |  7 +++++--
 kernel/sched.c       | 23 +++++++++++++++++++----
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index a210ede73b56..0d2eeb03a718 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -135,8 +135,11 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
 void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int sync, void *key);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
-extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
+			void *key);
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
 void __wake_up_bit(wait_queue_head_t *, void *, int);
 int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
 int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
diff --git a/kernel/sched.c b/kernel/sched.c
index 196d48babbef..73513f4e19df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5196,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+	__wake_up_common(q, mode, 1, 0, key);
+}
+
 /**
- * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
@@ -5209,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
  *
  * On UP it can prevent extra preemption.
  */
-void
-__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	int sync = 1;
@@ -5222,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 		sync = 0;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
+	__wake_up_common(q, mode, nr_exclusive, sync, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
 /**
-- 
cgit v1.2.3


From c0da37753695e010776ccf2200a5731e0f88a9f3 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:20 -0700
Subject: epoll keyed wakeups: introduce new *_poll() wakeup macros

Introduce new wakeup macros that allow passing an event mask to the wakeup
targets.  They exactly mimic their non-_poll() counterpart, with the added
event mask passing capability.  I did add only the ones currently
requested, avoiding the _nr() and _all() for the moment.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 0d2eeb03a718..5d631c17eaee 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -158,21 +158,17 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 #define wake_up_interruptible_all(x)	__wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
 #define wake_up_interruptible_sync(x)	__wake_up_sync((x), TASK_INTERRUPTIBLE, 1)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
- * macro to avoid include hell
+ * Wakeup macros to be used to report events to the targets.
  */
-#define wake_up_nested(x, s)						\
-do {									\
-	unsigned long flags;						\
-									\
-	spin_lock_irqsave_nested(&(x)->lock, flags, (s));		\
-	wake_up_locked(x); 						\
-	spin_unlock_irqrestore(&(x)->lock, flags);			\
-} while (0)
-#else
-#define wake_up_nested(x, s)		wake_up(x)
-#endif
+#define wake_up_poll(x, m)				\
+	__wake_up(x, TASK_NORMAL, 1, (void *) (m))
+#define wake_up_locked_poll(x, m)				\
+	__wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+#define wake_up_interruptible_poll(x, m)			\
+	__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
+#define wake_up_interruptible_sync_poll(x, m)				\
+	__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
 
 #define __wait_event(wq, condition) 					\
 do {									\
-- 
cgit v1.2.3


From 364fdbc00fbdd409ade63500710123fe323aa164 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 31 Mar 2009 15:24:36 -0700
Subject: spi_mpc83xx: rework chip selects handling

The main purpose of this patch is to pass 'struct spi_device' to the chip
select handling routines.  This is needed so that we could implement
full-fledged OpenFirmware support for this driver.

While at it, also:
- Replace two {de,activate}_cs routines by single cs_contol().
- Don't duplicate platform data callbacks in mpc83xx_spi struct.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@gate.crashing.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/83xx/mpc832x_rdb.c | 16 ++++------------
 arch/powerpc/sysdev/fsl_soc.c             | 14 ++++++--------
 arch/powerpc/sysdev/fsl_soc.h             |  5 +++--
 drivers/spi/spi_mpc83xx.c                 | 20 +++++++-------------
 include/linux/fsl_devices.h               |  5 +++--
 5 files changed, 23 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
index 2a1295f19832..28e23cde64a1 100644
--- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
@@ -39,16 +39,10 @@
 #endif
 
 #ifdef CONFIG_QUICC_ENGINE
-static void mpc83xx_spi_activate_cs(u8 cs, u8 polarity)
+static void mpc83xx_spi_cs_control(struct spi_device *spi, bool on)
 {
-	pr_debug("%s %d %d\n", __func__, cs, polarity);
-	par_io_data_set(3, 13, polarity);
-}
-
-static void mpc83xx_spi_deactivate_cs(u8 cs, u8 polarity)
-{
-	pr_debug("%s %d %d\n", __func__, cs, polarity);
-	par_io_data_set(3, 13, !polarity);
+	pr_debug("%s %d %d\n", __func__, spi->chip_select, on);
+	par_io_data_set(3, 13, on);
 }
 
 static struct mmc_spi_platform_data mpc832x_mmc_pdata = {
@@ -74,9 +68,7 @@ static int __init mpc832x_spi_init(void)
 	par_io_config_pin(3, 14, 2, 0, 0, 0); /* SD_INSERT, I */
 	par_io_config_pin(3, 15, 2, 0, 0, 0); /* SD_PROTECT,I */
 
-	return fsl_spi_init(&mpc832x_spi_boardinfo, 1,
-			    mpc83xx_spi_activate_cs,
-			    mpc83xx_spi_deactivate_cs);
+	return fsl_spi_init(&mpc832x_spi_boardinfo, 1, mpc83xx_spi_cs_control);
 }
 machine_device_initcall(mpc832x_rdb, mpc832x_spi_init);
 #endif /* CONFIG_QUICC_ENGINE */
diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index a01c89d3f9bd..a46c1c867930 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -420,8 +420,8 @@ arch_initcall(fsl_usb_of_init);
 static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk,
 				   struct spi_board_info *board_infos,
 				   unsigned int num_board_infos,
-				   void (*activate_cs)(u8 cs, u8 polarity),
-				   void (*deactivate_cs)(u8 cs, u8 polarity))
+				   void (*cs_control)(struct spi_device *dev,
+						      bool on))
 {
 	struct device_node *np;
 	unsigned int i = 0;
@@ -433,8 +433,7 @@ static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk,
 		struct resource res[2];
 		struct platform_device *pdev;
 		struct fsl_spi_platform_data pdata = {
-			.activate_cs = activate_cs,
-			.deactivate_cs = deactivate_cs,
+			.cs_control = cs_control,
 		};
 
 		memset(res, 0, sizeof(res));
@@ -501,8 +500,7 @@ next:
 
 int __init fsl_spi_init(struct spi_board_info *board_infos,
 			unsigned int num_board_infos,
-			void (*activate_cs)(u8 cs, u8 polarity),
-			void (*deactivate_cs)(u8 cs, u8 polarity))
+			void (*cs_control)(struct spi_device *spi, bool on))
 {
 	u32 sysclk = -1;
 	int ret;
@@ -518,10 +516,10 @@ int __init fsl_spi_init(struct spi_board_info *board_infos,
 	}
 
 	ret = of_fsl_spi_probe(NULL, "fsl,spi", sysclk, board_infos,
-			       num_board_infos, activate_cs, deactivate_cs);
+			       num_board_infos, cs_control);
 	if (!ret)
 		of_fsl_spi_probe("spi", "fsl_spi", sysclk, board_infos,
-				 num_board_infos, activate_cs, deactivate_cs);
+				 num_board_infos, cs_control);
 
 	return spi_register_board_info(board_infos, num_board_infos);
 }
diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h
index 9c744e4285a0..b5f3456780b8 100644
--- a/arch/powerpc/sysdev/fsl_soc.h
+++ b/arch/powerpc/sysdev/fsl_soc.h
@@ -4,6 +4,8 @@
 
 #include <asm/mmu.h>
 
+struct spi_device;
+
 extern phys_addr_t get_immrbase(void);
 #if defined(CONFIG_CPM2) || defined(CONFIG_QUICC_ENGINE) || defined(CONFIG_8xx)
 extern u32 get_brgfreq(void);
@@ -19,8 +21,7 @@ struct device_node;
 
 extern int fsl_spi_init(struct spi_board_info *board_infos,
 			unsigned int num_board_infos,
-			void (*activate_cs)(u8 cs, u8 polarity),
-			void (*deactivate_cs)(u8 cs, u8 polarity));
+			void (*cs_control)(struct spi_device *spi, bool on));
 
 extern void fsl_rstcr_restart(char *cmd);
 
diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index df6420029004..b95085a46f90 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -89,9 +89,6 @@ struct mpc83xx_spi {
 
 	bool qe_mode;
 
-	void (*activate_cs) (u8 cs, u8 polarity);
-	void (*deactivate_cs) (u8 cs, u8 polarity);
-
 	u8 busy;
 
 	struct workqueue_struct *workqueue;
@@ -153,15 +150,14 @@ MPC83XX_SPI_TX_BUF(u32)
 
 static void mpc83xx_spi_chipselect(struct spi_device *spi, int value)
 {
-	struct mpc83xx_spi *mpc83xx_spi;
-	u8 pol = spi->mode & SPI_CS_HIGH ? 1 : 0;
+	struct mpc83xx_spi *mpc83xx_spi = spi_master_get_devdata(spi->master);
+	struct fsl_spi_platform_data *pdata = spi->dev.parent->platform_data;
+	bool pol = spi->mode & SPI_CS_HIGH;
 	struct spi_mpc83xx_cs	*cs = spi->controller_state;
 
-	mpc83xx_spi = spi_master_get_devdata(spi->master);
-
 	if (value == BITBANG_CS_INACTIVE) {
-		if (mpc83xx_spi->deactivate_cs)
-			mpc83xx_spi->deactivate_cs(spi->chip_select, pol);
+		if (pdata->cs_control)
+			pdata->cs_control(spi, !pol);
 	}
 
 	if (value == BITBANG_CS_ACTIVE) {
@@ -186,8 +182,8 @@ static void mpc83xx_spi_chipselect(struct spi_device *spi, int value)
 			mpc83xx_spi_write_reg(mode, regval);
 			local_irq_restore(flags);
 		}
-		if (mpc83xx_spi->activate_cs)
-			mpc83xx_spi->activate_cs(spi->chip_select, pol);
+		if (pdata->cs_control)
+			pdata->cs_control(spi, pol);
 	}
 }
 
@@ -582,8 +578,6 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 	master->cleanup = mpc83xx_spi_cleanup;
 
 	mpc83xx_spi = spi_master_get_devdata(master);
-	mpc83xx_spi->activate_cs = pdata->activate_cs;
-	mpc83xx_spi->deactivate_cs = pdata->deactivate_cs;
 	mpc83xx_spi->qe_mode = pdata->qe_mode;
 	mpc83xx_spi->get_rx = mpc83xx_spi_rx_buf_u8;
 	mpc83xx_spi->get_tx = mpc83xx_spi_tx_buf_u8;
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index d9051d717d27..7bc1b643d370 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -95,14 +95,15 @@ struct fsl_usb2_platform_data {
 #define FSL_USB2_PORT0_ENABLED	0x00000001
 #define FSL_USB2_PORT1_ENABLED	0x00000002
 
+struct spi_device;
+
 struct fsl_spi_platform_data {
 	u32 	initial_spmode;	/* initial SPMODE value */
 	u16	bus_num;
 	bool	qe_mode;
 	/* board specific information */
 	u16	max_chipselect;
-	void	(*activate_cs)(u8 cs, u8 polarity);
-	void	(*deactivate_cs)(u8 cs, u8 polarity);
+	void	(*cs_control)(struct spi_device *spi, bool on);
 	u32	sysclk;
 };
 
-- 
cgit v1.2.3


From 35b4b3c0c1265f1a7342574be393c157601401f0 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 31 Mar 2009 15:24:37 -0700
Subject: spi_mpc83xx: add OF platform driver bindings

Implement full support for OF SPI bindings.  Now the driver can manage its
own chip selects without any help from the board files and/or fsl_soc
constructors.

The "legacy" code is well isolated and could be removed as time goes by.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@gate.crashing.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/spi_mpc83xx.c   | 330 +++++++++++++++++++++++++++++++++++++++-----
 include/linux/fsl_devices.h |   2 +-
 2 files changed, 295 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index b95085a46f90..f4573a96af24 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -14,6 +14,8 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
@@ -23,7 +25,13 @@
 #include <linux/spi/spi_bitbang.h>
 #include <linux/platform_device.h>
 #include <linux/fsl_devices.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/gpio.h>
+#include <linux/of_gpio.h>
+#include <linux/of_spi.h>
 
+#include <sysdev/fsl_soc.h>
 #include <asm/irq.h>
 #include <asm/io.h>
 
@@ -79,7 +87,7 @@ struct mpc83xx_spi {
 	u32(*get_tx) (struct mpc83xx_spi *);
 
 	unsigned int count;
-	int irq;
+	unsigned int irq;
 
 	unsigned nsecs;		/* (clock cycle time)/2 */
 
@@ -543,36 +551,23 @@ static void mpc83xx_spi_cleanup(struct spi_device *spi)
 	kfree(spi->controller_state);
 }
 
-static int __init mpc83xx_spi_probe(struct platform_device *dev)
+static struct spi_master * __devinit
+mpc83xx_spi_probe(struct device *dev, struct resource *mem, unsigned int irq)
 {
+	struct fsl_spi_platform_data *pdata = dev->platform_data;
 	struct spi_master *master;
 	struct mpc83xx_spi *mpc83xx_spi;
-	struct fsl_spi_platform_data *pdata;
-	struct resource *r;
 	u32 regval;
 	int ret = 0;
 
-	/* Get resources(memory, IRQ) associated with the device */
-	master = spi_alloc_master(&dev->dev, sizeof(struct mpc83xx_spi));
-
+	master = spi_alloc_master(dev, sizeof(struct mpc83xx_spi));
 	if (master == NULL) {
 		ret = -ENOMEM;
 		goto err;
 	}
 
-	platform_set_drvdata(dev, master);
-	pdata = dev->dev.platform_data;
-
-	if (pdata == NULL) {
-		ret = -ENODEV;
-		goto free_master;
-	}
+	dev_set_drvdata(dev, master);
 
-	r = platform_get_resource(dev, IORESOURCE_MEM, 0);
-	if (r == NULL) {
-		ret = -ENODEV;
-		goto free_master;
-	}
 	master->setup = mpc83xx_spi_setup;
 	master->transfer = mpc83xx_spi_transfer;
 	master->cleanup = mpc83xx_spi_cleanup;
@@ -592,18 +587,13 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 
 	init_completion(&mpc83xx_spi->done);
 
-	mpc83xx_spi->base = ioremap(r->start, r->end - r->start + 1);
+	mpc83xx_spi->base = ioremap(mem->start, mem->end - mem->start + 1);
 	if (mpc83xx_spi->base == NULL) {
 		ret = -ENOMEM;
 		goto put_master;
 	}
 
-	mpc83xx_spi->irq = platform_get_irq(dev, 0);
-
-	if (mpc83xx_spi->irq < 0) {
-		ret = -ENXIO;
-		goto unmap_io;
-	}
+	mpc83xx_spi->irq = irq;
 
 	/* Register for SPI Interrupt */
 	ret = request_irq(mpc83xx_spi->irq, mpc83xx_spi_irq,
@@ -645,9 +635,9 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 
 	printk(KERN_INFO
 	       "%s: MPC83xx SPI Controller driver at 0x%p (irq = %d)\n",
-	       dev_name(&dev->dev), mpc83xx_spi->base, mpc83xx_spi->irq);
+	       dev_name(dev), mpc83xx_spi->base, mpc83xx_spi->irq);
 
-	return ret;
+	return master;
 
 unreg_master:
 	destroy_workqueue(mpc83xx_spi->workqueue);
@@ -657,18 +647,16 @@ unmap_io:
 	iounmap(mpc83xx_spi->base);
 put_master:
 	spi_master_put(master);
-free_master:
-	kfree(master);
 err:
-	return ret;
+	return ERR_PTR(ret);
 }
 
-static int __exit mpc83xx_spi_remove(struct platform_device *dev)
+static int __devexit mpc83xx_spi_remove(struct device *dev)
 {
 	struct mpc83xx_spi *mpc83xx_spi;
 	struct spi_master *master;
 
-	master = platform_get_drvdata(dev);
+	master = dev_get_drvdata(dev);
 	mpc83xx_spi = spi_master_get_devdata(master);
 
 	flush_workqueue(mpc83xx_spi->workqueue);
@@ -681,23 +669,293 @@ static int __exit mpc83xx_spi_remove(struct platform_device *dev)
 	return 0;
 }
 
+struct mpc83xx_spi_probe_info {
+	struct fsl_spi_platform_data pdata;
+	int *gpios;
+	bool *alow_flags;
+};
+
+static struct mpc83xx_spi_probe_info *
+to_of_pinfo(struct fsl_spi_platform_data *pdata)
+{
+	return container_of(pdata, struct mpc83xx_spi_probe_info, pdata);
+}
+
+static void mpc83xx_spi_cs_control(struct spi_device *spi, bool on)
+{
+	struct device *dev = spi->dev.parent;
+	struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(dev->platform_data);
+	u16 cs = spi->chip_select;
+	int gpio = pinfo->gpios[cs];
+	bool alow = pinfo->alow_flags[cs];
+
+	gpio_set_value(gpio, on ^ alow);
+}
+
+static int of_mpc83xx_spi_get_chipselects(struct device *dev)
+{
+	struct device_node *np = dev_archdata_get_node(&dev->archdata);
+	struct fsl_spi_platform_data *pdata = dev->platform_data;
+	struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(pdata);
+	unsigned int ngpios;
+	int i = 0;
+	int ret;
+
+	ngpios = of_gpio_count(np);
+	if (!ngpios) {
+		/*
+		 * SPI w/o chip-select line. One SPI device is still permitted
+		 * though.
+		 */
+		pdata->max_chipselect = 1;
+		return 0;
+	}
+
+	pinfo->gpios = kmalloc(ngpios * sizeof(pinfo->gpios), GFP_KERNEL);
+	if (!pinfo->gpios)
+		return -ENOMEM;
+	memset(pinfo->gpios, -1, ngpios * sizeof(pinfo->gpios));
+
+	pinfo->alow_flags = kzalloc(ngpios * sizeof(pinfo->alow_flags),
+				    GFP_KERNEL);
+	if (!pinfo->alow_flags) {
+		ret = -ENOMEM;
+		goto err_alloc_flags;
+	}
+
+	for (; i < ngpios; i++) {
+		int gpio;
+		enum of_gpio_flags flags;
+
+		gpio = of_get_gpio_flags(np, i, &flags);
+		if (!gpio_is_valid(gpio)) {
+			dev_err(dev, "invalid gpio #%d: %d\n", i, gpio);
+			goto err_loop;
+		}
+
+		ret = gpio_request(gpio, dev_name(dev));
+		if (ret) {
+			dev_err(dev, "can't request gpio #%d: %d\n", i, ret);
+			goto err_loop;
+		}
+
+		pinfo->gpios[i] = gpio;
+		pinfo->alow_flags[i] = flags & OF_GPIO_ACTIVE_LOW;
+
+		ret = gpio_direction_output(pinfo->gpios[i],
+					    pinfo->alow_flags[i]);
+		if (ret) {
+			dev_err(dev, "can't set output direction for gpio "
+				"#%d: %d\n", i, ret);
+			goto err_loop;
+		}
+	}
+
+	pdata->max_chipselect = ngpios;
+	pdata->cs_control = mpc83xx_spi_cs_control;
+
+	return 0;
+
+err_loop:
+	while (i >= 0) {
+		if (gpio_is_valid(pinfo->gpios[i]))
+			gpio_free(pinfo->gpios[i]);
+		i--;
+	}
+
+	kfree(pinfo->alow_flags);
+	pinfo->alow_flags = NULL;
+err_alloc_flags:
+	kfree(pinfo->gpios);
+	pinfo->gpios = NULL;
+	return ret;
+}
+
+static int of_mpc83xx_spi_free_chipselects(struct device *dev)
+{
+	struct fsl_spi_platform_data *pdata = dev->platform_data;
+	struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(pdata);
+	int i;
+
+	if (!pinfo->gpios)
+		return 0;
+
+	for (i = 0; i < pdata->max_chipselect; i++) {
+		if (gpio_is_valid(pinfo->gpios[i]))
+			gpio_free(pinfo->gpios[i]);
+	}
+
+	kfree(pinfo->gpios);
+	kfree(pinfo->alow_flags);
+	return 0;
+}
+
+static int __devinit of_mpc83xx_spi_probe(struct of_device *ofdev,
+					  const struct of_device_id *ofid)
+{
+	struct device *dev = &ofdev->dev;
+	struct device_node *np = ofdev->node;
+	struct mpc83xx_spi_probe_info *pinfo;
+	struct fsl_spi_platform_data *pdata;
+	struct spi_master *master;
+	struct resource mem;
+	struct resource irq;
+	const void *prop;
+	int ret = -ENOMEM;
+
+	pinfo = kzalloc(sizeof(*pinfo), GFP_KERNEL);
+	if (!pinfo)
+		return -ENOMEM;
+
+	pdata = &pinfo->pdata;
+	dev->platform_data = pdata;
+
+	/* Allocate bus num dynamically. */
+	pdata->bus_num = -1;
+
+	/* SPI controller is either clocked from QE or SoC clock. */
+	pdata->sysclk = get_brgfreq();
+	if (pdata->sysclk == -1) {
+		pdata->sysclk = fsl_get_sys_freq();
+		if (pdata->sysclk == -1) {
+			ret = -ENODEV;
+			goto err_clk;
+		}
+	}
+
+	prop = of_get_property(np, "mode", NULL);
+	if (prop && !strcmp(prop, "cpu-qe"))
+		pdata->qe_mode = 1;
+
+	ret = of_mpc83xx_spi_get_chipselects(dev);
+	if (ret)
+		goto err;
+
+	ret = of_address_to_resource(np, 0, &mem);
+	if (ret)
+		goto err;
+
+	ret = of_irq_to_resource(np, 0, &irq);
+	if (!ret) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	master = mpc83xx_spi_probe(dev, &mem, irq.start);
+	if (IS_ERR(master)) {
+		ret = PTR_ERR(master);
+		goto err;
+	}
+
+	of_register_spi_devices(master, np);
+
+	return 0;
+
+err:
+	of_mpc83xx_spi_free_chipselects(dev);
+err_clk:
+	kfree(pinfo);
+	return ret;
+}
+
+static int __devexit of_mpc83xx_spi_remove(struct of_device *ofdev)
+{
+	int ret;
+
+	ret = mpc83xx_spi_remove(&ofdev->dev);
+	if (ret)
+		return ret;
+	of_mpc83xx_spi_free_chipselects(&ofdev->dev);
+	return 0;
+}
+
+static const struct of_device_id of_mpc83xx_spi_match[] = {
+	{ .compatible = "fsl,spi" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, of_mpc83xx_spi_match);
+
+static struct of_platform_driver of_mpc83xx_spi_driver = {
+	.name		= "mpc83xx_spi",
+	.match_table	= of_mpc83xx_spi_match,
+	.probe		= of_mpc83xx_spi_probe,
+	.remove		= __devexit_p(of_mpc83xx_spi_remove),
+};
+
+#ifdef CONFIG_MPC832x_RDB
+/*
+ * 				XXX XXX XXX
+ * This is "legacy" platform driver, was used by the MPC8323E-RDB boards
+ * only. The driver should go away soon, since newer MPC8323E-RDB's device
+ * tree can work with OpenFirmware driver. But for now we support old trees
+ * as well.
+ */
+static int __devinit plat_mpc83xx_spi_probe(struct platform_device *pdev)
+{
+	struct resource *mem;
+	unsigned int irq;
+	struct spi_master *master;
+
+	if (!pdev->dev.platform_data)
+		return -EINVAL;
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!mem)
+		return -EINVAL;
+
+	irq = platform_get_irq(pdev, 0);
+	if (!irq)
+		return -EINVAL;
+
+	master = mpc83xx_spi_probe(&pdev->dev, mem, irq);
+	if (IS_ERR(master))
+		return PTR_ERR(master);
+	return 0;
+}
+
+static int __devexit plat_mpc83xx_spi_remove(struct platform_device *pdev)
+{
+	return mpc83xx_spi_remove(&pdev->dev);
+}
+
 MODULE_ALIAS("platform:mpc83xx_spi");
 static struct platform_driver mpc83xx_spi_driver = {
-	.remove = __exit_p(mpc83xx_spi_remove),
+	.probe = plat_mpc83xx_spi_probe,
+	.remove = __exit_p(plat_mpc83xx_spi_remove),
 	.driver = {
 		.name = "mpc83xx_spi",
 		.owner = THIS_MODULE,
 	},
 };
 
+static bool legacy_driver_failed;
+
+static void __init legacy_driver_register(void)
+{
+	legacy_driver_failed = platform_driver_register(&mpc83xx_spi_driver);
+}
+
+static void __exit legacy_driver_unregister(void)
+{
+	if (legacy_driver_failed)
+		return;
+	platform_driver_unregister(&mpc83xx_spi_driver);
+}
+#else
+static void __init legacy_driver_register(void) {}
+static void __exit legacy_driver_unregister(void) {}
+#endif /* CONFIG_MPC832x_RDB */
+
 static int __init mpc83xx_spi_init(void)
 {
-	return platform_driver_probe(&mpc83xx_spi_driver, mpc83xx_spi_probe);
+	legacy_driver_register();
+	return of_register_platform_driver(&of_mpc83xx_spi_driver);
 }
 
 static void __exit mpc83xx_spi_exit(void)
 {
-	platform_driver_unregister(&mpc83xx_spi_driver);
+	of_unregister_platform_driver(&of_mpc83xx_spi_driver);
+	legacy_driver_unregister();
 }
 
 module_init(mpc83xx_spi_init);
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 7bc1b643d370..7ef1caf50269 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -99,7 +99,7 @@ struct spi_device;
 
 struct fsl_spi_platform_data {
 	u32 	initial_spmode;	/* initial SPMODE value */
-	u16	bus_num;
+	s16	bus_num;
 	bool	qe_mode;
 	/* board specific information */
 	u16	max_chipselect;
-- 
cgit v1.2.3


From 79955898f961a870cbcc58f6ae13f3741a909da5 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 31 Mar 2009 15:24:45 -0700
Subject: autofs4: fix kernel includes

autofs_dev-ioctl.h is included by both the kernel module and user space tools
and it includes two kernel header files.  Compiles work if the kernel headers
are installed but fail otherwise.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/auto_dev-ioctl.h | 7 ++++++-
 include/linux/auto_fs.h        | 6 ++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h
index 91a773993a5c..850f39b33e74 100644
--- a/include/linux/auto_dev-ioctl.h
+++ b/include/linux/auto_dev-ioctl.h
@@ -10,8 +10,13 @@
 #ifndef _LINUX_AUTO_DEV_IOCTL_H
 #define _LINUX_AUTO_DEV_IOCTL_H
 
+#include <linux/auto_fs.h>
+
+#ifdef __KERNEL__
 #include <linux/string.h>
-#include <linux/types.h>
+#else
+#include <string.h>
+#endif /* __KERNEL__ */
 
 #define AUTOFS_DEVICE_NAME		"autofs"
 
diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h
index c21e5972a3e8..63265852b7d1 100644
--- a/include/linux/auto_fs.h
+++ b/include/linux/auto_fs.h
@@ -17,11 +17,13 @@
 #ifdef __KERNEL__
 #include <linux/fs.h>
 #include <linux/limits.h>
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#else
 #include <asm/types.h>
+#include <sys/ioctl.h>
 #endif /* __KERNEL__ */
 
-#include <linux/ioctl.h>
-
 /* This file describes autofs v3 */
 #define AUTOFS_PROTO_VERSION	3
 
-- 
cgit v1.2.3


From 78d89ef40c2ff7265df077e20c4d76be7d415204 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 31 Mar 2009 15:24:48 -0700
Subject: rtc: convert LEAP_YEAR into an inline

- the LEAP_YEAR macro is buggy - it references its arg multiple times.
  Fix this by turning it into a C function.

- give it a more approriate name

- Move it to rtc.h so that other .c files can use it, instead of copying it.

Cc: dann frazier <dannf@hp.com>
Acked-by: Alessandro Zummo <alessandro.zummo@towertech.it>
Cc: stephane eranian <eranian@googlemail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-lib.c | 7 +++----
 include/linux/rtc.h   | 6 ++++++
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c
index dd70bf73ce9d..773851f338b8 100644
--- a/drivers/rtc/rtc-lib.c
+++ b/drivers/rtc/rtc-lib.c
@@ -26,14 +26,13 @@ static const unsigned short rtc_ydays[2][13] = {
 };
 
 #define LEAPS_THRU_END_OF(y) ((y)/4 - (y)/100 + (y)/400)
-#define LEAP_YEAR(year) ((!(year % 4) && (year % 100)) || !(year % 400))
 
 /*
  * The number of days in the month.
  */
 int rtc_month_days(unsigned int month, unsigned int year)
 {
-	return rtc_days_in_month[month] + (LEAP_YEAR(year) && month == 1);
+	return rtc_days_in_month[month] + (is_leap_year(year) && month == 1);
 }
 EXPORT_SYMBOL(rtc_month_days);
 
@@ -42,7 +41,7 @@ EXPORT_SYMBOL(rtc_month_days);
  */
 int rtc_year_days(unsigned int day, unsigned int month, unsigned int year)
 {
-	return rtc_ydays[LEAP_YEAR(year)][month] + day-1;
+	return rtc_ydays[is_leap_year(year)][month] + day-1;
 }
 EXPORT_SYMBOL(rtc_year_days);
 
@@ -66,7 +65,7 @@ void rtc_time_to_tm(unsigned long time, struct rtc_time *tm)
 		- LEAPS_THRU_END_OF(1970 - 1);
 	if (days < 0) {
 		year -= 1;
-		days += 365 + LEAP_YEAR(year);
+		days += 365 + is_leap_year(year);
 	}
 	tm->tm_year = year - 1900;
 	tm->tm_yday = days + 1;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 4046b75563c1..60f88a7fb13d 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -99,6 +99,7 @@ struct rtc_pll_info {
 
 #ifdef __KERNEL__
 
+#include <linux/types.h>
 #include <linux/interrupt.h>
 
 extern int rtc_month_days(unsigned int month, unsigned int year);
@@ -232,6 +233,11 @@ int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 
+static inline bool is_leap_year(unsigned int year)
+{
+	return (!(year % 4) && (year % 100)) || !(year % 400);
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_RTC_H_ */
-- 
cgit v1.2.3


From 614c0dc93284404be2a4d5750c79bb95f2b6c980 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:15 -0700
Subject: cirrusfb: add accelerator constant

Add an accelerator constant so almost all Cirrus are recognized as
accelerators by the fbset command.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 11 +++++++++--
 include/linux/fb.h       |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 15e2e6bfcbff..e9a2661669eb 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -519,6 +519,7 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 	int yres;
 	/* memory size in pixels */
 	unsigned pixels = info->screen_size * 8 / var->bits_per_pixel;
+	struct cirrusfb_info *cinfo = info->par;
 
 	switch (var->bits_per_pixel) {
 	case 1:
@@ -627,6 +628,9 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 	if (cirrusfb_check_pixclock(var, info))
 		return -EINVAL;
 
+	if (!is_laguna(cinfo))
+		var->accel_flags = FB_ACCELF_TEXT;
+
 	return 0;
 }
 
@@ -2029,8 +2033,12 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info)
 		    | FBINFO_HWACCEL_FILLRECT
 		    | FBINFO_HWACCEL_IMAGEBLIT
 		    | FBINFO_HWACCEL_COPYAREA;
-	if (noaccel || is_laguna(cinfo))
+	if (noaccel || is_laguna(cinfo)) {
 		info->flags |= FBINFO_HWACCEL_DISABLED;
+		info->fix.accel = FB_ACCEL_NONE;
+	} else
+		info->fix.accel = FB_ACCEL_CIRRUS_ALPINE;
+
 	info->fbops = &cirrusfb_ops;
 
 	if (cinfo->btype == BT_GD5480) {
@@ -2056,7 +2064,6 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info)
 
 	/* FIXME: map region at 0xB8000 if available, fill in here */
 	info->fix.mmio_len   = 0;
-	info->fix.accel = FB_ACCEL_NONE;
 
 	fb_alloc_cmap(&info->cmap, 256, 0);
 
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 31527e17076b..fe7d0d7907ab 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -123,6 +123,7 @@ struct dentry;
 #define FB_ACCEL_TRIDENT_3DIMAGE 51	/* Trident 3DImage		*/
 #define FB_ACCEL_TRIDENT_BLADE3D 52	/* Trident Blade3D		*/
 #define FB_ACCEL_TRIDENT_BLADEXP 53	/* Trident BladeXP		*/
+#define FB_ACCEL_CIRRUS_ALPINE   53	/* Cirrus Logic 543x/544x/5480	*/
 #define FB_ACCEL_NEOMAGIC_NM2070 90	/* NeoMagic NM2070              */
 #define FB_ACCEL_NEOMAGIC_NM2090 91	/* NeoMagic NM2090              */
 #define FB_ACCEL_NEOMAGIC_NM2093 92	/* NeoMagic NM2093              */
-- 
cgit v1.2.3


From 6a7f2829b5f8be124e168265f176dbbbea8861a0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 31 Mar 2009 15:25:19 -0700
Subject: fbdev: uninline lock_fb_info()

Before:

   text    data     bss     dec     hex filename
   3648    2910      32    6590    19be drivers/video/backlight/backlight.o
   3226    2812      32    6070    17b6 drivers/video/backlight/lcd.o
  30990   16688    8480   56158    db5e drivers/video/console/fbcon.o
  15488    8400      24   23912    5d68 drivers/video/fbmem.o

After:

   text    data     bss     dec     hex filename
   3537    2870      32    6439    1927 drivers/video/backlight/backlight.o
   3131    2772      32    5935    172f drivers/video/backlight/lcd.o
  30876   16648    8480   56004    dac4 drivers/video/console/fbcon.o
  15506    8400      24   23930    5d7a drivers/video/fbmem.o

Cc: Andrea Righi <righi.andrea@gmail.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/fbmem.c | 11 +++++++++++
 include/linux/fb.h    | 10 +---------
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index b64f061dd447..2ac32e6b5953 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -46,6 +46,17 @@
 struct fb_info *registered_fb[FB_MAX] __read_mostly;
 int num_registered_fb __read_mostly;
 
+int lock_fb_info(struct fb_info *info)
+{
+	mutex_lock(&info->lock);
+	if (!info->fbops) {
+		mutex_unlock(&info->lock);
+		return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(lock_fb_info);
+
 /*
  * Helpers
  */
diff --git a/include/linux/fb.h b/include/linux/fb.h
index fe7d0d7907ab..f563c5013932 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -961,15 +961,7 @@ extern struct fb_info *registered_fb[FB_MAX];
 extern int num_registered_fb;
 extern struct class *fb_class;
 
-static inline int lock_fb_info(struct fb_info *info)
-{
-	mutex_lock(&info->lock);
-	if (!info->fbops) {
-		mutex_unlock(&info->lock);
-		return 0;
-	}
-	return 1;
-}
+extern int lock_fb_info(struct fb_info *info);
 
 static inline void unlock_fb_info(struct fb_info *info)
 {
-- 
cgit v1.2.3


From 6fd5c665d8fe9da5f2081f0b3ca8054f0f730b1a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 1 Apr 2009 21:42:23 +0200
Subject: include/linux/hdreg.h: cover struct hd_driveid with #ifndef/#endif
 __KERNEL__

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/hdreg.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h
index ed21bd3dbd25..47fcb05af8b4 100644
--- a/include/linux/hdreg.h
+++ b/include/linux/hdreg.h
@@ -448,6 +448,7 @@ enum {
 
 #define __NEW_HD_DRIVE_ID
 
+#ifndef __KERNEL__
 /*
  * Structure returned by HDIO_GET_IDENTITY, as per ANSI NCITS ATA6 rev.1b spec.
  *
@@ -699,6 +700,7 @@ struct hd_driveid {
 					 *  7:0 Signature
 					 */
 };
+#endif /* __KERNEL__ */
 
 /*
  * IDE "nice" flags. These are used on a per drive basis to determine
-- 
cgit v1.2.3


From dafd01cc14a38690c87981eb2670d9c95f799ffd Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 1 Apr 2009 21:42:25 +0200
Subject: include/linux/hdreg.h: cover WIN_* and friends with #ifndef/#endif
 __KERNEL__

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/hdreg.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h
index 47fcb05af8b4..101eb91de22f 100644
--- a/include/linux/hdreg.h
+++ b/include/linux/hdreg.h
@@ -191,6 +191,7 @@ typedef struct hd_drive_hob_hdr {
 #define TASKFILE_INVALID		0x7fff
 #endif
 
+#ifndef __KERNEL__
 /* ATA/ATAPI Commands pre T13 Spec */
 #define WIN_NOP				0x00
 /*
@@ -379,6 +380,7 @@ typedef struct hd_drive_hob_hdr {
 #define SECURITY_ERASE_UNIT		0xBD
 #define SECURITY_FREEZE_LOCK		0xBE
 #define SECURITY_DISABLE_PASSWORD	0xBF
+#endif /* __KERNEL__ */
 
 struct hd_geometry {
       unsigned char heads;
-- 
cgit v1.2.3


From 4fe6e30645de0b7a179892d83049580bf72bcff2 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 1 Apr 2009 21:42:25 +0200
Subject: include/linux/hdreg.h: remove unused defines

* Move HD_IRQ define to drivers/block/hd.c (only user).

* Remove unused *_STAT, *_ERR, HD_*, CD, IO, REL and TAG_MASK defines.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/block/hd.c    |  2 ++
 include/linux/hdreg.h | 58 ---------------------------------------------------
 2 files changed, 2 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 482c0c4b964f..3c11f062a18c 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -42,6 +42,8 @@
 #include <linux/ata.h>
 #include <linux/hdreg.h>
 
+#define HD_IRQ 14
+
 #define REALLY_SLOW_IO
 #include <asm/system.h>
 #include <asm/io.h>
diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h
index 101eb91de22f..3bc8f9f986b6 100644
--- a/include/linux/hdreg.h
+++ b/include/linux/hdreg.h
@@ -3,64 +3,6 @@
 
 #ifdef __KERNEL__
 #include <linux/ata.h>
-
-/*
- * This file contains some defines for the AT-hd-controller.
- * Various sources.
- */
-
-/* ide.c has its own port definitions in "ide.h" */
-
-#define HD_IRQ		14
-
-/* Hd controller regs. Ref: IBM AT Bios-listing */
-#define HD_DATA		0x1f0		/* _CTL when writing */
-#define HD_ERROR	0x1f1		/* see err-bits */
-#define HD_NSECTOR	0x1f2		/* nr of sectors to read/write */
-#define HD_SECTOR	0x1f3		/* starting sector */
-#define HD_LCYL		0x1f4		/* starting cylinder */
-#define HD_HCYL		0x1f5		/* high byte of starting cyl */
-#define HD_CURRENT	0x1f6		/* 101dhhhh , d=drive, hhhh=head */
-#define HD_STATUS	0x1f7		/* see status-bits */
-#define HD_FEATURE	HD_ERROR	/* same io address, read=error, write=feature */
-#define HD_PRECOMP	HD_FEATURE	/* obsolete use of this port - predates IDE */
-#define HD_COMMAND	HD_STATUS	/* same io address, read=status, write=cmd */
-
-#define HD_CMD		0x3f6		/* used for resets */
-#define HD_ALTSTATUS	0x3f6		/* same as HD_STATUS but doesn't clear irq */
-
-/* remainder is shared between hd.c, ide.c, ide-cd.c, and the hdparm utility */
-
-/* Bits of HD_STATUS */
-#define ERR_STAT		0x01
-#define INDEX_STAT		0x02
-#define ECC_STAT		0x04	/* Corrected error */
-#define DRQ_STAT		0x08
-#define SEEK_STAT		0x10
-#define SRV_STAT		0x10
-#define WRERR_STAT		0x20
-#define READY_STAT		0x40
-#define BUSY_STAT		0x80
-
-/* Bits for HD_ERROR */
-#define MARK_ERR		0x01	/* Bad address mark */
-#define ILI_ERR			0x01	/* Illegal Length Indication (ATAPI) */
-#define TRK0_ERR		0x02	/* couldn't find track 0 */
-#define EOM_ERR			0x02	/* End Of Media (ATAPI) */
-#define ABRT_ERR		0x04	/* Command aborted */
-#define MCR_ERR			0x08	/* media change request */
-#define ID_ERR			0x10	/* ID field not found */
-#define MC_ERR			0x20	/* media changed */
-#define ECC_ERR			0x40	/* Uncorrectable ECC error */
-#define BBD_ERR			0x80	/* pre-EIDE meaning:  block marked bad */
-#define ICRC_ERR		0x80	/* new meaning:  CRC error during transfer */
-#define LFS_ERR			0xf0	/* Last Failed Sense (ATAPI) */
-
-/* Bits of HD_NSECTOR */
-#define CD			0x01
-#define IO			0x02
-#define REL			0x04
-#define TAG_MASK		0xf8
 #endif /* __KERNEL__ */
 
 #include <linux/types.h>
-- 
cgit v1.2.3


From eae6c2b6414fc6673ac5415442fe463c01005366 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 1 Apr 2009 21:42:26 +0200
Subject: remove <linux/ata.h> include from <linux/hdreg.h>

All <linux/hdreg.h> users that need <linux/ata.h> have been fixed
to include it directly.

Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/hdreg.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h
index 3bc8f9f986b6..29ee2873f4a8 100644
--- a/include/linux/hdreg.h
+++ b/include/linux/hdreg.h
@@ -1,10 +1,6 @@
 #ifndef _LINUX_HDREG_H
 #define _LINUX_HDREG_H
 
-#ifdef __KERNEL__
-#include <linux/ata.h>
-#endif /* __KERNEL__ */
-
 #include <linux/types.h>
 
 /*
-- 
cgit v1.2.3


From d9de451989a88a2003ca06e524aca4665c0c7f06 Mon Sep 17 00:00:00 2001
From: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com>
Date: Wed, 1 Apr 2009 15:47:02 +0200
Subject: dw_dmac: add cyclic API to DW DMA driver

This patch adds a cyclic DMA interface to the DW DMA driver. This is
very useful if you want to use the DMA controller in combination with a
sound device which uses cyclic buffers.

Using a DMA channel for cyclic DMA will disable the possibility to use
it as a normal DMA engine until the user calls the cyclic free function
on the DMA channel. Also a cyclic DMA list can not be prepared if the
channel is already active.

Signed-off-by: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com>
Acked-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dw_dmac.c      | 332 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/dma/dw_dmac_regs.h |   7 +-
 include/linux/dw_dmac.h    |  19 +++
 3 files changed, 356 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 862fc9ce9d86..0b8aada08aa8 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -363,6 +363,82 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	dwc_descriptor_complete(dwc, bad_desc);
 }
 
+/* --------------------- Cyclic DMA API extensions -------------------- */
+
+inline dma_addr_t dw_dma_get_src_addr(struct dma_chan *chan)
+{
+	struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
+	return channel_readl(dwc, SAR);
+}
+EXPORT_SYMBOL(dw_dma_get_src_addr);
+
+inline dma_addr_t dw_dma_get_dst_addr(struct dma_chan *chan)
+{
+	struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
+	return channel_readl(dwc, DAR);
+}
+EXPORT_SYMBOL(dw_dma_get_dst_addr);
+
+/* called with dwc->lock held and all DMAC interrupts disabled */
+static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc,
+		u32 status_block, u32 status_err, u32 status_xfer)
+{
+	if (status_block & dwc->mask) {
+		void (*callback)(void *param);
+		void *callback_param;
+
+		dev_vdbg(chan2dev(&dwc->chan), "new cyclic period llp 0x%08x\n",
+				channel_readl(dwc, LLP));
+		dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+
+		callback = dwc->cdesc->period_callback;
+		callback_param = dwc->cdesc->period_callback_param;
+		if (callback) {
+			spin_unlock(&dwc->lock);
+			callback(callback_param);
+			spin_lock(&dwc->lock);
+		}
+	}
+
+	/*
+	 * Error and transfer complete are highly unlikely, and will most
+	 * likely be due to a configuration error by the user.
+	 */
+	if (unlikely(status_err & dwc->mask) ||
+			unlikely(status_xfer & dwc->mask)) {
+		int i;
+
+		dev_err(chan2dev(&dwc->chan), "cyclic DMA unexpected %s "
+				"interrupt, stopping DMA transfer\n",
+				status_xfer ? "xfer" : "error");
+		dev_err(chan2dev(&dwc->chan),
+			"  SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n",
+			channel_readl(dwc, SAR),
+			channel_readl(dwc, DAR),
+			channel_readl(dwc, LLP),
+			channel_readl(dwc, CTL_HI),
+			channel_readl(dwc, CTL_LO));
+
+		channel_clear_bit(dw, CH_EN, dwc->mask);
+		while (dma_readl(dw, CH_EN) & dwc->mask)
+			cpu_relax();
+
+		/* make sure DMA does not restart by loading a new list */
+		channel_writel(dwc, LLP, 0);
+		channel_writel(dwc, CTL_LO, 0);
+		channel_writel(dwc, CTL_HI, 0);
+
+		dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+		dma_writel(dw, CLEAR.ERROR, dwc->mask);
+		dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+		for (i = 0; i < dwc->cdesc->periods; i++)
+			dwc_dump_lli(dwc, &dwc->cdesc->desc[i]->lli);
+	}
+}
+
+/* ------------------------------------------------------------------------- */
+
 static void dw_dma_tasklet(unsigned long data)
 {
 	struct dw_dma *dw = (struct dw_dma *)data;
@@ -382,7 +458,10 @@ static void dw_dma_tasklet(unsigned long data)
 	for (i = 0; i < dw->dma.chancnt; i++) {
 		dwc = &dw->chan[i];
 		spin_lock(&dwc->lock);
-		if (status_err & (1 << i))
+		if (test_bit(DW_DMA_IS_CYCLIC, &dwc->flags))
+			dwc_handle_cyclic(dw, dwc, status_block, status_err,
+					status_xfer);
+		else if (status_err & (1 << i))
 			dwc_handle_error(dw, dwc);
 		else if ((status_block | status_xfer) & (1 << i))
 			dwc_scan_descriptors(dw, dwc);
@@ -883,6 +962,257 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	dev_vdbg(chan2dev(chan), "free_chan_resources done\n");
 }
 
+/* --------------------- Cyclic DMA API extensions -------------------- */
+
+/**
+ * dw_dma_cyclic_start - start the cyclic DMA transfer
+ * @chan: the DMA channel to start
+ *
+ * Must be called with soft interrupts disabled. Returns zero on success or
+ * -errno on failure.
+ */
+int dw_dma_cyclic_start(struct dma_chan *chan)
+{
+	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+
+	if (!test_bit(DW_DMA_IS_CYCLIC, &dwc->flags)) {
+		dev_err(chan2dev(&dwc->chan), "missing prep for cyclic DMA\n");
+		return -ENODEV;
+	}
+
+	spin_lock(&dwc->lock);
+
+	/* assert channel is idle */
+	if (dma_readl(dw, CH_EN) & dwc->mask) {
+		dev_err(chan2dev(&dwc->chan),
+			"BUG: Attempted to start non-idle channel\n");
+		dev_err(chan2dev(&dwc->chan),
+			"  SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n",
+			channel_readl(dwc, SAR),
+			channel_readl(dwc, DAR),
+			channel_readl(dwc, LLP),
+			channel_readl(dwc, CTL_HI),
+			channel_readl(dwc, CTL_LO));
+		spin_unlock(&dwc->lock);
+		return -EBUSY;
+	}
+
+	dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+	dma_writel(dw, CLEAR.ERROR, dwc->mask);
+	dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+	/* setup DMAC channel registers */
+	channel_writel(dwc, LLP, dwc->cdesc->desc[0]->txd.phys);
+	channel_writel(dwc, CTL_LO, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
+	channel_writel(dwc, CTL_HI, 0);
+
+	channel_set_bit(dw, CH_EN, dwc->mask);
+
+	spin_unlock(&dwc->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(dw_dma_cyclic_start);
+
+/**
+ * dw_dma_cyclic_stop - stop the cyclic DMA transfer
+ * @chan: the DMA channel to stop
+ *
+ * Must be called with soft interrupts disabled.
+ */
+void dw_dma_cyclic_stop(struct dma_chan *chan)
+{
+	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+
+	spin_lock(&dwc->lock);
+
+	channel_clear_bit(dw, CH_EN, dwc->mask);
+	while (dma_readl(dw, CH_EN) & dwc->mask)
+		cpu_relax();
+
+	spin_unlock(&dwc->lock);
+}
+EXPORT_SYMBOL(dw_dma_cyclic_stop);
+
+/**
+ * dw_dma_cyclic_prep - prepare the cyclic DMA transfer
+ * @chan: the DMA channel to prepare
+ * @buf_addr: physical DMA address where the buffer starts
+ * @buf_len: total number of bytes for the entire buffer
+ * @period_len: number of bytes for each period
+ * @direction: transfer direction, to or from device
+ *
+ * Must be called before trying to start the transfer. Returns a valid struct
+ * dw_cyclic_desc if successful or an ERR_PTR(-errno) if not successful.
+ */
+struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
+		dma_addr_t buf_addr, size_t buf_len, size_t period_len,
+		enum dma_data_direction direction)
+{
+	struct dw_dma_chan		*dwc = to_dw_dma_chan(chan);
+	struct dw_cyclic_desc		*cdesc;
+	struct dw_cyclic_desc		*retval = NULL;
+	struct dw_desc			*desc;
+	struct dw_desc			*last = NULL;
+	struct dw_dma_slave		*dws = chan->private;
+	unsigned long			was_cyclic;
+	unsigned int			reg_width;
+	unsigned int			periods;
+	unsigned int			i;
+
+	spin_lock_bh(&dwc->lock);
+	if (!list_empty(&dwc->queue) || !list_empty(&dwc->active_list)) {
+		spin_unlock_bh(&dwc->lock);
+		dev_dbg(chan2dev(&dwc->chan),
+				"queue and/or active list are not empty\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	was_cyclic = test_and_set_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+	spin_unlock_bh(&dwc->lock);
+	if (was_cyclic) {
+		dev_dbg(chan2dev(&dwc->chan),
+				"channel already prepared for cyclic DMA\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	retval = ERR_PTR(-EINVAL);
+	reg_width = dws->reg_width;
+	periods = buf_len / period_len;
+
+	/* Check for too big/unaligned periods and unaligned DMA buffer. */
+	if (period_len > (DWC_MAX_COUNT << reg_width))
+		goto out_err;
+	if (unlikely(period_len & ((1 << reg_width) - 1)))
+		goto out_err;
+	if (unlikely(buf_addr & ((1 << reg_width) - 1)))
+		goto out_err;
+	if (unlikely(!(direction & (DMA_TO_DEVICE | DMA_FROM_DEVICE))))
+		goto out_err;
+
+	retval = ERR_PTR(-ENOMEM);
+
+	if (periods > NR_DESCS_PER_CHANNEL)
+		goto out_err;
+
+	cdesc = kzalloc(sizeof(struct dw_cyclic_desc), GFP_KERNEL);
+	if (!cdesc)
+		goto out_err;
+
+	cdesc->desc = kzalloc(sizeof(struct dw_desc *) * periods, GFP_KERNEL);
+	if (!cdesc->desc)
+		goto out_err_alloc;
+
+	for (i = 0; i < periods; i++) {
+		desc = dwc_desc_get(dwc);
+		if (!desc)
+			goto out_err_desc_get;
+
+		switch (direction) {
+		case DMA_TO_DEVICE:
+			desc->lli.dar = dws->tx_reg;
+			desc->lli.sar = buf_addr + (period_len * i);
+			desc->lli.ctllo = (DWC_DEFAULT_CTLLO
+					| DWC_CTLL_DST_WIDTH(reg_width)
+					| DWC_CTLL_SRC_WIDTH(reg_width)
+					| DWC_CTLL_DST_FIX
+					| DWC_CTLL_SRC_INC
+					| DWC_CTLL_FC_M2P
+					| DWC_CTLL_INT_EN);
+			break;
+		case DMA_FROM_DEVICE:
+			desc->lli.dar = buf_addr + (period_len * i);
+			desc->lli.sar = dws->rx_reg;
+			desc->lli.ctllo = (DWC_DEFAULT_CTLLO
+					| DWC_CTLL_SRC_WIDTH(reg_width)
+					| DWC_CTLL_DST_WIDTH(reg_width)
+					| DWC_CTLL_DST_INC
+					| DWC_CTLL_SRC_FIX
+					| DWC_CTLL_FC_P2M
+					| DWC_CTLL_INT_EN);
+			break;
+		default:
+			break;
+		}
+
+		desc->lli.ctlhi = (period_len >> reg_width);
+		cdesc->desc[i] = desc;
+
+		if (last) {
+			last->lli.llp = desc->txd.phys;
+			dma_sync_single_for_device(chan2parent(chan),
+					last->txd.phys, sizeof(last->lli),
+					DMA_TO_DEVICE);
+		}
+
+		last = desc;
+	}
+
+	/* lets make a cyclic list */
+	last->lli.llp = cdesc->desc[0]->txd.phys;
+	dma_sync_single_for_device(chan2parent(chan), last->txd.phys,
+			sizeof(last->lli), DMA_TO_DEVICE);
+
+	dev_dbg(chan2dev(&dwc->chan), "cyclic prepared buf 0x%08x len %zu "
+			"period %zu periods %d\n", buf_addr, buf_len,
+			period_len, periods);
+
+	cdesc->periods = periods;
+	dwc->cdesc = cdesc;
+
+	return cdesc;
+
+out_err_desc_get:
+	while (i--)
+		dwc_desc_put(dwc, cdesc->desc[i]);
+out_err_alloc:
+	kfree(cdesc);
+out_err:
+	clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+	return (struct dw_cyclic_desc *)retval;
+}
+EXPORT_SYMBOL(dw_dma_cyclic_prep);
+
+/**
+ * dw_dma_cyclic_free - free a prepared cyclic DMA transfer
+ * @chan: the DMA channel to free
+ */
+void dw_dma_cyclic_free(struct dma_chan *chan)
+{
+	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+	struct dw_cyclic_desc	*cdesc = dwc->cdesc;
+	int			i;
+
+	dev_dbg(chan2dev(&dwc->chan), "cyclic free\n");
+
+	if (!cdesc)
+		return;
+
+	spin_lock_bh(&dwc->lock);
+
+	channel_clear_bit(dw, CH_EN, dwc->mask);
+	while (dma_readl(dw, CH_EN) & dwc->mask)
+		cpu_relax();
+
+	dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+	dma_writel(dw, CLEAR.ERROR, dwc->mask);
+	dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+	spin_unlock_bh(&dwc->lock);
+
+	for (i = 0; i < cdesc->periods; i++)
+		dwc_desc_put(dwc, cdesc->desc[i]);
+
+	kfree(cdesc->desc);
+	kfree(cdesc);
+
+	clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+}
+EXPORT_SYMBOL(dw_dma_cyclic_free);
+
 /*----------------------------------------------------------------------*/
 
 static void dw_dma_off(struct dw_dma *dw)
diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h
index b252b202c5cf..13a580767031 100644
--- a/drivers/dma/dw_dmac_regs.h
+++ b/drivers/dma/dw_dmac_regs.h
@@ -126,6 +126,10 @@ struct dw_dma_regs {
 
 #define DW_REGLEN		0x400
 
+enum dw_dmac_flags {
+	DW_DMA_IS_CYCLIC = 0,
+};
+
 struct dw_dma_chan {
 	struct dma_chan		chan;
 	void __iomem		*ch_regs;
@@ -134,10 +138,12 @@ struct dw_dma_chan {
 	spinlock_t		lock;
 
 	/* these other elements are all protected by lock */
+	unsigned long		flags;
 	dma_cookie_t		completed;
 	struct list_head	active_list;
 	struct list_head	queue;
 	struct list_head	free_list;
+	struct dw_cyclic_desc	*cdesc;
 
 	unsigned int		descs_allocated;
 };
@@ -158,7 +164,6 @@ static inline struct dw_dma_chan *to_dw_dma_chan(struct dma_chan *chan)
 	return container_of(chan, struct dw_dma_chan, chan);
 }
 
-
 struct dw_dma {
 	struct dma_device	dma;
 	void __iomem		*regs;
diff --git a/include/linux/dw_dmac.h b/include/linux/dw_dmac.h
index d797dde247f7..c8aad713a046 100644
--- a/include/linux/dw_dmac.h
+++ b/include/linux/dw_dmac.h
@@ -74,4 +74,23 @@ struct dw_dma_slave {
 #define DWC_CFGL_HS_DST_POL	(1 << 18)	/* dst handshake active low */
 #define DWC_CFGL_HS_SRC_POL	(1 << 19)	/* src handshake active low */
 
+/* DMA API extensions */
+struct dw_cyclic_desc {
+	struct dw_desc	**desc;
+	unsigned long	periods;
+	void		(*period_callback)(void *param);
+	void		*period_callback_param;
+};
+
+struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
+		dma_addr_t buf_addr, size_t buf_len, size_t period_len,
+		enum dma_data_direction direction);
+void dw_dma_cyclic_free(struct dma_chan *chan);
+int dw_dma_cyclic_start(struct dma_chan *chan);
+void dw_dma_cyclic_stop(struct dma_chan *chan);
+
+dma_addr_t dw_dma_get_src_addr(struct dma_chan *chan);
+
+dma_addr_t dw_dma_get_dst_addr(struct dma_chan *chan);
+
 #endif /* DW_DMAC_H */
-- 
cgit v1.2.3


From 45194e4f89fbdd97a2b7d2698c05f0b00c19e820 Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Thu, 2 Apr 2009 19:55:28 +0100
Subject: dm target: remove struct tt_internal

The tt_internal is really just a list_head to manage registered target_type
in a double linked list,

Here embed the list_head into target_type directly,
1. to avoid kmalloc/kfree;
2. then tt_internal is really unneeded;

Cc: stable@kernel.org
Signed-off-by: Cheng Renquan <crquan@gmail.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Reviewed-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-target.c        | 90 +++++++++++++++----------------------------
 drivers/md/dm.h               |  2 +-
 include/linux/device-mapper.h |  3 ++
 3 files changed, 34 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index db72c9497bb4..04feccf2a997 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -14,40 +14,34 @@
 
 #define DM_MSG_PREFIX "target"
 
-struct tt_internal {
-	struct target_type tt;
-
-	struct list_head list;
-};
-
 static LIST_HEAD(_targets);
 static DECLARE_RWSEM(_lock);
 
 #define DM_MOD_NAME_SIZE 32
 
-static inline struct tt_internal *__find_target_type(const char *name)
+static inline struct target_type *__find_target_type(const char *name)
 {
-	struct tt_internal *ti;
+	struct target_type *tt;
 
-	list_for_each_entry (ti, &_targets, list)
-		if (!strcmp(name, ti->tt.name))
-			return ti;
+	list_for_each_entry(tt, &_targets, list)
+		if (!strcmp(name, tt->name))
+			return tt;
 
 	return NULL;
 }
 
-static struct tt_internal *get_target_type(const char *name)
+static struct target_type *get_target_type(const char *name)
 {
-	struct tt_internal *ti;
+	struct target_type *tt;
 
 	down_read(&_lock);
 
-	ti = __find_target_type(name);
-	if (ti && !try_module_get(ti->tt.module))
-		ti = NULL;
+	tt = __find_target_type(name);
+	if (tt && !try_module_get(tt->module))
+		tt = NULL;
 
 	up_read(&_lock);
-	return ti;
+	return tt;
 }
 
 static void load_module(const char *name)
@@ -57,83 +51,59 @@ static void load_module(const char *name)
 
 struct target_type *dm_get_target_type(const char *name)
 {
-	struct tt_internal *ti = get_target_type(name);
+	struct target_type *tt = get_target_type(name);
 
-	if (!ti) {
+	if (!tt) {
 		load_module(name);
-		ti = get_target_type(name);
+		tt = get_target_type(name);
 	}
 
-	return ti ? &ti->tt : NULL;
+	return tt;
 }
 
-void dm_put_target_type(struct target_type *t)
+void dm_put_target_type(struct target_type *tt)
 {
-	struct tt_internal *ti = (struct tt_internal *) t;
-
 	down_read(&_lock);
-	module_put(ti->tt.module);
+	module_put(tt->module);
 	up_read(&_lock);
-
-	return;
 }
 
-static struct tt_internal *alloc_target(struct target_type *t)
-{
-	struct tt_internal *ti = kzalloc(sizeof(*ti), GFP_KERNEL);
-
-	if (ti)
-		ti->tt = *t;
-
-	return ti;
-}
-
-
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
 					void *param), void *param)
 {
-	struct tt_internal *ti;
+	struct target_type *tt;
 
 	down_read(&_lock);
-	list_for_each_entry (ti, &_targets, list)
-		iter_func(&ti->tt, param);
+	list_for_each_entry(tt, &_targets, list)
+		iter_func(tt, param);
 	up_read(&_lock);
 
 	return 0;
 }
 
-int dm_register_target(struct target_type *t)
+int dm_register_target(struct target_type *tt)
 {
 	int rv = 0;
-	struct tt_internal *ti = alloc_target(t);
-
-	if (!ti)
-		return -ENOMEM;
 
 	down_write(&_lock);
-	if (__find_target_type(t->name))
+	if (__find_target_type(tt->name))
 		rv = -EEXIST;
 	else
-		list_add(&ti->list, &_targets);
+		list_add(&tt->list, &_targets);
 
 	up_write(&_lock);
-	if (rv)
-		kfree(ti);
 	return rv;
 }
 
-void dm_unregister_target(struct target_type *t)
+void dm_unregister_target(struct target_type *tt)
 {
-	struct tt_internal *ti;
-
 	down_write(&_lock);
-	if (!(ti = __find_target_type(t->name))) {
-		DMCRIT("Unregistering unrecognised target: %s", t->name);
+	if (!__find_target_type(tt->name)) {
+		DMCRIT("Unregistering unrecognised target: %s", tt->name);
 		BUG();
 	}
 
-	list_del(&ti->list);
-	kfree(ti);
+	list_del(&tt->list);
 
 	up_write(&_lock);
 }
@@ -142,17 +112,17 @@ void dm_unregister_target(struct target_type *t)
  * io-err: always fails an io, useful for bringing
  * up LVs that have holes in them.
  */
-static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
+static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
 {
 	return 0;
 }
 
-static void io_err_dtr(struct dm_target *ti)
+static void io_err_dtr(struct dm_target *tt)
 {
 	/* empty */
 }
 
-static int io_err_map(struct dm_target *ti, struct bio *bio,
+static int io_err_map(struct dm_target *tt, struct bio *bio,
 		      union map_info *map_context)
 {
 	return -EIO;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 20194e000c5a..b48397c0abbd 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -60,7 +60,7 @@ int dm_table_barrier_ok(struct dm_table *t);
 int dm_target_init(void);
 void dm_target_exit(void);
 struct target_type *dm_get_target_type(const char *name);
-void dm_put_target_type(struct target_type *t);
+void dm_put_target_type(struct target_type *tt);
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
 					void *param), void *param);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 8209e08969f9..66ec05a57955 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -139,6 +139,9 @@ struct target_type {
 	dm_ioctl_fn ioctl;
 	dm_merge_fn merge;
 	dm_busy_fn busy;
+
+	/* For internal device-mapper use. */
+	struct list_head list;
 };
 
 struct io_restrictions {
-- 
cgit v1.2.3


From ec44ab9d6681ddf9026b593e866bec9c0e075e1d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 2 Apr 2009 19:55:30 +0100
Subject: dm log: remove struct dm_dirty_log_internal

Remove the 'dm_dirty_log_internal' structure.  The resulting cleanup
eliminates extra memory allocations.  Therefore exposing the internal
list_head to the external 'dm_dirty_log_type' structure is a worthwhile
compromise.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c          | 58 +++++++++++---------------------------------
 include/linux/dm-dirty-log.h |  3 +++
 2 files changed, 17 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 094c8f0e0097..be233bc4d917 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -16,34 +16,28 @@
 
 #define DM_MSG_PREFIX "dirty region log"
 
-struct dm_dirty_log_internal {
-	struct dm_dirty_log_type *type;
-
-	struct list_head list;
-};
-
 static LIST_HEAD(_log_types);
 static DEFINE_SPINLOCK(_lock);
 
-static struct dm_dirty_log_internal *__find_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *__find_dirty_log_type(const char *name)
 {
-	struct dm_dirty_log_internal *log_type;
+	struct dm_dirty_log_type *log_type;
 
 	list_for_each_entry(log_type, &_log_types, list)
-		if (!strcmp(name, log_type->type->name))
+		if (!strcmp(name, log_type->name))
 			return log_type;
 
 	return NULL;
 }
 
-static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *_get_dirty_log_type(const char *name)
 {
-	struct dm_dirty_log_internal *log_type;
+	struct dm_dirty_log_type *log_type;
 
 	spin_lock(&_lock);
 
 	log_type = __find_dirty_log_type(name);
-	if (log_type && !try_module_get(log_type->type->module))
+	if (log_type && !try_module_get(log_type->module))
 		log_type = NULL;
 
 	spin_unlock(&_lock);
@@ -71,14 +65,14 @@ static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
 static struct dm_dirty_log_type *get_type(const char *type_name)
 {
 	char *p, *type_name_dup;
-	struct dm_dirty_log_internal *log_type;
+	struct dm_dirty_log_type *log_type;
 
 	if (!type_name)
 		return NULL;
 
 	log_type = _get_dirty_log_type(type_name);
 	if (log_type)
-		return log_type->type;
+		return log_type;
 
 	type_name_dup = kstrdup(type_name, GFP_KERNEL);
 	if (!type_name_dup) {
@@ -100,19 +94,16 @@ static struct dm_dirty_log_type *get_type(const char *type_name)
 
 	kfree(type_name_dup);
 
-	return log_type ? log_type->type : NULL;
+	return log_type;
 }
 
 static void put_type(struct dm_dirty_log_type *type)
 {
-	struct dm_dirty_log_internal *log_type;
-
 	if (!type)
 		return;
 
 	spin_lock(&_lock);
-	log_type = __find_dirty_log_type(type->name);
-	if (!log_type)
+	if (!__find_dirty_log_type(type->name))
 		goto out;
 
 	module_put(type->module);
@@ -121,32 +112,15 @@ out:
 	spin_unlock(&_lock);
 }
 
-static struct dm_dirty_log_internal *_alloc_dirty_log_type(struct dm_dirty_log_type *type)
-{
-	struct dm_dirty_log_internal *log_type = kzalloc(sizeof(*log_type),
-							 GFP_KERNEL);
-
-	if (log_type)
-		log_type->type = type;
-
-	return log_type;
-}
-
 int dm_dirty_log_type_register(struct dm_dirty_log_type *type)
 {
-	struct dm_dirty_log_internal *log_type = _alloc_dirty_log_type(type);
 	int r = 0;
 
-	if (!log_type)
-		return -ENOMEM;
-
 	spin_lock(&_lock);
 	if (!__find_dirty_log_type(type->name))
-		list_add(&log_type->list, &_log_types);
-	else {
-		kfree(log_type);
+		list_add(&type->list, &_log_types);
+	else
 		r = -EEXIST;
-	}
 	spin_unlock(&_lock);
 
 	return r;
@@ -155,20 +129,16 @@ EXPORT_SYMBOL(dm_dirty_log_type_register);
 
 int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
 {
-	struct dm_dirty_log_internal *log_type;
-
 	spin_lock(&_lock);
 
-	log_type = __find_dirty_log_type(type->name);
-	if (!log_type) {
+	if (!__find_dirty_log_type(type->name)) {
 		spin_unlock(&_lock);
 		return -EINVAL;
 	}
 
-	list_del(&log_type->list);
+	list_del(&type->list);
 
 	spin_unlock(&_lock);
-	kfree(log_type);
 
 	return 0;
 }
diff --git a/include/linux/dm-dirty-log.h b/include/linux/dm-dirty-log.h
index 600c5fb2daad..727602b686d4 100644
--- a/include/linux/dm-dirty-log.h
+++ b/include/linux/dm-dirty-log.h
@@ -28,6 +28,9 @@ struct dm_dirty_log_type {
 	const char *name;
 	struct module *module;
 
+	/* For internal device-mapper use */
+	struct list_head list;
+
 	int (*ctr)(struct dm_dirty_log *log, struct dm_target *ti,
 		   unsigned argc, char **argv);
 	void (*dtr)(struct dm_dirty_log *log);
-- 
cgit v1.2.3


From 7513c2a761d69d2a93f17146b3563527d3618ba0 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 2 Apr 2009 19:55:30 +0100
Subject: dm raid1: add is_remote_recovering hook for clusters

The logging API needs an extra function to make cluster mirroring
possible.  This new function allows us to check whether a mirror
region is being recovered on another machine in the cluster.  This
helps us prevent simultaneous recovery I/O and process I/O to the
same locations on disk.

Cluster-aware log modules will implement this function.  Single
machine log modules will not.  So, there is no performance
penalty for single machine mirrors.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Acked-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c        | 25 +++++++++++++++++++++++--
 include/linux/dm-dirty-log.h | 10 ++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 62d594889ac3..536ef0bef154 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -588,6 +588,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	int state;
 	struct bio *bio;
 	struct bio_list sync, nosync, recover, *this_list = NULL;
+	struct bio_list requeue;
+	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+	region_t region;
 
 	if (!writes->head)
 		return;
@@ -598,10 +601,18 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&sync);
 	bio_list_init(&nosync);
 	bio_list_init(&recover);
+	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
-		state = dm_rh_get_state(ms->rh,
-					dm_rh_bio_to_region(ms->rh, bio), 1);
+		region = dm_rh_bio_to_region(ms->rh, bio);
+
+		if (log->type->is_remote_recovering &&
+		    log->type->is_remote_recovering(log, region)) {
+			bio_list_add(&requeue, bio);
+			continue;
+		}
+
+		state = dm_rh_get_state(ms->rh, region, 1);
 		switch (state) {
 		case DM_RH_CLEAN:
 		case DM_RH_DIRTY:
@@ -620,6 +631,16 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 		bio_list_add(this_list, bio);
 	}
 
+	/*
+	 * Add bios that are delayed due to remote recovery
+	 * back on to the write queue
+	 */
+	if (unlikely(requeue.head)) {
+		spin_lock_irq(&ms->lock);
+		bio_list_merge(&ms->writes, &requeue);
+		spin_unlock_irq(&ms->lock);
+	}
+
 	/*
 	 * Increment the pending counts for any regions that will
 	 * be written to (writes to recover regions are going to
diff --git a/include/linux/dm-dirty-log.h b/include/linux/dm-dirty-log.h
index 727602b686d4..5e8b11d88f6f 100644
--- a/include/linux/dm-dirty-log.h
+++ b/include/linux/dm-dirty-log.h
@@ -116,6 +116,16 @@ struct dm_dirty_log_type {
 	 */
 	int (*status)(struct dm_dirty_log *log, status_type_t status_type,
 		      char *result, unsigned maxlen);
+
+	/*
+	 * is_remote_recovering is necessary for cluster mirroring. It provides
+	 * a way to detect recovery on another node, so we aren't writing
+	 * concurrently.  This function is likely to block (when a cluster log
+	 * is used).
+	 *
+	 * Returns: 0, 1
+	 */
+	int (*is_remote_recovering)(struct dm_dirty_log *log, region_t region);
 };
 
 int dm_dirty_log_type_register(struct dm_dirty_log_type *type);
-- 
cgit v1.2.3


From ee3b4290aec03022cfb67c9adba9f1b3215245f0 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Thu, 2 Apr 2009 16:56:30 -0700
Subject: generic debug pagealloc: build fix

This fixes a build failure with generic debug pagealloc:

  mm/debug-pagealloc.c: In function 'set_page_poison':
  mm/debug-pagealloc.c:8: error: 'struct page' has no member named 'debug_flags'
  mm/debug-pagealloc.c: In function 'clear_page_poison':
  mm/debug-pagealloc.c:13: error: 'struct page' has no member named 'debug_flags'
  mm/debug-pagealloc.c: In function 'page_poison':
  mm/debug-pagealloc.c:18: error: 'struct page' has no member named 'debug_flags'
  mm/debug-pagealloc.c: At top level:
  mm/debug-pagealloc.c:120: error: redefinition of 'kernel_map_pages'
  include/linux/mm.h:1278: error: previous definition of 'kernel_map_pages' was here
  mm/debug-pagealloc.c: In function 'kernel_map_pages':
  mm/debug-pagealloc.c:122: error: 'debug_pagealloc_enabled' undeclared (first use in this function)

by fixing

 - debug_flags should be in struct page
 - define DEBUG_PAGEALLOC config option for all architectures

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Reported-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/Kconfig.debug | 10 ----------
 arch/s390/Kconfig.debug    |  9 ---------
 arch/sparc/Kconfig.debug   |  9 ---------
 arch/x86/Kconfig.debug     |  9 ---------
 include/linux/mm_types.h   |  6 +++---
 mm/Kconfig.debug           |  9 +++++++++
 6 files changed, 12 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 6aa0b5e087cd..a1098e23221f 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -27,16 +27,6 @@ config DEBUG_STACK_USAGE
 
 	  This option will slow down process creation somewhat.
 
-config DEBUG_PAGEALLOC
-        bool "Debug page memory allocations"
-        depends on DEBUG_KERNEL && !HIBERNATION
-	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
-        help
-          Unmap pages from the kernel linear mapping after free_pages().
-          This results in a large slowdown, but helps to find certain types
-          of memory corruptions.
-
-
 config HCALL_STATS
 	bool "Hypervisor call instrumentation"
 	depends on PPC_PSERIES && DEBUG_FS
diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index 7e297a3cde34..2283933a9a93 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -6,13 +6,4 @@ config TRACE_IRQFLAGS_SUPPORT
 
 source "lib/Kconfig.debug"
 
-config DEBUG_PAGEALLOC
-	bool "Debug page memory allocations"
-	depends on DEBUG_KERNEL
-	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	help
-	  Unmap pages from the kernel linear mapping after free_pages().
-	  This results in a slowdown, but helps to find certain types of
-	  memory corruptions.
-
 endmenu
diff --git a/arch/sparc/Kconfig.debug b/arch/sparc/Kconfig.debug
index d001b42041a5..90d5fe223a74 100644
--- a/arch/sparc/Kconfig.debug
+++ b/arch/sparc/Kconfig.debug
@@ -22,15 +22,6 @@ config DEBUG_DCFLUSH
 config STACK_DEBUG
 	bool "Stack Overflow Detection Support"
 
-config DEBUG_PAGEALLOC
-	bool "Debug page memory allocations"
-	depends on DEBUG_KERNEL && !HIBERNATION
-	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	help
-	  Unmap pages from the kernel linear mapping after free_pages().
-	  This results in a large slowdown, but helps to find certain types
-	  of memory corruptions.
-
 config MCOUNT
 	bool
 	depends on SPARC64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index a345cb5447a8..d8359e73317f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -72,15 +72,6 @@ config DEBUG_STACK_USAGE
 
 	  This option will slow down process creation somewhat.
 
-config DEBUG_PAGEALLOC
-	bool "Debug page memory allocations"
-	depends on DEBUG_KERNEL
-	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	---help---
-	  Unmap pages from the kernel linear mapping after free_pages().
-	  This results in a large slowdown, but helps to find certain types
-	  of memory corruptions.
-
 config DEBUG_PER_CPU_MAPS
 	bool "Debug access to per_cpu maps"
 	depends on DEBUG_KERNEL
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ddadb4defe00..0e80e26ecf21 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -95,6 +95,9 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
+#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
+	unsigned long debug_flags;	/* Use atomic bitops on this */
+#endif
 };
 
 /*
@@ -175,9 +178,6 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
-#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
-	unsigned long debug_flags;	/* Use atomic bitops on this */
-#endif
 };
 
 struct core_thread {
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index c8d62d49a44e..bb01e298f260 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,3 +1,12 @@
+config DEBUG_PAGEALLOC
+	bool "Debug page memory allocations"
+	depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	depends on !HIBERNATION || !PPC && !SPARC
+	---help---
+	  Unmap pages from the kernel linear mapping after free_pages().
+	  This results in a large slowdown, but helps to find certain types
+	  of memory corruptions.
+
 config WANT_PAGE_DEBUG_FLAGS
 	bool
 
-- 
cgit v1.2.3


From 33e5d76979cf01e3834814fe0aea569d1d602c1a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Apr 2009 16:56:32 -0700
Subject: nommu: fix a number of issues with the per-MM VMA patch

Fix a number of issues with the per-MM VMA patch:

 (1) Make mmap_pages_allocated an atomic_long_t, just in case this is used on
     a NOMMU system with more than 2G pages.  Makes no difference on a 32-bit
     system.

 (2) Report vma->vm_pgoff * PAGE_SIZE as a 64-bit value, not a 32-bit value,
     lest it overflow.

 (3) Move the allocation of the vm_area_struct slab back for fork.c.

 (4) Use KMEM_CACHE() for both vm_area_struct and vm_region slabs.

 (5) Use BUG_ON() rather than if () BUG().

 (6) Make the default validate_nommu_regions() a static inline rather than a
     #define.

 (7) Make free_page_series()'s objection to pages with a refcount != 1 more
     informative.

 (8) Adjust the __put_nommu_region() banner comment to indicate that the
     semaphore must be held for writing.

 (9) Limit the number of warnings about munmaps of non-mmapped regions.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c    |  2 +-
 fs/proc/task_nommu.c |  4 ++--
 include/linux/mm.h   |  2 +-
 kernel/fork.c        |  1 +
 mm/mmap.c            |  3 ---
 mm/nommu.c           | 52 +++++++++++++++++++++++++---------------------------
 6 files changed, 30 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384a..74ea974f5ca6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeram-i.freehigh),
 #endif
 #ifndef CONFIG_MMU
-		K((unsigned long) atomic_read(&mmap_pages_allocated)),
+		K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
 #endif
 		K(i.totalswap),
 		K(i.freeswap),
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..370be0a2c909 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -136,14 +136,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 	}
 
 	seq_printf(m,
-		   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
 		   vma->vm_start,
 		   vma->vm_end,
 		   flags & VM_READ ? 'r' : '-',
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-		   vma->vm_pgoff << PAGE_SHIFT,
+		   (unsigned long long) vma->vm_pgoff << PAGE_SHIFT,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index aeabe953ba4f..bff1f0d475c7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1079,7 +1079,7 @@ static inline void setup_per_cpu_pageset(void) {}
 #endif
 
 /* nommu.c */
-extern atomic_t mmap_pages_allocated;
+extern atomic_long_t mmap_pages_allocated;
 
 /* prio_tree.c */
 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c15840a381..51d1aa21483b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1488,6 +1488,7 @@ void __init proc_caches_init(void)
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
 	mmap_init();
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 1abb9185a686..4a3841186c11 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2481,7 +2481,4 @@ void mm_drop_all_locks(struct mm_struct *mm)
  */
 void __init mmap_init(void)
 {
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
-			sizeof(struct vm_area_struct), 0,
-			SLAB_PANIC, NULL);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 2fcf47d449b4..72eda4aee2cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,7 +69,7 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
 int heap_stack_gap = 0;
 
-atomic_t mmap_pages_allocated;
+atomic_long_t mmap_pages_allocated;
 
 EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(num_physpages);
@@ -463,12 +463,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
  */
 void __init mmap_init(void)
 {
-	vm_region_jar = kmem_cache_create("vm_region_jar",
-					  sizeof(struct vm_region), 0,
-					  SLAB_PANIC, NULL);
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
-					   sizeof(struct vm_area_struct), 0,
-					   SLAB_PANIC, NULL);
+	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
 
 /*
@@ -486,27 +481,24 @@ static noinline void validate_nommu_regions(void)
 		return;
 
 	last = rb_entry(lastp, struct vm_region, vm_rb);
-	if (unlikely(last->vm_end <= last->vm_start))
-		BUG();
-	if (unlikely(last->vm_top < last->vm_end))
-		BUG();
+	BUG_ON(unlikely(last->vm_end <= last->vm_start));
+	BUG_ON(unlikely(last->vm_top < last->vm_end));
 
 	while ((p = rb_next(lastp))) {
 		region = rb_entry(p, struct vm_region, vm_rb);
 		last = rb_entry(lastp, struct vm_region, vm_rb);
 
-		if (unlikely(region->vm_end <= region->vm_start))
-			BUG();
-		if (unlikely(region->vm_top < region->vm_end))
-			BUG();
-		if (unlikely(region->vm_start < last->vm_top))
-			BUG();
+		BUG_ON(unlikely(region->vm_end <= region->vm_start));
+		BUG_ON(unlikely(region->vm_top < region->vm_end));
+		BUG_ON(unlikely(region->vm_start < last->vm_top));
 
 		lastp = p;
 	}
 }
 #else
-#define validate_nommu_regions() do {} while(0)
+static void validate_nommu_regions(void)
+{
+}
 #endif
 
 /*
@@ -563,16 +555,17 @@ static void free_page_series(unsigned long from, unsigned long to)
 		struct page *page = virt_to_page(from);
 
 		kdebug("- free %lx", from);
-		atomic_dec(&mmap_pages_allocated);
+		atomic_long_dec(&mmap_pages_allocated);
 		if (page_count(page) != 1)
-			kdebug("free page %p [%d]", page, page_count(page));
+			kdebug("free page %p: refcount not one: %d",
+			       page, page_count(page));
 		put_page(page);
 	}
 }
 
 /*
  * release a reference to a region
- * - the caller must hold the region semaphore, which this releases
+ * - the caller must hold the region semaphore for writing, which this releases
  * - the region may not have been added to the tree yet, in which case vm_top
  *   will equal vm_start
  */
@@ -1096,7 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 		goto enomem;
 
 	total = 1 << order;
-	atomic_add(total, &mmap_pages_allocated);
+	atomic_long_add(total, &mmap_pages_allocated);
 
 	point = rlen >> PAGE_SHIFT;
 
@@ -1107,7 +1100,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 			order = ilog2(total - point);
 			n = 1 << order;
 			kdebug("shave %lu/%lu @%lu", n, total - point, total);
-			atomic_sub(n, &mmap_pages_allocated);
+			atomic_long_sub(n, &mmap_pages_allocated);
 			total -= n;
 			set_page_refcounted(pages + total);
 			__free_pages(pages + total, order);
@@ -1536,10 +1529,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 	/* find the first potentially overlapping VMA */
 	vma = find_vma(mm, start);
 	if (!vma) {
-		printk(KERN_WARNING
-		       "munmap of memory not mmapped by process %d (%s):"
-		       " 0x%lx-0x%lx\n",
-		       current->pid, current->comm, start, start + len - 1);
+		static int limit = 0;
+		if (limit < 5) {
+			printk(KERN_WARNING
+			       "munmap of memory not mmapped by process %d"
+			       " (%s): 0x%lx-0x%lx\n",
+			       current->pid, current->comm,
+			       start, start + len - 1);
+			limit++;
+		}
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 8e2c3795c78d5c4e2e1f14ce751e9d08decbe9d3 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 2 Apr 2009 16:56:44 -0700
Subject: add fiemap.h to header-y

Include fiemap.h in header-y; it defines the interface for the
FS_IOC_FIEMAP file mapping ioctl.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/Kbuild | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index a67b6227d272..ca9b9b9bd331 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -67,6 +67,7 @@ header-y += falloc.h
 header-y += fd.h
 header-y += fdreg.h
 header-y += fib_rules.h
+header-y += fiemap.h
 header-y += firewire-cdev.h
 header-y += firewire-constants.h
 header-y += fuse.h
-- 
cgit v1.2.3


From 9a896c9a48ac6704c0ce8ee081b836644d0afe40 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Thu, 2 Apr 2009 16:56:45 -0700
Subject: mm: define a UNIQUE value for AS_UNEVICTABLE flag

A new "address_space flag"--AS_MM_ALL_LOCKS--was defined to use the next
available AS flag while the Unevictable LRU was under development.  The
Unevictable LRU was using the same flag and "no one" noticed.  Current
mainline, since 2.6.28, has same value for two symbolic flag names.

So, define a unique flag value for AS_UNEVICTABLE--up close to the other
flags, [at the cost of an additional #ifdef] so we'll notice next time.
Note that #ifdef is not actually required, if we don't mind having the
unused flag value defined.

Replace #defines with an enum.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: <stable@kernel.org>		[2.6.28.x, 2.6.29.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 01ca0856caff..076a7dc67c2b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -18,9 +18,14 @@
  * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
  * allocation mode flags.
  */
-#define	AS_EIO		(__GFP_BITS_SHIFT + 0)	/* IO error on async write */
-#define AS_ENOSPC	(__GFP_BITS_SHIFT + 1)	/* ENOSPC on async write */
-#define AS_MM_ALL_LOCKS	(__GFP_BITS_SHIFT + 2)	/* under mm_take_all_locks() */
+enum mapping_flags {
+	AS_EIO		= __GFP_BITS_SHIFT + 0,	/* IO error on async write */
+	AS_ENOSPC	= __GFP_BITS_SHIFT + 1,	/* ENOSPC on async write */
+	AS_MM_ALL_LOCKS	= __GFP_BITS_SHIFT + 2,	/* under mm_take_all_locks() */
+#ifdef CONFIG_UNEVICTABLE_LRU
+	AS_UNEVICTABLE	= __GFP_BITS_SHIFT + 3,	/* e.g., ramdisk, SHM_LOCK */
+#endif
+};
 
 static inline void mapping_set_error(struct address_space *mapping, int error)
 {
@@ -33,7 +38,6 @@ static inline void mapping_set_error(struct address_space *mapping, int error)
 }
 
 #ifdef CONFIG_UNEVICTABLE_LRU
-#define AS_UNEVICTABLE	(__GFP_BITS_SHIFT + 2)	/* e.g., ramdisk, SHM_LOCK */
 
 static inline void mapping_set_unevictable(struct address_space *mapping)
 {
-- 
cgit v1.2.3


From bf6aede712334d7338d5c47a5ee5ba3883c82a61 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Thu, 2 Apr 2009 16:56:54 -0700
Subject: workqueue: add to_delayed_work() helper function

It is a fairly common operation to have a pointer to a work and to need a
pointer to the delayed work it is contained in.  In particular, all
delayed works which want to rearm themselves will have to do that.  So it
would seem fair to offer a helper function for this operation.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/vio.c                               | 2 +-
 drivers/crypto/hifn_795x.c                              | 2 +-
 drivers/input/mouse/hgpk.c                              | 2 +-
 drivers/net/dm9000.c                                    | 2 +-
 drivers/net/mlx4/en_netdev.c                            | 2 +-
 drivers/net/mlx4/en_rx.c                                | 2 +-
 drivers/net/mlx4/sense.c                                | 2 +-
 drivers/net/phy/phy.c                                   | 3 +--
 drivers/s390/scsi/zfcp_fc.c                             | 2 +-
 drivers/staging/rtl8187se/ieee80211/ieee80211_softmac.c | 8 ++++----
 drivers/staging/rtl8187se/r8180_core.c                  | 8 ++++----
 drivers/usb/wusbcore/devconnect.c                       | 2 +-
 include/linux/workqueue.h                               | 5 +++++
 mm/slab.c                                               | 3 +--
 14 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index d3694498f3af..819e59f6f7c7 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -482,7 +482,7 @@ static void vio_cmo_balance(struct work_struct *work)
 	cmo->excess.size = cmo->entitled - cmo->reserve.size;
 	cmo->excess.free = cmo->excess.size - need;
 
-	cancel_delayed_work(container_of(work, struct delayed_work, work));
+	cancel_delayed_work(to_delayed_work(work));
 	spin_unlock_irqrestore(&vio_cmo.lock, flags);
 }
 
diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c
index 0c79fe7f1567..4d85402a9e4a 100644
--- a/drivers/crypto/hifn_795x.c
+++ b/drivers/crypto/hifn_795x.c
@@ -1882,7 +1882,7 @@ static void hifn_clear_rings(struct hifn_device *dev, int error)
 
 static void hifn_work(struct work_struct *work)
 {
-	struct delayed_work *dw = container_of(work, struct delayed_work, work);
+	struct delayed_work *dw = to_delayed_work(work);
 	struct hifn_device *dev = container_of(dw, struct hifn_device, work);
 	unsigned long flags;
 	int reset = 0;
diff --git a/drivers/input/mouse/hgpk.c b/drivers/input/mouse/hgpk.c
index 81e6ebf323e9..55cd0fa68339 100644
--- a/drivers/input/mouse/hgpk.c
+++ b/drivers/input/mouse/hgpk.c
@@ -381,7 +381,7 @@ static void hgpk_disconnect(struct psmouse *psmouse)
 
 static void hgpk_recalib_work(struct work_struct *work)
 {
-	struct delayed_work *w = container_of(work, struct delayed_work, work);
+	struct delayed_work *w = to_delayed_work(work);
 	struct hgpk_data *priv = container_of(w, struct hgpk_data, recalib_wq);
 	struct psmouse *psmouse = priv->psmouse;
 
diff --git a/drivers/net/dm9000.c b/drivers/net/dm9000.c
index 254ec62b5f58..d8350860c0f8 100644
--- a/drivers/net/dm9000.c
+++ b/drivers/net/dm9000.c
@@ -559,7 +559,7 @@ static void dm9000_show_carrier(board_info_t *db,
 static void
 dm9000_poll_work(struct work_struct *w)
 {
-	struct delayed_work *dw = container_of(w, struct delayed_work, work);
+	struct delayed_work *dw = to_delayed_work(w);
 	board_info_t *db = container_of(dw, board_info_t, phy_poll);
 	struct net_device *ndev = db->ndev;
 
diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c
index 9f6644a44030..303c23de6cac 100644
--- a/drivers/net/mlx4/en_netdev.c
+++ b/drivers/net/mlx4/en_netdev.c
@@ -505,7 +505,7 @@ out:
 
 static void mlx4_en_do_get_stats(struct work_struct *work)
 {
-	struct delayed_work *delay = container_of(work, struct delayed_work, work);
+	struct delayed_work *delay = to_delayed_work(work);
 	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
 						 stats_task);
 	struct mlx4_en_dev *mdev = priv->mdev;
diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c
index a4130e764991..7e40741fb7d8 100644
--- a/drivers/net/mlx4/en_rx.c
+++ b/drivers/net/mlx4/en_rx.c
@@ -298,7 +298,7 @@ static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
 
 void mlx4_en_rx_refill(struct work_struct *work)
 {
-	struct delayed_work *delay = container_of(work, struct delayed_work, work);
+	struct delayed_work *delay = to_delayed_work(work);
 	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
 						 refill_task);
 	struct mlx4_en_dev *mdev = priv->mdev;
diff --git a/drivers/net/mlx4/sense.c b/drivers/net/mlx4/sense.c
index 6d5089ecb5af..f36ae691cab3 100644
--- a/drivers/net/mlx4/sense.c
+++ b/drivers/net/mlx4/sense.c
@@ -103,7 +103,7 @@ void mlx4_do_sense_ports(struct mlx4_dev *dev,
 
 static void mlx4_sense_port(struct work_struct *work)
 {
-	struct delayed_work *delay = container_of(work, struct delayed_work, work);
+	struct delayed_work *delay = to_delayed_work(work);
 	struct mlx4_sense *sense = container_of(delay, struct mlx4_sense,
 						sense_poll);
 	struct mlx4_dev *dev = sense->dev;
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 58b73b08dde0..3ff1f425f1bb 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -757,8 +757,7 @@ EXPORT_SYMBOL(phy_start);
  */
 static void phy_state_machine(struct work_struct *work)
 {
-	struct delayed_work *dwork =
-			container_of(work, struct delayed_work, work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct phy_device *phydev =
 			container_of(dwork, struct phy_device, state_queue);
 	int needs_aneg = 0;
diff --git a/drivers/s390/scsi/zfcp_fc.c b/drivers/s390/scsi/zfcp_fc.c
index aab8123c5966..e8d032b9dfbd 100644
--- a/drivers/s390/scsi/zfcp_fc.c
+++ b/drivers/s390/scsi/zfcp_fc.c
@@ -94,7 +94,7 @@ static int zfcp_wka_port_get(struct zfcp_wka_port *wka_port)
 
 static void zfcp_wka_port_offline(struct work_struct *work)
 {
-	struct delayed_work *dw = container_of(work, struct delayed_work, work);
+	struct delayed_work *dw = to_delayed_work(work);
 	struct zfcp_wka_port *wka_port =
 			container_of(dw, struct zfcp_wka_port, work);
 
diff --git a/drivers/staging/rtl8187se/ieee80211/ieee80211_softmac.c b/drivers/staging/rtl8187se/ieee80211/ieee80211_softmac.c
index e5752f615e09..80f9cc7137c2 100644
--- a/drivers/staging/rtl8187se/ieee80211/ieee80211_softmac.c
+++ b/drivers/staging/rtl8187se/ieee80211/ieee80211_softmac.c
@@ -719,7 +719,7 @@ void ieee80211_softmac_scan(struct ieee80211_device *ieee)
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20))
 void ieee80211_softmac_scan_wq(struct work_struct *work)
 {
-	struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device *ieee = container_of(dwork, struct ieee80211_device, softmac_scan_wq);
 #else
 void ieee80211_softmac_scan_wq(struct ieee80211_device *ieee)
@@ -777,7 +777,7 @@ out:
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20))
 void ieee80211_softmac_scan_wq(struct work_struct *work)
 {
-        struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+	struct delayed_work *dwork = to_delayed_work(work);
         struct ieee80211_device *ieee = container_of(work, struct ieee80211_device, softmac_scan_wq);
 #else
 void ieee80211_softmac_scan_wq(struct ieee80211_device *ieee)
@@ -2980,7 +2980,7 @@ void ieee80211_start_monitor_mode(struct ieee80211_device *ieee)
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20))
 void ieee80211_start_ibss_wq(struct work_struct *work)
 {
-	struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device *ieee = container_of(dwork, struct ieee80211_device, start_ibss_wq);
 #else
 void ieee80211_start_ibss_wq(struct ieee80211_device *ieee)
@@ -3162,7 +3162,7 @@ void ieee80211_disassociate(struct ieee80211_device *ieee)
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20))
 void ieee80211_associate_retry_wq(struct work_struct *work)
 {
-	struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device *ieee = container_of(dwork, struct ieee80211_device, associate_retry_wq);
 #else
 void ieee80211_associate_retry_wq(struct ieee80211_device *ieee)
diff --git a/drivers/staging/rtl8187se/r8180_core.c b/drivers/staging/rtl8187se/r8180_core.c
index 66de5cc8ddf1..ff1f23f99f27 100644
--- a/drivers/staging/rtl8187se/r8180_core.c
+++ b/drivers/staging/rtl8187se/r8180_core.c
@@ -5438,7 +5438,7 @@ void rtl8180_hw_wakeup_wq (struct work_struct *work)
 //	struct r8180_priv *priv = container_of(work, struct r8180_priv, watch_dog_wq);
 //	struct ieee80211_device * ieee = (struct ieee80211_device*)
 //	                                       container_of(work, struct ieee80211_device, watch_dog_wq);
-	struct delayed_work *dwork = container_of(work,struct delayed_work,work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device *ieee = container_of(dwork,struct ieee80211_device,hw_wakeup_wq);
 	struct net_device *dev = ieee->dev;
 #else
@@ -5459,7 +5459,7 @@ void rtl8180_hw_sleep_wq (struct work_struct *work)
 //      struct r8180_priv *priv = container_of(work, struct r8180_priv, watch_dog_wq);
 //      struct ieee80211_device * ieee = (struct ieee80211_device*)
 //                                             container_of(work, struct ieee80211_device, watch_dog_wq);
-        struct delayed_work *dwork = container_of(work,struct delayed_work,work);
+	struct delayed_work *dwork = to_delayed_work(work);
         struct ieee80211_device *ieee = container_of(dwork,struct ieee80211_device,hw_sleep_wq);
         struct net_device *dev = ieee->dev;
 #else
@@ -6407,7 +6407,7 @@ priv->txnpring)/8);
 void rtl8180_tx_irq_wq(struct work_struct *work)
 {
 	//struct r8180_priv *priv = container_of(work, struct r8180_priv, reset_wq);
-        struct delayed_work *dwork = container_of(work,struct delayed_work,work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device * ieee = (struct ieee80211_device*)
 	                                       container_of(dwork, struct ieee80211_device, watch_dog_wq);
 	struct net_device *dev = ieee->dev;
@@ -6691,7 +6691,7 @@ lizhaoming--------------------------- RF power on/power off -----------------
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20))
 void GPIOChangeRFWorkItemCallBack(struct work_struct *work)
 {
-	//struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+	//struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device *ieee = container_of(work, struct ieee80211_device, GPIOChangeRFWorkItem.work);
 	struct net_device *dev = ieee->dev;
 	struct r8180_priv *priv = ieee80211_priv(dev);
diff --git a/drivers/usb/wusbcore/devconnect.c b/drivers/usb/wusbcore/devconnect.c
index f0aac0cf315a..386eaa22d215 100644
--- a/drivers/usb/wusbcore/devconnect.c
+++ b/drivers/usb/wusbcore/devconnect.c
@@ -471,7 +471,7 @@ static void __wusbhc_keep_alive(struct wusbhc *wusbhc)
  */
 static void wusbhc_keep_alive_run(struct work_struct *ws)
 {
-	struct delayed_work *dw = container_of(ws, struct delayed_work, work);
+	struct delayed_work *dw = to_delayed_work(ws);
 	struct wusbhc *wusbhc =	container_of(dw, struct wusbhc, keep_alive_timer);
 
 	mutex_lock(&wusbhc->mutex);
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 3cd51e579ab1..13e1adf55c4c 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -41,6 +41,11 @@ struct delayed_work {
 	struct timer_list timer;
 };
 
+static inline struct delayed_work *to_delayed_work(struct work_struct *work)
+{
+	return container_of(work, struct delayed_work, work);
+}
+
 struct execute_work {
 	struct work_struct work;
 };
diff --git a/mm/slab.c b/mm/slab.c
index 825c606f691d..208323fd37bc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3992,8 +3992,7 @@ static void cache_reap(struct work_struct *w)
 	struct kmem_cache *searchp;
 	struct kmem_list3 *l3;
 	int node = numa_node_id();
-	struct delayed_work *work =
-		container_of(w, struct delayed_work, work);
+	struct delayed_work *work = to_delayed_work(w);
 
 	if (!mutex_trylock(&cache_chain_mutex))
 		/* Give up. Setup the next iteration. */
-- 
cgit v1.2.3


From 06c421ee0d5af95c8c6749ca0ba620cd5010707f Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@deeprootsystems.com>
Date: Thu, 2 Apr 2009 16:56:56 -0700
Subject: memory_accessor: new interface for reading/writing persistent memory

Add an interface by which other kernel code can read/write persistent
memory such as I2C or SPI EEPROMs, or devices which provide NVRAM.  Use
cases include storage of board-specific configuration data like Ethernet
addresses and sensor calibrations.

Original idea, review and improvement suggestions by David Brownell.

Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
Cc: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 3fdc10806d31..42767d1a62e7 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -99,4 +99,15 @@ enum mem_add_context { BOOT, HOTPLUG };
 #define hotplug_memory_notifier(fn, pri) do { } while (0)
 #endif
 
+/*
+ * 'struct memory_accessor' is a generic interface to provide
+ * in-kernel access to persistent memory such as i2c or SPI EEPROMs
+ */
+struct memory_accessor {
+	ssize_t (*read)(struct memory_accessor *, char *buf, off_t offset,
+			size_t count);
+	ssize_t (*write)(struct memory_accessor *, const char *buf,
+			 off_t offset, size_t count);
+};
+
 #endif /* _LINUX_MEMORY_H_ */
-- 
cgit v1.2.3


From 7274ec8bd71e99018642f474528ea7de4bb3ae25 Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@deeprootsystems.com>
Date: Thu, 2 Apr 2009 16:56:57 -0700
Subject: memory_accessor: implement the new memory_accessor interface for I2C
 EEPROM

In the case of at24, the platform code registers a 'setup' callback with
the at24_platform_data.  When the at24 driver detects an EEPROM, it fills
out the read and write functions of the memory_accessor and calls the
setup callback passing the memory_accessor struct.  The platform code can
then use the read/write functions in the memory_accessor struct for
reading and writing the EEPROM.

Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/eeprom/at24.c | 67 +++++++++++++++++++++++++++++++++++++---------
 include/linux/i2c/at24.h   |  4 +++
 2 files changed, 58 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c
index d4775528abc6..d184dfab9631 100644
--- a/drivers/misc/eeprom/at24.c
+++ b/drivers/misc/eeprom/at24.c
@@ -53,6 +53,7 @@
 
 struct at24_data {
 	struct at24_platform_data chip;
+	struct memory_accessor macc;
 	bool use_smbus;
 
 	/*
@@ -225,14 +226,11 @@ static ssize_t at24_eeprom_read(struct at24_data *at24, char *buf,
 		return status;
 }
 
-static ssize_t at24_bin_read(struct kobject *kobj, struct bin_attribute *attr,
+static ssize_t at24_read(struct at24_data *at24,
 		char *buf, loff_t off, size_t count)
 {
-	struct at24_data *at24;
 	ssize_t retval = 0;
 
-	at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
-
 	if (unlikely(!count))
 		return count;
 
@@ -262,12 +260,14 @@ static ssize_t at24_bin_read(struct kobject *kobj, struct bin_attribute *attr,
 	return retval;
 }
 
+static ssize_t at24_bin_read(struct kobject *kobj, struct bin_attribute *attr,
+		char *buf, loff_t off, size_t count)
+{
+	struct at24_data *at24;
 
-/*
- * REVISIT: export at24_bin{read,write}() to let other kernel code use
- * eeprom data. For example, it might hold a board's Ethernet address, or
- * board-specific calibration data generated on the manufacturing floor.
- */
+	at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
+	return at24_read(at24, buf, off, count);
+}
 
 
 /*
@@ -347,14 +347,11 @@ static ssize_t at24_eeprom_write(struct at24_data *at24, char *buf,
 	return -ETIMEDOUT;
 }
 
-static ssize_t at24_bin_write(struct kobject *kobj, struct bin_attribute *attr,
+static ssize_t at24_write(struct at24_data *at24,
 		char *buf, loff_t off, size_t count)
 {
-	struct at24_data *at24;
 	ssize_t retval = 0;
 
-	at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
-
 	if (unlikely(!count))
 		return count;
 
@@ -384,6 +381,39 @@ static ssize_t at24_bin_write(struct kobject *kobj, struct bin_attribute *attr,
 	return retval;
 }
 
+static ssize_t at24_bin_write(struct kobject *kobj, struct bin_attribute *attr,
+		char *buf, loff_t off, size_t count)
+{
+	struct at24_data *at24;
+
+	at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
+	return at24_write(at24, buf, off, count);
+}
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * This lets other kernel code access the eeprom data. For example, it
+ * might hold a board's Ethernet address, or board-specific calibration
+ * data generated on the manufacturing floor.
+ */
+
+static ssize_t at24_macc_read(struct memory_accessor *macc, char *buf,
+			 off_t offset, size_t count)
+{
+	struct at24_data *at24 = container_of(macc, struct at24_data, macc);
+
+	return at24_read(at24, buf, offset, count);
+}
+
+static ssize_t at24_macc_write(struct memory_accessor *macc, char *buf,
+			  off_t offset, size_t count)
+{
+	struct at24_data *at24 = container_of(macc, struct at24_data, macc);
+
+	return at24_write(at24, buf, offset, count);
+}
+
 /*-------------------------------------------------------------------------*/
 
 static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
@@ -413,6 +443,9 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		 * is recommended anyhow.
 		 */
 		chip.page_size = 1;
+
+		chip.setup = NULL;
+		chip.context = NULL;
 	}
 
 	if (!is_power_of_2(chip.byte_len))
@@ -463,6 +496,8 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	at24->bin.read = at24_bin_read;
 	at24->bin.size = chip.byte_len;
 
+	at24->macc.read = at24_macc_read;
+
 	writable = !(chip.flags & AT24_FLAG_READONLY);
 	if (writable) {
 		if (!use_smbus || i2c_check_functionality(client->adapter,
@@ -470,6 +505,8 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
 
 			unsigned write_max = chip.page_size;
 
+			at24->macc.write = at24_macc_write;
+
 			at24->bin.write = at24_bin_write;
 			at24->bin.attr.mode |= S_IWUSR;
 
@@ -520,6 +557,10 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		at24->write_max,
 		use_smbus ? ", use_smbus" : "");
 
+	/* export data to kernel code */
+	if (chip.setup)
+		chip.setup(&at24->macc, chip.context);
+
 	return 0;
 
 err_clients:
diff --git a/include/linux/i2c/at24.h b/include/linux/i2c/at24.h
index f6edd522a929..8ace93024d60 100644
--- a/include/linux/i2c/at24.h
+++ b/include/linux/i2c/at24.h
@@ -2,6 +2,7 @@
 #define _LINUX_AT24_H
 
 #include <linux/types.h>
+#include <linux/memory.h>
 
 /*
  * As seen through Linux I2C, differences between the most common types of I2C
@@ -23,6 +24,9 @@ struct at24_platform_data {
 #define AT24_FLAG_READONLY	0x40	/* sysfs-entry will be read-only */
 #define AT24_FLAG_IRUGO		0x20	/* sysfs-entry will be world-readable */
 #define AT24_FLAG_TAKE8ADDR	0x10	/* take always 8 addresses (24c00) */
+
+	void		(*setup)(struct memory_accessor *, void *context);
+	void		*context;
 };
 
 #endif /* _LINUX_AT24_H */
-- 
cgit v1.2.3


From 14dd1ff0f9e75dd4ae2f1ff8e48becb76d14f4ab Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Thu, 2 Apr 2009 16:56:58 -0700
Subject: memory_accessor: implement the new memory_accessor interfaces for SPI
 EEPROMs

- Define new setup() hook to export the accessor
 - Implement accessor methods

Moves some error checking out of the sysfs interface code into the layer
below it, which is now shared by both sysfs and memory access code.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
Cc: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/eeprom/at25.c | 58 +++++++++++++++++++++++++++++++++++-----------
 include/linux/spi/eeprom.h |  6 +++++
 2 files changed, 50 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c
index 290dbe99647a..6bc0dac5c1e8 100644
--- a/drivers/misc/eeprom/at25.c
+++ b/drivers/misc/eeprom/at25.c
@@ -30,6 +30,7 @@
 
 struct at25_data {
 	struct spi_device	*spi;
+	struct memory_accessor	mem;
 	struct mutex		lock;
 	struct spi_eeprom	chip;
 	struct bin_attribute	bin;
@@ -75,6 +76,13 @@ at25_ee_read(
 	struct spi_transfer	t[2];
 	struct spi_message	m;
 
+	if (unlikely(offset >= at25->bin.size))
+		return 0;
+	if ((offset + count) > at25->bin.size)
+		count = at25->bin.size - offset;
+	if (unlikely(!count))
+		return count;
+
 	cp = command;
 	*cp++ = AT25_READ;
 
@@ -127,13 +135,6 @@ at25_bin_read(struct kobject *kobj, struct bin_attribute *bin_attr,
 	dev = container_of(kobj, struct device, kobj);
 	at25 = dev_get_drvdata(dev);
 
-	if (unlikely(off >= at25->bin.size))
-		return 0;
-	if ((off + count) > at25->bin.size)
-		count = at25->bin.size - off;
-	if (unlikely(!count))
-		return count;
-
 	return at25_ee_read(at25, buf, off, count);
 }
 
@@ -146,6 +147,13 @@ at25_ee_write(struct at25_data *at25, char *buf, loff_t off, size_t count)
 	unsigned		buf_size;
 	u8			*bounce;
 
+	if (unlikely(off >= at25->bin.size))
+		return -EFBIG;
+	if ((off + count) > at25->bin.size)
+		count = at25->bin.size - off;
+	if (unlikely(!count))
+		return count;
+
 	/* Temp buffer starts with command and address */
 	buf_size = at25->chip.page_size;
 	if (buf_size > io_limit)
@@ -253,18 +261,31 @@ at25_bin_write(struct kobject *kobj, struct bin_attribute *bin_attr,
 	dev = container_of(kobj, struct device, kobj);
 	at25 = dev_get_drvdata(dev);
 
-	if (unlikely(off >= at25->bin.size))
-		return -EFBIG;
-	if ((off + count) > at25->bin.size)
-		count = at25->bin.size - off;
-	if (unlikely(!count))
-		return count;
-
 	return at25_ee_write(at25, buf, off, count);
 }
 
 /*-------------------------------------------------------------------------*/
 
+/* Let in-kernel code access the eeprom data. */
+
+static ssize_t at25_mem_read(struct memory_accessor *mem, char *buf,
+			 off_t offset, size_t count)
+{
+	struct at25_data *at25 = container_of(mem, struct at25_data, mem);
+
+	return at25_ee_read(at25, buf, offset, count);
+}
+
+static ssize_t at25_mem_write(struct memory_accessor *mem, char *buf,
+			  off_t offset, size_t count)
+{
+	struct at25_data *at25 = container_of(mem, struct at25_data, mem);
+
+	return at25_ee_write(at25, buf, offset, count);
+}
+
+/*-------------------------------------------------------------------------*/
+
 static int at25_probe(struct spi_device *spi)
 {
 	struct at25_data	*at25 = NULL;
@@ -317,6 +338,10 @@ static int at25_probe(struct spi_device *spi)
 	at25->addrlen = addrlen;
 
 	/* Export the EEPROM bytes through sysfs, since that's convenient.
+	 * And maybe to other kernel code; it might hold a board's Ethernet
+	 * address, or board-specific calibration data generated on the
+	 * manufacturing floor.
+	 *
 	 * Default to root-only access to the data; EEPROMs often hold data
 	 * that's sensitive for read and/or write, like ethernet addresses,
 	 * security codes, board-specific manufacturing calibrations, etc.
@@ -324,17 +349,22 @@ static int at25_probe(struct spi_device *spi)
 	at25->bin.attr.name = "eeprom";
 	at25->bin.attr.mode = S_IRUSR;
 	at25->bin.read = at25_bin_read;
+	at25->mem.read = at25_mem_read;
 
 	at25->bin.size = at25->chip.byte_len;
 	if (!(chip->flags & EE_READONLY)) {
 		at25->bin.write = at25_bin_write;
 		at25->bin.attr.mode |= S_IWUSR;
+		at25->mem.write = at25_mem_write;
 	}
 
 	err = sysfs_create_bin_file(&spi->dev.kobj, &at25->bin);
 	if (err)
 		goto fail;
 
+	if (chip->setup)
+		chip->setup(&at25->mem, chip->context);
+
 	dev_info(&spi->dev, "%Zd %s %s eeprom%s, pagesize %u\n",
 		(at25->bin.size < 1024)
 			? at25->bin.size
diff --git a/include/linux/spi/eeprom.h b/include/linux/spi/eeprom.h
index 1085212c446e..306e7b1c69ed 100644
--- a/include/linux/spi/eeprom.h
+++ b/include/linux/spi/eeprom.h
@@ -1,6 +1,8 @@
 #ifndef __LINUX_SPI_EEPROM_H
 #define __LINUX_SPI_EEPROM_H
 
+#include <linux/memory.h>
+
 /*
  * Put one of these structures in platform_data for SPI EEPROMS handled
  * by the "at25" driver.  On SPI, most EEPROMS understand the same core
@@ -17,6 +19,10 @@ struct spi_eeprom {
 #define	EE_ADDR2	0x0002			/* 16 bit addrs */
 #define	EE_ADDR3	0x0004			/* 24 bit addrs */
 #define	EE_READONLY	0x0008			/* disallow writes */
+
+	/* for exporting this chip's data to other kernel code */
+	void (*setup)(struct memory_accessor *mem, void *context);
+	void *context;
 };
 
 #endif /* __LINUX_SPI_EEPROM_H */
-- 
cgit v1.2.3


From 6f2c55b843836d26528c56a0968689accaedbc67 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 2 Apr 2009 16:56:59 -0700
Subject: Simplify copy_thread()

First argument unused since 2.3.11.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/process.c         | 2 +-
 arch/arm/kernel/process.c           | 2 +-
 arch/avr32/kernel/process.c         | 2 +-
 arch/blackfin/kernel/process.c      | 2 +-
 arch/cris/arch-v10/kernel/process.c | 2 +-
 arch/cris/arch-v32/kernel/process.c | 2 +-
 arch/frv/kernel/process.c           | 2 +-
 arch/h8300/kernel/process.c         | 2 +-
 arch/ia64/kernel/process.c          | 2 +-
 arch/m32r/kernel/process.c          | 2 +-
 arch/m68k/kernel/process.c          | 2 +-
 arch/m68knommu/kernel/process.c     | 2 +-
 arch/mips/kernel/process.c          | 2 +-
 arch/mn10300/kernel/process.c       | 2 +-
 arch/parisc/kernel/process.c        | 2 +-
 arch/powerpc/kernel/process.c       | 2 +-
 arch/s390/kernel/process.c          | 2 +-
 arch/sh/kernel/process_32.c         | 2 +-
 arch/sh/kernel/process_64.c         | 2 +-
 arch/sparc/kernel/process_32.c      | 2 +-
 arch/sparc/kernel/process_64.c      | 2 +-
 arch/um/kernel/process.c            | 2 +-
 arch/x86/kernel/process_32.c        | 2 +-
 arch/x86/kernel/process_64.c        | 2 +-
 arch/xtensa/kernel/process.c        | 2 +-
 include/linux/sched.h               | 3 ++-
 kernel/fork.c                       | 2 +-
 27 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 8d0097f10208..3a2fb7a02db4 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -272,7 +272,7 @@ alpha_vfork(struct pt_regs *regs)
  */
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	    unsigned long unused,
 	    struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 2de14e2afdc5..c3265a2e7cd4 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -301,7 +301,7 @@ void release_thread(struct task_struct *dead_task)
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long stack_start,
+copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	    unsigned long stk_sz, struct task_struct *p, struct pt_regs *regs)
 {
 	struct thread_info *thread = task_thread_info(p);
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 43ae555ecb33..1bbe1da54869 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -332,7 +332,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 33e2e8993f7f..f49427293ca1 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -193,7 +193,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs)
 }
 
 int
-copy_thread(int nr, unsigned long clone_flags,
+copy_thread(unsigned long clone_flags,
 	    unsigned long usp, unsigned long topstk,
 	    struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index bd9b3ff63f6c..c4c69cf721e5 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -115,7 +115,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
  */
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index ced5b725d9bd..120e7f796fea 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -131,7 +131,7 @@ kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 extern asmlinkage void ret_from_fork(void);
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 9583a338e9d6..0de50df74970 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -204,7 +204,7 @@ void prepare_to_copy(struct task_struct *tsk)
 /*
  * set up the kernel stack and exception frames for a new process
  */
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long usp, unsigned long topstk,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index a8ef654a5a0b..e2f33d0f9969 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -191,7 +191,7 @@ asmlinkage int h8300_clone(struct pt_regs *regs)
 
 }
 
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
                 unsigned long usp, unsigned long topstk,
 		 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index c57162705147..5d7c0e5b9e76 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -413,7 +413,7 @@ ia64_load_extra (struct task_struct *task)
  * so there is nothing to worry about.
  */
 int
-copy_thread (int nr, unsigned long clone_flags,
+copy_thread(unsigned long clone_flags,
 	     unsigned long user_stack_base, unsigned long user_stack_size,
 	     struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 7103d91e1a2f..3e876f0baebc 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -225,7 +225,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 	return 0; /* Task didn't use the fpu at all. */
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long spu,
+int copy_thread(unsigned long clone_flags, unsigned long spu,
 	unsigned long unused, struct task_struct *tsk, struct pt_regs *regs)
 {
 	struct pt_regs *childregs = task_pt_regs(tsk);
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 632ce016014d..ec37fb56c127 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -233,7 +233,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs)
 		       parent_tidptr, child_tidptr);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		 unsigned long unused,
 		 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index 3f2d7745f31e..1e96c6eb6312 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -199,7 +199,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs)
         return do_fork(clone_flags, newsp, regs, 0, NULL, NULL);
 }
 
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long usp, unsigned long topstk,
 		struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index ca2e4026ad20..1eaaa450e20c 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -99,7 +99,7 @@ void flush_thread(void)
 {
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 	unsigned long unused, struct task_struct *p, struct pt_regs *regs)
 {
 	struct thread_info *ti = task_thread_info(p);
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index b28c9a60445b..234cf344cdce 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -193,7 +193,7 @@ void prepare_to_copy(struct task_struct *tsk)
  * set up the kernel stack for a new thread and copy arch-specific thread
  * control information
  */
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long c_usp, unsigned long ustk_size,
 		struct task_struct *p, struct pt_regs *kregs)
 {
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index b80e02a4d81d..8aa591ed9127 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -263,7 +263,7 @@ sys_vfork(struct pt_regs *regs)
 }
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	    unsigned long unused,	/* in ia64 this is "user_stack_size" */
 	    struct task_struct * p, struct pt_regs * pregs)
 {
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index eac064948780..7b44a33f03c2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -598,7 +598,7 @@ void prepare_to_copy(struct task_struct *tsk)
 /*
  * Copy a thread..
  */
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused, struct task_struct *p,
 		struct pt_regs *regs)
 {
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index b48e961a38f6..a3acd8e60aff 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -160,7 +160,7 @@ void release_thread(struct task_struct *dead_task)
 {
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
+int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index ddafbbbab2ab..694bc15f84fd 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -170,7 +170,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index c90c7e5e5fee..96be839040f8 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -425,7 +425,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index f4bee35a1b46..2830b415e214 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -455,7 +455,7 @@ asmlinkage int sparc_do_fork(unsigned long clone_flags,
  */
 extern void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index a73954b87f0a..4041f94e7724 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -561,7 +561,7 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags,
  * Parent -->  %o0 == childs  pid, %o1 == 0
  * Child  -->  %o0 == parents pid, %o1 == 1
  */
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index a1c6d07cac3e..4a28a1568d85 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -179,7 +179,7 @@ void fork_handler(void)
 	userspace(&current->thread.regs.regs);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long stack_top, struct task_struct * p,
 		struct pt_regs *regs)
 {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 14014d766cad..76f8f84043a2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -245,7 +245,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 	unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index abb7e6a7f0c6..b751a41392b1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -278,7 +278,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 9185597eb6a0..031f36685710 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -172,7 +172,7 @@ void prepare_to_copy(struct task_struct *tsk)
  *       childregs.
  */
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
                 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 481fad3a9b42..9186f8c5d5f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1975,7 +1975,8 @@ extern void mm_release(struct task_struct *, struct mm_struct *);
 /* Allocate a new mm structure and copy contents from tsk->mm */
 extern struct mm_struct *dup_mm(struct task_struct *tsk);
 
-extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+extern int copy_thread(unsigned long, unsigned long, unsigned long,
+			struct task_struct *, struct pt_regs *);
 extern void flush_thread(void);
 extern void exit_thread(void);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 51d1aa21483b..d7eb727eb535 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1125,7 +1125,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
-	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+	retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
-- 
cgit v1.2.3


From 96615841e170f0108832e64a90d51b469573a472 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Thu, 2 Apr 2009 16:57:01 -0700
Subject: rtc-v3020: add ability to access v3020 chip with GPIOs

The v3020 RTC can be connected to GPIOs as well as to memory-like
interface.  Add ability to use GPIO bit-bang for v3020 read-write access.

[akpm@linux-foundation.org: fix off-by-one in error path]
Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Acked-by: Alessandro Zummo <a.zummo@towertech.it>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-v3020.c   | 190 +++++++++++++++++++++++++++++++++++++++++-----
 include/linux/rtc-v3020.h |   6 ++
 2 files changed, 176 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c
index 66955cc9c746..ad164056feb6 100644
--- a/drivers/rtc/rtc-v3020.c
+++ b/drivers/rtc/rtc-v3020.c
@@ -27,17 +27,162 @@
 #include <linux/bcd.h>
 #include <linux/rtc-v3020.h>
 #include <linux/delay.h>
+#include <linux/gpio.h>
 
 #include <linux/io.h>
 
 #undef DEBUG
 
+struct v3020;
+
+struct v3020_chip_ops {
+	int (*map_io)(struct v3020 *chip, struct platform_device *pdev,
+		      struct v3020_platform_data *pdata);
+	void (*unmap_io)(struct v3020 *chip);
+	unsigned char (*read_bit)(struct v3020 *chip);
+	void (*write_bit)(struct v3020 *chip, unsigned char bit);
+};
+
+#define V3020_CS	0
+#define V3020_WR	1
+#define V3020_RD	2
+#define V3020_IO	3
+
+struct v3020_gpio {
+	const char *name;
+	unsigned int gpio;
+};
+
 struct v3020 {
+	/* MMIO access */
 	void __iomem *ioaddress;
 	int leftshift;
+
+	/* GPIO access */
+	struct v3020_gpio *gpio;
+
+	struct v3020_chip_ops *ops;
+
 	struct rtc_device *rtc;
 };
 
+
+static int v3020_mmio_map(struct v3020 *chip, struct platform_device *pdev,
+			  struct v3020_platform_data *pdata)
+{
+	if (pdev->num_resources != 1)
+		return -EBUSY;
+
+	if (pdev->resource[0].flags != IORESOURCE_MEM)
+		return -EBUSY;
+
+	chip->leftshift = pdata->leftshift;
+	chip->ioaddress = ioremap(pdev->resource[0].start, 1);
+	if (chip->ioaddress == NULL)
+		return -EBUSY;
+
+	return 0;
+}
+
+static void v3020_mmio_unmap(struct v3020 *chip)
+{
+	iounmap(chip->ioaddress);
+}
+
+static void v3020_mmio_write_bit(struct v3020 *chip, unsigned char bit)
+{
+	writel(bit << chip->leftshift, chip->ioaddress);
+}
+
+static unsigned char v3020_mmio_read_bit(struct v3020 *chip)
+{
+	return readl(chip->ioaddress) & (1 << chip->leftshift);
+}
+
+static struct v3020_chip_ops v3020_mmio_ops = {
+	.map_io		= v3020_mmio_map,
+	.unmap_io	= v3020_mmio_unmap,
+	.read_bit	= v3020_mmio_read_bit,
+	.write_bit	= v3020_mmio_write_bit,
+};
+
+static struct v3020_gpio v3020_gpio[] = {
+	{ "RTC CS", 0 },
+	{ "RTC WR", 0 },
+	{ "RTC RD", 0 },
+	{ "RTC IO", 0 },
+};
+
+static int v3020_gpio_map(struct v3020 *chip, struct platform_device *pdev,
+			  struct v3020_platform_data *pdata)
+{
+	int i, err;
+
+	v3020_gpio[V3020_CS].gpio = pdata->gpio_cs;
+	v3020_gpio[V3020_WR].gpio = pdata->gpio_wr;
+	v3020_gpio[V3020_RD].gpio = pdata->gpio_rd;
+	v3020_gpio[V3020_IO].gpio = pdata->gpio_io;
+
+	for (i = 0; i < ARRAY_SIZE(v3020_gpio); i++) {
+		err = gpio_request(v3020_gpio[i].gpio, v3020_gpio[i].name);
+		if (err)
+			goto err_request;
+
+		gpio_direction_output(v3020_gpio[i].gpio, 1);
+	}
+
+	chip->gpio = v3020_gpio;
+
+	return 0;
+
+err_request:
+	while (--i >= 0)
+		gpio_free(v3020_gpio[i].gpio);
+
+	return err;
+}
+
+static void v3020_gpio_unmap(struct v3020 *chip)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(v3020_gpio); i++)
+		gpio_free(v3020_gpio[i].gpio);
+}
+
+static void v3020_gpio_write_bit(struct v3020 *chip, unsigned char bit)
+{
+	gpio_direction_output(chip->gpio[V3020_IO].gpio, bit);
+	gpio_set_value(chip->gpio[V3020_CS].gpio, 0);
+	gpio_set_value(chip->gpio[V3020_WR].gpio, 0);
+	udelay(1);
+	gpio_set_value(chip->gpio[V3020_WR].gpio, 1);
+	gpio_set_value(chip->gpio[V3020_CS].gpio, 1);
+}
+
+static unsigned char v3020_gpio_read_bit(struct v3020 *chip)
+{
+	int bit;
+
+	gpio_direction_input(chip->gpio[V3020_IO].gpio);
+	gpio_set_value(chip->gpio[V3020_CS].gpio, 0);
+	gpio_set_value(chip->gpio[V3020_RD].gpio, 0);
+	udelay(1);
+	bit = !!gpio_get_value(chip->gpio[V3020_IO].gpio);
+	udelay(1);
+	gpio_set_value(chip->gpio[V3020_RD].gpio, 1);
+	gpio_set_value(chip->gpio[V3020_CS].gpio, 1);
+
+	return bit;
+}
+
+static struct v3020_chip_ops v3020_gpio_ops = {
+	.map_io		= v3020_gpio_map,
+	.unmap_io	= v3020_gpio_unmap,
+	.read_bit	= v3020_gpio_read_bit,
+	.write_bit	= v3020_gpio_write_bit,
+};
+
 static void v3020_set_reg(struct v3020 *chip, unsigned char address,
 			unsigned char data)
 {
@@ -46,7 +191,7 @@ static void v3020_set_reg(struct v3020 *chip, unsigned char address,
 
 	tmp = address;
 	for (i = 0; i < 4; i++) {
-		writel((tmp & 1) << chip->leftshift, chip->ioaddress);
+		chip->ops->write_bit(chip, (tmp & 1));
 		tmp >>= 1;
 		udelay(1);
 	}
@@ -54,7 +199,7 @@ static void v3020_set_reg(struct v3020 *chip, unsigned char address,
 	/* Commands dont have data */
 	if (!V3020_IS_COMMAND(address)) {
 		for (i = 0; i < 8; i++) {
-			writel((data & 1) << chip->leftshift, chip->ioaddress);
+			chip->ops->write_bit(chip, (data & 1));
 			data >>= 1;
 			udelay(1);
 		}
@@ -67,14 +212,14 @@ static unsigned char v3020_get_reg(struct v3020 *chip, unsigned char address)
 	int i;
 
 	for (i = 0; i < 4; i++) {
-		writel((address & 1) << chip->leftshift, chip->ioaddress);
+		chip->ops->write_bit(chip, (address & 1));
 		address >>= 1;
 		udelay(1);
 	}
 
 	for (i = 0; i < 8; i++) {
 		data >>= 1;
-		if (readl(chip->ioaddress) & (1 << chip->leftshift))
+		if (chip->ops->read_bit(chip))
 			data |= 0x80;
 		udelay(1);
 	}
@@ -164,25 +309,23 @@ static int rtc_probe(struct platform_device *pdev)
 	int i;
 	int temp;
 
-	if (pdev->num_resources != 1)
-		return -EBUSY;
-
-	if (pdev->resource[0].flags != IORESOURCE_MEM)
-		return -EBUSY;
-
 	chip = kzalloc(sizeof *chip, GFP_KERNEL);
 	if (!chip)
 		return -ENOMEM;
 
-	chip->leftshift = pdata->leftshift;
-	chip->ioaddress = ioremap(pdev->resource[0].start, 1);
-	if (chip->ioaddress == NULL)
+	if (pdata->use_gpio)
+		chip->ops = &v3020_gpio_ops;
+	else
+		chip->ops = &v3020_mmio_ops;
+
+	retval = chip->ops->map_io(chip, pdev, pdata);
+	if (retval)
 		goto err_chip;
 
 	/* Make sure the v3020 expects a communication cycle
 	 * by reading 8 times */
 	for (i = 0; i < 8; i++)
-		temp = readl(chip->ioaddress);
+		temp = chip->ops->read_bit(chip);
 
 	/* Test chip by doing a write/read sequence
 	 * to the chip ram */
@@ -196,10 +339,17 @@ static int rtc_probe(struct platform_device *pdev)
 	 * are all disabled */
 	v3020_set_reg(chip, V3020_STATUS_0, 0x0);
 
-	dev_info(&pdev->dev, "Chip available at physical address 0x%llx,"
-		"data connected to D%d\n",
-		(unsigned long long)pdev->resource[0].start,
-		chip->leftshift);
+	if (pdata->use_gpio)
+		dev_info(&pdev->dev, "Chip available at GPIOs "
+			 "%d, %d, %d, %d\n",
+			 chip->gpio[V3020_CS].gpio, chip->gpio[V3020_WR].gpio,
+			 chip->gpio[V3020_RD].gpio, chip->gpio[V3020_IO].gpio);
+	else
+		dev_info(&pdev->dev, "Chip available at "
+			 "physical address 0x%llx,"
+			 "data connected to D%d\n",
+			 (unsigned long long)pdev->resource[0].start,
+			 chip->leftshift);
 
 	platform_set_drvdata(pdev, chip);
 
@@ -214,7 +364,7 @@ static int rtc_probe(struct platform_device *pdev)
 	return 0;
 
 err_io:
-	iounmap(chip->ioaddress);
+	chip->ops->unmap_io(chip);
 err_chip:
 	kfree(chip);
 
@@ -229,7 +379,7 @@ static int rtc_remove(struct platform_device *dev)
 	if (rtc)
 		rtc_device_unregister(rtc);
 
-	iounmap(chip->ioaddress);
+	chip->ops->unmap_io(chip);
 	kfree(chip);
 
 	return 0;
diff --git a/include/linux/rtc-v3020.h b/include/linux/rtc-v3020.h
index bf74e63c98fe..8ba646e610d9 100644
--- a/include/linux/rtc-v3020.h
+++ b/include/linux/rtc-v3020.h
@@ -14,6 +14,12 @@
  * is used depends on the board. */
 struct v3020_platform_data {
 	int leftshift; /* (1<<(leftshift)) & readl() */
+
+	int use_gpio:1;
+	unsigned int gpio_cs;
+	unsigned int gpio_wr;
+	unsigned int gpio_rd;
+	unsigned int gpio_io;
 };
 
 #define V3020_STATUS_0	0x00
-- 
cgit v1.2.3


From bfb9bcdbda9a61bca469bf899a589918c60c4c18 Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Thu, 2 Apr 2009 16:57:07 -0700
Subject: spi-gpio: allow operation without CS signal

Change spi-gpio so that it is possible to drive SPI communications over
GPIO without the need for a chipselect signal.

This is useful in very small setups where there's only one slave device
on the bus.

This patch does not affect existing setups.

I use this for a tiny communication channel between an embedded device and
a microcontroller.  There are not enough GPIOs available for chipselect
and it's not needed anyway in this case.

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/spi_gpio.c       | 21 +++++++++++++--------
 include/linux/spi/spi_gpio.h |  6 ++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi_gpio.c b/drivers/spi/spi_gpio.c
index d2866c293dee..26bd03e61855 100644
--- a/drivers/spi/spi_gpio.c
+++ b/drivers/spi/spi_gpio.c
@@ -178,8 +178,10 @@ static void spi_gpio_chipselect(struct spi_device *spi, int is_active)
 	if (is_active)
 		setsck(spi, spi->mode & SPI_CPOL);
 
-	/* SPI is normally active-low */
-	gpio_set_value(cs, (spi->mode & SPI_CS_HIGH) ? is_active : !is_active);
+	if (cs != SPI_GPIO_NO_CHIPSELECT) {
+		/* SPI is normally active-low */
+		gpio_set_value(cs, (spi->mode & SPI_CS_HIGH) ? is_active : !is_active);
+	}
 }
 
 static int spi_gpio_setup(struct spi_device *spi)
@@ -191,15 +193,17 @@ static int spi_gpio_setup(struct spi_device *spi)
 		return -EINVAL;
 
 	if (!spi->controller_state) {
-		status = gpio_request(cs, dev_name(&spi->dev));
-		if (status)
-			return status;
-		status = gpio_direction_output(cs, spi->mode & SPI_CS_HIGH);
+		if (cs != SPI_GPIO_NO_CHIPSELECT) {
+			status = gpio_request(cs, dev_name(&spi->dev));
+			if (status)
+				return status;
+			status = gpio_direction_output(cs, spi->mode & SPI_CS_HIGH);
+		}
 	}
 	if (!status)
 		status = spi_bitbang_setup(spi);
 	if (status) {
-		if (!spi->controller_state)
+		if (!spi->controller_state && cs != SPI_GPIO_NO_CHIPSELECT)
 			gpio_free(cs);
 	}
 	return status;
@@ -209,7 +213,8 @@ static void spi_gpio_cleanup(struct spi_device *spi)
 {
 	unsigned long	cs = (unsigned long) spi->controller_data;
 
-	gpio_free(cs);
+	if (cs != SPI_GPIO_NO_CHIPSELECT)
+		gpio_free(cs);
 	spi_bitbang_cleanup(spi);
 }
 
diff --git a/include/linux/spi/spi_gpio.h b/include/linux/spi/spi_gpio.h
index 0f01a0f1f40c..ca6782ee4b9f 100644
--- a/include/linux/spi/spi_gpio.h
+++ b/include/linux/spi/spi_gpio.h
@@ -25,10 +25,16 @@
  *	...
  *	};
  *
+ * If chipselect is not used (there's only one device on the bus), assign
+ * SPI_GPIO_NO_CHIPSELECT to the controller_data:
+ *		.controller_data = (void *) SPI_GPIO_NO_CHIPSELECT;
+ *
  * If the bitbanged bus is later switched to a "native" controller,
  * that platform_device and controller_data should be removed.
  */
 
+#define SPI_GPIO_NO_CHIPSELECT		((unsigned long)-1l)
+
 /**
  * struct spi_gpio_platform_data - parameter for bitbanged SPI master
  * @sck: number of the GPIO used for clock output
-- 
cgit v1.2.3


From 039fd8ce6258e01ec29f1637f9bf1868dd877c55 Mon Sep 17 00:00:00 2001
From: Cyrus Massoumi <cyrusm@gmx.net>
Date: Thu, 2 Apr 2009 16:57:12 -0700
Subject: ext3: remove the BKL in ext3/ioctl.c

Reformat ext3/ioctl.c to make it look more like ext4/ioctl.c and remove
the BKL around ext3_ioctl().

Signed-off-by: Cyrus Massoumi <cyrusm@gmx.net>
Cc: <linux-ext4@vger.kernel.org>
Acked-by: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/dir.c           |  2 +-
 fs/ext3/file.c          |  2 +-
 fs/ext3/ioctl.c         | 59 +++++++++++++++++--------------------------------
 include/linux/ext3_fs.h |  5 ++---
 4 files changed, 24 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 5853f4440af4..3d724a95882f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
-	.ioctl		= ext3_ioctl,		/* BKL held */
+	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
 #endif
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9a..521f8238b2fa 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -112,7 +112,7 @@ const struct file_operations ext3_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= ext3_file_write,
-	.ioctl		= ext3_ioctl,
+	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
 #endif
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 5e86ce9a86e0..88974814783a 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -15,12 +15,11 @@
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 
-int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
-		unsigned long arg)
+long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_dentry->d_inode;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	unsigned int flags;
 	unsigned short rsv_window_size;
@@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		unsigned int oldflags;
 		unsigned int jflag;
 
+		if (!is_owner_or_cap(inode))
+			return -EACCES;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
 			return err;
 
-		if (!is_owner_or_cap(inode)) {
-			err = -EACCES;
-			goto flags_out;
-		}
-
-		if (get_user(flags, (int __user *) arg)) {
-			err = -EFAULT;
-			goto flags_out;
-		}
-
 		flags = ext3_mask_flags(inode->i_mode, flags);
 
 		mutex_lock(&inode->i_mutex);
+
 		/* Is it quota file? Do not allow user to mess with it */
-		if (IS_NOQUOTA(inode)) {
-			mutex_unlock(&inode->i_mutex);
-			err = -EPERM;
+		err = -EPERM;
+		if (IS_NOQUOTA(inode))
 			goto flags_out;
-		}
+
 		oldflags = ei->i_flags;
 
 		/* The JOURNAL_DATA flag is modifiable only by root */
@@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * This test looks nicer. Thanks to Pauline Middelink
 		 */
 		if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				mutex_unlock(&inode->i_mutex);
-				err = -EPERM;
+			if (!capable(CAP_LINUX_IMMUTABLE))
 				goto flags_out;
-			}
 		}
 
 		/*
@@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE)) {
-				mutex_unlock(&inode->i_mutex);
-				err = -EPERM;
+			if (!capable(CAP_SYS_RESOURCE))
 				goto flags_out;
-			}
 		}
 
-
 		handle = ext3_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
-			mutex_unlock(&inode->i_mutex);
 			err = PTR_ERR(handle);
 			goto flags_out;
 		}
@@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
 		ext3_journal_stop(handle);
-		if (err) {
-			mutex_unlock(&inode->i_mutex);
-			return err;
-		}
+		if (err)
+			goto flags_out;
 
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
 			err = ext3_change_inode_journal_flag(inode, jflag);
-		mutex_unlock(&inode->i_mutex);
 flags_out:
+		mutex_unlock(&inode->i_mutex);
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
@@ -140,6 +125,7 @@ flags_out:
 
 		if (!is_owner_or_cap(inode))
 			return -EPERM;
+
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
 			return err;
@@ -147,6 +133,7 @@ flags_out:
 			err = -EFAULT;
 			goto setversion_out;
 		}
+
 		handle = ext3_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
 			err = PTR_ERR(handle);
@@ -299,9 +286,6 @@ group_add_out:
 #ifdef CONFIG_COMPAT
 long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret;
-
 	/* These are just misnamed, they actually get/put from/to user an int */
 	switch (cmd) {
 	case EXT3_IOC32_GETFLAGS:
@@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	default:
 		return -ENOIOCTLCMD;
 	}
-	lock_kernel();
-	ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-	unlock_kernel();
-	return ret;
+	return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
 }
 #endif
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index dd495b8c3091..e263acaa405b 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -893,9 +893,8 @@ extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len);
 
 /* ioctl.c */
-extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
-		       unsigned long);
-extern long ext3_compat_ioctl (struct file *, unsigned int, unsigned long);
+extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* namei.c */
 extern int ext3_orphan_add(handle_t *, struct inode *);
-- 
cgit v1.2.3


From d20a390a0ee2bf2f692c539c6ce1c829e1080bb5 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Thu, 2 Apr 2009 16:57:22 -0700
Subject: cgroups: fix cgroup.h comments

Fix the style of some multi-line comments in cgroup.h to match
Documentation/CodingStyle

Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 71 +++++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 499900d0cee7..bb8feb9feccd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -47,14 +47,18 @@ enum cgroup_subsys_id {
 
 /* Per-subsystem/per-cgroup state maintained by the system. */
 struct cgroup_subsys_state {
-	/* The cgroup that this subsystem is attached to. Useful
+	/*
+	 * The cgroup that this subsystem is attached to. Useful
 	 * for subsystems that want to know about the cgroup
-	 * hierarchy structure */
+	 * hierarchy structure
+	 */
 	struct cgroup *cgroup;
 
-	/* State maintained by the cgroup system to allow subsystems
+	/*
+	 * State maintained by the cgroup system to allow subsystems
 	 * to be "busy". Should be accessed via css_get(),
-	 * css_tryget() and and css_put(). */
+	 * css_tryget() and and css_put().
+	 */
 
 	atomic_t refcnt;
 
@@ -120,8 +124,10 @@ static inline void css_put(struct cgroup_subsys_state *css)
 enum {
 	/* Control Group is dead */
 	CGRP_REMOVED,
-	/* Control Group has previously had a child cgroup or a task,
-	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
+	/*
+	 * Control Group has previously had a child cgroup or a task,
+	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
+	 */
 	CGRP_RELEASABLE,
 	/* Control Group requires release notifications to userspace */
 	CGRP_NOTIFY_ON_RELEASE,
@@ -130,9 +136,10 @@ enum {
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
-	/* count users of this cgroup. >0 means busy, but doesn't
-	 * necessarily indicate the number of tasks in the
-	 * cgroup */
+	/*
+	 * count users of this cgroup. >0 means busy, but doesn't
+	 * necessarily indicate the number of tasks in the cgroup
+	 */
 	atomic_t count;
 
 	/*
@@ -142,7 +149,7 @@ struct cgroup {
 	struct list_head sibling;	/* my parent's children */
 	struct list_head children;	/* my children */
 
-	struct cgroup *parent;	/* my parent */
+	struct cgroup *parent;		/* my parent */
 	struct dentry *dentry;	  	/* cgroup fs entry, RCU protected */
 
 	/* Private pointers for each registered subsystem */
@@ -177,11 +184,12 @@ struct cgroup {
 	struct rcu_head rcu_head;
 };
 
-/* A css_set is a structure holding pointers to a set of
+/*
+ * A css_set is a structure holding pointers to a set of
  * cgroup_subsys_state objects. This saves space in the task struct
  * object and speeds up fork()/exit(), since a single inc/dec and a
- * list_add()/del() can bump the reference count on the entire
- * cgroup set for a task.
+ * list_add()/del() can bump the reference count on the entire cgroup
+ * set for a task.
  */
 
 struct css_set {
@@ -226,13 +234,8 @@ struct cgroup_map_cb {
 	void *state;
 };
 
-/* struct cftype:
- *
- * The files in the cgroup filesystem mostly have a very simple read/write
- * handling, some common function will take care of it. Nevertheless some cases
- * (read tasks) are special and therefore I define this structure for every
- * kind of file.
- *
+/*
+ * struct cftype: handler definitions for cgroup control files
  *
  * When reading/writing to a file:
  *	- the cgroup to use is file->f_dentry->d_parent->d_fsdata
@@ -241,8 +244,10 @@ struct cgroup_map_cb {
 
 #define MAX_CFTYPE_NAME 64
 struct cftype {
-	/* By convention, the name should begin with the name of the
-	 * subsystem, followed by a period */
+	/*
+	 * By convention, the name should begin with the name of the
+	 * subsystem, followed by a period
+	 */
 	char name[MAX_CFTYPE_NAME];
 	int private;
 
@@ -321,13 +326,17 @@ struct cgroup_scanner {
 	struct ptr_heap *heap;
 };
 
-/* Add a new file to the given cgroup directory. Should only be
- * called by subsystems from within a populate() method */
+/*
+ * Add a new file to the given cgroup directory. Should only be
+ * called by subsystems from within a populate() method
+ */
 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 		       const struct cftype *cft);
 
-/* Add a set of new files to the given cgroup directory. Should
- * only be called by subsystems from within a populate() method */
+/*
+ * Add a set of new files to the given cgroup directory. Should
+ * only be called by subsystems from within a populate() method
+ */
 int cgroup_add_files(struct cgroup *cgrp,
 			struct cgroup_subsys *subsys,
 			const struct cftype cft[],
@@ -419,7 +428,8 @@ struct cgroup_iter {
 	struct list_head *task;
 };
 
-/* To iterate across the tasks in a cgroup:
+/*
+ * To iterate across the tasks in a cgroup:
  *
  * 1) call cgroup_iter_start to intialize an iterator
  *
@@ -428,9 +438,10 @@ struct cgroup_iter {
  *
  * 3) call cgroup_iter_end() to destroy the iterator.
  *
- * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
- *    - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
- *      callback, but not while calling the process_task() callback.
+ * Or, call cgroup_scan_tasks() to iterate through every task in a
+ * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling
+ * the test_task() callback, but not while calling the process_task()
+ * callback.
  */
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it);
 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
-- 
cgit v1.2.3


From 313e924c0852943e67335fad9d2608701f0dfe8e Mon Sep 17 00:00:00 2001
From: Grzegorz Nosek <root@localdomain.pl>
Date: Thu, 2 Apr 2009 16:57:23 -0700
Subject: cgroups: relax ns_can_attach checks to allow attaching to grandchild
 cgroups

The ns_proxy cgroup allows moving processes to child cgroups only one
level deep at a time.  This commit relaxes this restriction and makes it
possible to attach tasks directly to grandchild cgroups, e.g.:

($pid is in the root cgroup)
echo $pid > /cgroup/CG1/CG2/tasks

Previously this operation would fail with -EPERM and would have to be
performed as two steps:
echo $pid > /cgroup/CG1/tasks
echo $pid > /cgroup/CG1/CG2/tasks

Also, the target cgroup no longer needs to be empty to move a task there.

Signed-off-by: Grzegorz Nosek <root@localdomain.pl>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  4 ++--
 kernel/cgroup.c        | 11 ++++++-----
 kernel/ns_cgroup.c     | 14 ++++----------
 3 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bb8feb9feccd..788c4964c142 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -348,8 +348,8 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
-/* Return true if the cgroup is a descendant of the current cgroup */
-int cgroup_is_descendant(const struct cgroup *cgrp);
+/* Return true if cgrp is a descendant of the task's cgroup */
+int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 
 /* Control Group subsystem type. See Documentation/cgroups.txt for details */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c500ca7239b2..27792bcb0758 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3084,18 +3084,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 }
 
 /**
- * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
+ * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
  * @cgrp: the cgroup in question
+ * @task: the task in question
  *
- * See if @cgrp is a descendant of the current task's cgroup in
- * the appropriate hierarchy.
+ * See if @cgrp is a descendant of @task's cgroup in the appropriate
+ * hierarchy.
  *
  * If we are sending in dummytop, then presumably we are creating
  * the top cgroup in the subsystem.
  *
  * Called only by the ns (nsproxy) cgroup.
  */
-int cgroup_is_descendant(const struct cgroup *cgrp)
+int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
 {
 	int ret;
 	struct cgroup *target;
@@ -3105,7 +3106,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
 		return 1;
 
 	get_first_subsys(cgrp, NULL, &subsys_id);
-	target = task_cgroup(current, subsys_id);
+	target = task_cgroup(task, subsys_id);
 	while (cgrp != target && cgrp!= cgrp->top_cgroup)
 		cgrp = cgrp->parent;
 	ret = (cgrp == target);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 78bc3fdac0d2..5aa854f9e5ae 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 
 /*
  * Rules:
- *   1. you can only enter a cgroup which is a child of your current
+ *   1. you can only enter a cgroup which is a descendant of your current
  *     cgroup
  *   2. you can only place another process into a cgroup if
  *     a. you have CAP_SYS_ADMIN
@@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 static int ns_can_attach(struct cgroup_subsys *ss,
 		struct cgroup *new_cgroup, struct task_struct *task)
 {
-	struct cgroup *orig;
-
 	if (current != task) {
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
-		if (!cgroup_is_descendant(new_cgroup))
+		if (!cgroup_is_descendant(new_cgroup, current))
 			return -EPERM;
 	}
 
-	if (atomic_read(&new_cgroup->count) != 0)
-		return -EPERM;
-
-	orig = task_cgroup(task, ns_subsys_id);
-	if (orig && orig != new_cgroup->parent)
+	if (!cgroup_is_descendant(new_cgroup, task))
 		return -EPERM;
 
 	return 0;
@@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
-	if (!cgroup_is_descendant(cgroup))
+	if (!cgroup_is_descendant(cgroup, current))
 		return ERR_PTR(-EPERM);
 
 	ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
-- 
cgit v1.2.3


From 38460b48d06440de46b34cb778bd6c4855030754 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:25 -0700
Subject: cgroup: CSS ID support

Patch for Per-CSS(Cgroup Subsys State) ID and private hierarchy code.

This patch attaches unique ID to each css and provides following.

 - css_lookup(subsys, id)
   returns pointer to struct cgroup_subysys_state of id.
 - css_get_next(subsys, id, rootid, depth, foundid)
   returns the next css under "root" by scanning

When cgroup_subsys->use_id is set, an id for css is maintained.

The cgroup framework only parepares
	- css_id of root css for subsys
	- id is automatically attached at creation of css.
	- id is *not* freed automatically. Because the cgroup framework
	  don't know lifetime of cgroup_subsys_state.
	  free_css_id() function is provided. This must be called by subsys.

There are several reasons to develop this.
	- Saving space .... For example, memcg's swap_cgroup is array of
	  pointers to cgroup. But it is not necessary to be very fast.
	  By replacing pointers(8bytes per ent) to ID (2byes per ent), we can
	  reduce much amount of memory usage.

	- Scanning without lock.
	  CSS_ID provides "scan id under this ROOT" function. By this, scanning
	  css under root can be written without locks.
	  ex)
	  do {
		rcu_read_lock();
		next = cgroup_get_next(subsys, id, root, &found);
		/* check sanity of next here */
		css_tryget();
		rcu_read_unlock();
		id = found + 1
	 } while(...)

Characteristics:
	- Each css has unique ID under subsys.
	- Lifetime of ID is controlled by subsys.
	- css ID contains "ID" and "Depth in hierarchy" and stack of hierarchy
	- Allowed ID is 1-65535, ID 0 is UNUSED ID.

Design Choices:
	- scan-by-ID v.s. scan-by-tree-walk.
	  As /proc's pid scan does, scan-by-ID is robust when scanning is done
	  by following kind of routine.
	  scan -> rest a while(release a lock) -> conitunue from interrupted
	  memcg's hierarchical reclaim does this.

	- When subsys->use_id is set, # of css in the system is limited to
	  65535.

[bharata@linux.vnet.ibm.com: remove rcu_read_lock() from css_get_next()]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  50 +++++++++
 include/linux/idr.h    |   1 +
 kernel/cgroup.c        | 286 ++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/idr.c              |  46 ++++++++
 4 files changed, 382 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 788c4964c142..9a23bb098205 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -15,6 +15,7 @@
 #include <linux/cgroupstats.h>
 #include <linux/prio_heap.h>
 #include <linux/rwsem.h>
+#include <linux/idr.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -22,6 +23,7 @@ struct cgroupfs_root;
 struct cgroup_subsys;
 struct inode;
 struct cgroup;
+struct css_id;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
@@ -63,6 +65,8 @@ struct cgroup_subsys_state {
 	atomic_t refcnt;
 
 	unsigned long flags;
+	/* ID for this css, if possible */
+	struct css_id *id;
 };
 
 /* bits in struct cgroup_subsys_state flags field */
@@ -373,6 +377,11 @@ struct cgroup_subsys {
 	int active;
 	int disabled;
 	int early_init;
+	/*
+	 * True if this subsys uses ID. ID is not available before cgroup_init()
+	 * (not available in early_init time.)
+	 */
+	bool use_id;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
@@ -395,6 +404,9 @@ struct cgroup_subsys {
 	 */
 	struct cgroupfs_root *root;
 	struct list_head sibling;
+	/* used when use_id == true */
+	struct idr idr;
+	spinlock_t id_lock;
 };
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
@@ -450,6 +462,44 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 
+/*
+ * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
+ * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
+ * CSS ID is assigned at cgroup allocation (create) automatically
+ * and removed when subsys calls free_css_id() function. This is because
+ * the lifetime of cgroup_subsys_state is subsys's matter.
+ *
+ * Looking up and scanning function should be called under rcu_read_lock().
+ * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
+ * But the css returned by this routine can be "not populated yet" or "being
+ * destroyed". The caller should check css and cgroup's status.
+ */
+
+/*
+ * Typically Called at ->destroy(), or somewhere the subsys frees
+ * cgroup_subsys_state.
+ */
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
+
+/* Find a cgroup_subsys_state which has given ID */
+
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
+
+/*
+ * Get a cgroup whose id is greater than or equal to id under tree of root.
+ * Returning a cgroup_subsys_state or NULL.
+ */
+struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
+		struct cgroup_subsys_state *root, int *foundid);
+
+/* Returns true if root is ancestor of cg */
+bool css_is_ancestor(struct cgroup_subsys_state *cg,
+		     struct cgroup_subsys_state *root);
+
+/* Get id and depth of css */
+unsigned short css_id(struct cgroup_subsys_state *css);
+unsigned short css_depth(struct cgroup_subsys_state *css);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/idr.h b/include/linux/idr.h
index dd846df8cd32..e968db71e33a 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -106,6 +106,7 @@ int idr_get_new(struct idr *idp, void *ptr, int *id);
 int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
 int idr_for_each(struct idr *idp,
 		 int (*fn)(int id, void *p, void *data), void *data);
+void *idr_get_next(struct idr *idp, int *nextid);
 void *idr_replace(struct idr *idp, void *ptr, int id);
 void idr_remove(struct idr *idp, int id);
 void idr_remove_all(struct idr *idp);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 27792bcb0758..d3c521137425 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -94,7 +94,6 @@ struct cgroupfs_root {
 	char release_agent_path[PATH_MAX];
 };
 
-
 /*
  * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
  * subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
  */
 static struct cgroupfs_root rootnode;
 
+/*
+ * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
+ * cgroup_subsys->use_id != 0.
+ */
+#define CSS_ID_MAX	(65535)
+struct css_id {
+	/*
+	 * The css to which this ID points. This pointer is set to valid value
+	 * after cgroup is populated. If cgroup is removed, this will be NULL.
+	 * This pointer is expected to be RCU-safe because destroy()
+	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
+	 * css_tryget() should be used for avoiding race.
+	 */
+	struct cgroup_subsys_state *css;
+	/*
+	 * ID of this css.
+	 */
+	unsigned short id;
+	/*
+	 * Depth in hierarchy which this ID belongs to.
+	 */
+	unsigned short depth;
+	/*
+	 * ID is freed by RCU. (and lookup routine is RCU safe.)
+	 */
+	struct rcu_head rcu_head;
+	/*
+	 * Hierarchy of CSS ID belongs to.
+	 */
+	unsigned short stack[0]; /* Array of Length (depth+1) */
+};
+
+
 /* The list of hierarchy roots */
 
 static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
 
+static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+
 /* css_set_lock protects the list of css_set objects, and the
  * chain of tasks off each css_set.  Nests outside task->alloc_lock
  * due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
+static int alloc_css_id(struct cgroup_subsys *ss,
+			struct cgroup *parent, struct cgroup *child);
+
 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
@@ -2327,6 +2364,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 			return err;
 	}
+	/* This cgroup is ready now */
+	for_each_subsys(cgrp->root, ss) {
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+		/*
+		 * Update id->css pointer and make this css visible from
+		 * CSS ID functions. This pointer will be dereferened
+		 * from RCU-read-side without locks.
+		 */
+		if (css->id)
+			rcu_assign_pointer(css->id->css, css);
+	}
 
 	return 0;
 }
@@ -2338,6 +2386,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 	css->cgroup = cgrp;
 	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
+	css->id = NULL;
 	if (cgrp == dummytop)
 		set_bit(CSS_ROOT, &css->flags);
 	BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2413,6 +2462,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			goto err_destroy;
 		}
 		init_cgroup_css(css, ss, cgrp);
+		if (ss->use_id)
+			if (alloc_css_id(ss, parent, cgrp))
+				goto err_destroy;
+		/* At error, ->destroy() callback has to free assigned ID. */
 	}
 
 	cgroup_lock_hierarchy(root);
@@ -2708,6 +2761,8 @@ int __init cgroup_init(void)
 		struct cgroup_subsys *ss = subsys[i];
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
+		if (ss->use_id)
+			cgroup_subsys_init_idr(ss);
 	}
 
 	/* Add init_css_set to the hash table */
@@ -3242,3 +3297,232 @@ static int __init cgroup_disable(char *str)
 	return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);
+
+/*
+ * Functons for CSS ID.
+ */
+
+/*
+ *To get ID other than 0, this should be called when !cgroup_is_removed().
+ */
+unsigned short css_id(struct cgroup_subsys_state *css)
+{
+	struct css_id *cssid = rcu_dereference(css->id);
+
+	if (cssid)
+		return cssid->id;
+	return 0;
+}
+
+unsigned short css_depth(struct cgroup_subsys_state *css)
+{
+	struct css_id *cssid = rcu_dereference(css->id);
+
+	if (cssid)
+		return cssid->depth;
+	return 0;
+}
+
+bool css_is_ancestor(struct cgroup_subsys_state *child,
+		    struct cgroup_subsys_state *root)
+{
+	struct css_id *child_id = rcu_dereference(child->id);
+	struct css_id *root_id = rcu_dereference(root->id);
+
+	if (!child_id || !root_id || (child_id->depth < root_id->depth))
+		return false;
+	return child_id->stack[root_id->depth] == root_id->id;
+}
+
+static void __free_css_id_cb(struct rcu_head *head)
+{
+	struct css_id *id;
+
+	id = container_of(head, struct css_id, rcu_head);
+	kfree(id);
+}
+
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
+{
+	struct css_id *id = css->id;
+	/* When this is called before css_id initialization, id can be NULL */
+	if (!id)
+		return;
+
+	BUG_ON(!ss->use_id);
+
+	rcu_assign_pointer(id->css, NULL);
+	rcu_assign_pointer(css->id, NULL);
+	spin_lock(&ss->id_lock);
+	idr_remove(&ss->idr, id->id);
+	spin_unlock(&ss->id_lock);
+	call_rcu(&id->rcu_head, __free_css_id_cb);
+}
+
+/*
+ * This is called by init or create(). Then, calls to this function are
+ * always serialized (By cgroup_mutex() at create()).
+ */
+
+static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
+{
+	struct css_id *newid;
+	int myid, error, size;
+
+	BUG_ON(!ss->use_id);
+
+	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
+	newid = kzalloc(size, GFP_KERNEL);
+	if (!newid)
+		return ERR_PTR(-ENOMEM);
+	/* get id */
+	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
+		error = -ENOMEM;
+		goto err_out;
+	}
+	spin_lock(&ss->id_lock);
+	/* Don't use 0. allocates an ID of 1-65535 */
+	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+	spin_unlock(&ss->id_lock);
+
+	/* Returns error when there are no free spaces for new ID.*/
+	if (error) {
+		error = -ENOSPC;
+		goto err_out;
+	}
+	if (myid > CSS_ID_MAX)
+		goto remove_idr;
+
+	newid->id = myid;
+	newid->depth = depth;
+	return newid;
+remove_idr:
+	error = -ENOSPC;
+	spin_lock(&ss->id_lock);
+	idr_remove(&ss->idr, myid);
+	spin_unlock(&ss->id_lock);
+err_out:
+	kfree(newid);
+	return ERR_PTR(error);
+
+}
+
+static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+{
+	struct css_id *newid;
+	struct cgroup_subsys_state *rootcss;
+
+	spin_lock_init(&ss->id_lock);
+	idr_init(&ss->idr);
+
+	rootcss = init_css_set.subsys[ss->subsys_id];
+	newid = get_new_cssid(ss, 0);
+	if (IS_ERR(newid))
+		return PTR_ERR(newid);
+
+	newid->stack[0] = newid->id;
+	newid->css = rootcss;
+	rootcss->id = newid;
+	return 0;
+}
+
+static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
+			struct cgroup *child)
+{
+	int subsys_id, i, depth = 0;
+	struct cgroup_subsys_state *parent_css, *child_css;
+	struct css_id *child_id, *parent_id = NULL;
+
+	subsys_id = ss->subsys_id;
+	parent_css = parent->subsys[subsys_id];
+	child_css = child->subsys[subsys_id];
+	depth = css_depth(parent_css) + 1;
+	parent_id = parent_css->id;
+
+	child_id = get_new_cssid(ss, depth);
+	if (IS_ERR(child_id))
+		return PTR_ERR(child_id);
+
+	for (i = 0; i < depth; i++)
+		child_id->stack[i] = parent_id->stack[i];
+	child_id->stack[depth] = child_id->id;
+	/*
+	 * child_id->css pointer will be set after this cgroup is available
+	 * see cgroup_populate_dir()
+	 */
+	rcu_assign_pointer(child_css->id, child_id);
+
+	return 0;
+}
+
+/**
+ * css_lookup - lookup css by id
+ * @ss: cgroup subsys to be looked into.
+ * @id: the id
+ *
+ * Returns pointer to cgroup_subsys_state if there is valid one with id.
+ * NULL if not. Should be called under rcu_read_lock()
+ */
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
+{
+	struct css_id *cssid = NULL;
+
+	BUG_ON(!ss->use_id);
+	cssid = idr_find(&ss->idr, id);
+
+	if (unlikely(!cssid))
+		return NULL;
+
+	return rcu_dereference(cssid->css);
+}
+
+/**
+ * css_get_next - lookup next cgroup under specified hierarchy.
+ * @ss: pointer to subsystem
+ * @id: current position of iteration.
+ * @root: pointer to css. search tree under this.
+ * @foundid: position of found object.
+ *
+ * Search next css under the specified hierarchy of rootid. Calling under
+ * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
+ */
+struct cgroup_subsys_state *
+css_get_next(struct cgroup_subsys *ss, int id,
+	     struct cgroup_subsys_state *root, int *foundid)
+{
+	struct cgroup_subsys_state *ret = NULL;
+	struct css_id *tmp;
+	int tmpid;
+	int rootid = css_id(root);
+	int depth = css_depth(root);
+
+	if (!rootid)
+		return NULL;
+
+	BUG_ON(!ss->use_id);
+	/* fill start point for scan */
+	tmpid = id;
+	while (1) {
+		/*
+		 * scan next entry from bitmap(tree), tmpid is updated after
+		 * idr_get_next().
+		 */
+		spin_lock(&ss->id_lock);
+		tmp = idr_get_next(&ss->idr, &tmpid);
+		spin_unlock(&ss->id_lock);
+
+		if (!tmp)
+			break;
+		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
+			ret = rcu_dereference(tmp->css);
+			if (ret) {
+				*foundid = tmpid;
+				break;
+			}
+		}
+		/* continue to scan from next id */
+		tmpid = tmpid + 1;
+	}
+	return ret;
+}
+
diff --git a/lib/idr.c b/lib/idr.c
index dab4bca86f5d..80ca9aca038b 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -578,6 +578,52 @@ int idr_for_each(struct idr *idp,
 }
 EXPORT_SYMBOL(idr_for_each);
 
+/**
+ * idr_get_next - lookup next object of id to given id.
+ * @idp: idr handle
+ * @id:  pointer to lookup key
+ *
+ * Returns pointer to registered object with id, which is next number to
+ * given id.
+ */
+
+void *idr_get_next(struct idr *idp, int *nextidp)
+{
+	struct idr_layer *p, *pa[MAX_LEVEL];
+	struct idr_layer **paa = &pa[0];
+	int id = *nextidp;
+	int n, max;
+
+	/* find first ent */
+	n = idp->layers * IDR_BITS;
+	max = 1 << n;
+	p = rcu_dereference(idp->top);
+	if (!p)
+		return NULL;
+
+	while (id < max) {
+		while (n > 0 && p) {
+			n -= IDR_BITS;
+			*paa++ = p;
+			p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]);
+		}
+
+		if (p) {
+			*nextidp = id;
+			return p;
+		}
+
+		id += 1 << n;
+		while (n < fls(id)) {
+			n += IDR_BITS;
+			p = *--paa;
+		}
+	}
+	return NULL;
+}
+
+
+
 /**
  * idr_replace - replace pointer for given id
  * @idp: idr handle
-- 
cgit v1.2.3


From ec64f51545fffbc4cb968f0cea56341a4b07e85a Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:26 -0700
Subject: cgroup: fix frequent -EBUSY at rmdir

In following situation, with memory subsystem,

	/groupA use_hierarchy==1
		/01 some tasks
		/02 some tasks
		/03 some tasks
		/04 empty

When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim
is triggered and the kernel walks tree under groupA. In this case,
rmdir /groupA/04 fails with -EBUSY frequently because of temporal
refcnt from the kernel.

In general. cgroup can be rmdir'd if there are no children groups and
no tasks. Frequent fails of rmdir() is not useful to users.
(And the reason for -EBUSY is unknown to users.....in most cases)

This patch tries to modify above behavior, by
	- retries if css_refcnt is got by someone.
	- add "return value" to pre_destroy() and allows subsystem to
	  say "we're really busy!"

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/cgroups.txt |  6 ++-
 include/linux/cgroup.h            |  6 ++-
 kernel/cgroup.c                   | 81 ++++++++++++++++++++++++++++++++-------
 mm/memcontrol.c                   |  5 ++-
 4 files changed, 79 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 93feb8444489..cdc46a501b85 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -476,11 +476,13 @@ cgroup->parent is still valid. (Note - can also be called for a
 newly-created cgroup if an error occurs after this subsystem's
 create() method has been called for the new cgroup).
 
-void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
 
 Called before checking the reference count on each subsystem. This may
 be useful for subsystems which have some extra references even if
-there are not tasks in the cgroup.
+there are not tasks in the cgroup. If pre_destroy() returns error code,
+rmdir() will fail with it. From this behavior, pre_destroy() can be
+called multiple times against a cgroup.
 
 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	       struct task_struct *task)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9a23bb098205..7d824b80b3d7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -135,6 +135,10 @@ enum {
 	CGRP_RELEASABLE,
 	/* Control Group requires release notifications to userspace */
 	CGRP_NOTIFY_ON_RELEASE,
+	/*
+	 * A thread in rmdir() is wating for this cgroup.
+	 */
+	CGRP_WAIT_ON_RMDIR,
 };
 
 struct cgroup {
@@ -360,7 +364,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
 						  struct cgroup *cgrp);
-	void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
+	int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	int (*can_attach)(struct cgroup_subsys *ss,
 			  struct cgroup *cgrp, struct task_struct *tsk);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d3c521137425..fc5e4a48582f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
  * Call subsys's pre_destroy handler.
  * This is called before css refcnt check.
  */
-static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
+	int ret = 0;
+
 	for_each_subsys(cgrp->root, ss)
-		if (ss->pre_destroy)
-			ss->pre_destroy(ss, cgrp);
-	return;
+		if (ss->pre_destroy) {
+			ret = ss->pre_destroy(ss, cgrp);
+			if (ret)
+				break;
+		}
+	return ret;
 }
 
 static void free_cgroup_rcu(struct rcu_head *obj)
@@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 	remove_dir(dentry);
 }
 
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ */
+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+{
+	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+		wake_up_all(&cgroup_rmdir_waitq);
+}
+
 static int rebind_subsystems(struct cgroupfs_root *root,
 			      unsigned long final_bits)
 {
@@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
 	synchronize_rcu();
 	put_css_set(cg);
+
+	/*
+	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
+	 * is no longer empty.
+	 */
+	cgroup_wakeup_rmdir_waiters(cgrp);
 	return 0;
 }
 
@@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cgroup *cgrp = dentry->d_fsdata;
 	struct dentry *d;
 	struct cgroup *parent;
+	DEFINE_WAIT(wait);
+	int ret;
 
 	/* the vfs holds both inode->i_mutex already */
-
+again:
 	mutex_lock(&cgroup_mutex);
 	if (atomic_read(&cgrp->count) != 0) {
 		mutex_unlock(&cgroup_mutex);
@@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	 * Call pre_destroy handlers of subsys. Notify subsystems
 	 * that rmdir() request comes.
 	 */
-	cgroup_call_pre_destroy(cgrp);
+	ret = cgroup_call_pre_destroy(cgrp);
+	if (ret)
+		return ret;
 
 	mutex_lock(&cgroup_mutex);
 	parent = cgrp->parent;
-
-	if (atomic_read(&cgrp->count)
-	    || !list_empty(&cgrp->children)
-	    || !cgroup_clear_css_refs(cgrp)) {
+	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
+	/*
+	 * css_put/get is provided for subsys to grab refcnt to css. In typical
+	 * case, subsystem has no reference after pre_destroy(). But, under
+	 * hierarchy management, some *temporal* refcnt can be hold.
+	 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
+	 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
+	 * is called when css_put() is called and refcnt goes down to 0.
+	 */
+	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
+
+	if (!cgroup_clear_css_refs(cgrp)) {
+		mutex_unlock(&cgroup_mutex);
+		schedule();
+		finish_wait(&cgroup_rmdir_waitq, &wait);
+		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+		if (signal_pending(current))
+			return -EINTR;
+		goto again;
+	}
+	/* NO css_tryget() can success after here. */
+	finish_wait(&cgroup_rmdir_waitq, &wait);
+	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 
 	spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
 	rcu_read_lock();
-	if ((atomic_dec_return(&css->refcnt) == 1) &&
-	    notify_on_release(cgrp)) {
-		set_bit(CGRP_RELEASABLE, &cgrp->flags);
-		check_for_release(cgrp);
+	if (atomic_dec_return(&css->refcnt) == 1) {
+		if (notify_on_release(cgrp)) {
+			set_bit(CGRP_RELEASABLE, &cgrp->flags);
+			check_for_release(cgrp);
+		}
+		cgroup_wakeup_rmdir_waiters(cgrp);
 	}
 	rcu_read_unlock();
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e4be9cb2a6a..8ffec674c5ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2272,11 +2272,12 @@ free_out:
 	return ERR_PTR(-ENOMEM);
 }
 
-static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
+static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 					struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-	mem_cgroup_force_empty(mem, false);
+
+	return mem_cgroup_force_empty(mem, false);
 }
 
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
-- 
cgit v1.2.3


From 099fca3225b39f7a3ed853036038054172b55581 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:29 -0700
Subject: cgroups: show correct file mode

We have some read-only files and write-only files, but currently they are
all set to 0644, which is counter-intuitive and cause trouble for some
cgroup tools like libcgroup.

This patch adds 'mode' to struct cftype to allow cgroup subsys to set it's
own files' file mode, and for the most cases cft->mode can be default to 0
and cgroup will figure out proper mode.

Acked-by: Paul Menage <menage@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  5 +++++
 kernel/cgroup.c        | 38 ++++++++++++++++++++++++++++++++++----
 kernel/cpuset.c        |  1 +
 3 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7d824b80b3d7..b2816fba5306 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -258,6 +258,11 @@ struct cftype {
 	 */
 	char name[MAX_CFTYPE_NAME];
 	int private;
+	/*
+	 * If not 0, file mode is set to this value, otherwise it will
+	 * be figured out automatically
+	 */
+	mode_t mode;
 
 	/*
 	 * If non-zero, defines the maximum length of string that can
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9a6c2bfa1d9f..fea11c5c990c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1686,7 +1686,7 @@ static struct inode_operations cgroup_dir_inode_operations = {
 	.rename = cgroup_rename,
 };
 
-static int cgroup_create_file(struct dentry *dentry, int mode,
+static int cgroup_create_file(struct dentry *dentry, mode_t mode,
 				struct super_block *sb)
 {
 	static const struct dentry_operations cgroup_dops = {
@@ -1732,7 +1732,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
  * @mode: mode to set on new directory.
  */
 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-				int mode)
+				mode_t mode)
 {
 	struct dentry *parent;
 	int error = 0;
@@ -1750,6 +1750,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 	return error;
 }
 
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static mode_t cgroup_file_mode(const struct cftype *cft)
+{
+	mode_t mode = 0;
+
+	if (cft->mode)
+		return cft->mode;
+
+	if (cft->read || cft->read_u64 || cft->read_s64 ||
+	    cft->read_map || cft->read_seq_string)
+		mode |= S_IRUGO;
+
+	if (cft->write || cft->write_u64 || cft->write_s64 ||
+	    cft->write_string || cft->trigger)
+		mode |= S_IWUSR;
+
+	return mode;
+}
+
 int cgroup_add_file(struct cgroup *cgrp,
 		       struct cgroup_subsys *subsys,
 		       const struct cftype *cft)
@@ -1757,6 +1784,7 @@ int cgroup_add_file(struct cgroup *cgrp,
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
 	int error;
+	mode_t mode;
 
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1767,7 +1795,8 @@ int cgroup_add_file(struct cgroup *cgrp,
 	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
 	dentry = lookup_one_len(name, dir, strlen(name));
 	if (!IS_ERR(dentry)) {
-		error = cgroup_create_file(dentry, 0644 | S_IFREG,
+		mode = cgroup_file_mode(cft);
+		error = cgroup_create_file(dentry, mode | S_IFREG,
 						cgrp->root->sb);
 		if (!error)
 			dentry->d_fsdata = (void *)cft;
@@ -2349,6 +2378,7 @@ static struct cftype files[] = {
 		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_tasks_release,
 		.private = FILE_TASKLIST,
+		.mode = S_IRUGO | S_IWUSR,
 	},
 
 	{
@@ -2449,7 +2479,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  * Must be called with the mutex on the parent inode held
  */
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
-			     int mode)
+			     mode_t mode)
 {
 	struct cgroup *cgrp;
 	struct cgroupfs_root *root = parent->root;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f76db9dcaa05..ee5ec386aa8b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1706,6 +1706,7 @@ static struct cftype files[] = {
 		.read_u64 = cpuset_read_u64,
 		.write_u64 = cpuset_write_u64,
 		.private = FILE_MEMORY_PRESSURE,
+		.mode = S_IRUGO,
 	},
 
 	{
-- 
cgit v1.2.3


From 0b7f569e45bb6be142d87017030669a6a7d327a1 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:38 -0700
Subject: memcg: fix OOM killer under memcg

This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.

But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup

	/groupA/	hierarchy=1, limit=1G,
		01	nolimit
		02	nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.

This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.

To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/memcg_test.txt | 20 +++++++++++++++++++-
 include/linux/cgroup.h               |  2 +-
 kernel/cgroup.c                      |  2 +-
 mm/memcontrol.c                      | 30 ++++++++++++++++++++++++++++--
 4 files changed, 49 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index 523a9c16c400..8a11caf417a0 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -1,5 +1,5 @@
 Memory Resource Controller(Memcg)  Implementation Memo.
-Last Updated: 2009/1/19
+Last Updated: 2009/1/20
 Base Kernel Version: based on 2.6.29-rc2.
 
 Because VM is getting complex (one of reasons is memcg...), memcg's behavior
@@ -360,3 +360,21 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	# kill malloc task.
 
 	Of course, tmpfs v.s. swapoff test should be tested, too.
+
+ 9.8 OOM-Killer
+	Out-of-memory caused by memcg's limit will kill tasks under
+	the memcg. When hierarchy is used, a task under hierarchy
+	will be killed by the kernel.
+	In this case, panic_on_oom shouldn't be invoked and tasks
+	in other groups shouldn't be killed.
+
+	It's not difficult to cause OOM under memcg as following.
+	Case A) when you can swapoff
+	#swapoff -a
+	#echo 50M > /memory.limit_in_bytes
+	run 51M of malloc
+
+	Case B) when you use mem+swap limitation.
+	#echo 50M > memory.limit_in_bytes
+	#echo 50M > memory.memsw.limit_in_bytes
+	run 51M of malloc
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b2816fba5306..43763bd772b9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -503,7 +503,7 @@ struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
 
 /* Returns true if root is ancestor of cg */
 bool css_is_ancestor(struct cgroup_subsys_state *cg,
-		     struct cgroup_subsys_state *root);
+		     const struct cgroup_subsys_state *root);
 
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f2a3f5c9936c..382109b5baeb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3405,7 +3405,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 }
 
 bool css_is_ancestor(struct cgroup_subsys_state *child,
-		    struct cgroup_subsys_state *root)
+		    const struct cgroup_subsys_state *root)
 {
 	struct css_id *child_id = rcu_dereference(child->id);
 	struct css_id *root_id = rcu_dereference(root->id);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6f6a575e77ad..025f8abfae2d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -295,6 +295,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *mem = NULL;
+
+	if (!mm)
+		return NULL;
 	/*
 	 * Because we have no locks, mm->owner's may be being moved to other
 	 * cgroup. We use css_tryget() here even if this looks
@@ -486,10 +489,20 @@ void mem_cgroup_move_lists(struct page *page,
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
 	int ret;
+	struct mem_cgroup *curr = NULL;
 
 	task_lock(task);
-	ret = task->mm && mm_match_cgroup(task->mm, mem);
+	rcu_read_lock();
+	curr = try_get_mem_cgroup_from_mm(task->mm);
+	rcu_read_unlock();
 	task_unlock(task);
+	if (!curr)
+		return 0;
+	if (curr->use_hierarchy)
+		ret = css_is_ancestor(&curr->css, &mem->css);
+	else
+		ret = (curr == mem);
+	css_put(&curr->css);
 	return ret;
 }
 
@@ -820,6 +833,19 @@ bool mem_cgroup_oom_called(struct task_struct *task)
 	rcu_read_unlock();
 	return ret;
 }
+
+static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+{
+	mem->last_oom_jiffies = jiffies;
+	return 0;
+}
+
+static void record_last_oom(struct mem_cgroup *mem)
+{
+	mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+}
+
+
 /*
  * Unlike exported interface, "oom" parameter is added. if oom==true,
  * oom-killer can be invoked.
@@ -902,7 +928,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 				mutex_lock(&memcg_tasklist);
 				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
 				mutex_unlock(&memcg_tasklist);
-				mem_over_limit->last_oom_jiffies = jiffies;
+				record_last_oom(mem_over_limit);
 			}
 			goto nomem;
 		}
-- 
cgit v1.2.3


From e222432bfa7dcf6ec008622a978c9f284ed5e3a9 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:57:39 -0700
Subject: memcg: show memcg information during OOM

Add RSS and swap to OOM output from memcg

Display memcg values like failcnt, usage and limit when an OOM occurs due
to memcg.

Thanks to Johannes Weiner, Li Zefan, David Rientjes, Kamezawa Hiroyuki,
Daisuke Nishimura and KOSAKI Motohiro for review.

Sample output
-------------

Task in /a/x killed as a result of limit of /a
memory: usage 1048576kB, limit 1048576kB, failcnt 4183
memory+swap: usage 1400964kB, limit 9007199254740991kB, failcnt 0

[akpm@linux-foundation.org: compilation fix]
[akpm@linux-foundation.org: fix kerneldoc and whitespace]
[akpm@linux-foundation.org: add printk facility level]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  7 +++++
 mm/memcontrol.c            | 69 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/oom_kill.c              |  1 +
 3 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 326f45c86530..7aba9f264622 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -104,6 +104,8 @@ struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
 						      struct zone *zone);
 struct zone_reclaim_stat*
 mem_cgroup_get_reclaim_stat_from_page(struct page *page);
+extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
+					struct task_struct *p);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern int do_swap_account;
@@ -270,6 +272,11 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 	return NULL;
 }
 
+static inline void
+mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
+{
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 025f8abfae2d..2bdb6149faeb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
@@ -721,6 +722,74 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
 	(*val)++;
 	return 0;
 }
+
+/**
+ * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
+ * @memcg: The memory cgroup that went over limit
+ * @p: Task that is going to be killed
+ *
+ * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
+ * enabled
+ */
+void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
+{
+	struct cgroup *task_cgrp;
+	struct cgroup *mem_cgrp;
+	/*
+	 * Need a buffer in BSS, can't rely on allocations. The code relies
+	 * on the assumption that OOM is serialized for memory controller.
+	 * If this assumption is broken, revisit this code.
+	 */
+	static char memcg_name[PATH_MAX];
+	int ret;
+
+	if (!memcg)
+		return;
+
+
+	rcu_read_lock();
+
+	mem_cgrp = memcg->css.cgroup;
+	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
+
+	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
+	if (ret < 0) {
+		/*
+		 * Unfortunately, we are unable to convert to a useful name
+		 * But we'll still print out the usage information
+		 */
+		rcu_read_unlock();
+		goto done;
+	}
+	rcu_read_unlock();
+
+	printk(KERN_INFO "Task in %s killed", memcg_name);
+
+	rcu_read_lock();
+	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
+	if (ret < 0) {
+		rcu_read_unlock();
+		goto done;
+	}
+	rcu_read_unlock();
+
+	/*
+	 * Continues from above, so we don't need an KERN_ level
+	 */
+	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
+done:
+
+	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
+		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
+		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
+		res_counter_read_u64(&memcg->res, RES_FAILCNT));
+	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
+		"failcnt %llu\n",
+		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
+		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
+		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
+}
+
 /*
  * This function returns the number of memcg under hierarchy tree. Returns
  * 1(self count) if no children.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d3b9bac085b5..2f3166e308d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -394,6 +394,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 		cpuset_print_task_mems_allowed(current);
 		task_unlock(current);
 		dump_stack();
+		mem_cgroup_print_oom_info(mem, current);
 		show_mem();
 		if (sysctl_oom_dump_tasks)
 			dump_tasks(mem);
-- 
cgit v1.2.3


From c137b5ece4b111e46981aae7da77315b9909809f Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:40 -0700
Subject: memcg: remove mem_cgroup_calc_mapped_ratio()

Currently, mem_cgroup_calc_mapped_ratio() is unused at all.  it can be
removed and KAMEZAWA-san suggested it.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ------
 mm/memcontrol.c            | 17 -----------------
 2 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7aba9f264622..4562d09ab964 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -88,7 +88,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
 /*
  * For memory reclaim.
  */
-extern int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem);
 extern long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem);
 
 extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
@@ -211,11 +210,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
 {
 }
 
-static inline int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
-{
-	return 0;
-}
-
 static inline int mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
 {
 	return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bdb6149faeb..7bb14fdc780c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -507,23 +507,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 	return ret;
 }
 
-/*
- * Calculate mapped_ratio under memory controller. This will be used in
- * vmscan.c for deteremining we have to reclaim mapped pages.
- */
-int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
-{
-	long total, rss;
-
-	/*
-	 * usage is recorded in bytes. But, here, we assume the number of
-	 * physical pages can be represented by "long" on any arch.
-	 */
-	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
-	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
-	return (int)((rss * 100L) / total);
-}
-
 /*
  * prev_priority control...this will be used in memory reclaim path.
  */
-- 
cgit v1.2.3


From 3918b96e03b2b8dd05889320623f6870e81d35ec Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:41 -0700
Subject: memcg: remove mem_cgroup_reclaim_imbalance() remnants

commit 4f98a2fee8acdb4ac84545df98cccecfd130f8db (vmscan: split LRU lists
into anon & file sets) removed mem_cgroup_reclaim_imbalance(), but there
are some leftovers in memcontrol.h.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4562d09ab964..18146c980b68 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -88,8 +88,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
 /*
  * For memory reclaim.
  */
-extern long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem);
-
 extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
 extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
 							int priority);
@@ -210,11 +208,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
 {
 }
 
-static inline int mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
-{
-	return 0;
-}
-
 static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 {
 	return 0;
-- 
cgit v1.2.3


From a3b2d692690aef228e493b1beaafe5364cab3237 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:45 -0700
Subject: cgroups: use css id in swap cgroup for saving memory v5

Try to use CSS ID for records in swap_cgroup.  By this, on 64bit machine,
size of swap_cgroup goes down to 2 bytes from 8bytes.

This means, when 2GB of swap is equipped, (assume the page size is 4096bytes)

	From size of swap_cgroup = 2G/4k * 8 = 4Mbytes.
	To   size of swap_cgroup = 2G/4k * 2 = 1Mbytes.

Reduction is large.  Of course, there are trade-offs.  This CSS ID will
add overhead to swap-in/swap-out/swap-free.

But in general,
  - swap is a resource which the user tend to avoid use.
  - If swap is never used, swap_cgroup area is not used.
  - Reading traditional manuals, size of swap should be proportional to
    size of memory. Memory size of machine is increasing now.

I think reducing size of swap_cgroup makes sense.

Note:
  - ID->CSS lookup routine has no locks, it's under RCU-Read-Side.
  - memcg can be obsolete at rmdir() but not freed while refcnt from
    swap_cgroup is available.

Changelog v4->v5:
 - reworked on to memcg-charge-swapcache-to-proper-memcg.patch
Changlog ->v4:
 - fixed not configured case.
 - deleted unnecessary comments.
 - fixed NULL pointer bug.
 - fixed message in dmesg.

[nishimura@mxp.nes.nec.co.jp: css_tryget can be called twice in !PageCgroupUsed case]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_cgroup.h | 13 ++++----
 mm/memcontrol.c             | 74 +++++++++++++++++++++++++++++++++++++--------
 mm/page_cgroup.c            | 32 +++++++++-----------
 3 files changed, 82 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 602cc1fdee90..7339c7bf7331 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -91,24 +91,23 @@ static inline void page_cgroup_init(void)
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 #include <linux/swap.h>
-extern struct mem_cgroup *
-swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem);
-extern struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent);
+extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
+extern unsigned short lookup_swap_cgroup(swp_entry_t ent);
 extern int swap_cgroup_swapon(int type, unsigned long max_pages);
 extern void swap_cgroup_swapoff(int type);
 #else
 #include <linux/swap.h>
 
 static inline
-struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 {
-	return NULL;
+	return 0;
 }
 
 static inline
-struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
+unsigned short lookup_swap_cgroup(swp_entry_t ent)
 {
-	return NULL;
+	return 0;
 }
 
 static inline int
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 81b0ae8183d0..55dea5968464 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -991,10 +991,31 @@ nomem:
 	return -ENOMEM;
 }
 
+
+/*
+ * A helper function to get mem_cgroup from ID. must be called under
+ * rcu_read_lock(). The caller must check css_is_removed() or some if
+ * it's concern. (dropping refcnt from swap can be called against removed
+ * memcg.)
+ */
+static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
+{
+	struct cgroup_subsys_state *css;
+
+	/* ID 0 is unused ID */
+	if (!id)
+		return NULL;
+	css = css_lookup(&mem_cgroup_subsys, id);
+	if (!css)
+		return NULL;
+	return container_of(css, struct mem_cgroup, css);
+}
+
 static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
+	unsigned short id;
 	swp_entry_t ent;
 
 	VM_BUG_ON(!PageLocked(page));
@@ -1006,16 +1027,19 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
 	/*
 	 * Used bit of swapcache is solid under page lock.
 	 */
-	if (PageCgroupUsed(pc))
+	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
-	else {
+		if (mem && !css_tryget(&mem->css))
+			mem = NULL;
+	} else {
 		ent.val = page_private(page);
-		mem = lookup_swap_cgroup(ent);
+		id = lookup_swap_cgroup(ent);
+		rcu_read_lock();
+		mem = mem_cgroup_lookup(id);
+		if (mem && !css_tryget(&mem->css))
+			mem = NULL;
+		rcu_read_unlock();
 	}
-	if (!mem)
-		return NULL;
-	if (!css_tryget(&mem->css))
-		return NULL;
 	return mem;
 }
 
@@ -1276,12 +1300,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 
 	if (do_swap_account && !ret && PageSwapCache(page)) {
 		swp_entry_t ent = {.val = page_private(page)};
+		unsigned short id;
 		/* avoid double counting */
-		mem = swap_cgroup_record(ent, NULL);
+		id = swap_cgroup_record(ent, 0);
+		rcu_read_lock();
+		mem = mem_cgroup_lookup(id);
 		if (mem) {
+			/*
+			 * We did swap-in. Then, this entry is doubly counted
+			 * both in mem and memsw. We uncharge it, here.
+			 * Recorded ID can be obsolete. We avoid calling
+			 * css_tryget()
+			 */
 			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 			mem_cgroup_put(mem);
 		}
+		rcu_read_unlock();
 	}
 	return ret;
 }
@@ -1346,13 +1380,21 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 	 */
 	if (do_swap_account && PageSwapCache(page)) {
 		swp_entry_t ent = {.val = page_private(page)};
+		unsigned short id;
 		struct mem_cgroup *memcg;
-		memcg = swap_cgroup_record(ent, NULL);
+
+		id = swap_cgroup_record(ent, 0);
+		rcu_read_lock();
+		memcg = mem_cgroup_lookup(id);
 		if (memcg) {
+			/*
+			 * This recorded memcg can be obsolete one. So, avoid
+			 * calling css_tryget
+			 */
 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 			mem_cgroup_put(memcg);
 		}
-
+		rcu_read_unlock();
 	}
 	/* add this page(page_cgroup) to the LRU we want. */
 
@@ -1473,7 +1515,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 					MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
 	/* record memcg information */
 	if (do_swap_account && memcg) {
-		swap_cgroup_record(ent, memcg);
+		swap_cgroup_record(ent, css_id(&memcg->css));
 		mem_cgroup_get(memcg);
 	}
 	if (memcg)
@@ -1488,15 +1530,23 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 void mem_cgroup_uncharge_swap(swp_entry_t ent)
 {
 	struct mem_cgroup *memcg;
+	unsigned short id;
 
 	if (!do_swap_account)
 		return;
 
-	memcg = swap_cgroup_record(ent, NULL);
+	id = swap_cgroup_record(ent, 0);
+	rcu_read_lock();
+	memcg = mem_cgroup_lookup(id);
 	if (memcg) {
+		/*
+		 * We uncharge this because swap is freed.
+		 * This memcg can be obsolete one. We avoid calling css_tryget
+		 */
 		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 		mem_cgroup_put(memcg);
 	}
+	rcu_read_unlock();
 }
 #endif
 
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ceecfbb143fa..ebf81074bed4 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -285,12 +285,8 @@ struct swap_cgroup_ctrl {
 
 struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 
-/*
- * This 8bytes seems big..maybe we can reduce this when we can use "id" for
- * cgroup rather than pointer.
- */
 struct swap_cgroup {
-	struct mem_cgroup	*val;
+	unsigned short		id;
 };
 #define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
 #define SC_POS_MASK	(SC_PER_PAGE - 1)
@@ -342,10 +338,10 @@ not_enough_page:
  * @ent: swap entry to be recorded into
  * @mem: mem_cgroup to be recorded
  *
- * Returns old value at success, NULL at failure.
- * (Of course, old value can be NULL.)
+ * Returns old value at success, 0 at failure.
+ * (Of course, old value can be 0.)
  */
-struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 {
 	int type = swp_type(ent);
 	unsigned long offset = swp_offset(ent);
@@ -354,18 +350,18 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
 	struct swap_cgroup_ctrl *ctrl;
 	struct page *mappage;
 	struct swap_cgroup *sc;
-	struct mem_cgroup *old;
+	unsigned short old;
 
 	if (!do_swap_account)
-		return NULL;
+		return 0;
 
 	ctrl = &swap_cgroup_ctrl[type];
 
 	mappage = ctrl->map[idx];
 	sc = page_address(mappage);
 	sc += pos;
-	old = sc->val;
-	sc->val = mem;
+	old = sc->id;
+	sc->id = id;
 
 	return old;
 }
@@ -374,9 +370,9 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
  * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
  * @ent: swap entry to be looked up.
  *
- * Returns pointer to mem_cgroup at success. NULL at failure.
+ * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
  */
-struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
+unsigned short lookup_swap_cgroup(swp_entry_t ent)
 {
 	int type = swp_type(ent);
 	unsigned long offset = swp_offset(ent);
@@ -385,16 +381,16 @@ struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
 	struct swap_cgroup_ctrl *ctrl;
 	struct page *mappage;
 	struct swap_cgroup *sc;
-	struct mem_cgroup *ret;
+	unsigned short ret;
 
 	if (!do_swap_account)
-		return NULL;
+		return 0;
 
 	ctrl = &swap_cgroup_ctrl[type];
 	mappage = ctrl->map[idx];
 	sc = page_address(mappage);
 	sc += pos;
-	ret = sc->val;
+	ret = sc->id;
 	return ret;
 }
 
@@ -432,7 +428,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
 
 	printk(KERN_INFO
 		"swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
-		" and %ld bytes to hold mem_cgroup pointers on swap\n",
+		" and %ld bytes to hold mem_cgroup information per swap ents\n",
 		array_size, length * PAGE_SIZE);
 	printk(KERN_INFO
 	"swap_cgroup can be disabled by noswapaccount boot option.\n");
-- 
cgit v1.2.3


From bd1a8ab73edd449fecda633449cc277b856ad4f5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:50 -0700
Subject: cgroups: add 'data' field to struct cgroup_scanner

We need to pass some data to test_task() or process_task() in some cases.
Will be used later.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 43763bd772b9..4316a546beb5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -337,6 +337,7 @@ struct cgroup_scanner {
 	void (*process_task)(struct task_struct *p,
 			struct cgroup_scanner *scan);
 	struct ptr_heap *heap;
+	void *data;
 };
 
 /*
-- 
cgit v1.2.3


From a1bc5a4eee990a1f290735c8694d0aebdad095fa Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 2 Apr 2009 16:57:54 -0700
Subject: cpusets: replace zone allowed functions with node allowed

The cpuset_zone_allowed() variants are actually only a function of the
zone's node.

Cc: Paul Menage <menage@google.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 33 +++++++++++++++++++++++-----
 kernel/cpuset.c        | 59 +++++++++++++++++++++-----------------------------
 2 files changed, 52 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 2e0d79678deb..05ea1dd7d681 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/cgroup.h>
+#include <linux/mm.h>
 
 #ifdef CONFIG_CPUSETS
 
@@ -29,19 +30,29 @@ void cpuset_init_current_mems_allowed(void);
 void cpuset_update_task_memory_state(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
-extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
+extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
+extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
 
-static int inline cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
 	return number_of_cpusets <= 1 ||
-		__cpuset_zone_allowed_softwall(z, gfp_mask);
+		__cpuset_node_allowed_softwall(node, gfp_mask);
 }
 
-static int inline cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
 	return number_of_cpusets <= 1 ||
-		__cpuset_zone_allowed_hardwall(z, gfp_mask);
+		__cpuset_node_allowed_hardwall(node, gfp_mask);
+}
+
+static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+{
+	return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask);
+}
+
+static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+{
+	return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
 }
 
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -112,6 +123,16 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 	return 1;
 }
 
+static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+{
+	return 1;
+}
+
+static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
+{
+	return 1;
+}
+
 static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 {
 	return 1;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0619f109d38d..3ff910eb30d3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2181,26 +2181,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 }
 
 /**
- * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
- * If we're in interrupt, yes, we can always allocate.  If
- * __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.  If it's not a
- * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * hardwalled cpuset ancestor to this tasks cpuset, yes.
- * If the task has been OOM killed and has access to memory reserves
- * as specified by the TIF_MEMDIE flag, yes.
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
+ * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
+ * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
+ * flag, yes.
  * Otherwise, no.
  *
- * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
- * reduces to cpuset_zone_allowed_hardwall().  Otherwise,
- * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
- * from an enclosing cpuset.
+ * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
+ * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
+ * might sleep, and might allow a node from an enclosing cpuset.
  *
- * cpuset_zone_allowed_hardwall() only handles the simpler case of
- * hardwall cpusets, and never sleeps.
+ * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
+ * cpusets, and never sleeps.
  *
  * The __GFP_THISNODE placement logic is really handled elsewhere,
  * by forcibly using a zonelist starting at a specified node, and by
@@ -2239,20 +2237,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  *
  * Rule:
- *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
+ *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
  *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
  *    the code that might scan up ancestor cpusets and sleep.
  */
-
-int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
-	int node;			/* node that zone z is on */
 	const struct cpuset *cs;	/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = zone_to_nid(z);
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
 	if (node_isset(node, current->mems_allowed))
 		return 1;
@@ -2281,15 +2276,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 }
 
 /*
- * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
- * If we're in interrupt, yes, we can always allocate.
- * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.   If the task has been
- * OOM killed and has access to memory reserves as specified by the
- * TIF_MEMDIE flag, yes.  Otherwise, no.
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * yes.  If the task has been OOM killed and has access to memory reserves as
+ * specified by the TIF_MEMDIE flag, yes.
+ * Otherwise, no.
  *
  * The __GFP_THISNODE placement logic is really handled elsewhere,
  * by forcibly using a zonelist starting at a specified node, and by
@@ -2297,20 +2292,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
  * any node on the zonelist except the first.  By the time any such
  * calls get to this routine, we should just shut up and say 'yes'.
  *
- * Unlike the cpuset_zone_allowed_softwall() variant, above,
- * this variant requires that the zone be in the current tasks
+ * Unlike the cpuset_node_allowed_softwall() variant, above,
+ * this variant requires that the node be in the current task's
  * mems_allowed or that we're in interrupt.  It does not scan up the
  * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
  * It never sleeps.
  */
-
-int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
-	int node;			/* node that zone z is on */
-
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = zone_to_nid(z);
 	if (node_isset(node, current->mems_allowed))
 		return 1;
 	/*
-- 
cgit v1.2.3


From 43918f2bf4806675943416d539d9d5e4d585ebff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:00 -0700
Subject: signals: remove 'handler' parameter to tracehook functions

Container-init must behave like global-init to processes within the
container and hence it must be immune to unhandled fatal signals from
within the container (i.e SIG_DFL signals that terminate the process).

But the same container-init must behave like a normal process to processes
in ancestor namespaces and so if it receives the same fatal signal from a
process in ancestor namespace, the signal must be processed.

Implementing these semantics requires that send_signal() determine pid
namespace of the sender but since signals can originate from workqueues/
interrupt-handlers, determining pid namespace of sender may not always be
possible or safe.

This patchset implements the design/simplified semantics suggested by
Oleg Nesterov.  The simplified semantics for container-init are:

	- container-init must never be terminated by a signal from a
	  descendant process.

	- container-init must never be immune to SIGKILL from an ancestor
	  namespace (so a process in parent namespace must always be able
	  to terminate a descendant container).

	- container-init may be immune to unhandled fatal signals (like
	  SIGUSR1) even if they are from ancestor namespace. SIGKILL/SIGSTOP
	  are the only reliable signals to a container-init from ancestor
	  namespace.

This patch:

Based on an earlier patch submitted by Oleg Nesterov and comments from
Roland McGrath (http://lkml.org/lkml/2008/11/19/258).

The handler parameter is currently unused in the tracehook functions.
Besides, the tracehook functions are called with siglock held, so the
functions can check the handler if they later need to.

Removing the parameter simiplifies changes to sig_ignored() in a follow-on
patch.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/ptrace.c  |  2 +-
 include/linux/tracehook.h | 13 ++++---------
 kernel/signal.c           |  6 +++---
 3 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 19378715f415..b7cc21bc6ae0 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1455,6 +1455,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
 	 * system call instruction.
 	 */
 	if (test_thread_flag(TIF_SINGLESTEP) &&
-	    tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
+	    tracehook_consider_fatal_signal(current, SIGTRAP))
 		send_sigtrap(current, regs, 0, TRAP_BRKPT);
 }
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 6186a789d6c7..eb4c6545b384 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -388,17 +388,14 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
  * tracehook_consider_ignored_signal - suppress short-circuit of ignored signal
  * @task:		task receiving the signal
  * @sig:		signal number being sent
- * @handler:		%SIG_IGN or %SIG_DFL
  *
  * Return zero iff tracing doesn't care to examine this ignored signal,
  * so it can short-circuit normal delivery and never even get queued.
- * Either @handler is %SIG_DFL and @sig's default is ignore, or it's %SIG_IGN.
  *
  * Called with @task->sighand->siglock held.
  */
 static inline int tracehook_consider_ignored_signal(struct task_struct *task,
-						    int sig,
-						    void __user *handler)
+						    int sig)
 {
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
@@ -407,19 +404,17 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task,
  * tracehook_consider_fatal_signal - suppress special handling of fatal signal
  * @task:		task receiving the signal
  * @sig:		signal number being sent
- * @handler:		%SIG_DFL or %SIG_IGN
  *
  * Return nonzero to prevent special handling of this termination signal.
- * Normally @handler is %SIG_DFL.  It can be %SIG_IGN if @sig is ignored,
- * in which case force_sig() is about to reset it to %SIG_DFL.
+ * Normally handler for signal is %SIG_DFL.  It can be %SIG_IGN if @sig is
+ * ignored, in which case force_sig() is about to reset it to %SIG_DFL.
  * When this returns zero, this signal might cause a quick termination
  * that does not give the debugger a chance to intercept the signal.
  *
  * Called with or without @task->sighand->siglock held.
  */
 static inline int tracehook_consider_fatal_signal(struct task_struct *task,
-						  int sig,
-						  void __user *handler)
+						  int sig)
 {
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 1c8814481a11..92a1ab004498 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -74,7 +74,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	/*
 	 * Tracers may want to know about even ignored signals.
 	 */
-	return !tracehook_consider_ignored_signal(t, sig, handler);
+	return !tracehook_consider_ignored_signal(t, sig);
 }
 
 /*
@@ -318,7 +318,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
 		return 1;
 	if (handler != SIG_IGN && handler != SIG_DFL)
 		return 0;
-	return !tracehook_consider_fatal_signal(tsk, sig, handler);
+	return !tracehook_consider_fatal_signal(tsk, sig);
 }
 
 
@@ -777,7 +777,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
 	    !sigismember(&t->real_blocked, sig) &&
 	    (sig == SIGKILL ||
-	     !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
+	     !tracehook_consider_fatal_signal(t, sig))) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
-- 
cgit v1.2.3


From 4576145c1ecdaaea9ef8976a48335206aa1ebf91 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:14 -0700
Subject: ptrace: fix possible zombie leak on PTRACE_DETACH

When ptrace_detach() takes tasklist, the tracee can be SIGKILL'ed.  If it
has already passed exit_notify() we can leak a zombie, because a) ptracing
disables the auto-reaping logic, and b) ->real_parent was not notified
about the child's death.

ptrace_detach() should follow the ptrace_exit's logic, change the code
accordingly.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Tested-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 1 +
 kernel/ptrace.c        | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 98b93ca4db06..1a2b0cb55535 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
+extern int __ptrace_detach(struct task_struct *tracer, struct task_struct *p);
 extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f62a568e84ec..ee553b6ad125 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -237,6 +237,8 @@ out:
 
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
+	int dead = 0;
+
 	if (!valid_signal(data))
 		return -EIO;
 
@@ -244,18 +246,21 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	ptrace_disable(child);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
-	/* protect against de_thread()->release_task() */
 	write_lock_irq(&tasklist_lock);
+	/* protect against de_thread()->release_task() */
 	if (child->ptrace) {
 		child->exit_code = data;
 
-		__ptrace_unlink(child);
+		dead = __ptrace_detach(current, child);
 
 		if (!child->exit_state)
 			wake_up_process(child);
 	}
 	write_unlock_irq(&tasklist_lock);
 
+	if (unlikely(dead))
+		release_task(child);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 39c626ae47c469abdfd30c6e42eff884931380d6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:18 -0700
Subject: forget_original_parent: split out the un-ptrace part

By discussion with Roland.

- Rename ptrace_exit() to exit_ptrace(), and change it to do all the
  necessary work with ->ptraced list by its own.

- Move this code from exit.c to ptrace.c

- Update the comment in ptrace_detach() to explain the rechecking of
  the child->ptrace.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Metzger, Markus T" <markus.t.metzger@intel.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h |  2 +-
 include/linux/sched.h  |  5 +++
 kernel/exit.c          | 95 ++++----------------------------------------------
 kernel/ptrace.c        | 78 +++++++++++++++++++++++++++++++++++++++--
 4 files changed, 88 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 1a2b0cb55535..67c15653fc23 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,7 +94,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
-extern int __ptrace_detach(struct task_struct *tracer, struct task_struct *p);
+extern void exit_ptrace(struct task_struct *tracer);
 extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9186f8c5d5f2..b47c94e7560b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2061,6 +2061,11 @@ static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
+static inline int task_detached(struct task_struct *p)
+{
+	return p->exit_signal == -1;
+}
+
 /*
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
diff --git a/kernel/exit.c b/kernel/exit.c
index 3e09b7cb3b20..506693dfdd4e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -61,11 +61,6 @@ DEFINE_TRACE(sched_process_wait);
 
 static void exit_mm(struct task_struct * tsk);
 
-static inline int task_detached(struct task_struct *p)
-{
-	return p->exit_signal == -1;
-}
-
 static void __unhash_process(struct task_struct *p)
 {
 	nr_threads--;
@@ -731,85 +726,6 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-/*
- * Called with irqs disabled, returns true if childs should reap themselves.
- */
-static int ignoring_children(struct sighand_struct *sigh)
-{
-	int ret;
-	spin_lock(&sigh->siglock);
-	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
-	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
-	spin_unlock(&sigh->siglock);
-	return ret;
-}
-
-/* Returns nonzero if the tracee should be released. */
-int __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
-{
-	__ptrace_unlink(p);
-
-	if (p->exit_state != EXIT_ZOMBIE)
-		return 0;
-	/*
-	 * If it's a zombie, our attachedness prevented normal
-	 * parent notification or self-reaping.  Do notification
-	 * now if it would have happened earlier.  If it should
-	 * reap itself we return true.
-	 *
-	 * If it's our own child, there is no notification to do.
-	 * But if our normal children self-reap, then this child
-	 * was prevented by ptrace and we must reap it now.
-	 */
-	if (!task_detached(p) && thread_group_empty(p)) {
-		if (!same_thread_group(p->real_parent, tracer))
-			do_notify_parent(p, p->exit_signal);
-		else if (ignoring_children(tracer->sighand))
-			p->exit_signal = -1;
-	}
-
-	if (!task_detached(p))
-		return 0;
-
-	/* Mark it as in the process of being reaped. */
-	p->exit_state = EXIT_DEAD;
-	return 1;
-}
-
-/*
- * Detach all tasks we were using ptrace on.
- * Any that need to be release_task'd are put on the @dead list.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
-{
-	struct task_struct *p, *n;
-
-	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
-		if (__ptrace_detach(parent, p))
-			list_add(&p->ptrace_entry, dead);
-	}
-}
-
-/*
- * Finish up exit-time ptrace cleanup.
- *
- * Called without locks.
- */
-static void ptrace_exit_finish(struct task_struct *parent,
-			       struct list_head *dead)
-{
-	struct task_struct *p, *n;
-
-	BUG_ON(!list_empty(&parent->ptraced));
-
-	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
-		list_del_init(&p->ptrace_entry);
-		release_task(p);
-	}
-}
-
 /* Returns nonzero if the child should be released. */
 static int reparent_thread(struct task_struct *p, struct task_struct *father)
 {
@@ -894,12 +810,10 @@ static void forget_original_parent(struct task_struct *father)
 	struct task_struct *p, *n, *reaper;
 	LIST_HEAD(ptrace_dead);
 
+	exit_ptrace(father);
+
 	write_lock_irq(&tasklist_lock);
 	reaper = find_new_reaper(father);
-	/*
-	 * First clean up ptrace if we were using it.
-	 */
-	ptrace_exit(father, &ptrace_dead);
 
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
 		p->real_parent = reaper;
@@ -914,7 +828,10 @@ static void forget_original_parent(struct task_struct *father)
 	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&father->children));
 
-	ptrace_exit_finish(father, &ptrace_dead);
+	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
 }
 
 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ee553b6ad125..f5a9fa5aafa1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -235,9 +235,57 @@ out:
 	return retval;
 }
 
+/*
+ * Called with irqs disabled, returns true if childs should reap themselves.
+ */
+static int ignoring_children(struct sighand_struct *sigh)
+{
+	int ret;
+	spin_lock(&sigh->siglock);
+	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
+	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
+	spin_unlock(&sigh->siglock);
+	return ret;
+}
+
+/*
+ * Called with tasklist_lock held for writing.
+ * Unlink a traced task, and clean it up if it was a traced zombie.
+ * Return true if it needs to be reaped with release_task().
+ * (We can't call release_task() here because we already hold tasklist_lock.)
+ *
+ * If it's a zombie, our attachedness prevented normal parent notification
+ * or self-reaping.  Do notification now if it would have happened earlier.
+ * If it should reap itself, return true.
+ *
+ * If it's our own child, there is no notification to do.
+ * But if our normal children self-reap, then this child
+ * was prevented by ptrace and we must reap it now.
+ */
+static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
+{
+	__ptrace_unlink(p);
+
+	if (p->exit_state == EXIT_ZOMBIE) {
+		if (!task_detached(p) && thread_group_empty(p)) {
+			if (!same_thread_group(p->real_parent, tracer))
+				do_notify_parent(p, p->exit_signal);
+			else if (ignoring_children(tracer->sighand))
+				p->exit_signal = -1;
+		}
+		if (task_detached(p)) {
+			/* Mark it as in the process of being reaped. */
+			p->exit_state = EXIT_DEAD;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
-	int dead = 0;
+	bool dead = false;
 
 	if (!valid_signal(data))
 		return -EIO;
@@ -247,7 +295,10 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
 	write_lock_irq(&tasklist_lock);
-	/* protect against de_thread()->release_task() */
+	/*
+	 * This child can be already killed. Make sure de_thread() or
+	 * our sub-thread doing do_wait() didn't do release_task() yet.
+	 */
 	if (child->ptrace) {
 		child->exit_code = data;
 
@@ -264,6 +315,29 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	return 0;
 }
 
+/*
+ * Detach all tasks we were using ptrace on.
+ */
+void exit_ptrace(struct task_struct *tracer)
+{
+	struct task_struct *p, *n;
+	LIST_HEAD(ptrace_dead);
+
+	write_lock_irq(&tasklist_lock);
+	list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
+		if (__ptrace_detach(tracer, p))
+			list_add(&p->ptrace_entry, &ptrace_dead);
+	}
+	write_unlock_irq(&tasklist_lock);
+
+	BUG_ON(!list_empty(&tracer->ptraced));
+
+	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
+}
+
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
 {
 	int copied = 0;
-- 
cgit v1.2.3


From bb24c679a51b1a9b726b901330649e3861814ac0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:20 -0700
Subject: tracehook_notify_death: use task_detached() helper

Now that task_detached() is exported, change tracehook_notify_death() to
use this helper, nobody else checks ->exit_signal == -1 by hand.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Metzger, Markus T" <markus.t.metzger@intel.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index eb4c6545b384..c7aa154f4bfc 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -502,7 +502,7 @@ static inline int tracehook_notify_jctl(int notify, int why)
 static inline int tracehook_notify_death(struct task_struct *task,
 					 void **death_cookie, int group_dead)
 {
-	if (task->exit_signal == -1)
+	if (task_detached(task))
 		return task->ptrace ? SIGCHLD : DEATH_REAP;
 
 	/*
-- 
cgit v1.2.3


From 40e8a10de2c9f87e892dcd5a6f9d1b208329ffea Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:58:25 -0700
Subject: cpu hotplug: remove unused cpuhotplug_mutex_lock()

cpuhotplug_mutex_lock() is not used, remove it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpu.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index c2747ac2ae43..2643d848df90 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -23,7 +23,6 @@
 #include <linux/node.h>
 #include <linux/compiler.h>
 #include <linux/cpumask.h>
-#include <linux/mutex.h>
 
 struct cpu {
 	int node_id;		/* The node which contains the CPU */
@@ -103,16 +102,6 @@ extern struct sysdev_class cpu_sysdev_class;
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
 
-static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
-{
-	mutex_lock(cpu_hp_mutex);
-}
-
-static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
-{
-	mutex_unlock(cpu_hp_mutex);
-}
-
 extern void get_online_cpus(void);
 extern void put_online_cpus(void);
 #define hotcpu_notifier(fn, pri) {				\
@@ -126,11 +115,6 @@ int cpu_down(unsigned int cpu);
 
 #else		/* CONFIG_HOTPLUG_CPU */
 
-static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
-{ }
-static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
-{ }
-
 #define get_online_cpus()	do { } while (0)
 #define put_online_cpus()	do { } while (0)
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
-- 
cgit v1.2.3


From a50b0aa4bd9a7d42112442a385f3dc0e775284dd Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Thu, 2 Apr 2009 16:58:29 -0700
Subject: struct linux_binprm: drop unused fields

Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/binfmts.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 77b4a9e46004..6638b8148de7 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -35,8 +35,7 @@ struct linux_binprm{
 #endif
 	struct mm_struct *mm;
 	unsigned long p; /* current top of mem */
-	unsigned int sh_bang:1,
-		misc_bang:1,
+	unsigned int
 		cred_prepared:1,/* true if creds already prepared (multiple
 				 * preps happen for interpreters) */
 		cap_effective:1;/* true if has elevated effective capabilities,
-- 
cgit v1.2.3


From 1f80769ffd36e74357fe896dc43dddf1af1510f3 Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Thu, 2 Apr 2009 16:58:30 -0700
Subject: synclink_gt: add clock options

Add support for x8 asynchronous sample rate and ability to specify base
clock frequency.

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/synclink_gt.c | 58 ++++++++++++++++++++++++++++++----------------
 include/linux/synclink.h   |  1 +
 2 files changed, 39 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 6ec6e13d47d7..5e256494686a 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -298,6 +298,7 @@ struct slgt_info {
 
 	unsigned int rbuf_fill_level;
 	unsigned int if_mode;
+	unsigned int base_clock;
 
 	/* device status */
 
@@ -1156,22 +1157,26 @@ static long set_params32(struct slgt_info *info, struct MGSL_PARAMS32 __user *ne
 		return -EFAULT;
 
 	spin_lock(&info->lock);
-	info->params.mode            = tmp_params.mode;
-	info->params.loopback        = tmp_params.loopback;
-	info->params.flags           = tmp_params.flags;
-	info->params.encoding        = tmp_params.encoding;
-	info->params.clock_speed     = tmp_params.clock_speed;
-	info->params.addr_filter     = tmp_params.addr_filter;
-	info->params.crc_type        = tmp_params.crc_type;
-	info->params.preamble_length = tmp_params.preamble_length;
-	info->params.preamble        = tmp_params.preamble;
-	info->params.data_rate       = tmp_params.data_rate;
-	info->params.data_bits       = tmp_params.data_bits;
-	info->params.stop_bits       = tmp_params.stop_bits;
-	info->params.parity          = tmp_params.parity;
+	if (tmp_params.mode == MGSL_MODE_BASE_CLOCK) {
+		info->base_clock = tmp_params.clock_speed;
+	} else {
+		info->params.mode            = tmp_params.mode;
+		info->params.loopback        = tmp_params.loopback;
+		info->params.flags           = tmp_params.flags;
+		info->params.encoding        = tmp_params.encoding;
+		info->params.clock_speed     = tmp_params.clock_speed;
+		info->params.addr_filter     = tmp_params.addr_filter;
+		info->params.crc_type        = tmp_params.crc_type;
+		info->params.preamble_length = tmp_params.preamble_length;
+		info->params.preamble        = tmp_params.preamble;
+		info->params.data_rate       = tmp_params.data_rate;
+		info->params.data_bits       = tmp_params.data_bits;
+		info->params.stop_bits       = tmp_params.stop_bits;
+		info->params.parity          = tmp_params.parity;
+	}
 	spin_unlock(&info->lock);
 
- 	change_params(info);
+	program_hw(info);
 
 	return 0;
 }
@@ -2559,10 +2564,13 @@ static int set_params(struct slgt_info *info, MGSL_PARAMS __user *new_params)
 		return -EFAULT;
 
 	spin_lock_irqsave(&info->lock, flags);
-	memcpy(&info->params, &tmp_params, sizeof(MGSL_PARAMS));
+	if (tmp_params.mode == MGSL_MODE_BASE_CLOCK)
+		info->base_clock = tmp_params.clock_speed;
+	else
+		memcpy(&info->params, &tmp_params, sizeof(MGSL_PARAMS));
 	spin_unlock_irqrestore(&info->lock, flags);
 
- 	change_params(info);
+	program_hw(info);
 
 	return 0;
 }
@@ -3432,6 +3440,7 @@ static struct slgt_info *alloc_dev(int adapter_num, int port_num, struct pci_dev
 		info->magic = MGSL_MAGIC;
 		INIT_WORK(&info->task, bh_handler);
 		info->max_frame_size = 4096;
+		info->base_clock = 14745600;
 		info->rbuf_fill_level = DMABUFSIZE;
 		info->port.close_delay = 5*HZ/10;
 		info->port.closing_wait = 30*HZ;
@@ -3779,7 +3788,7 @@ static void enable_loopback(struct slgt_info *info)
 static void set_rate(struct slgt_info *info, u32 rate)
 {
 	unsigned int div;
-	static unsigned int osc = 14745600;
+	unsigned int osc = info->base_clock;
 
 	/* div = osc/rate - 1
 	 *
@@ -4083,18 +4092,27 @@ static void async_mode(struct slgt_info *info)
 	 * 06  CTS      IRQ enable
 	 * 05  DCD      IRQ enable
 	 * 04  RI       IRQ enable
-	 * 03  reserved, must be zero
+	 * 03  0=16x sampling, 1=8x sampling
 	 * 02  1=txd->rxd internal loopback enable
 	 * 01  reserved, must be zero
 	 * 00  1=master IRQ enable
 	 */
 	val = BIT15 + BIT14 + BIT0;
+	/* JCR[8] : 1 = x8 async mode feature available */
+	if ((rd_reg32(info, JCR) & BIT8) && info->params.data_rate &&
+	    ((info->base_clock < (info->params.data_rate * 16)) ||
+	     (info->base_clock % (info->params.data_rate * 16)))) {
+		/* use 8x sampling */
+		val |= BIT3;
+		set_rate(info, info->params.data_rate * 8);
+	} else {
+		/* use 16x sampling */
+		set_rate(info, info->params.data_rate * 16);
+	}
 	wr_reg16(info, SCR, val);
 
 	slgt_irq_on(info, IRQ_RXBREAK | IRQ_RXOVER);
 
-	set_rate(info, info->params.data_rate * 16);
-
 	if (info->params.loopback)
 		enable_loopback(info);
 }
diff --git a/include/linux/synclink.h b/include/linux/synclink.h
index 99b8bdb17b2b..0ff2779c44d0 100644
--- a/include/linux/synclink.h
+++ b/include/linux/synclink.h
@@ -125,6 +125,7 @@
 #define MGSL_MODE_MONOSYNC	3
 #define MGSL_MODE_BISYNC	4
 #define MGSL_MODE_RAW		6
+#define MGSL_MODE_BASE_CLOCK    7
 
 #define MGSL_BUS_TYPE_ISA	1
 #define MGSL_BUS_TYPE_EISA	2
-- 
cgit v1.2.3


From 6dda81f4384b94930826eded254d8c16f89a9248 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:35 -0700
Subject: pids: document task_pgrp/task_session is not safe without
 tasklist/rcu

Even if task == current, it is not safe to dereference the result of
task_pgrp/task_session.  We can race with another thread which changes the
special pid via setpgid/setsid.

Document this.  The next 2 patches give an example of the unsafe usage, we
have more bad users.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b47c94e7560b..722dd313bf8a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1489,6 +1489,11 @@ static inline struct pid *task_tgid(struct task_struct *task)
 	return task->group_leader->pids[PIDTYPE_PID].pid;
 }
 
+/*
+ * Without tasklist or rcu lock it is not safe to dereference
+ * the result of task_pgrp/task_session even if task == current,
+ * we can race with another thread doing sys_setsid/sys_setpgid.
+ */
 static inline struct pid *task_pgrp(struct task_struct *task)
 {
 	return task->group_leader->pids[PIDTYPE_PGID].pid;
-- 
cgit v1.2.3


From 52ee2dfdd4f51cf422ea6a96a0846dc94244aa37 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:38 -0700
Subject: pids: refactor vnr/nr_ns helpers to make them safe

Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.

task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.

As for "special" pids, vnr/nr_ns helpers always need rcu.  However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.

And almost every helper has a callsite which needs a fix.

Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".

This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.

After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 27 ++++++++++++++++++++-------
 kernel/pid.c          | 31 ++++++++++++++++---------------
 2 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 722dd313bf8a..49df878a0cad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1519,17 +1519,23 @@ struct pid_namespace;
  *
  * see also pid_nr() etc in include/linux/pid.h
  */
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+			struct pid_namespace *ns);
 
 static inline pid_t task_pid_nr(struct task_struct *tsk)
 {
 	return tsk->pid;
 }
 
-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
+}
 
 static inline pid_t task_pid_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_pid(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
 }
 
 
@@ -1551,11 +1557,15 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 	return tsk->signal->__pgrp;
 }
 
-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
+}
 
 static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_pgrp(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
 }
 
 
@@ -1564,14 +1574,17 @@ static inline pid_t task_session_nr(struct task_struct *tsk)
 	return tsk->signal->__session;
 }
 
-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_session_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
+}
 
 static inline pid_t task_session_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_session(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 
-
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
diff --git a/kernel/pid.c b/kernel/pid.c
index 6628abcc520e..b2e5f78fd281 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -452,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
 }
 EXPORT_SYMBOL_GPL(pid_vnr);
 
-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+			struct pid_namespace *ns)
 {
-	return pid_nr_ns(task_pid(tsk), ns);
+	pid_t nr = 0;
+
+	rcu_read_lock();
+	if (!ns)
+		ns = current->nsproxy->pid_ns;
+	if (likely(pid_alive(task))) {
+		if (type != PIDTYPE_PID)
+			task = task->group_leader;
+		nr = pid_nr_ns(task->pids[type].pid, ns);
+	}
+	rcu_read_unlock();
+
+	return nr;
 }
-EXPORT_SYMBOL(task_pid_nr_ns);
+EXPORT_SYMBOL(__task_pid_nr_ns);
 
 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
@@ -464,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 }
 EXPORT_SYMBOL(task_tgid_nr_ns);
 
-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_pgrp(tsk), ns);
-}
-EXPORT_SYMBOL(task_pgrp_nr_ns);
-
-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_session(tsk), ns);
-}
-EXPORT_SYMBOL(task_session_nr_ns);
-
 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
 {
 	return ns_of_pid(task_pid(tsk));
-- 
cgit v1.2.3


From 1b0f7ffd0ea27cd3a0b9ca04e3df9522048c32a3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:39 -0700
Subject: pids: kill signal_struct-> __pgrp/__session and friends

We are wasting 2 words in signal_struct without any reason to implement
task_pgrp_nr() and task_session_nr().

task_session_nr() has no callers since
2e2ba22ea4fd4bb85f0fa37c521066db6775cbef, we can remove it.

task_pgrp_nr() is still (I believe wrongly) used in fs/autofsX and
fs/coda.

This patch reimplements task_pgrp_nr() via task_pgrp_nr_ns(), and kills
__pgrp/__session and the related helpers.

The change in drivers/char/tty_io.c is cosmetic, but hopefully makes sense
anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Alan Cox <number6@the-village.bc.nu>		[tty parts]
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_io.c |  4 ++--
 include/linux/sched.h | 43 ++++++-------------------------------------
 kernel/exit.c         | 10 +++-------
 kernel/fork.c         |  2 --
 kernel/sys.c          |  4 +---
 5 files changed, 12 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index a44b701c5bba..66b99a2049e3 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2681,7 +2681,7 @@ void __do_SAK(struct tty_struct *tty)
 	/* Kill the entire session */
 	do_each_pid_task(session, PIDTYPE_SID, p) {
 		printk(KERN_NOTICE "SAK: killed process %d"
-			" (%s): task_session_nr(p)==tty->session\n",
+			" (%s): task_session(p)==tty->session\n",
 			task_pid_nr(p), p->comm);
 		send_sig(SIGKILL, p, 1);
 	} while_each_pid_task(session, PIDTYPE_SID, p);
@@ -2691,7 +2691,7 @@ void __do_SAK(struct tty_struct *tty)
 	do_each_thread(g, p) {
 		if (p->signal->tty == tty) {
 			printk(KERN_NOTICE "SAK: killed process %d"
-			    " (%s): task_session_nr(p)==tty->session\n",
+			    " (%s): task_session(p)==tty->session\n",
 			    task_pid_nr(p), p->comm);
 			send_sig(SIGKILL, p, 1);
 			continue;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 49df878a0cad..206ac003e8c0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -547,25 +547,8 @@ struct signal_struct {
 
 	struct list_head cpu_timers[3];
 
-	/* job control IDs */
-
-	/*
-	 * pgrp and session fields are deprecated.
-	 * use the task_session_Xnr and task_pgrp_Xnr routines below
-	 */
-
-	union {
-		pid_t pgrp __deprecated;
-		pid_t __pgrp;
-	};
-
 	struct pid *tty_old_pgrp;
 
-	union {
-		pid_t session __deprecated;
-		pid_t __session;
-	};
-
 	/* boolean value for session group leader */
 	int leader;
 
@@ -1469,16 +1452,6 @@ static inline int rt_task(struct task_struct *p)
 	return rt_prio(p->prio);
 }
 
-static inline void set_task_session(struct task_struct *tsk, pid_t session)
-{
-	tsk->signal->__session = session;
-}
-
-static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
-{
-	tsk->signal->__pgrp = pgrp;
-}
-
 static inline struct pid *task_pid(struct task_struct *task)
 {
 	return task->pids[PIDTYPE_PID].pid;
@@ -1552,11 +1525,6 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
-{
-	return tsk->signal->__pgrp;
-}
-
 static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
 					struct pid_namespace *ns)
 {
@@ -1569,11 +1537,6 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_session_nr(struct task_struct *tsk)
-{
-	return tsk->signal->__session;
-}
-
 static inline pid_t task_session_nr_ns(struct task_struct *tsk,
 					struct pid_namespace *ns)
 {
@@ -1585,6 +1548,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk)
 	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 
+/* obsolete, do not use */
+static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+{
+	return task_pgrp_nr_ns(tsk, &init_pid_ns);
+}
+
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
diff --git a/kernel/exit.c b/kernel/exit.c
index 384f09caf2ef..3bec141c82f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -357,16 +357,12 @@ static void reparent_to_kthreadd(void)
 void __set_special_pids(struct pid *pid)
 {
 	struct task_struct *curr = current->group_leader;
-	pid_t nr = pid_nr(pid);
 
-	if (task_session(curr) != pid) {
+	if (task_session(curr) != pid)
 		change_pid(curr, PIDTYPE_SID, pid);
-		set_task_session(curr, nr);
-	}
-	if (task_pgrp(curr) != pid) {
+
+	if (task_pgrp(curr) != pid)
 		change_pid(curr, PIDTYPE_PGID, pid);
-		set_task_pgrp(curr, nr);
-	}
 }
 
 static void set_special_pids(struct pid *pid)
diff --git a/kernel/fork.c b/kernel/fork.c
index adbea16ec649..f74458231449 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1265,8 +1265,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			p->signal->leader_pid = pid;
 			tty_kref_put(p->signal->tty);
 			p->signal->tty = tty_kref_get(current->signal->tty);
-			set_task_pgrp(p, task_pgrp_nr(current));
-			set_task_session(p, task_session_nr(current));
 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
 			attach_pid(p, PIDTYPE_SID, task_session(current));
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e6882a..742cefa527e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1013,10 +1013,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 	if (err)
 		goto out;
 
-	if (task_pgrp(p) != pgrp) {
+	if (task_pgrp(p) != pgrp)
 		change_pid(p, PIDTYPE_PGID, pgrp);
-		set_task_pgrp(p, pid_nr(pgrp));
-	}
 
 	err = 0;
 out:
-- 
cgit v1.2.3


From 7c5ff4f92e2b47c56d777a5adbadd9a52841b635 Mon Sep 17 00:00:00 2001
From: Harry Ciao <qingtao.cao@windriver.com>
Date: Thu, 2 Apr 2009 16:58:48 -0700
Subject: pci: Add AMD8111 PCI Bridge PCI Device ID

Add the PCI Device ID of the PCI Bridge Controller on AMD8111 chip.

Signed-off-by: Harry Ciao <qingtao.cao@windriver.com>
Cc: Doug Thompson <norsk5@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index cb14fd260837..170f8b1f22db 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -526,6 +526,7 @@
 #define PCI_DEVICE_ID_AMD_OPUS_7443	0x7443
 #define PCI_DEVICE_ID_AMD_VIPER_7443	0x7443
 #define PCI_DEVICE_ID_AMD_OPUS_7445	0x7445
+#define PCI_DEVICE_ID_AMD_8111_PCI	0x7460
 #define PCI_DEVICE_ID_AMD_8111_LPC	0x7468
 #define PCI_DEVICE_ID_AMD_8111_IDE	0x7469
 #define PCI_DEVICE_ID_AMD_8111_SMBUS2	0x746a
-- 
cgit v1.2.3


From 04d491ab2a53008a1aa98ac09561768c7f3adda3 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 2 Apr 2009 16:58:57 -0700
Subject: kexec: add dmesg log symbols to /proc/vmcoreinfo lists

It would be nice to be able to extract the dmesg log from a vmcore file
without needing to keep the debug symbols for the running kernel handy all
the time.  We have a facility to do this in /proc/vmcore.  This patch adds
the log_buf and log_end symbols to the vmcoreinfo area so that tools (like
makedumpfile) can easily extract the dmesg logs from a vmcore image.

[akpm@linux-foundation.org: several fixes and cleanups]
[akpm@linux-foundation.org: fix unused log_buf_kexec_setup()]
[akpm@linux-foundation.org: build fix]
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Simon Horman <horms@verge.net.au>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  4 ++++
 kernel/kexec.c         |  1 +
 kernel/printk.c        | 19 +++++++++++++++++++
 3 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e720b0da7751..556d781e69fe 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -242,6 +242,7 @@ extern struct ratelimit_state printk_ratelimit_state;
 extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
+void log_buf_kexec_setup(void);
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
@@ -253,6 +254,9 @@ static inline int printk_ratelimit(void) { return 0; }
 static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 					  unsigned int interval_msec)	\
 		{ return false; }
+static inline void log_buf_kexec_setup(void)
+{
+}
 #endif
 
 extern int printk_needs_cpu(int cpu);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 93eed85fe017..589832aac41f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(list_head, prev);
 	VMCOREINFO_OFFSET(vm_struct, addr);
 	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+	log_buf_kexec_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
 	VMCOREINFO_NUMBER(PG_lru);
diff --git a/kernel/printk.c b/kernel/printk.c
index e3602d0755b0..a5f61a9acedb 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
+#include <linux/kexec.h>
 
 #include <asm/uaccess.h>
 
@@ -135,6 +136,24 @@ static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
 
+#ifdef CONFIG_KEXEC
+/*
+ * This appends the listed symbols to /proc/vmcoreinfo
+ *
+ * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
+ * obtain access to symbols that are otherwise very difficult to locate.  These
+ * symbols are specifically used so that utilities can access and extract the
+ * dmesg log from a vmcore file after a crash.
+ */
+void log_buf_kexec_setup(void)
+{
+	VMCOREINFO_SYMBOL(log_buf);
+	VMCOREINFO_SYMBOL(log_end);
+	VMCOREINFO_SYMBOL(log_buf_len);
+	VMCOREINFO_SYMBOL(logged_chars);
+}
+#endif
+
 static int __init log_buf_len_setup(char *str)
 {
 	unsigned size = memparse(str, &str);
-- 
cgit v1.2.3


From f3554f4bc69803ac2baaf7cf2aa4339e1f4b693e Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Thu, 2 Apr 2009 16:59:23 -0700
Subject: preadv/pwritev: Add preadv and pwritev system calls.

This patch adds preadv and pwritev system calls.  These syscalls are a
pretty straightforward combination of pread and readv (same for write).
They are quite useful for doing vectored I/O in threaded applications.
Using lseek+readv instead opens race windows you'll have to plug with
locking.

Other systems have such system calls too, for example NetBSD, check
here: http://www.daemon-systems.org/man/preadv.2.html

The application-visible interface provided by glibc should look like
this to be compatible to the existing implementations in the *BSD family:

  ssize_t preadv(int d, const struct iovec *iov, int iovcnt, off_t offset);
  ssize_t pwritev(int d, const struct iovec *iov, int iovcnt, off_t offset);

This prototype has one problem though: On 32bit archs is the (64bit)
offset argument unaligned, which the syscall ABI of several archs doesn't
allow to do.  At least s390 needs a wrapper in glibc to handle this.  As
we'll need a wrappers in glibc anyway I've decided to push problem to
glibc entriely and use a syscall prototype which works without
arch-specific wrappers inside the kernel: The offset argument is
explicitly splitted into two 32bit values.

The patch sports the actual system call implementation and the windup in
the x86 system call tables.  Other archs follow as separate patches.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <linux-api@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S          |  2 ++
 arch/x86/include/asm/unistd_32.h   |  2 ++
 arch/x86/include/asm/unistd_64.h   |  4 +++
 arch/x86/kernel/syscall_table_32.S |  2 ++
 fs/compat.c                        | 36 +++++++++++++++++++++++++++
 fs/read_write.c                    | 50 ++++++++++++++++++++++++++++++++++++++
 include/linux/compat.h             |  6 +++++
 include/linux/syscalls.h           |  4 +++
 8 files changed, 106 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index db0c803170ab..a505202086e8 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -828,4 +828,6 @@ ia32_sys_call_table:
 	.quad sys_dup3			/* 330 */
 	.quad sys_pipe2
 	.quad sys_inotify_init1
+	.quad compat_sys_preadv
+	.quad compat_sys_pwritev
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4..6e72d74cf8dc 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,8 @@
 #define __NR_dup3		330
 #define __NR_pipe2		331
 #define __NR_inotify_init1	332
+#define __NR_preadv		333
+#define __NR_pwritev		334
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e6666f..f81829462325 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,6 +653,10 @@ __SYSCALL(__NR_dup3, sys_dup3)
 __SYSCALL(__NR_pipe2, sys_pipe2)
 #define __NR_inotify_init1			294
 __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+#define __NR_preadv				295
+__SYSCALL(__NR_preadv, sys_preadv)
+#define __NR_pwritev				296
+__SYSCALL(__NR_pwritev, sys_pwritev)
 
 
 #ifndef __NO_STUBS
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 3bdb64829b82..ff5c8736b491 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,5 @@ ENTRY(sys_call_table)
 	.long sys_dup3			/* 330 */
 	.long sys_pipe2
 	.long sys_inotify_init1
+	.long sys_preadv
+	.long sys_pwritev
diff --git a/fs/compat.c b/fs/compat.c
index e04b4660db84..7c1615183d1e 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1232,6 +1232,24 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
 	return ret;
 }
 
+asmlinkage ssize_t
+compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
+		  unsigned long vlen, u32 pos_high, u32 pos_low)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+	ret = compat_readv(file, vec, vlen, &pos);
+	fput(file);
+	return ret;
+}
+
 static size_t compat_writev(struct file *file,
 			    const struct compat_iovec __user *vec,
 			    unsigned long vlen, loff_t *pos)
@@ -1269,6 +1287,24 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
 	return ret;
 }
 
+asmlinkage ssize_t
+compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
+		   unsigned long vlen, u32 pos_high, u32 pos_low)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+	ret = compat_writev(file, vec, vlen, &pos);
+	fput(file);
+	return ret;
+}
+
 asmlinkage long
 compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
 		    unsigned int nr_segs, unsigned int flags)
diff --git a/fs/read_write.c b/fs/read_write.c
index 400fe81c973e..6d5d8ff238aa 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,6 +731,56 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, u32, pos_high, u32, pos_low)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PREAD)
+			ret = vfs_readv(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, u32, pos_high, u32, pos_low)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PWRITE)
+			ret = vfs_writev(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 			   size_t count, loff_t max)
 {
diff --git a/include/linux/compat.h b/include/linux/compat.h
index b880864672de..9723edd6455c 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -191,6 +191,12 @@ asmlinkage ssize_t compat_sys_readv(unsigned long fd,
 		const struct compat_iovec __user *vec, unsigned long vlen);
 asmlinkage ssize_t compat_sys_writev(unsigned long fd,
 		const struct compat_iovec __user *vec, unsigned long vlen);
+asmlinkage ssize_t compat_sys_preadv(unsigned long fd,
+		const struct compat_iovec __user *vec,
+		unsigned long vlen, u32 pos_high, u32 pos_low);
+asmlinkage ssize_t compat_sys_pwritev(unsigned long fd,
+		const struct compat_iovec __user *vec,
+		unsigned long vlen, u32 pos_high, u32 pos_low);
 
 int compat_do_execve(char * filename, compat_uptr_t __user *argv,
 	        compat_uptr_t __user *envp, struct pt_regs * regs);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9f900cfd066..b299a82a05e7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -461,6 +461,10 @@ asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
 			    size_t count, loff_t pos);
 asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
 			     size_t count, loff_t pos);
+asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
+			   unsigned long vlen, u32 pos_high, u32 pos_low);
+asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
+			    unsigned long vlen, u32 pos_high, u32 pos_low);
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
 asmlinkage long sys_mkdir(const char __user *pathname, int mode);
 asmlinkage long sys_chdir(const char __user *filename);
-- 
cgit v1.2.3


From e8c158bb313c1df421eab7dc4299cd39cbbf5895 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Thu, 2 Apr 2009 16:59:45 -0700
Subject: Factor out #ifdefs from kernel/spinlock.c to LOCK_CONTENDED_FLAGS

SGI has observed that on large systems, interrupts are not serviced for a
long period of time when waiting for a rwlock.  The following patch series
re-enables irqs while waiting for the lock, resembling the code which is
already there for spinlocks.

I only made the ia64 version, because the patch adds some overhead to the
fast path.  I assume there is currently no demand to have this for other
architectures, because the systems are not so large.  Of course, the
possibility to implement raw_{read|write}_lock_flags for any architecture
is still there.

This patch:

The new macro LOCK_CONTENDED_FLAGS expands to the correct implementation
depending on the config options, so that IRQ's are re-enabled when
possible, but they remain disabled if CONFIG_LOCKDEP is set.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Robin Holt <holt@sgi.com>
Cc: <linux-arch@vger.kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockdep.h | 17 +++++++++++++++++
 kernel/spinlock.c       | 12 ++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 5a58ea3e91e9..da5a5a1f4cd2 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -364,6 +364,23 @@ do {								\
 
 #endif /* CONFIG_LOCK_STAT */
 
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * On lockdep we dont want the hand-coded irq-enable of
+ * _raw_*_lock_flags() code, because lockdep assumes
+ * that interrupts are not re-enabled during lock-acquire:
+ */
+#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
+	LOCK_CONTENDED((_lock), (try), (lock))
+
+#else /* CONFIG_LOCKDEP */
+
+#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
+	lockfl((_lock), (flags))
+
+#endif /* CONFIG_LOCKDEP */
+
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern void early_init_irq_lock_class(void);
 #else
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 29ab20749dd3..7283c6dc2d59 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -299,16 +299,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 	local_irq_save(flags);
 	preempt_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	/*
-	 * On lockdep we dont want the hand-coded irq-enable of
-	 * _raw_spin_lock_flags() code, because lockdep assumes
-	 * that interrupts are not re-enabled during lock-acquire:
-	 */
-#ifdef CONFIG_LOCKDEP
-	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
-#else
-	_raw_spin_lock_flags(lock, &flags);
-#endif
+	LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
+				_raw_spin_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
-- 
cgit v1.2.3


From f5f7eac41db827a47b2163330eecd7bb55ae9f12 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Thu, 2 Apr 2009 16:59:46 -0700
Subject: Allow rwlocks to re-enable interrupts

Pass the original flags to rwlock arch-code, so that it can re-enable
interrupts if implemented for that architecture.

Initially, make __raw_read_lock_flags and __raw_write_lock_flags stubs
which just do the same thing as non-flags variants.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Robin Holt <holt@sgi.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <linux-arch@vger.kernel.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/spinlock.h          | 3 +++
 arch/arm/include/asm/spinlock.h            | 3 +++
 arch/cris/include/arch-v32/arch/spinlock.h | 2 ++
 arch/ia64/include/asm/spinlock.h           | 3 +++
 arch/mips/include/asm/spinlock.h           | 2 ++
 arch/parisc/include/asm/spinlock.h         | 3 +++
 arch/powerpc/include/asm/spinlock.h        | 3 +++
 arch/s390/include/asm/spinlock.h           | 3 +++
 arch/sh/include/asm/spinlock.h             | 3 +++
 arch/sparc/include/asm/spinlock_32.h       | 2 ++
 arch/sparc/include/asm/spinlock_64.h       | 2 ++
 arch/x86/include/asm/spinlock.h            | 3 +++
 include/asm-m32r/spinlock.h                | 3 +++
 include/linux/spinlock.h                   | 6 ++++++
 kernel/spinlock.c                          | 6 ++++--
 15 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h
index aeeb125f6851..e38fb95cb335 100644
--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -166,6 +166,9 @@ static inline void __raw_write_unlock(raw_rwlock_t * lock)
 	lock->lock = 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index 2b41ebbfa7ff..c13681ac1ede 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -217,6 +217,9 @@ static inline int __raw_read_trylock(raw_rwlock_t *rw)
 /* read_can_lock - would read_trylock() succeed? */
 #define __raw_read_can_lock(x)		((x)->lock < 0x80000000)
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/cris/include/arch-v32/arch/spinlock.h b/arch/cris/include/arch-v32/arch/spinlock.h
index 0d5709b983a1..129756b96661 100644
--- a/arch/cris/include/arch-v32/arch/spinlock.h
+++ b/arch/cris/include/arch-v32/arch/spinlock.h
@@ -121,6 +121,8 @@ static  inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return 1;
 }
 
+#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock)
+#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index 0229fb95fb38..0a619618b2fa 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -213,6 +213,9 @@ static inline int __raw_read_trylock(raw_rwlock_t *x)
 	return (u32)ia64_cmpxchg4_acq((__u32 *)(x), new.word, old.word) == old.word;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 10e82441b496..5b60a09a0f08 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -480,6 +480,8 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return ret;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index f3d2090a18dc..fae03e136fa8 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -187,6 +187,9 @@ static __inline__ int __raw_write_can_lock(raw_rwlock_t *rw)
 	return !rw->counter;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 36864364e601..c3b193121f81 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -287,6 +287,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 	rw->lock = 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	__spin_yield(lock)
 #define _raw_read_relax(lock)	__rw_yield(lock)
 #define _raw_write_relax(lock)	__rw_yield(lock)
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index df84ae96915f..f3861b09ebb0 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -172,6 +172,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return _raw_write_trylock_retry(rw);
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
 
diff --git a/arch/sh/include/asm/spinlock.h b/arch/sh/include/asm/spinlock.h
index e793181d64da..60283565f89b 100644
--- a/arch/sh/include/asm/spinlock.h
+++ b/arch/sh/include/asm/spinlock.h
@@ -216,6 +216,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return (oldval > (RW_LOCK_BIAS - 1));
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index bf2d532593e3..46f91ab66a50 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -177,6 +177,8 @@ static inline int __read_trylock(raw_rwlock_t *rw)
 #define __raw_write_unlock(rw)	do { (rw)->lock = 0; } while(0)
 
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+#define __raw_read_lock_flags(rw, flags)   __raw_read_lock(rw)
+#define __raw_write_lock_flags(rw, flags)  __raw_write_lock(rw)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index c4d274d330e9..f6b2b92ad8d2 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -211,9 +211,11 @@ static int inline __write_trylock(raw_rwlock_t *lock)
 }
 
 #define __raw_read_lock(p)	__read_lock(p)
+#define __raw_read_lock_flags(p, f) __read_lock(p)
 #define __raw_read_trylock(p)	__read_trylock(p)
 #define __raw_read_unlock(p)	__read_unlock(p)
 #define __raw_write_lock(p)	__write_lock(p)
+#define __raw_write_lock_flags(p, f) __write_lock(p)
 #define __raw_write_unlock(p)	__write_unlock(p)
 #define __raw_write_trylock(p)	__write_trylock(p)
 
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 3a5696656680..e5e6caffec87 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -295,6 +295,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 		     : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/include/asm-m32r/spinlock.h b/include/asm-m32r/spinlock.h
index f5cfba81ee10..dded923883b2 100644
--- a/include/asm-m32r/spinlock.h
+++ b/include/asm-m32r/spinlock.h
@@ -316,6 +316,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock)
 	return 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index a0c66a2e00ad..252b245cfcf4 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -153,9 +153,11 @@ do {								\
  extern int _raw_spin_trylock(spinlock_t *lock);
  extern void _raw_spin_unlock(spinlock_t *lock);
  extern void _raw_read_lock(rwlock_t *lock);
+#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock)
  extern int _raw_read_trylock(rwlock_t *lock);
  extern void _raw_read_unlock(rwlock_t *lock);
  extern void _raw_write_lock(rwlock_t *lock);
+#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock)
  extern int _raw_write_trylock(rwlock_t *lock);
  extern void _raw_write_unlock(rwlock_t *lock);
 #else
@@ -165,9 +167,13 @@ do {								\
 # define _raw_spin_trylock(lock)	__raw_spin_trylock(&(lock)->raw_lock)
 # define _raw_spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
 # define _raw_read_lock(rwlock)		__raw_read_lock(&(rwlock)->raw_lock)
+# define _raw_read_lock_flags(lock, flags) \
+		__raw_read_lock_flags(&(lock)->raw_lock, *(flags))
 # define _raw_read_trylock(rwlock)	__raw_read_trylock(&(rwlock)->raw_lock)
 # define _raw_read_unlock(rwlock)	__raw_read_unlock(&(rwlock)->raw_lock)
 # define _raw_write_lock(rwlock)	__raw_write_lock(&(rwlock)->raw_lock)
+# define _raw_write_lock_flags(lock, flags) \
+		__raw_write_lock_flags(&(lock)->raw_lock, *(flags))
 # define _raw_write_trylock(rwlock)	__raw_write_trylock(&(rwlock)->raw_lock)
 # define _raw_write_unlock(rwlock)	__raw_write_unlock(&(rwlock)->raw_lock)
 #endif
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7283c6dc2d59..7932653c4ebd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+	LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
+			     _raw_read_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_read_lock_irqsave);
@@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+	LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
+			     _raw_write_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_write_lock_irqsave);
-- 
cgit v1.2.3


From f7ab34ea723ed304b19698efca85d6f40cecd99b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 3 Apr 2009 01:34:35 -0400
Subject: ext3: Add replace-on-truncate hueristics for data=writeback mode

In data=writeback mode, start an asynchronous flush when closing a
file which had been previously truncated down to zero.  This lowers
the probability of data loss in the case of applications that attempt
to replace a file using truncate.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext3/file.c          | 4 ++++
 fs/ext3/inode.c         | 3 +++
 include/linux/ext3_fs.h | 1 +
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9a..4a04cbb1c231 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -33,6 +33,10 @@
  */
 static int ext3_release_file (struct inode * inode, struct file * filp)
 {
+	if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
+		filemap_flush(inode->i_mapping);
+		EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
+	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
 			(atomic_read(&inode->i_writecount) == 1))
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5fa453b49a64..0f5bca0d82fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2346,6 +2346,9 @@ void ext3_truncate(struct inode *inode)
 	if (!ext3_can_truncate(inode))
 		return;
 
+	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+		ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
+
 	/*
 	 * We have to lock the EOF page here, because lock_page() nests
 	 * outside journal_start().
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index dd495b8c3091..d2630c56cb34 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -208,6 +208,7 @@ static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
 #define EXT3_STATE_JDATA		0x00000001 /* journaled data exists */
 #define EXT3_STATE_NEW			0x00000002 /* inode is newly created */
 #define EXT3_STATE_XATTR		0x00000004 /* has in-inode xattrs */
+#define EXT3_STATE_FLUSH_ON_CLOSE	0x00000008
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext3_new_group_input {
-- 
cgit v1.2.3


From 3d544f411f2971eb82f5c52322251eb04494542a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Tue, 24 Mar 2009 11:59:23 +0200
Subject: kmemtrace, fs, security: move alloc_secdata() and free_secdata() to
 linux/security.h

Impact: cleanup

We want to remove percpu.h from rcupdate.h (for upcoming kmemtrace
changes), but this is not possible currently without breaking the
build because fs.h has implicit include file depedencies: it uses
GFP_* types in inlines but does not include gfp.h.

In practice most fs.h using .c files get gfp.h included implicitly,
via an indirect route: via rcupdate.h inclusion - so this underlying
problem gets masked in practice.

So we want to solve fs.h's dependency on gfp.h.

gfp.h can not be included here directly because it is not exported and it
would break the build the following way:

  /home/mingo/tip/usr/include/linux/bsg.h:11: found __[us]{8,16,32,64} type without #include <linux/types.h>
  /home/mingo/tip/usr/include/linux/fs.h:11: included file 'linux/gfp.h' is not exported
  make[3]: *** [/home/mingo/tip/usr/include/linux/.check] Error 1
  make[2]: *** [linux] Error 2

As suggested by Alexey Dobriyan, move alloc_secdata() and free_secdata()
to linux/security.h - they belong there. This also cleans fs.h of GFP_*
usage.

Suggested-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
LKML-Reference: <1237906803.25315.96.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/fs.h       | 21 ---------------------
 include/linux/security.h | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 61211ad823fe..fc4dc28c5735 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2382,27 +2382,6 @@ ssize_t simple_attr_read(struct file *file, char __user *buf,
 ssize_t simple_attr_write(struct file *file, const char __user *buf,
 			  size_t len, loff_t *ppos);
 
-
-#ifdef CONFIG_SECURITY
-static inline char *alloc_secdata(void)
-{
-	return (char *)get_zeroed_page(GFP_KERNEL);
-}
-
-static inline void free_secdata(void *secdata)
-{
-	free_page((unsigned long)secdata);
-}
-#else
-static inline char *alloc_secdata(void)
-{
-	return (char *)1;
-}
-
-static inline void free_secdata(void *secdata)
-{ }
-#endif	/* CONFIG_SECURITY */
-
 struct ctl_table;
 int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
diff --git a/include/linux/security.h b/include/linux/security.h
index 54ed15799a83..d5fd6163606f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -32,6 +32,7 @@
 #include <linux/sched.h>
 #include <linux/key.h>
 #include <linux/xfrm.h>
+#include <linux/gfp.h>
 #include <net/flow.h>
 
 /* Maximum number of letters for an LSM name string */
@@ -2953,5 +2954,28 @@ static inline void securityfs_remove(struct dentry *dentry)
 
 #endif
 
+#ifdef CONFIG_SECURITY
+
+static inline char *alloc_secdata(void)
+{
+	return (char *)get_zeroed_page(GFP_KERNEL);
+}
+
+static inline void free_secdata(void *secdata)
+{
+	free_page((unsigned long)secdata);
+}
+
+#else
+
+static inline char *alloc_secdata(void)
+{
+        return (char *)1;
+}
+
+static inline void free_secdata(void *secdata)
+{ }
+#endif /* CONFIG_SECURITY */
+
 #endif /* ! __LINUX_SECURITY_H */
 
-- 
cgit v1.2.3


From 76791ab2d5e00c1eef728a8df4347ba133760fb8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Mar 2009 16:48:35 +0100
Subject: kmemtrace, fs: uninline simple_transaction_set()

Impact: cleanup

We want to remove percpu.h from rcupdate.h (for upcoming kmemtrace
changes), but this is not possible currently without breaking the
build because fs.h has an implicit include file depedency: it
uses PAGE_SIZE but does not include asm/page.h which defines it.

This problem gets masked in practice because most fs.h using sites
use rcupreempt.h (and other headers) which includes percpu.h which
brings in asm/page.h indirectly.

We cannot add asm/page.h to asm/fs.h because page.h is not an
exported header.

Move simple_transaction_set() to the other simple-transaction
file helpers in fs/libfs.c.

This removes the include file hell and also reduces
kernel size a bit.

Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/libfs.c         | 16 ++++++++++++++++
 include/linux/fs.h | 14 +-------------
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516e..cd223190c4e9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -575,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
  * possibly a read which collects the result - which is stored in a
  * file-local buffer.
  */
+
+void simple_transaction_set(struct file *file, size_t n)
+{
+	struct simple_transaction_argresp *ar = file->private_data;
+
+	BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
+
+	/*
+	 * The barrier ensures that ar->size will really remain zero until
+	 * ar->data is ready for reading.
+	 */
+	smp_mb();
+	ar->size = n;
+}
+
 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
 {
 	struct simple_transaction_argresp *ar;
@@ -820,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
 EXPORT_SYMBOL(memory_read_from_buffer);
+EXPORT_SYMBOL(simple_transaction_set);
 EXPORT_SYMBOL(simple_transaction_get);
 EXPORT_SYMBOL(simple_transaction_read);
 EXPORT_SYMBOL(simple_transaction_release);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fc4dc28c5735..e4de2b543a73 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2323,19 +2323,7 @@ ssize_t simple_transaction_read(struct file *file, char __user *buf,
 				size_t size, loff_t *pos);
 int simple_transaction_release(struct inode *inode, struct file *file);
 
-static inline void simple_transaction_set(struct file *file, size_t n)
-{
-	struct simple_transaction_argresp *ar = file->private_data;
-
-	BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
-
-	/*
-	 * The barrier ensures that ar->size will really remain zero until
-	 * ar->data is ready for reading.
-	 */
-	smp_mb();
-	ar->size = n;
-}
+void simple_transaction_set(struct file *file, size_t n);
 
 /*
  * simple attribute files
-- 
cgit v1.2.3


From 21e5445928af0d80f7cf803a77f2b65e9e147890 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Mar 2009 16:29:05 +0100
Subject: kmemtrace, fs: fix linux/fdtable.h header file dependencies

Impact: cleanup

We want to remove percpu.h from rcupdate.h (for upcoming kmemtrace
changes), but this is not possible currently without breaking the
build because fdtable.h has an implicit include file dependency: it
uses __init does not include init.h.

This can cause build failures on non-x86 architectures:

 /home/mingo/tip/include/linux/fdtable.h:66: error: expected '=', ',',
 ';', 'asm' or '__attribute__' before 'files_defer_init'
 make[2]: *** [fs/locks.o] Error 1

We got this header included indirectly via rcupdate.h's percpu.h
inclusion - but if that is not there the build will break.

Fix it.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/fdtable.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 09d6c5bbdddd..a2ec74bc4812 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -5,12 +5,14 @@
 #ifndef __LINUX_FDTABLE_H
 #define __LINUX_FDTABLE_H
 
-#include <asm/atomic.h>
 #include <linux/posix_types.h>
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/types.h>
+#include <linux/init.h>
+
+#include <asm/atomic.h>
 
 /*
  * The default fd array needs to be at least BITS_PER_LONG,
-- 
cgit v1.2.3


From aa84442d674ef83e1cbf2bce52a20ecff4ce515e Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Tue, 24 Mar 2009 10:54:46 +0200
Subject: kmemtrace, security: fix linux/key.h header file dependencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

We want to remove percpu.h from rcupdate.h (for upcoming kmemtrace
changes), but this is not possible currently without breaking the
build because key.h has an implicit include file dependency on
rwsem.h:

    CC [M]  fs/cifs/cifs_spnego.o
  In file included from include/keys/user-type.h:15,
                   from fs/cifs/cifs_spnego.c:24:
  include/linux/key.h:128: error: field ‘sem’ has incomplete type
  make[2]: *** [fs/cifs/cifs_spnego.o] Error 1
  make[1]: *** [fs/cifs] Error 2
  make: *** [fs] Error 2

Fix it by making the dependency explicit.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
LKML-Reference: <1237884886.25315.39.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/key.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 21d32a142c00..e544f466d69a 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -20,6 +20,7 @@
 #include <linux/rbtree.h>
 #include <linux/rcupdate.h>
 #include <linux/sysctl.h>
+#include <linux/rwsem.h>
 #include <asm/atomic.h>
 
 #ifdef __KERNEL__
-- 
cgit v1.2.3


From b1f77b0581b8fd837acb4a973f7d5496cae6efee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 03:20:49 +0100
Subject: kmemtrace, rcu: fix linux/rcutree.h and linux/rcuclassic.h
 dependencies

Impact: build fix for all non-x86 architectures

We want to remove percpu.h from rcuclassic.h/rcutree.h (for upcoming
kmemtrace changes) but that would break the DECLARE_PER_CPU based
declarations in these files.

Move the quiescent counter management functions to their respective
RCU implementation .c files - they were slightly above the inlining
limit anyway.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcuclassic.h | 15 ++-------------
 include/linux/rcutree.h    | 26 ++------------------------
 kernel/rcuclassic.c        | 23 +++++++++++++++++++++--
 kernel/rcutree.c           | 28 ++++++++++++++++++++++++----
 4 files changed, 49 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 80044a4f3ab9..2d688b4461f7 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -108,25 +108,14 @@ struct rcu_data {
 	struct rcu_head barrier;
 };
 
-DECLARE_PER_CPU(struct rcu_data, rcu_data);
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-
 /*
  * Increment the quiescent state counter.
  * The counter is a bit degenerated: We do not need to know
  * how many quiescent states passed, just if there was at least
  * one since the start of the grace period. Thus just a flag.
  */
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	rdp->passed_quiesc = 1;
-}
-static inline void rcu_bh_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc = 1;
-}
+extern void rcu_qsctr_inc(int cpu);
+extern void rcu_bh_qsctr_inc(int cpu);
 
 extern int rcu_pending(int cpu);
 extern int rcu_needs_cpu(int cpu);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index a722fb67bb2d..5d6f425260bc 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -236,30 +236,8 @@ struct rcu_state {
 #endif /* #ifdef CONFIG_NO_HZ */
 };
 
-extern struct rcu_state rcu_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_data);
-
-extern struct rcu_state rcu_bh_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-
-/*
- * Increment the quiescent state counter.
- * The counter is a bit degenerated: We do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period. Thus just a flag.
- */
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	rdp->passed_quiesc = 1;
-	rdp->passed_quiesc_completed = rdp->completed;
-}
-static inline void rcu_bh_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc = 1;
-	rdp->passed_quiesc_completed = rdp->completed;
-}
+extern void rcu_qsctr_inc(int cpu);
+extern void rcu_bh_qsctr_inc(int cpu);
 
 extern int rcu_pending(int cpu);
 extern int rcu_needs_cpu(int cpu);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 654c640a6b9c..0f2b0b311304 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
 	.cpumask = CPU_BITS_NONE,
 };
+
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
@@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cpumask = CPU_BITS_NONE,
 };
 
-DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+}
+
+void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+}
 
 static int blimit = 10;
 static int qhimark = 10000;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 97ce31579ec0..a2015edfe167 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -72,11 +72,31 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 	.n_force_qs_ngp = 0, \
 }
 
-struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
-DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
 
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+static struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+
+void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
 
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-- 
cgit v1.2.3


From a979241c532f07c201fe94e0a632107268f02578 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Mar 2009 16:42:24 +0100
Subject: kmemtrace, rcu: fix rcupreempt.c data structure dependencies

Impact: cleanup

We want to remove percpu.h from rcupreempt.h, but if we do that
the percpu primitives there wont build anymore. Move them to the
.c file instead.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcupreempt.h | 51 ++++++++--------------------------------------
 kernel/rcupreempt.c        | 48 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 53 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 74304b4538d8..3eb8fdd19445 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -40,30 +40,15 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
-struct rcu_dyntick_sched {
-	int dynticks;
-	int dynticks_snap;
-	int sched_qs;
-	int sched_qs_snap;
-	int sched_dynticks_snap;
-};
-
-DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched);
-
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	rdssp->sched_qs++;
-}
-#define rcu_bh_qsctr_inc(cpu)
+extern void rcu_qsctr_inc(int cpu);
+static inline void rcu_bh_qsctr_inc(int cpu) { }
 
 /*
  * Someone might want to pass call_rcu_bh as a function pointer.
  * So this needs to just be a rename and not a macro function.
  *  (no parentheses)
  */
-#define call_rcu_bh	 	call_rcu
+#define call_rcu_bh		call_rcu
 
 /**
  * call_rcu_sched - Queue RCU callback for invocation after sched grace period.
@@ -117,30 +102,12 @@ extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
 struct softirq_action;
 
 #ifdef CONFIG_NO_HZ
-
-static inline void rcu_enter_nohz(void)
-{
-	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
-
-	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
-	__get_cpu_var(rcu_dyntick_sched).dynticks++;
-	WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
-}
-
-static inline void rcu_exit_nohz(void)
-{
-	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
-
-	__get_cpu_var(rcu_dyntick_sched).dynticks++;
-	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
-	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
-				&rs);
-}
-
-#else /* CONFIG_NO_HZ */
-#define rcu_enter_nohz()	do { } while (0)
-#define rcu_exit_nohz()		do { } while (0)
-#endif /* CONFIG_NO_HZ */
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
+#else
+# define rcu_enter_nohz()	do { } while (0)
+# define rcu_exit_nohz()	do { } while (0)
+#endif
 
 /*
  * A context switch is a grace period for rcupreempt synchronize_rcu()
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5d59e850fb71..ce97a4df64d3 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -147,7 +147,51 @@ struct rcu_ctrlblk {
 	wait_queue_head_t sched_wq;	/* Place for rcu_sched to sleep. */
 };
 
+struct rcu_dyntick_sched {
+	int dynticks;
+	int dynticks_snap;
+	int sched_qs;
+	int sched_qs_snap;
+	int sched_dynticks_snap;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+	.dynticks = 1,
+};
+
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->sched_qs++;
+}
+
+#ifdef CONFIG_NO_HZ
+
+void rcu_enter_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	__get_cpu_var(rcu_dyntick_sched).dynticks++;
+	WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
+}
+
+void rcu_exit_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	__get_cpu_var(rcu_dyntick_sched).dynticks++;
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
+				&rs);
+}
+
+#endif /* CONFIG_NO_HZ */
+
+
 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+
 static struct rcu_ctrlblk rcu_ctrlblk = {
 	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
 	.completed = 0,
@@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
 	}
 }
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
-	.dynticks = 1,
-};
-
 #ifdef CONFIG_NO_HZ
 static DEFINE_PER_CPU(int, rcu_update_flag);
 
-- 
cgit v1.2.3


From ac44021fccd8f1f2b267b004f23a2e8d7ef05f7b Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:21 +0200
Subject: kmemtrace, rcu: don't include unnecessary headers, allow kmemtrace w/
 tracepoints

Impact: cleanup

linux/percpu.h includes linux/slab.h, which generates circular inclusion
dependencies when trying to switch kmemtrace to use tracepoints instead
of markers.

This patch allows tracing within slab headers' inline functions.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcuclassic.h | 1 -
 include/linux/rcupdate.h   | 1 -
 include/linux/rcupreempt.h | 2 +-
 include/linux/rcutree.h    | 1 -
 4 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 2d688b4461f7..bfd92e1e5d2c 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -36,7 +36,6 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
-#include <linux/percpu.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 528343e6da51..15fbb3ca634d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -36,7 +36,6 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
-#include <linux/percpu.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 #include <linux/lockdep.h>
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 3eb8fdd19445..fce522782ffa 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -36,7 +36,7 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
-#include <linux/percpu.h>
+#include <linux/smp.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 5d6f425260bc..0cdda00f2b2a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -33,7 +33,6 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
-#include <linux/percpu.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
-- 
cgit v1.2.3


From ca2b84cb3c4a0d4d2143b46ec072cdff5d1b3b87 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:24 +0200
Subject: kmemtrace: use tracepoints

kmemtrace now uses tracepoints instead of markers. We no longer need to
use format specifiers to pass arguments.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
[ folded: Use the new TP_PROTO and TP_ARGS to fix the build.     ]
[ folded: fix build when CONFIG_KMEMTRACE is disabled.           ]
[ folded: define tracepoints when CONFIG_TRACEPOINTS is enabled. ]
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <ae61c0f37156db8ec8dc0d5778018edde60a92e3.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/slab_def.h  |  10 +--
 include/linux/slub_def.h  |  12 +--
 include/trace/kmemtrace.h |  92 +++++++++------------
 kernel/trace/kmemtrace.c  | 206 ++++++++++++++++++++++++++++++++--------------
 kernel/trace/trace.h      |   6 ++
 mm/slab.c                 |  24 +++---
 mm/slob.c                 |  28 +++----
 mm/slub.c                 |  30 +++----
 mm/util.c                 |  16 ++++
 9 files changed, 251 insertions(+), 173 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index f4523651fa42..5ac9b0bcaf9a 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -73,8 +73,8 @@ found:
 
 		ret = kmem_cache_alloc_notrace(cachep, flags);
 
-		kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
-				     size, slab_buffer_size(cachep), flags);
+		trace_kmalloc(_THIS_IP_, ret,
+			      size, slab_buffer_size(cachep), flags);
 
 		return ret;
 	}
@@ -128,9 +128,9 @@ found:
 
 		ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_,
-					  ret, size, slab_buffer_size(cachep),
-					  flags, node);
+		trace_kmalloc_node(_THIS_IP_, ret,
+				   size, slab_buffer_size(cachep),
+				   flags, node);
 
 		return ret;
 	}
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index a1f90528e70b..5046f90c1171 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -233,8 +233,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 	unsigned int order = get_order(size);
 	void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
-			     size, PAGE_SIZE << order, flags);
+	trace_kmalloc(_THIS_IP_, ret, size, PAGE_SIZE << order, flags);
 
 	return ret;
 }
@@ -255,9 +254,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 
 			ret = kmem_cache_alloc_notrace(s, flags);
 
-			kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
-					     _THIS_IP_, ret,
-					     size, s->size, flags);
+			trace_kmalloc(_THIS_IP_, ret, size, s->size, flags);
 
 			return ret;
 		}
@@ -296,9 +293,8 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 
 		ret = kmem_cache_alloc_node_notrace(s, flags, node);
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _THIS_IP_, ret,
-					  size, s->size, flags, node);
+		trace_kmalloc_node(_THIS_IP_, ret,
+				   size, s->size, flags, node);
 
 		return ret;
 	}
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h
index ad8b7857855a..28ee69f9cd46 100644
--- a/include/trace/kmemtrace.h
+++ b/include/trace/kmemtrace.h
@@ -9,65 +9,53 @@
 
 #ifdef __KERNEL__
 
+#include <linux/tracepoint.h>
 #include <linux/types.h>
-#include <linux/marker.h>
-
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
 
 #ifdef CONFIG_KMEMTRACE
-
 extern void kmemtrace_init(void);
-
-extern void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node);
-
-extern void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr);
-
-#else /* CONFIG_KMEMTRACE */
-
+#else
 static inline void kmemtrace_init(void)
 {
 }
-
-static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node)
-{
-}
-
-static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr)
-{
-}
-
-#endif /* CONFIG_KMEMTRACE */
-
-static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
-					unsigned long call_site,
-					const void *ptr,
-					size_t bytes_req,
-					size_t bytes_alloc,
-					gfp_t gfp_flags)
-{
-	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
-				  bytes_req, bytes_alloc, gfp_flags, -1);
-}
+#endif
+
+DECLARE_TRACE(kmalloc,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
+DECLARE_TRACE(kmem_cache_alloc,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
+DECLARE_TRACE(kmalloc_node,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags,
+		      int node),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
+DECLARE_TRACE(kmem_cache_alloc_node,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags,
+		      int node),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
+DECLARE_TRACE(kfree,
+	      TP_PROTO(unsigned long call_site, const void *ptr),
+	      TP_ARGS(call_site, ptr));
+DECLARE_TRACE(kmem_cache_free,
+	      TP_PROTO(unsigned long call_site, const void *ptr),
+	      TP_ARGS(call_site, ptr));
 
 #endif /* __KERNEL__ */
 
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index ae201b3eda89..4f7b5db5dd06 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
+#include <linux/tracepoint.h>
 #include <trace/kmemtrace.h>
 
 #include "trace.h"
@@ -29,10 +30,150 @@ static struct tracer_flags kmem_tracer_flags = {
 	.opts = kmem_opts
 };
 
-
-static bool kmem_tracing_enabled __read_mostly;
 static struct trace_array *kmemtrace_array;
 
+/* Trace allocations */
+static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
+				   unsigned long call_site,
+				   const void *ptr,
+				   size_t bytes_req,
+				   size_t bytes_alloc,
+				   gfp_t gfp_flags,
+				   int node)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_alloc_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+	entry->bytes_req = bytes_req;
+	entry->bytes_alloc = bytes_alloc;
+	entry->gfp_flags = gfp_flags;
+	entry->node	=	node;
+
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+	trace_wake_up();
+}
+
+static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
+				  unsigned long call_site,
+				  const void *ptr)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_free_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_FREE;
+	entry->type_id	= type_id;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+	trace_wake_up();
+}
+
+static void kmemtrace_kmalloc(unsigned long call_site,
+			      const void *ptr,
+			      size_t bytes_req,
+			      size_t bytes_alloc,
+			      gfp_t gfp_flags)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
+				       const void *ptr,
+				       size_t bytes_req,
+				       size_t bytes_alloc,
+				       gfp_t gfp_flags)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmalloc_node(unsigned long call_site,
+				   const void *ptr,
+				   size_t bytes_req,
+				   size_t bytes_alloc,
+				   gfp_t gfp_flags,
+				   int node)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
+					    const void *ptr,
+					    size_t bytes_req,
+					    size_t bytes_alloc,
+					    gfp_t gfp_flags,
+					    int node)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
+{
+	kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
+}
+
+static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
+{
+	kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
+}
+
+static int kmemtrace_start_probes(void)
+{
+	int err;
+
+	err = register_trace_kmalloc(kmemtrace_kmalloc);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+	if (err)
+		return err;
+	err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+	if (err)
+		return err;
+	err = register_trace_kfree(kmemtrace_kfree);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+
+	return err;
+}
+
+static void kmemtrace_stop_probes(void)
+{
+	unregister_trace_kmalloc(kmemtrace_kmalloc);
+	unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+	unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
+	unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+	unregister_trace_kfree(kmemtrace_kfree);
+	unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+}
+
 static int kmem_trace_init(struct trace_array *tr)
 {
 	int cpu;
@@ -41,14 +182,14 @@ static int kmem_trace_init(struct trace_array *tr)
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 
-	kmem_tracing_enabled = true;
+	kmemtrace_start_probes();
 
 	return 0;
 }
 
 static void kmem_trace_reset(struct trace_array *tr)
 {
-	kmem_tracing_enabled = false;
+	kmemtrace_stop_probes();
 }
 
 static void kmemtrace_headers(struct seq_file *s)
@@ -260,63 +401,6 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 	}
 }
 
-/* Trace allocations */
-void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-			     unsigned long call_site,
-			     const void *ptr,
-			     size_t bytes_req,
-			     size_t bytes_alloc,
-			     gfp_t gfp_flags,
-			     int node)
-{
-	struct ring_buffer_event *event;
-	struct kmemtrace_alloc_entry *entry;
-	struct trace_array *tr = kmemtrace_array;
-
-	if (!kmem_tracing_enabled)
-		return;
-
-	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-
-	entry->call_site = call_site;
-	entry->ptr = ptr;
-	entry->bytes_req = bytes_req;
-	entry->bytes_alloc = bytes_alloc;
-	entry->gfp_flags = gfp_flags;
-	entry->node	=	node;
-
-	trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
-
-void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-		       unsigned long call_site,
-		       const void *ptr)
-{
-	struct ring_buffer_event *event;
-	struct kmemtrace_free_entry *entry;
-	struct trace_array *tr = kmemtrace_array;
-
-	if (!kmem_tracing_enabled)
-		return;
-
-	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	entry->type_id	= type_id;
-	entry->call_site = call_site;
-	entry->ptr = ptr;
-
-	trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-EXPORT_SYMBOL(kmemtrace_mark_free);
-
 static struct tracer kmem_tracer __read_mostly = {
 	.name		= "kmemtrace",
 	.init		= kmem_trace_init,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cb0ce3fc36d3..cbc168f1e43d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -182,6 +182,12 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
+enum kmemtrace_type_id {
+	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
+	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
+	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
+};
+
 struct kmemtrace_alloc_entry {
 	struct trace_entry	ent;
 	enum kmemtrace_type_id type_id;
diff --git a/mm/slab.c b/mm/slab.c
index 9ec66c3e6ee0..fa00fd6a644d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3565,8 +3565,8 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-			     obj_size(cachep), cachep->buffer_size, flags);
+	trace_kmem_cache_alloc(_RET_IP_, ret,
+			       obj_size(cachep), cachep->buffer_size, flags);
 
 	return ret;
 }
@@ -3627,9 +3627,9 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	void *ret = __cache_alloc_node(cachep, flags, nodeid,
 				       __builtin_return_address(0));
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-				  obj_size(cachep), cachep->buffer_size,
-				  flags, nodeid);
+	trace_kmem_cache_alloc_node(_RET_IP_, ret,
+				    obj_size(cachep), cachep->buffer_size,
+				    flags, nodeid);
 
 	return ret;
 }
@@ -3657,9 +3657,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 		return cachep;
 	ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-				  (unsigned long) caller, ret,
-				  size, cachep->buffer_size, flags, node);
+	trace_kmalloc_node((unsigned long) caller, ret,
+			   size, cachep->buffer_size, flags, node);
 
 	return ret;
 }
@@ -3709,9 +3708,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 		return cachep;
 	ret = __cache_alloc(cachep, flags, caller);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
-			     (unsigned long) caller, ret,
-			     size, cachep->buffer_size, flags);
+	trace_kmalloc((unsigned long) caller, ret,
+		      size, cachep->buffer_size, flags);
 
 	return ret;
 }
@@ -3757,7 +3755,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	__cache_free(cachep, objp);
 	local_irq_restore(flags);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, objp);
+	trace_kmem_cache_free(_RET_IP_, objp);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -3785,7 +3783,7 @@ void kfree(const void *objp)
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, objp);
+	trace_kfree(_RET_IP_, objp);
 }
 EXPORT_SYMBOL(kfree);
 
diff --git a/mm/slob.c b/mm/slob.c
index 4dd6516447f2..00003587ebfa 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -490,9 +490,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 		*m = size;
 		ret = (void *)m + align;
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _RET_IP_, ret,
-					  size, size + align, gfp, node);
+		trace_kmalloc_node(_RET_IP_, ret,
+				   size, size + align, gfp, node);
 	} else {
 		unsigned int order = get_order(size);
 
@@ -503,9 +502,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 			page->private = size;
 		}
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _RET_IP_, ret,
-					  size, PAGE_SIZE << order, gfp, node);
+		trace_kmalloc_node(_RET_IP_, ret,
+				   size, PAGE_SIZE << order, gfp, node);
 	}
 
 	return ret;
@@ -527,7 +525,7 @@ void kfree(const void *block)
 	} else
 		put_page(&sp->page);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, block);
+	trace_kfree(_RET_IP_, block);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -599,16 +597,14 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 
 	if (c->size < PAGE_SIZE) {
 		b = slob_alloc(c->size, flags, c->align, node);
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
-					  _RET_IP_, b, c->size,
-					  SLOB_UNITS(c->size) * SLOB_UNIT,
-					  flags, node);
+		trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+					    SLOB_UNITS(c->size) * SLOB_UNIT,
+					    flags, node);
 	} else {
 		b = slob_new_pages(flags, get_order(c->size), node);
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
-					  _RET_IP_, b, c->size,
-					  PAGE_SIZE << get_order(c->size),
-					  flags, node);
+		trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+					    PAGE_SIZE << get_order(c->size),
+					    flags, node);
 	}
 
 	if (c->ctor)
@@ -646,7 +642,7 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 		__kmem_cache_free(b, c->size);
 	}
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, b);
+	trace_kmem_cache_free(_RET_IP_, b);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
diff --git a/mm/slub.c b/mm/slub.c
index 7aaa121d0ea9..a98078bf738b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1621,8 +1621,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
 	void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-			     s->objsize, s->size, gfpflags);
+	trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
 
 	return ret;
 }
@@ -1641,8 +1640,8 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
 	void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-				  s->objsize, s->size, gfpflags, node);
+	trace_kmem_cache_alloc_node(_RET_IP_, ret,
+				    s->objsize, s->size, gfpflags, node);
 
 	return ret;
 }
@@ -1767,7 +1766,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 
 	slab_free(s, page, x, _RET_IP_);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, x);
+	trace_kmem_cache_free(_RET_IP_, x);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -2702,8 +2701,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 
 	ret = slab_alloc(s, flags, -1, _RET_IP_);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
-			     size, s->size, flags);
+	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
 
 	return ret;
 }
@@ -2729,10 +2727,9 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 	if (unlikely(size > SLUB_MAX_SIZE)) {
 		ret = kmalloc_large_node(size, flags, node);
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _RET_IP_, ret,
-					  size, PAGE_SIZE << get_order(size),
-					  flags, node);
+		trace_kmalloc_node(_RET_IP_, ret,
+				   size, PAGE_SIZE << get_order(size),
+				   flags, node);
 
 		return ret;
 	}
@@ -2744,8 +2741,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 
 	ret = slab_alloc(s, flags, node, _RET_IP_);
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
-				  size, s->size, flags, node);
+	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
 
 	return ret;
 }
@@ -2807,7 +2803,7 @@ void kfree(const void *x)
 	}
 	slab_free(page->slab, page, object, _RET_IP_);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, x);
+	trace_kfree(_RET_IP_, x);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3290,8 +3286,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 	ret = slab_alloc(s, gfpflags, -1, caller);
 
 	/* Honor the call site pointer we recieved. */
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, caller, ret, size,
-			     s->size, gfpflags);
+	trace_kmalloc(caller, ret, size, s->size, gfpflags);
 
 	return ret;
 }
@@ -3313,8 +3308,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	ret = slab_alloc(s, gfpflags, node, caller);
 
 	/* Honor the call site pointer we recieved. */
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, caller, ret,
-				  size, s->size, gfpflags, node);
+	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
 
 	return ret;
 }
diff --git a/mm/util.c b/mm/util.c
index 7c122e49f769..2599e83eea17 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/tracepoint.h>
 #include <asm/uaccess.h>
 
 /**
@@ -236,3 +237,18 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
+
+/* Tracepoints definitions. */
+DEFINE_TRACE(kmalloc);
+DEFINE_TRACE(kmem_cache_alloc);
+DEFINE_TRACE(kmalloc_node);
+DEFINE_TRACE(kmem_cache_alloc_node);
+DEFINE_TRACE(kfree);
+DEFINE_TRACE(kmem_cache_free);
+
+EXPORT_TRACEPOINT_SYMBOL(kmalloc);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
+EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kfree);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
-- 
cgit v1.2.3


From 07fe7cb7c7c179f473fd9c823348fd3eb5dad369 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Create a dynamically sized pool of threads for doing very slow work
 items

Create a dynamically sized pool of threads for doing very slow work items, such
as invoking mkdir() or rmdir() - things that may take a long time and may
sleep, holding mutexes/semaphores and hogging a thread, and are thus unsuitable
for workqueues.

The number of threads is always at least a settable minimum, but more are
started when there's more work to do, up to a limit.  Because of the nature of
the load, it's not suitable for a 1-thread-per-CPU type pool.  A system with
one CPU may well want several threads.

This is used by FS-Cache to do slow caching operations in the background, such
as looking up, creating or deleting cache objects.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/slow-work.h |  88 +++++++++++
 init/Kconfig              |  12 ++
 kernel/Makefile           |   1 +
 kernel/slow-work.c        | 388 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 489 insertions(+)
 create mode 100644 include/linux/slow-work.h
 create mode 100644 kernel/slow-work.c

(limited to 'include/linux')

diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
new file mode 100644
index 000000000000..4dd754af393e
--- /dev/null
+++ b/include/linux/slow-work.h
@@ -0,0 +1,88 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_SLOW_WORK_H
+#define _LINUX_SLOW_WORK_H
+
+#ifdef CONFIG_SLOW_WORK
+
+struct slow_work;
+
+/*
+ * The operations used to support slow work items
+ */
+struct slow_work_ops {
+	/* get a ref on a work item
+	 * - return 0 if successful, -ve if not
+	 */
+	int (*get_ref)(struct slow_work *work);
+
+	/* discard a ref to a work item */
+	void (*put_ref)(struct slow_work *work);
+
+	/* execute a work item */
+	void (*execute)(struct slow_work *work);
+};
+
+/*
+ * A slow work item
+ * - A reference is held on the parent object by the thread pool when it is
+ *   queued
+ */
+struct slow_work {
+	unsigned long		flags;
+#define SLOW_WORK_PENDING	0	/* item pending (further) execution */
+#define SLOW_WORK_EXECUTING	1	/* item currently executing */
+#define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
+#define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
+	const struct slow_work_ops *ops; /* operations table for this item */
+	struct list_head	link;	/* link in queue */
+};
+
+/**
+ * slow_work_init - Initialise a slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a slow work item.
+ */
+static inline void slow_work_init(struct slow_work *work,
+				  const struct slow_work_ops *ops)
+{
+	work->flags = 0;
+	work->ops = ops;
+	INIT_LIST_HEAD(&work->link);
+}
+
+/**
+ * slow_work_init - Initialise a very slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a very slow work item.  This item will be restricted such that
+ * only a certain number of the pool threads will be able to execute items of
+ * this type.
+ */
+static inline void vslow_work_init(struct slow_work *work,
+				   const struct slow_work_ops *ops)
+{
+	work->flags = 1 << SLOW_WORK_VERY_SLOW;
+	work->ops = ops;
+	INIT_LIST_HEAD(&work->link);
+}
+
+extern int slow_work_enqueue(struct slow_work *work);
+extern int slow_work_register_user(void);
+extern void slow_work_unregister_user(void);
+
+
+#endif /* CONFIG_SLOW_WORK */
+#endif /* _LINUX_SLOW_WORK_H */
diff --git a/init/Kconfig b/init/Kconfig
index 1398a14b0191..236a79377b8e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1014,6 +1014,18 @@ config MARKERS
 
 source "arch/Kconfig"
 
+config SLOW_WORK
+	default n
+	bool "Enable slow work thread pool"
+	help
+	  The slow work thread pool provides a number of dynamically allocated
+	  threads that can be used by the kernel to perform operations that
+	  take a relatively long time.
+
+	  An example of this would be CacheFiles doing a path lookup followed
+	  by a series of mkdirs and a create call, all of which have to touch
+	  disk.
+
 endmenu		# General setup
 
 config HAVE_GENERIC_DMA_COHERENT
diff --git a/kernel/Makefile b/kernel/Makefile
index e4791b3ba55d..bab1dffe37e9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_SLOW_WORK) += slow-work.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 000000000000..5a7392734c82
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,388 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <asm/system.h>
+
+/*
+ * The pool of threads has at least min threads in it as long as someone is
+ * using the facility, and may have as many as max.
+ *
+ * A portion of the pool may be processing very slow operations.
+ */
+static unsigned slow_work_min_threads = 2;
+static unsigned slow_work_max_threads = 4;
+static unsigned vslow_work_proportion = 50; /* % of threads that may process
+					     * very slow work */
+static atomic_t slow_work_thread_count;
+static atomic_t vslow_work_executing_count;
+
+/*
+ * The queues of work items and the lock governing access to them.  These are
+ * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
+ * as the number of threads bears no relation to the number of CPUs.
+ *
+ * There are two queues of work items: one for slow work items, and one for
+ * very slow work items.
+ */
+static LIST_HEAD(slow_work_queue);
+static LIST_HEAD(vslow_work_queue);
+static DEFINE_SPINLOCK(slow_work_queue_lock);
+
+/*
+ * The thread controls.  A variable used to signal to the threads that they
+ * should exit when the queue is empty, a waitqueue used by the threads to wait
+ * for signals, and a completion set by the last thread to exit.
+ */
+static bool slow_work_threads_should_exit;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
+static DECLARE_COMPLETION(slow_work_last_thread_exited);
+
+/*
+ * The number of users of the thread pool and its lock.  Whilst this is zero we
+ * have no threads hanging around, and when this reaches zero, we wait for all
+ * active or queued work items to complete and kill all the threads we do have.
+ */
+static int slow_work_user_count;
+static DEFINE_MUTEX(slow_work_user_lock);
+
+/*
+ * Calculate the maximum number of active threads in the pool that are
+ * permitted to process very slow work items.
+ *
+ * The answer is rounded up to at least 1, but may not equal or exceed the
+ * maximum number of the threads in the pool.  This means we always have at
+ * least one thread that can process slow work items, and we always have at
+ * least one thread that won't get tied up doing so.
+ */
+static unsigned slow_work_calc_vsmax(void)
+{
+	unsigned vsmax;
+
+	vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
+	vsmax /= 100;
+	vsmax = max(vsmax, 1U);
+	return min(vsmax, slow_work_max_threads - 1);
+}
+
+/*
+ * Attempt to execute stuff queued on a slow thread.  Return true if we managed
+ * it, false if there was nothing to do.
+ */
+static bool slow_work_execute(void)
+{
+	struct slow_work *work = NULL;
+	unsigned vsmax;
+	bool very_slow;
+
+	vsmax = slow_work_calc_vsmax();
+
+	/* find something to execute */
+	spin_lock_irq(&slow_work_queue_lock);
+	if (!list_empty(&vslow_work_queue) &&
+	    atomic_read(&vslow_work_executing_count) < vsmax) {
+		work = list_entry(vslow_work_queue.next,
+				  struct slow_work, link);
+		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+			BUG();
+		list_del_init(&work->link);
+		atomic_inc(&vslow_work_executing_count);
+		very_slow = true;
+	} else if (!list_empty(&slow_work_queue)) {
+		work = list_entry(slow_work_queue.next,
+				  struct slow_work, link);
+		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+			BUG();
+		list_del_init(&work->link);
+		very_slow = false;
+	} else {
+		very_slow = false; /* avoid the compiler warning */
+	}
+	spin_unlock_irq(&slow_work_queue_lock);
+
+	if (!work)
+		return false;
+
+	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
+		BUG();
+
+	work->ops->execute(work);
+
+	if (very_slow)
+		atomic_dec(&vslow_work_executing_count);
+	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
+
+	/* if someone tried to enqueue the item whilst we were executing it,
+	 * then it'll be left unenqueued to avoid multiple threads trying to
+	 * execute it simultaneously
+	 *
+	 * there is, however, a race between us testing the pending flag and
+	 * getting the spinlock, and between the enqueuer setting the pending
+	 * flag and getting the spinlock, so we use a deferral bit to tell us
+	 * if the enqueuer got there first
+	 */
+	if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irq(&slow_work_queue_lock);
+
+		if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
+		    test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
+			goto auto_requeue;
+
+		spin_unlock_irq(&slow_work_queue_lock);
+	}
+
+	work->ops->put_ref(work);
+	return true;
+
+auto_requeue:
+	/* we must complete the enqueue operation
+	 * - we transfer our ref on the item back to the appropriate queue
+	 * - don't wake another thread up as we're awake already
+	 */
+	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+		list_add_tail(&work->link, &vslow_work_queue);
+	else
+		list_add_tail(&work->link, &slow_work_queue);
+	spin_unlock_irq(&slow_work_queue_lock);
+	return true;
+}
+
+/**
+ * slow_work_enqueue - Schedule a slow work item for processing
+ * @work: The work item to queue
+ *
+ * Schedule a slow work item for processing.  If the item is already undergoing
+ * execution, this guarantees not to re-enter the execution routine until the
+ * first execution finishes.
+ *
+ * The item is pinned by this function as it retains a reference to it, managed
+ * through the item operations.  The item is unpinned once it has been
+ * executed.
+ *
+ * An item may hog the thread that is running it for a relatively large amount
+ * of time, sufficient, for example, to perform several lookup, mkdir, create
+ * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
+ *
+ * Conversely, if a number of items are awaiting processing, it may take some
+ * time before any given item is given attention.  The number of threads in the
+ * pool may be increased to deal with demand, but only up to a limit.
+ *
+ * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
+ * the very slow queue, from which only a portion of the threads will be
+ * allowed to pick items to execute.  This ensures that very slow items won't
+ * overly block ones that are just ordinarily slow.
+ *
+ * Returns 0 if successful, -EAGAIN if not.
+ */
+int slow_work_enqueue(struct slow_work *work)
+{
+	unsigned long flags;
+
+	BUG_ON(slow_work_user_count <= 0);
+	BUG_ON(!work);
+	BUG_ON(!work->ops);
+	BUG_ON(!work->ops->get_ref);
+
+	/* when honouring an enqueue request, we only promise that we will run
+	 * the work function in the future; we do not promise to run it once
+	 * per enqueue request
+	 *
+	 * we use the PENDING bit to merge together repeat requests without
+	 * having to disable IRQs and take the spinlock, whilst still
+	 * maintaining our promise
+	 */
+	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+		/* we promise that we will not attempt to execute the work
+		 * function in more than one thread simultaneously
+		 *
+		 * this, however, leaves us with a problem if we're asked to
+		 * enqueue the work whilst someone is executing the work
+		 * function as simply queueing the work immediately means that
+		 * another thread may try executing it whilst it is already
+		 * under execution
+		 *
+		 * to deal with this, we set the ENQ_DEFERRED bit instead of
+		 * enqueueing, and the thread currently executing the work
+		 * function will enqueue the work item when the work function
+		 * returns and it has cleared the EXECUTING bit
+		 */
+		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+		} else {
+			if (work->ops->get_ref(work) < 0)
+				goto cant_get_ref;
+			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+				list_add_tail(&work->link, &vslow_work_queue);
+			else
+				list_add_tail(&work->link, &slow_work_queue);
+			wake_up(&slow_work_thread_wq);
+		}
+
+		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	}
+	return 0;
+
+cant_get_ref:
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	return -EAGAIN;
+}
+EXPORT_SYMBOL(slow_work_enqueue);
+
+/*
+ * Determine if there is slow work available for dispatch
+ */
+static inline bool slow_work_available(int vsmax)
+{
+	return !list_empty(&slow_work_queue) ||
+		(!list_empty(&vslow_work_queue) &&
+		 atomic_read(&vslow_work_executing_count) < vsmax);
+}
+
+/*
+ * Worker thread dispatcher
+ */
+static int slow_work_thread(void *_data)
+{
+	int vsmax;
+
+	DEFINE_WAIT(wait);
+
+	set_freezable();
+	set_user_nice(current, -5);
+
+	for (;;) {
+		vsmax = vslow_work_proportion;
+		vsmax *= atomic_read(&slow_work_thread_count);
+		vsmax /= 100;
+
+		prepare_to_wait(&slow_work_thread_wq, &wait,
+				TASK_INTERRUPTIBLE);
+		if (!freezing(current) &&
+		    !slow_work_threads_should_exit &&
+		    !slow_work_available(vsmax))
+			schedule();
+		finish_wait(&slow_work_thread_wq, &wait);
+
+		try_to_freeze();
+
+		vsmax = vslow_work_proportion;
+		vsmax *= atomic_read(&slow_work_thread_count);
+		vsmax /= 100;
+
+		if (slow_work_available(vsmax) && slow_work_execute()) {
+			cond_resched();
+			continue;
+		}
+
+		if (slow_work_threads_should_exit)
+			break;
+	}
+
+	if (atomic_dec_and_test(&slow_work_thread_count))
+		complete_and_exit(&slow_work_last_thread_exited, 0);
+	return 0;
+}
+
+/**
+ * slow_work_register_user - Register a user of the facility
+ *
+ * Register a user of the facility, starting up the initial threads if there
+ * aren't any other users at this point.  This will return 0 if successful, or
+ * an error if not.
+ */
+int slow_work_register_user(void)
+{
+	struct task_struct *p;
+	int loop;
+
+	mutex_lock(&slow_work_user_lock);
+
+	if (slow_work_user_count == 0) {
+		printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
+		init_completion(&slow_work_last_thread_exited);
+
+		slow_work_threads_should_exit = false;
+
+		/* start the minimum number of threads */
+		for (loop = 0; loop < slow_work_min_threads; loop++) {
+			atomic_inc(&slow_work_thread_count);
+			p = kthread_run(slow_work_thread, NULL, "kslowd");
+			if (IS_ERR(p))
+				goto error;
+		}
+		printk(KERN_NOTICE "Slow work thread pool: Ready\n");
+	}
+
+	slow_work_user_count++;
+	mutex_unlock(&slow_work_user_lock);
+	return 0;
+
+error:
+	if (atomic_dec_and_test(&slow_work_thread_count))
+		complete(&slow_work_last_thread_exited);
+	if (loop > 0) {
+		printk(KERN_ERR "Slow work thread pool:"
+		       " Aborting startup on ENOMEM\n");
+		slow_work_threads_should_exit = true;
+		wake_up_all(&slow_work_thread_wq);
+		wait_for_completion(&slow_work_last_thread_exited);
+		printk(KERN_ERR "Slow work thread pool: Aborted\n");
+	}
+	mutex_unlock(&slow_work_user_lock);
+	return PTR_ERR(p);
+}
+EXPORT_SYMBOL(slow_work_register_user);
+
+/**
+ * slow_work_unregister_user - Unregister a user of the facility
+ *
+ * Unregister a user of the facility, killing all the threads if this was the
+ * last one.
+ */
+void slow_work_unregister_user(void)
+{
+	mutex_lock(&slow_work_user_lock);
+
+	BUG_ON(slow_work_user_count <= 0);
+
+	slow_work_user_count--;
+	if (slow_work_user_count == 0) {
+		printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
+		slow_work_threads_should_exit = true;
+		wake_up_all(&slow_work_thread_wq);
+		wait_for_completion(&slow_work_last_thread_exited);
+		printk(KERN_NOTICE "Slow work thread pool:"
+		       " Shut down complete\n");
+	}
+
+	mutex_unlock(&slow_work_user_lock);
+}
+EXPORT_SYMBOL(slow_work_unregister_user);
+
+/*
+ * Initialise the slow work facility
+ */
+static int __init init_slow_work(void)
+{
+	unsigned nr_cpus = num_possible_cpus();
+
+	if (nr_cpus > slow_work_max_threads)
+		slow_work_max_threads = nr_cpus;
+	return 0;
+}
+
+subsys_initcall(init_slow_work);
-- 
cgit v1.2.3


From 12e22c5e4bc08ab4b05ac079fe40d9891c5e81a0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Make the slow work pool configurable

Make the slow work pool configurable through /proc/sys/kernel/slow-work.

 (*) /proc/sys/kernel/slow-work/min-threads

     The minimum number of threads that should be in the pool as long as it is
     in use.  This may be anywhere between 2 and max-threads.

 (*) /proc/sys/kernel/slow-work/max-threads

     The maximum number of threads that should in the pool.  This may be
     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.

 (*) /proc/sys/kernel/slow-work/vslow-percentage

     The percentage of active threads in the pool that may be used to execute
     very slow work items.  This may be between 1 and 99.  The resultant number
     is bounded to between 1 and one fewer than the number of active threads.
     This ensures there is always at least one thread that can process very
     slow work items, and always at least one thread that won't.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/slow-work.h |   5 ++
 kernel/slow-work.c        | 118 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sysctl.c           |   9 ++++
 3 files changed, 130 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 4dd754af393e..8262809dfa8b 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_SLOW_WORK
 
+#include <linux/sysctl.h>
+
 struct slow_work;
 
 /*
@@ -83,6 +85,9 @@ extern int slow_work_enqueue(struct slow_work *work);
 extern int slow_work_register_user(void);
 extern void slow_work_unregister_user(void);
 
+#ifdef CONFIG_SYSCTL
+extern ctl_table slow_work_sysctls[];
+#endif
 
 #endif /* CONFIG_SLOW_WORK */
 #endif /* _LINUX_SLOW_WORK_H */
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 454abb21c8bd..3f65900aa3cb 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -14,7 +14,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
-#include <asm/system.h>
 
 #define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
 					 * things to do */
@@ -24,6 +23,14 @@
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
+#ifdef CONFIG_SYSCTL
+static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+
+static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+					void __user *, size_t *, loff_t *);
+#endif
+
 /*
  * The pool of threads has at least min threads in it as long as someone is
  * using the facility, and may have as many as max.
@@ -34,6 +41,51 @@ static unsigned slow_work_min_threads = 2;
 static unsigned slow_work_max_threads = 4;
 static unsigned vslow_work_proportion = 50; /* % of threads that may process
 					     * very slow work */
+
+#ifdef CONFIG_SYSCTL
+static const int slow_work_min_min_threads = 2;
+static int slow_work_max_max_threads = 255;
+static const int slow_work_min_vslow = 1;
+static const int slow_work_max_vslow = 99;
+
+ctl_table slow_work_sysctls[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "min-threads",
+		.data		= &slow_work_min_threads,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= slow_work_min_threads_sysctl,
+		.extra1		= (void *) &slow_work_min_min_threads,
+		.extra2		= &slow_work_max_threads,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "max-threads",
+		.data		= &slow_work_max_threads,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= slow_work_max_threads_sysctl,
+		.extra1		= &slow_work_min_threads,
+		.extra2		= (void *) &slow_work_max_max_threads,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "vslow-percentage",
+		.data		= &vslow_work_proportion,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= (void *) &slow_work_min_vslow,
+		.extra2		= (void *) &slow_work_max_vslow,
+	},
+	{ .ctl_name = 0 }
+};
+#endif
+
+/*
+ * The active state of the thread pool
+ */
 static atomic_t slow_work_thread_count;
 static atomic_t vslow_work_executing_count;
 
@@ -427,6 +479,64 @@ static void slow_work_oom_timeout(unsigned long data)
 	slow_work_may_not_start_new_thread = false;
 }
 
+#ifdef CONFIG_SYSCTL
+/*
+ * Handle adjustment of the minimum number of threads
+ */
+static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
+					struct file *filp, void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int n;
+
+	if (ret == 0) {
+		mutex_lock(&slow_work_user_lock);
+		if (slow_work_user_count > 0) {
+			/* see if we need to start or stop threads */
+			n = atomic_read(&slow_work_thread_count) -
+				slow_work_min_threads;
+
+			if (n < 0 && !slow_work_may_not_start_new_thread)
+				slow_work_enqueue(&slow_work_new_thread);
+			else if (n > 0)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+		}
+		mutex_unlock(&slow_work_user_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * Handle adjustment of the maximum number of threads
+ */
+static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
+					struct file *filp, void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int n;
+
+	if (ret == 0) {
+		mutex_lock(&slow_work_user_lock);
+		if (slow_work_user_count > 0) {
+			/* see if we need to stop threads */
+			n = slow_work_max_threads -
+				atomic_read(&slow_work_thread_count);
+
+			if (n < 0)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+		}
+		mutex_unlock(&slow_work_user_lock);
+	}
+
+	return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
 /**
  * slow_work_register_user - Register a user of the facility
  *
@@ -516,8 +626,12 @@ static int __init init_slow_work(void)
 {
 	unsigned nr_cpus = num_possible_cpus();
 
-	if (nr_cpus > slow_work_max_threads)
+	if (slow_work_max_threads < nr_cpus)
 		slow_work_max_threads = nr_cpus;
+#ifdef CONFIG_SYSCTL
+	if (slow_work_max_max_threads < nr_cpus * 2)
+		slow_work_max_max_threads = nr_cpus * 2;
+#endif
 	return 0;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5ec4543dfc06..82350f8f04f6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -48,6 +48,7 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
+#include <linux/slow-work.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -897,6 +898,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &scan_unevictable_handler,
 	},
 #endif
+#ifdef CONFIG_SLOW_WORK
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "slow-work",
+		.mode		= 0555,
+		.child		= slow_work_sysctls,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3


From 8f0aa2f25b31ba27db84259141e52ee6ec0d2820 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Document the slow work thread pool

Document the slow work thread pool.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 Documentation/slow-work.txt | 174 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/slow-work.h   |   2 +
 kernel/slow-work.c          |   2 +
 3 files changed, 178 insertions(+)
 create mode 100644 Documentation/slow-work.txt

(limited to 'include/linux')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
new file mode 100644
index 000000000000..ebc50f808ea4
--- /dev/null
+++ b/Documentation/slow-work.txt
@@ -0,0 +1,174 @@
+		     ====================================
+		     SLOW WORK ITEM EXECUTION THREAD POOL
+		     ====================================
+
+By: David Howells <dhowells@redhat.com>
+
+The slow work item execution thread pool is a pool of threads for performing
+things that take a relatively long time, such as making mkdir calls.
+Typically, when processing something, these items will spend a lot of time
+blocking a thread on I/O, thus making that thread unavailable for doing other
+work.
+
+The standard workqueue model is unsuitable for this class of work item as that
+limits the owner to a single thread or a single thread per CPU.  For some
+tasks, however, more threads - or fewer - are required.
+
+There is just one pool per system.  It contains no threads unless something
+wants to use it - and that something must register its interest first.  When
+the pool is active, the number of threads it contains is dynamic, varying
+between a maximum and minimum setting, depending on the load.
+
+
+====================
+CLASSES OF WORK ITEM
+====================
+
+This pool support two classes of work items:
+
+ (*) Slow work items.
+
+ (*) Very slow work items.
+
+The former are expected to finish much quicker than the latter.
+
+An operation of the very slow class may do a batch combination of several
+lookups, mkdirs, and a create for instance.
+
+An operation of the ordinarily slow class may, for example, write stuff or
+expand files, provided the time taken to do so isn't too long.
+
+Operations of both types may sleep during execution, thus tying up the thread
+loaned to it.
+
+
+THREAD-TO-CLASS ALLOCATION
+--------------------------
+
+Not all the threads in the pool are available to work on very slow work items.
+The number will be between one and one fewer than the number of active threads.
+This is configurable (see the "Pool Configuration" section).
+
+All the threads are available to work on ordinarily slow work items, but a
+percentage of the threads will prefer to work on very slow work items.
+
+The configuration ensures that at least one thread will be available to work on
+very slow work items, and at least one thread will be available that won't work
+on very slow work items at all.
+
+
+=====================
+USING SLOW WORK ITEMS
+=====================
+
+Firstly, a module or subsystem wanting to make use of slow work items must
+register its interest:
+
+	 int ret = slow_work_register_user();
+
+This will return 0 if successful, or a -ve error upon failure.
+
+
+Slow work items may then be set up by:
+
+ (1) Declaring a slow_work struct type variable:
+
+	#include <linux/slow-work.h>
+
+	struct slow_work myitem;
+
+ (2) Declaring the operations to be used for this item:
+
+	struct slow_work_ops myitem_ops = {
+		.get_ref = myitem_get_ref,
+		.put_ref = myitem_put_ref,
+		.execute = myitem_execute,
+	};
+
+     [*] For a description of the ops, see section "Item Operations".
+
+ (3) Initialising the item:
+
+	slow_work_init(&myitem, &myitem_ops);
+
+     or:
+
+	vslow_work_init(&myitem, &myitem_ops);
+
+     depending on its class.
+
+A suitably set up work item can then be enqueued for processing:
+
+	int ret = slow_work_enqueue(&myitem);
+
+This will return a -ve error if the thread pool is unable to gain a reference
+on the item, 0 otherwise.
+
+
+The items are reference counted, so there ought to be no need for a flush
+operation.  When all a module's slow work items have been processed, and the
+module has no further interest in the facility, it should unregister its
+interest:
+
+	slow_work_unregister_user();
+
+
+===============
+ITEM OPERATIONS
+===============
+
+Each work item requires a table of operations of type struct slow_work_ops.
+All members are required:
+
+ (*) Get a reference on an item:
+
+	int (*get_ref)(struct slow_work *work);
+
+     This allows the thread pool to attempt to pin an item by getting a
+     reference on it.  This function should return 0 if the reference was
+     granted, or a -ve error otherwise.  If an error is returned,
+     slow_work_enqueue() will fail.
+
+     The reference is held whilst the item is queued and whilst it is being
+     executed.  The item may then be requeued with the same reference held, or
+     the reference will be released.
+
+ (*) Release a reference on an item:
+
+	void (*put_ref)(struct slow_work *work);
+
+     This allows the thread pool to unpin an item by releasing the reference on
+     it.  The thread pool will not touch the item again once this has been
+     called.
+
+ (*) Execute an item:
+
+	void (*execute)(struct slow_work *work);
+
+     This should perform the work required of the item.  It may sleep, it may
+     perform disk I/O and it may wait for locks.
+
+
+==================
+POOL CONFIGURATION
+==================
+
+The slow-work thread pool has a number of configurables:
+
+ (*) /proc/sys/kernel/slow-work/min-threads
+
+     The minimum number of threads that should be in the pool whilst it is in
+     use.  This may be anywhere between 2 and max-threads.
+
+ (*) /proc/sys/kernel/slow-work/max-threads
+
+     The maximum number of threads that should in the pool.  This may be
+     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.
+
+ (*) /proc/sys/kernel/slow-work/vslow-percentage
+
+     The percentage of active threads in the pool that may be used to execute
+     very slow work items.  This may be between 1 and 99.  The resultant number
+     is bounded to between 1 and one fewer than the number of active threads.
+     This ensures there is always at least one thread that can process very
+     slow work items, and always at least one thread that won't.
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 8262809dfa8b..85958277f83d 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -7,6 +7,8 @@
  * modify it under the terms of the GNU General Public Licence
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
  */
 
 #ifndef _LINUX_SLOW_WORK_H
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 3f65900aa3cb..cf2bc01186ef 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -7,6 +7,8 @@
  * modify it under the terms of the GNU General Public Licence
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
  */
 
 #include <linux/module.h>
-- 
cgit v1.2.3


From 03fb3d2af96c2783c3a5bc03f3d984cf422f0e69 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: FS-Cache: Release page->private after failed readahead

The attached patch causes read_cache_pages() to release page-private data on a
page for which add_to_page_cache() fails.  If the filler function fails, then
the problematic page is left attached to the pagecache (with appropriate flags
set, one presumes) and the remaining to-be-attached pages are invalidated and
discarded.  This permits pages with caching references associated with them to
be cleaned up.

The invalidatepage() address space op is called (indirectly) to do the honours.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/page-flags.h |  2 +-
 mm/readahead.c             | 39 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 61df1779b2a5..9d99e7471ade 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -182,7 +182,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
 
 struct page;	/* forward declaration */
 
-TESTPAGEFLAG(Locked, locked)
+TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
 PAGEFLAG(Error, error)
 PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
diff --git a/mm/readahead.c b/mm/readahead.c
index 9ce303d4b810..6be927569cf6 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -31,6 +31,41 @@ EXPORT_SYMBOL_GPL(file_ra_state_init);
 
 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
 
+/*
+ * see if a page needs releasing upon read_cache_pages() failure
+ * - the caller of read_cache_pages() may have set PG_private before calling,
+ *   such as the NFS fs marking pages that are cached locally on disk, thus we
+ *   need to give the fs a chance to clean up in the event of an error
+ */
+static void read_cache_pages_invalidate_page(struct address_space *mapping,
+					     struct page *page)
+{
+	if (PagePrivate(page)) {
+		if (!trylock_page(page))
+			BUG();
+		page->mapping = mapping;
+		do_invalidatepage(page, 0);
+		page->mapping = NULL;
+		unlock_page(page);
+	}
+	page_cache_release(page);
+}
+
+/*
+ * release a list of pages, invalidating them first if need be
+ */
+static void read_cache_pages_invalidate_pages(struct address_space *mapping,
+					      struct list_head *pages)
+{
+	struct page *victim;
+
+	while (!list_empty(pages)) {
+		victim = list_to_page(pages);
+		list_del(&victim->lru);
+		read_cache_pages_invalidate_page(mapping, victim);
+	}
+}
+
 /**
  * read_cache_pages - populate an address space with some pages & start reads against them
  * @mapping: the address_space
@@ -52,14 +87,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
 		list_del(&page->lru);
 		if (add_to_page_cache_lru(page, mapping,
 					page->index, GFP_KERNEL)) {
-			page_cache_release(page);
+			read_cache_pages_invalidate_page(mapping, page);
 			continue;
 		}
 		page_cache_release(page);
 
 		ret = filler(data, page);
 		if (unlikely(ret)) {
-			put_pages_list(pages);
+			read_cache_pages_invalidate_pages(mapping, pages);
 			break;
 		}
 		task_io_account_read(PAGE_CACHE_SIZE);
-- 
cgit v1.2.3


From 266cf658efcf6ac33541a46740f74f50c79d2b6b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:36 +0100
Subject: FS-Cache: Recruit a page flags for cache management

Recruit a page flag to aid in cache management.  The following extra flag is
defined:

 (1) PG_fscache (PG_private_2)

     The marked page is backed by a local cache and is pinning resources in the
     cache driver.

If PG_fscache is set, then things that checked for PG_private will now also
check for that.  This includes things like truncation and page invalidation.
The function page_has_private() had been added to make the checks for both
PG_private and PG_private_2 at the same time.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/splice.c                |  3 ++-
 include/linux/page-flags.h | 38 +++++++++++++++++++++++++++++++++-----
 mm/filemap.c               |  3 +++
 mm/migrate.c               | 10 +++++-----
 mm/readahead.c             |  9 +++++----
 mm/swap.c                  |  4 ++--
 mm/truncate.c              | 10 +++++-----
 mm/vmscan.c                |  6 +++---
 8 files changed, 58 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index 4ed0ba44a966..dd727d43e5b7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 		 */
 		wait_on_page_writeback(page);
 
-		if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+		if (page_has_private(page) &&
+		    !try_to_release_page(page, GFP_KERNEL))
 			goto out_unlock;
 
 		/*
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 9d99e7471ade..62214c7d2d93 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -82,6 +82,7 @@ enum pageflags {
 	PG_arch_1,
 	PG_reserved,
 	PG_private,		/* If pagecache, has fs-private data */
+	PG_private_2,		/* If pagecache, has fs aux data */
 	PG_writeback,		/* Page is under writeback */
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
 	PG_head,		/* A head page */
@@ -108,6 +109,12 @@ enum pageflags {
 	/* Filesystems */
 	PG_checked = PG_owner_priv_1,
 
+	/* Two page bits are conscripted by FS-Cache to maintain local caching
+	 * state.  These bits are set on pages belonging to the netfs's inodes
+	 * when those inodes are being locally cached.
+	 */
+	PG_fscache = PG_private_2,	/* page backed by cache */
+
 	/* XEN */
 	PG_pinned = PG_owner_priv_1,
 	PG_savepinned = PG_dirty,
@@ -194,8 +201,6 @@ PAGEFLAG(Checked, checked)		/* Used by some filesystems */
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
 PAGEFLAG(SavePinned, savepinned);			/* Xen */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
-	__SETPAGEFLAG(Private, private)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
 __PAGEFLAG(SlobPage, slob_page)
@@ -204,6 +209,16 @@ __PAGEFLAG(SlobFree, slob_free)
 __PAGEFLAG(SlubFrozen, slub_frozen)
 __PAGEFLAG(SlubDebug, slub_debug)
 
+/*
+ * Private page markings that may be used by the filesystem that owns the page
+ * for its own purposes.
+ * - PG_private and PG_private_2 cause releasepage() and co to be invoked
+ */
+PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
+	__CLEARPAGEFLAG(Private, private)
+PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
+PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
@@ -384,9 +399,10 @@ static inline void __ClearPageTail(struct page *page)
  * these flags set.  It they are, there is a problem.
  */
 #define PAGE_FLAGS_CHECK_AT_FREE \
-	(1 << PG_lru   | 1 << PG_private   | 1 << PG_locked | \
-	 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \
-	 1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active | \
+	(1 << PG_lru	 | 1 << PG_locked    | \
+	 1 << PG_private | 1 << PG_private_2 | \
+	 1 << PG_buddy	 | 1 << PG_writeback | 1 << PG_reserved | \
+	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
 	 __PG_UNEVICTABLE | __PG_MLOCKED)
 
 /*
@@ -397,4 +413,16 @@ static inline void __ClearPageTail(struct page *page)
 #define PAGE_FLAGS_CHECK_AT_PREP	((1 << NR_PAGEFLAGS) - 1)
 
 #endif /* !__GENERATING_BOUNDS_H */
+
+/**
+ * page_has_private - Determine if page has private stuff
+ * @page: The page to be checked
+ *
+ * Determine if a page has private stuff, indicating that release routines
+ * should be invoked upon it.
+ */
+#define page_has_private(page)			\
+	((page)->flags & ((1 << PG_private) |	\
+			  (1 << PG_private_2)))
+
 #endif	/* PAGE_FLAGS_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 126d3973b3d1..cbc5772e7171 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2463,6 +2463,9 @@ EXPORT_SYMBOL(generic_file_aio_write);
  * (presumably at page->private).  If the release was successful, return `1'.
  * Otherwise return zero.
  *
+ * This may also be called if PG_fscache is set on a page, indicating that the
+ * page is known to the local caching routines.
+ *
  * The @gfp_mask argument specifies whether I/O may be performed to release
  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
  *
diff --git a/mm/migrate.c b/mm/migrate.c
index a9eff3f092f6..068655d8f883 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -250,7 +250,7 @@ out:
  * The number of remaining references must be:
  * 1 for anonymous pages without a mapping
  * 2 for pages with a mapping
- * 3 for pages with a mapping and PagePrivate set.
+ * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
 static int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page)
@@ -270,7 +270,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
  					page_index(page));
 
-	expected_count = 2 + !!PagePrivate(page);
+	expected_count = 2 + !!page_has_private(page);
 	if (page_count(page) != expected_count ||
 			(struct page *)radix_tree_deref_slot(pslot) != page) {
 		spin_unlock_irq(&mapping->tree_lock);
@@ -386,7 +386,7 @@ EXPORT_SYMBOL(fail_migrate_page);
 
 /*
  * Common logic to directly migrate a single page suitable for
- * pages that do not use PagePrivate.
+ * pages that do not use PagePrivate/PagePrivate2.
  *
  * Pages are locked upon entry and exit.
  */
@@ -522,7 +522,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 	 * Buffers may be managed in a filesystem specific way.
 	 * We must have no buffers or drop them.
 	 */
-	if (PagePrivate(page) &&
+	if (page_has_private(page) &&
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
 
@@ -655,7 +655,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	 * free the metadata, so the page can be freed.
 	 */
 	if (!page->mapping) {
-		if (!PageAnon(page) && PagePrivate(page)) {
+		if (!PageAnon(page) && page_has_private(page)) {
 			/*
 			 * Go direct to try_to_free_buffers() here because
 			 * a) that's what try_to_release_page() would do anyway
diff --git a/mm/readahead.c b/mm/readahead.c
index 6be927569cf6..133b6d525513 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -33,14 +33,15 @@ EXPORT_SYMBOL_GPL(file_ra_state_init);
 
 /*
  * see if a page needs releasing upon read_cache_pages() failure
- * - the caller of read_cache_pages() may have set PG_private before calling,
- *   such as the NFS fs marking pages that are cached locally on disk, thus we
- *   need to give the fs a chance to clean up in the event of an error
+ * - the caller of read_cache_pages() may have set PG_private or PG_fscache
+ *   before calling, such as the NFS fs marking pages that are cached locally
+ *   on disk, thus we need to give the fs a chance to clean up in the event of
+ *   an error
  */
 static void read_cache_pages_invalidate_page(struct address_space *mapping,
 					     struct page *page)
 {
-	if (PagePrivate(page)) {
+	if (page_has_private(page)) {
 		if (!trylock_page(page))
 			BUG();
 		page->mapping = mapping;
diff --git a/mm/swap.c b/mm/swap.c
index 6e83084c1f6c..bede23ce64ea 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -448,8 +448,8 @@ void pagevec_strip(struct pagevec *pvec)
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
 
-		if (PagePrivate(page) && trylock_page(page)) {
-			if (PagePrivate(page))
+		if (page_has_private(page) && trylock_page(page)) {
+			if (page_has_private(page))
 				try_to_release_page(page, 0);
 			unlock_page(page);
 		}
diff --git a/mm/truncate.c b/mm/truncate.c
index 1229211104f8..55206fab7b99 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -50,7 +50,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
-	if (PagePrivate(page))
+	if (page_has_private(page))
 		do_invalidatepage(page, partial);
 }
 
@@ -99,7 +99,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 	if (page->mapping != mapping)
 		return;
 
-	if (PagePrivate(page))
+	if (page_has_private(page))
 		do_invalidatepage(page, 0);
 
 	cancel_dirty_page(page, PAGE_CACHE_SIZE);
@@ -126,7 +126,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	if (page->mapping != mapping)
 		return 0;
 
-	if (PagePrivate(page) && !try_to_release_page(page, 0))
+	if (page_has_private(page) && !try_to_release_page(page, 0))
 		return 0;
 
 	clear_page_mlock(page);
@@ -348,7 +348,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (page->mapping != mapping)
 		return 0;
 
-	if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
 	spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +356,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 		goto failed;
 
 	clear_page_mlock(page);
-	BUG_ON(PagePrivate(page));
+	BUG_ON(page_has_private(page));
 	__remove_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
 	page_cache_release(page);	/* pagecache ref */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 06e72693b458..425244988bb2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -283,7 +283,7 @@ static inline int page_mapping_inuse(struct page *page)
 
 static inline int is_page_cache_freeable(struct page *page)
 {
-	return page_count(page) - !!PagePrivate(page) == 2;
+	return page_count(page) - !!page_has_private(page) == 2;
 }
 
 static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -367,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 		 * Some data journaling orphaned pages can have
 		 * page->mapping == NULL while being dirty with clean buffers.
 		 */
-		if (PagePrivate(page)) {
+		if (page_has_private(page)) {
 			if (try_to_free_buffers(page)) {
 				ClearPageDirty(page);
 				printk("%s: orphaned page\n", __func__);
@@ -727,7 +727,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * process address space (page_count == 1) it can be freed.
 		 * Otherwise, leave the page on the LRU so it is swappable.
 		 */
-		if (PagePrivate(page)) {
+		if (page_has_private(page)) {
 			if (!try_to_release_page(page, sc->gfp_mask))
 				goto activate_locked;
 			if (!mapping && page_count(page) == 1) {
-- 
cgit v1.2.3


From 2d6fff637037395cc946ef910a880b5fa67b5370 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:36 +0100
Subject: FS-Cache: Add the FS-Cache netfs API and documentation

Add the API for a generic facility (FS-Cache) by which filesystems (such as AFS
or NFS) may call on local caching capabilities without having to know anything
about how the cache works, or even if there is a cache:

	+---------+
	|         |                        +--------------+
	|   NFS   |--+                     |              |
	|         |  |                 +-->|   CacheFS    |
	+---------+  |   +----------+  |   |  /dev/hda5   |
	             |   |          |  |   +--------------+
	+---------+  +-->|          |  |
	|         |      |          |--+
	|   AFS   |----->| FS-Cache |
	|         |      |          |--+
	+---------+  +-->|          |  |
	             |   |          |  |   +--------------+
	+---------+  |   +----------+  |   |              |
	|         |  |                 +-->|  CacheFiles  |
	|  ISOFS  |--+                     |  /var/cache  |
	|         |                        +--------------+
	+---------+

General documentation and documentation of the netfs specific API are provided
in addition to the header files.

As this patch stands, it is possible to build a filesystem against the facility
and attempt to use it.  All that will happen is that all requests will be
immediately denied as if no cache is present.

Further patches will implement the core of the facility.  The facility will
transfer requests from networking filesystems to appropriate caches if
possible, or else gracefully deny them.

If this facility is disabled in the kernel configuration, then all its
operations will trivially reduce to nothing during compilation.

WHY NOT I_MAPPING?
==================

I have added my own API to implement caching rather than using i_mapping to do
this for a number of reasons.  These have been discussed a lot on the LKML and
CacheFS mailing lists, but to summarise the basics:

 (1) Most filesystems don't do hole reportage.  Holes in files are treated as
     blocks of zeros and can't be distinguished otherwise, making it difficult
     to distinguish blocks that have been read from the network and cached from
     those that haven't.

 (2) The backing inode must be fully populated before being exposed to
     userspace through the main inode because the VM/VFS goes directly to the
     backing inode and does not interrogate the front inode's VM ops.

     Therefore:

     (a) The backing inode must fit entirely within the cache.

     (b) All backed files currently open must fit entirely within the cache at
     	 the same time.

     (c) A working set of files in total larger than the cache may not be
     	 cached.

     (d) A file may not grow larger than the available space in the cache.

     (e) A file that's open and cached, and remotely grows larger than the
     	 cache is potentially stuffed.

 (3) Writes go to the backing filesystem, and can only be transferred to the
     network when the file is closed.

 (4) There's no record of what changes have been made, so the whole file must
     be written back.

 (5) The pages belong to the backing filesystem, and all metadata associated
     with that page are relevant only to the backing filesystem, and not
     anything stacked atop it.

OVERVIEW
========

FS-Cache provides (or will provide) the following facilities:

 (1) Caches can be added / removed at any time, even whilst in use.

 (2) Adds a facility by which tags can be used to refer to caches, even if
     they're not available yet.

 (3) More than one cache can be used at once.  Caches can be selected
     explicitly by use of tags.

 (4) The netfs is provided with an interface that allows either party to
     withdraw caching facilities from a file (required for (1)).

 (5) A netfs may annotate cache objects that belongs to it.  This permits the
     storage of coherency maintenance data.

 (6) Cache objects will be pinnable and space reservations will be possible.

 (7) The interface to the netfs returns as few errors as possible, preferring
     rather to let the netfs remain oblivious.

 (8) Cookies are used to represent indices, files and other objects to the
     netfs.  The simplest cookie is just a NULL pointer - indicating nothing
     cached there.

 (9) The netfs is allowed to propose - dynamically - any index hierarchy it
     desires, though it must be aware that the index search function is
     recursive, stack space is limited, and indices can only be children of
     indices.

(10) Indices can be used to group files together to reduce key size and to make
     group invalidation easier.  The use of indices may make lookup quicker,
     but that's cache dependent.

(11) Data I/O is effectively done directly to and from the netfs's pages.  The
     netfs indicates that page A is at index B of the data-file represented by
     cookie C, and that it should be read or written.  The cache backend may or
     may not start I/O on that page, but if it does, a netfs callback will be
     invoked to indicate completion.  The I/O may be either synchronous or
     asynchronous.

(12) Cookies can be "retired" upon release.  At this point FS-Cache will mark
     them as obsolete and the index hierarchy rooted at that point will get
     recycled.

(13) The netfs provides a "match" function for index searches.  In addition to
     saying whether a match was made or not, this can also specify that an
     entry should be updated or deleted.

FS-Cache maintains a virtual index tree in which all indices, files, objects
and pages are kept.  Bits of this tree may actually reside in one or more
caches.

                                           FSDEF
                                             |
                        +------------------------------------+
                        |                                    |
                       NFS                                  AFS
                        |                                    |
           +--------------------------+                +-----------+
           |                          |                |           |
        homedir                     mirror          afs.org   redhat.com
           |                          |                            |
     +------------+           +---------------+              +----------+
     |            |           |               |              |          |
   00001        00002       00007           00125        vol00001   vol00002
     |            |           |               |                         |
 +---+---+     +-----+      +---+      +------+------+            +-----+----+
 |   |   |     |     |      |   |      |      |      |            |     |    |
PG0 PG1 PG2   PG0  XATTR   PG0 PG1   DIRENT DIRENT DIRENT        R/W   R/O  Bak
                     |                                            |
                    PG0                                       +-------+
                                                              |       |
                                                            00001   00003
                                                              |
                                                          +---+---+
                                                          |   |   |
                                                         PG0 PG1 PG2

In the example above, two netfs's can be seen to be backed: NFS and AFS.  These
have different index hierarchies:

 (*) The NFS primary index will probably contain per-server indices.  Each
     server index is indexed by NFS file handles to get data file objects.
     Each data file objects can have an array of pages, but may also have
     further child objects, such as extended attributes and directory entries.
     Extended attribute objects themselves have page-array contents.

 (*) The AFS primary index contains per-cell indices.  Each cell index contains
     per-logical-volume indices.  Each of volume index contains up to three
     indices for the read-write, read-only and backup mirrors of those volumes.
     Each of these contains vnode data file objects, each of which contains an
     array of pages.

The very top index is the FS-Cache master index in which individual netfs's
have entries.

Any index object may reside in more than one cache, provided it only has index
children.  Any index with non-index object children will be assumed to only
reside in one cache.

The FS-Cache overview can be found in:

	Documentation/filesystems/caching/fscache.txt

The netfs API to FS-Cache can be found in:

	Documentation/filesystems/caching/netfs-api.txt

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 Documentation/filesystems/caching/fscache.txt   | 330 ++++++++++
 Documentation/filesystems/caching/netfs-api.txt | 778 ++++++++++++++++++++++++
 include/linux/fscache.h                         | 548 +++++++++++++++++
 3 files changed, 1656 insertions(+)
 create mode 100644 Documentation/filesystems/caching/fscache.txt
 create mode 100644 Documentation/filesystems/caching/netfs-api.txt
 create mode 100644 include/linux/fscache.h

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
new file mode 100644
index 000000000000..a759d916273e
--- /dev/null
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -0,0 +1,330 @@
+			  ==========================
+			  General Filesystem Caching
+			  ==========================
+
+========
+OVERVIEW
+========
+
+This facility is a general purpose cache for network filesystems, though it
+could be used for caching other things such as ISO9660 filesystems too.
+
+FS-Cache mediates between cache backends (such as CacheFS) and network
+filesystems:
+
+	+---------+
+	|         |                        +--------------+
+	|   NFS   |--+                     |              |
+	|         |  |                 +-->|   CacheFS    |
+	+---------+  |   +----------+  |   |  /dev/hda5   |
+	             |   |          |  |   +--------------+
+	+---------+  +-->|          |  |
+	|         |      |          |--+
+	|   AFS   |----->| FS-Cache |
+	|         |      |          |--+
+	+---------+  +-->|          |  |
+	             |   |          |  |   +--------------+
+	+---------+  |   +----------+  |   |              |
+	|         |  |                 +-->|  CacheFiles  |
+	|  ISOFS  |--+                     |  /var/cache  |
+	|         |                        +--------------+
+	+---------+
+
+Or to look at it another way, FS-Cache is a module that provides a caching
+facility to a network filesystem such that the cache is transparent to the
+user:
+
+	+---------+
+	|         |
+	| Server  |
+	|         |
+	+---------+
+	     |                  NETWORK
+	~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	     |
+	     |           +----------+
+	     V           |          |
+	+---------+      |          |
+	|         |      |          |
+	|   NFS   |----->| FS-Cache |
+	|         |      |          |--+
+	+---------+      |          |  |   +--------------+   +--------------+
+	     |           |          |  |   |              |   |              |
+	     V           +----------+  +-->|  CacheFiles  |-->|  Ext3        |
+	+---------+                        |  /var/cache  |   |  /dev/sda6   |
+	|         |                        +--------------+   +--------------+
+	|   VFS   |                                ^                     ^
+	|         |                                |                     |
+	+---------+                                +--------------+      |
+	     |                  KERNEL SPACE                      |      |
+	~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|~~~~~~|~~~~
+	     |                  USER SPACE                        |      |
+	     V                                                    |      |
+	+---------+                                           +--------------+
+	|         |                                           |              |
+	| Process |                                           | cachefilesd  |
+	|         |                                           |              |
+	+---------+                                           +--------------+
+
+
+FS-Cache does not follow the idea of completely loading every netfs file
+opened in its entirety into a cache before permitting it to be accessed and
+then serving the pages out of that cache rather than the netfs inode because:
+
+ (1) It must be practical to operate without a cache.
+
+ (2) The size of any accessible file must not be limited to the size of the
+     cache.
+
+ (3) The combined size of all opened files (this includes mapped libraries)
+     must not be limited to the size of the cache.
+
+ (4) The user should not be forced to download an entire file just to do a
+     one-off access of a small portion of it (such as might be done with the
+     "file" program).
+
+It instead serves the cache out in PAGE_SIZE chunks as and when requested by
+the netfs('s) using it.
+
+
+FS-Cache provides the following facilities:
+
+ (1) More than one cache can be used at once.  Caches can be selected
+     explicitly by use of tags.
+
+ (2) Caches can be added / removed at any time.
+
+ (3) The netfs is provided with an interface that allows either party to
+     withdraw caching facilities from a file (required for (2)).
+
+ (4) The interface to the netfs returns as few errors as possible, preferring
+     rather to let the netfs remain oblivious.
+
+ (5) Cookies are used to represent indices, files and other objects to the
+     netfs.  The simplest cookie is just a NULL pointer - indicating nothing
+     cached there.
+
+ (6) The netfs is allowed to propose - dynamically - any index hierarchy it
+     desires, though it must be aware that the index search function is
+     recursive, stack space is limited, and indices can only be children of
+     indices.
+
+ (7) Data I/O is done direct to and from the netfs's pages.  The netfs
+     indicates that page A is at index B of the data-file represented by cookie
+     C, and that it should be read or written.  The cache backend may or may
+     not start I/O on that page, but if it does, a netfs callback will be
+     invoked to indicate completion.  The I/O may be either synchronous or
+     asynchronous.
+
+ (8) Cookies can be "retired" upon release.  At this point FS-Cache will mark
+     them as obsolete and the index hierarchy rooted at that point will get
+     recycled.
+
+ (9) The netfs provides a "match" function for index searches.  In addition to
+     saying whether a match was made or not, this can also specify that an
+     entry should be updated or deleted.
+
+(10) As much as possible is done asynchronously.
+
+
+FS-Cache maintains a virtual indexing tree in which all indices, files, objects
+and pages are kept.  Bits of this tree may actually reside in one or more
+caches.
+
+                                           FSDEF
+                                             |
+                        +------------------------------------+
+                        |                                    |
+                       NFS                                  AFS
+                        |                                    |
+           +--------------------------+                +-----------+
+           |                          |                |           |
+        homedir                     mirror          afs.org   redhat.com
+           |                          |                            |
+     +------------+           +---------------+              +----------+
+     |            |           |               |              |          |
+   00001        00002       00007           00125        vol00001   vol00002
+     |            |           |               |                         |
+ +---+---+     +-----+      +---+      +------+------+            +-----+----+
+ |   |   |     |     |      |   |      |      |      |            |     |    |
+PG0 PG1 PG2   PG0  XATTR   PG0 PG1   DIRENT DIRENT DIRENT        R/W   R/O  Bak
+                     |                                            |
+                    PG0                                       +-------+
+                                                              |       |
+                                                            00001   00003
+                                                              |
+                                                          +---+---+
+                                                          |   |   |
+                                                         PG0 PG1 PG2
+
+In the example above, you can see two netfs's being backed: NFS and AFS.  These
+have different index hierarchies:
+
+ (*) The NFS primary index contains per-server indices.  Each server index is
+     indexed by NFS file handles to get data file objects.  Each data file
+     objects can have an array of pages, but may also have further child
+     objects, such as extended attributes and directory entries.  Extended
+     attribute objects themselves have page-array contents.
+
+ (*) The AFS primary index contains per-cell indices.  Each cell index contains
+     per-logical-volume indices.  Each of volume index contains up to three
+     indices for the read-write, read-only and backup mirrors of those volumes.
+     Each of these contains vnode data file objects, each of which contains an
+     array of pages.
+
+The very top index is the FS-Cache master index in which individual netfs's
+have entries.
+
+Any index object may reside in more than one cache, provided it only has index
+children.  Any index with non-index object children will be assumed to only
+reside in one cache.
+
+
+The netfs API to FS-Cache can be found in:
+
+	Documentation/filesystems/caching/netfs-api.txt
+
+The cache backend API to FS-Cache can be found in:
+
+	Documentation/filesystems/caching/backend-api.txt
+
+
+=======================
+STATISTICAL INFORMATION
+=======================
+
+If FS-Cache is compiled with the following options enabled:
+
+	CONFIG_FSCACHE_PROC=y (implied by the following two)
+	CONFIG_FSCACHE_STATS=y
+	CONFIG_FSCACHE_HISTOGRAM=y
+
+then it will gather certain statistics and display them through a number of
+proc files.
+
+ (*) /proc/fs/fscache/stats
+
+     This shows counts of a number of events that can happen in FS-Cache:
+
+	CLASS	EVENT	MEANING
+	=======	=======	=======================================================
+	Cookies	idx=N	Number of index cookies allocated
+		dat=N	Number of data storage cookies allocated
+		spc=N	Number of special cookies allocated
+	Objects	alc=N	Number of objects allocated
+		nal=N	Number of object allocation failures
+		avl=N	Number of objects that reached the available state
+		ded=N	Number of objects that reached the dead state
+	ChkAux	non=N	Number of objects that didn't have a coherency check
+		ok=N	Number of objects that passed a coherency check
+		upd=N	Number of objects that needed a coherency data update
+		obs=N	Number of objects that were declared obsolete
+	Pages	mrk=N	Number of pages marked as being cached
+		unc=N	Number of uncache page requests seen
+	Acquire	n=N	Number of acquire cookie requests seen
+		nul=N	Number of acq reqs given a NULL parent
+		noc=N	Number of acq reqs rejected due to no cache available
+		ok=N	Number of acq reqs succeeded
+		nbf=N	Number of acq reqs rejected due to error
+		oom=N	Number of acq reqs failed on ENOMEM
+	Lookups	n=N	Number of lookup calls made on cache backends
+		neg=N	Number of negative lookups made
+		pos=N	Number of positive lookups made
+		crt=N	Number of objects created by lookup
+	Updates	n=N	Number of update cookie requests seen
+		nul=N	Number of upd reqs given a NULL parent
+		run=N	Number of upd reqs granted CPU time
+	Relinqs	n=N	Number of relinquish cookie requests seen
+		nul=N	Number of rlq reqs given a NULL parent
+		wcr=N	Number of rlq reqs waited on completion of creation
+	AttrChg	n=N	Number of attribute changed requests seen
+		ok=N	Number of attr changed requests queued
+		nbf=N	Number of attr changed rejected -ENOBUFS
+		oom=N	Number of attr changed failed -ENOMEM
+		run=N	Number of attr changed ops given CPU time
+	Allocs	n=N	Number of allocation requests seen
+		ok=N	Number of successful alloc reqs
+		wt=N	Number of alloc reqs that waited on lookup completion
+		nbf=N	Number of alloc reqs rejected -ENOBUFS
+		ops=N	Number of alloc reqs submitted
+		owt=N	Number of alloc reqs waited for CPU time
+	Retrvls	n=N	Number of retrieval (read) requests seen
+		ok=N	Number of successful retr reqs
+		wt=N	Number of retr reqs that waited on lookup completion
+		nod=N	Number of retr reqs returned -ENODATA
+		nbf=N	Number of retr reqs rejected -ENOBUFS
+		int=N	Number of retr reqs aborted -ERESTARTSYS
+		oom=N	Number of retr reqs failed -ENOMEM
+		ops=N	Number of retr reqs submitted
+		owt=N	Number of retr reqs waited for CPU time
+	Stores	n=N	Number of storage (write) requests seen
+		ok=N	Number of successful store reqs
+		agn=N	Number of store reqs on a page already pending storage
+		nbf=N	Number of store reqs rejected -ENOBUFS
+		oom=N	Number of store reqs failed -ENOMEM
+		ops=N	Number of store reqs submitted
+		run=N	Number of store reqs granted CPU time
+	Ops	pend=N	Number of times async ops added to pending queues
+		run=N	Number of times async ops given CPU time
+		enq=N	Number of times async ops queued for processing
+		dfr=N	Number of async ops queued for deferred release
+		rel=N	Number of async ops released
+		gc=N	Number of deferred-release async ops garbage collected
+
+
+ (*) /proc/fs/fscache/histogram
+
+	cat /proc/fs/fscache/histogram
+	+HZ   +TIME OBJ INST  OP RUNS   OBJ RUNS  RETRV DLY RETRIEVLS
+	===== ===== ========= ========= ========= ========= =========
+
+     This shows the breakdown of the number of times each amount of time
+     between 0 jiffies and HZ-1 jiffies a variety of tasks took to run.  The
+     columns are as follows:
+
+	COLUMN		TIME MEASUREMENT
+	=======		=======================================================
+	OBJ INST	Length of time to instantiate an object
+	OP RUNS		Length of time a call to process an operation took
+	OBJ RUNS	Length of time a call to process an object event took
+	RETRV DLY	Time between an requesting a read and lookup completing
+	RETRIEVLS	Time between beginning and end of a retrieval
+
+     Each row shows the number of events that took a particular range of times.
+     Each step is 1 jiffy in size.  The +HZ column indicates the particular
+     jiffy range covered, and the +TIME field the equivalent number of seconds.
+
+
+=========
+DEBUGGING
+=========
+
+The FS-Cache facility can have runtime debugging enabled by adjusting the value
+in:
+
+	/sys/module/fscache/parameters/debug
+
+This is a bitmask of debugging streams to enable:
+
+	BIT	VALUE	STREAM				POINT
+	=======	=======	===============================	=======================
+	0	1	Cache management		Function entry trace
+	1	2					Function exit trace
+	2	4					General
+	3	8	Cookie management		Function entry trace
+	4	16					Function exit trace
+	5	32					General
+	6	64	Page handling			Function entry trace
+	7	128					Function exit trace
+	8	256					General
+	9	512	Operation management		Function entry trace
+	10	1024					Function exit trace
+	11	2048					General
+
+The appropriate set of values should be OR'd together and the result written to
+the control file.  For example:
+
+	echo $((1|8|64)) >/sys/module/fscache/parameters/debug
+
+will turn on all function entry debugging.
+
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
new file mode 100644
index 000000000000..4db125b3a5c6
--- /dev/null
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -0,0 +1,778 @@
+			===============================
+			FS-CACHE NETWORK FILESYSTEM API
+			===============================
+
+There's an API by which a network filesystem can make use of the FS-Cache
+facilities.  This is based around a number of principles:
+
+ (1) Caches can store a number of different object types.  There are two main
+     object types: indices and files.  The first is a special type used by
+     FS-Cache to make finding objects faster and to make retiring of groups of
+     objects easier.
+
+ (2) Every index, file or other object is represented by a cookie.  This cookie
+     may or may not have anything associated with it, but the netfs doesn't
+     need to care.
+
+ (3) Barring the top-level index (one entry per cached netfs), the index
+     hierarchy for each netfs is structured according the whim of the netfs.
+
+This API is declared in <linux/fscache.h>.
+
+This document contains the following sections:
+
+	 (1) Network filesystem definition
+	 (2) Index definition
+	 (3) Object definition
+	 (4) Network filesystem (un)registration
+	 (5) Cache tag lookup
+	 (6) Index registration
+	 (7) Data file registration
+	 (8) Miscellaneous object registration
+	 (9) Setting the data file size
+	(10) Page alloc/read/write
+	(11) Page uncaching
+	(12) Index and data file update
+	(13) Miscellaneous cookie operations
+	(14) Cookie unregistration
+	(15) Index and data file invalidation
+	(16) FS-Cache specific page flags.
+
+
+=============================
+NETWORK FILESYSTEM DEFINITION
+=============================
+
+FS-Cache needs a description of the network filesystem.  This is specified
+using a record of the following structure:
+
+	struct fscache_netfs {
+		uint32_t			version;
+		const char			*name;
+		struct fscache_cookie		*primary_index;
+		...
+	};
+
+This first two fields should be filled in before registration, and the third
+will be filled in by the registration function; any other fields should just be
+ignored and are for internal use only.
+
+The fields are:
+
+ (1) The name of the netfs (used as the key in the toplevel index).
+
+ (2) The version of the netfs (if the name matches but the version doesn't, the
+     entire in-cache hierarchy for this netfs will be scrapped and begun
+     afresh).
+
+ (3) The cookie representing the primary index will be allocated according to
+     another parameter passed into the registration function.
+
+For example, kAFS (linux/fs/afs/) uses the following definitions to describe
+itself:
+
+	struct fscache_netfs afs_cache_netfs = {
+		.version	= 0,
+		.name		= "afs",
+	};
+
+
+================
+INDEX DEFINITION
+================
+
+Indices are used for two purposes:
+
+ (1) To aid the finding of a file based on a series of keys (such as AFS's
+     "cell", "volume ID", "vnode ID").
+
+ (2) To make it easier to discard a subset of all the files cached based around
+     a particular key - for instance to mirror the removal of an AFS volume.
+
+However, since it's unlikely that any two netfs's are going to want to define
+their index hierarchies in quite the same way, FS-Cache tries to impose as few
+restraints as possible on how an index is structured and where it is placed in
+the tree.  The netfs can even mix indices and data files at the same level, but
+it's not recommended.
+
+Each index entry consists of a key of indeterminate length plus some auxilliary
+data, also of indeterminate length.
+
+There are some limits on indices:
+
+ (1) Any index containing non-index objects should be restricted to a single
+     cache.  Any such objects created within an index will be created in the
+     first cache only.  The cache in which an index is created can be
+     controlled by cache tags (see below).
+
+ (2) The entry data must be atomically journallable, so it is limited to about
+     400 bytes at present.  At least 400 bytes will be available.
+
+ (3) The depth of the index tree should be judged with care as the search
+     function is recursive.  Too many layers will run the kernel out of stack.
+
+
+=================
+OBJECT DEFINITION
+=================
+
+To define an object, a structure of the following type should be filled out:
+
+	struct fscache_cookie_def
+	{
+		uint8_t name[16];
+		uint8_t type;
+
+		struct fscache_cache_tag *(*select_cache)(
+			const void *parent_netfs_data,
+			const void *cookie_netfs_data);
+
+		uint16_t (*get_key)(const void *cookie_netfs_data,
+				    void *buffer,
+				    uint16_t bufmax);
+
+		void (*get_attr)(const void *cookie_netfs_data,
+				 uint64_t *size);
+
+		uint16_t (*get_aux)(const void *cookie_netfs_data,
+				    void *buffer,
+				    uint16_t bufmax);
+
+		enum fscache_checkaux (*check_aux)(void *cookie_netfs_data,
+						   const void *data,
+						   uint16_t datalen);
+
+		void (*get_context)(void *cookie_netfs_data, void *context);
+
+		void (*put_context)(void *cookie_netfs_data, void *context);
+
+		void (*mark_pages_cached)(void *cookie_netfs_data,
+					  struct address_space *mapping,
+					  struct pagevec *cached_pvec);
+
+		void (*now_uncached)(void *cookie_netfs_data);
+	};
+
+This has the following fields:
+
+ (1) The type of the object [mandatory].
+
+     This is one of the following values:
+
+	(*) FSCACHE_COOKIE_TYPE_INDEX
+
+	    This defines an index, which is a special FS-Cache type.
+
+	(*) FSCACHE_COOKIE_TYPE_DATAFILE
+
+	    This defines an ordinary data file.
+
+	(*) Any other value between 2 and 255
+
+	    This defines an extraordinary object such as an XATTR.
+
+ (2) The name of the object type (NUL terminated unless all 16 chars are used)
+     [optional].
+
+ (3) A function to select the cache in which to store an index [optional].
+
+     This function is invoked when an index needs to be instantiated in a cache
+     during the instantiation of a non-index object.  Only the immediate index
+     parent for the non-index object will be queried.  Any indices above that
+     in the hierarchy may be stored in multiple caches.  This function does not
+     need to be supplied for any non-index object or any index that will only
+     have index children.
+
+     If this function is not supplied or if it returns NULL then the first
+     cache in the parent's list will be chosed, or failing that, the first
+     cache in the master list.
+
+ (4) A function to retrieve an object's key from the netfs [mandatory].
+
+     This function will be called with the netfs data that was passed to the
+     cookie acquisition function and the maximum length of key data that it may
+     provide.  It should write the required key data into the given buffer and
+     return the quantity it wrote.
+
+ (5) A function to retrieve attribute data from the netfs [optional].
+
+     This function will be called with the netfs data that was passed to the
+     cookie acquisition function.  It should return the size of the file if
+     this is a data file.  The size may be used to govern how much cache must
+     be reserved for this file in the cache.
+
+     If the function is absent, a file size of 0 is assumed.
+
+ (6) A function to retrieve auxilliary data from the netfs [optional].
+
+     This function will be called with the netfs data that was passed to the
+     cookie acquisition function and the maximum length of auxilliary data that
+     it may provide.  It should write the auxilliary data into the given buffer
+     and return the quantity it wrote.
+
+     If this function is absent, the auxilliary data length will be set to 0.
+
+     The length of the auxilliary data buffer may be dependent on the key
+     length.  A netfs mustn't rely on being able to provide more than 400 bytes
+     for both.
+
+ (7) A function to check the auxilliary data [optional].
+
+     This function will be called to check that a match found in the cache for
+     this object is valid.  For instance with AFS it could check the auxilliary
+     data against the data version number returned by the server to determine
+     whether the index entry in a cache is still valid.
+
+     If this function is absent, it will be assumed that matching objects in a
+     cache are always valid.
+
+     If present, the function should return one of the following values:
+
+	(*) FSCACHE_CHECKAUX_OKAY		- the entry is okay as is
+	(*) FSCACHE_CHECKAUX_NEEDS_UPDATE	- the entry requires update
+	(*) FSCACHE_CHECKAUX_OBSOLETE		- the entry should be deleted
+
+     This function can also be used to extract data from the auxilliary data in
+     the cache and copy it into the netfs's structures.
+
+ (8) A pair of functions to manage contexts for the completion callback
+     [optional].
+
+     The cache read/write functions are passed a context which is then passed
+     to the I/O completion callback function.  To ensure this context remains
+     valid until after the I/O completion is called, two functions may be
+     provided: one to get an extra reference on the context, and one to drop a
+     reference to it.
+
+     If the context is not used or is a type of object that won't go out of
+     scope, then these functions are not required.  These functions are not
+     required for indices as indices may not contain data.  These functions may
+     be called in interrupt context and so may not sleep.
+
+ (9) A function to mark a page as retaining cache metadata [optional].
+
+     This is called by the cache to indicate that it is retaining in-memory
+     information for this page and that the netfs should uncache the page when
+     it has finished.  This does not indicate whether there's data on the disk
+     or not.  Note that several pages at once may be presented for marking.
+
+     The PG_fscache bit is set on the pages before this function would be
+     called, so the function need not be provided if this is sufficient.
+
+     This function is not required for indices as they're not permitted data.
+
+(10) A function to unmark all the pages retaining cache metadata [mandatory].
+
+     This is called by FS-Cache to indicate that a backing store is being
+     unbound from a cookie and that all the marks on the pages should be
+     cleared to prevent confusion.  Note that the cache will have torn down all
+     its tracking information so that the pages don't need to be explicitly
+     uncached.
+
+     This function is not required for indices as they're not permitted data.
+
+
+===================================
+NETWORK FILESYSTEM (UN)REGISTRATION
+===================================
+
+The first step is to declare the network filesystem to the cache.  This also
+involves specifying the layout of the primary index (for AFS, this would be the
+"cell" level).
+
+The registration function is:
+
+	int fscache_register_netfs(struct fscache_netfs *netfs);
+
+It just takes a pointer to the netfs definition.  It returns 0 or an error as
+appropriate.
+
+For kAFS, registration is done as follows:
+
+	ret = fscache_register_netfs(&afs_cache_netfs);
+
+The last step is, of course, unregistration:
+
+	void fscache_unregister_netfs(struct fscache_netfs *netfs);
+
+
+================
+CACHE TAG LOOKUP
+================
+
+FS-Cache permits the use of more than one cache.  To permit particular index
+subtrees to be bound to particular caches, the second step is to look up cache
+representation tags.  This step is optional; it can be left entirely up to
+FS-Cache as to which cache should be used.  The problem with doing that is that
+FS-Cache will always pick the first cache that was registered.
+
+To get the representation for a named tag:
+
+	struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name);
+
+This takes a text string as the name and returns a representation of a tag.  It
+will never return an error.  It may return a dummy tag, however, if it runs out
+of memory; this will inhibit caching with this tag.
+
+Any representation so obtained must be released by passing it to this function:
+
+	void fscache_release_cache_tag(struct fscache_cache_tag *tag);
+
+The tag will be retrieved by FS-Cache when it calls the object definition
+operation select_cache().
+
+
+==================
+INDEX REGISTRATION
+==================
+
+The third step is to inform FS-Cache about part of an index hierarchy that can
+be used to locate files.  This is done by requesting a cookie for each index in
+the path to the file:
+
+	struct fscache_cookie *
+	fscache_acquire_cookie(struct fscache_cookie *parent,
+			       const struct fscache_object_def *def,
+			       void *netfs_data);
+
+This function creates an index entry in the index represented by parent,
+filling in the index entry by calling the operations pointed to by def.
+
+Note that this function never returns an error - all errors are handled
+internally.  It may, however, return NULL to indicate no cookie.  It is quite
+acceptable to pass this token back to this function as the parent to another
+acquisition (or even to the relinquish cookie, read page and write page
+functions - see below).
+
+Note also that no indices are actually created in a cache until a non-index
+object needs to be created somewhere down the hierarchy.  Furthermore, an index
+may be created in several different caches independently at different times.
+This is all handled transparently, and the netfs doesn't see any of it.
+
+For example, with AFS, a cell would be added to the primary index.  This index
+entry would have a dependent inode containing a volume location index for the
+volume mappings within this cell:
+
+	cell->cache =
+		fscache_acquire_cookie(afs_cache_netfs.primary_index,
+				       &afs_cell_cache_index_def,
+				       cell);
+
+Then when a volume location was accessed, it would be entered into the cell's
+index and an inode would be allocated that acts as a volume type and hash chain
+combination:
+
+	vlocation->cache =
+		fscache_acquire_cookie(cell->cache,
+				       &afs_vlocation_cache_index_def,
+				       vlocation);
+
+And then a particular flavour of volume (R/O for example) could be added to
+that index, creating another index for vnodes (AFS inode equivalents):
+
+	volume->cache =
+		fscache_acquire_cookie(vlocation->cache,
+				       &afs_volume_cache_index_def,
+				       volume);
+
+
+======================
+DATA FILE REGISTRATION
+======================
+
+The fourth step is to request a data file be created in the cache.  This is
+identical to index cookie acquisition.  The only difference is that the type in
+the object definition should be something other than index type.
+
+	vnode->cache =
+		fscache_acquire_cookie(volume->cache,
+				       &afs_vnode_cache_object_def,
+				       vnode);
+
+
+=================================
+MISCELLANEOUS OBJECT REGISTRATION
+=================================
+
+An optional step is to request an object of miscellaneous type be created in
+the cache.  This is almost identical to index cookie acquisition.  The only
+difference is that the type in the object definition should be something other
+than index type.  Whilst the parent object could be an index, it's more likely
+it would be some other type of object such as a data file.
+
+	xattr->cache =
+		fscache_acquire_cookie(vnode->cache,
+				       &afs_xattr_cache_object_def,
+				       xattr);
+
+Miscellaneous objects might be used to store extended attributes or directory
+entries for example.
+
+
+==========================
+SETTING THE DATA FILE SIZE
+==========================
+
+The fifth step is to set the physical attributes of the file, such as its size.
+This doesn't automatically reserve any space in the cache, but permits the
+cache to adjust its metadata for data tracking appropriately:
+
+	int fscache_attr_changed(struct fscache_cookie *cookie);
+
+The cache will return -ENOBUFS if there is no backing cache or if there is no
+space to allocate any extra metadata required in the cache.  The attributes
+will be accessed with the get_attr() cookie definition operation.
+
+Note that attempts to read or write data pages in the cache over this size may
+be rebuffed with -ENOBUFS.
+
+This operation schedules an attribute adjustment to happen asynchronously at
+some point in the future, and as such, it may happen after the function returns
+to the caller.  The attribute adjustment excludes read and write operations.
+
+
+=====================
+PAGE READ/ALLOC/WRITE
+=====================
+
+And the sixth step is to store and retrieve pages in the cache.  There are
+three functions that are used to do this.
+
+Note:
+
+ (1) A page should not be re-read or re-allocated without uncaching it first.
+
+ (2) A read or allocated page must be uncached when the netfs page is released
+     from the pagecache.
+
+ (3) A page should only be written to the cache if previous read or allocated.
+
+This permits the cache to maintain its page tracking in proper order.
+
+
+PAGE READ
+---------
+
+Firstly, the netfs should ask FS-Cache to examine the caches and read the
+contents cached for a particular page of a particular file if present, or else
+allocate space to store the contents if not:
+
+	typedef
+	void (*fscache_rw_complete_t)(struct page *page,
+				      void *context,
+				      int error);
+
+	int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+				       struct page *page,
+				       fscache_rw_complete_t end_io_func,
+				       void *context,
+				       gfp_t gfp);
+
+The cookie argument must specify a cookie for an object that isn't an index,
+the page specified will have the data loaded into it (and is also used to
+specify the page number), and the gfp argument is used to control how any
+memory allocations made are satisfied.
+
+If the cookie indicates the inode is not cached:
+
+ (1) The function will return -ENOBUFS.
+
+Else if there's a copy of the page resident in the cache:
+
+ (1) The mark_pages_cached() cookie operation will be called on that page.
+
+ (2) The function will submit a request to read the data from the cache's
+     backing device directly into the page specified.
+
+ (3) The function will return 0.
+
+ (4) When the read is complete, end_io_func() will be invoked with:
+
+     (*) The netfs data supplied when the cookie was created.
+
+     (*) The page descriptor.
+
+     (*) The context argument passed to the above function.  This will be
+         maintained with the get_context/put_context functions mentioned above.
+
+     (*) An argument that's 0 on success or negative for an error code.
+
+     If an error occurs, it should be assumed that the page contains no usable
+     data.
+
+     end_io_func() will be called in process context if the read is results in
+     an error, but it might be called in interrupt context if the read is
+     successful.
+
+Otherwise, if there's not a copy available in cache, but the cache may be able
+to store the page:
+
+ (1) The mark_pages_cached() cookie operation will be called on that page.
+
+ (2) A block may be reserved in the cache and attached to the object at the
+     appropriate place.
+
+ (3) The function will return -ENODATA.
+
+This function may also return -ENOMEM or -EINTR, in which case it won't have
+read any data from the cache.
+
+
+PAGE ALLOCATE
+-------------
+
+Alternatively, if there's not expected to be any data in the cache for a page
+because the file has been extended, a block can simply be allocated instead:
+
+	int fscache_alloc_page(struct fscache_cookie *cookie,
+			       struct page *page,
+			       gfp_t gfp);
+
+This is similar to the fscache_read_or_alloc_page() function, except that it
+never reads from the cache.  It will return 0 if a block has been allocated,
+rather than -ENODATA as the other would.  One or the other must be performed
+before writing to the cache.
+
+The mark_pages_cached() cookie operation will be called on the page if
+successful.
+
+
+PAGE WRITE
+----------
+
+Secondly, if the netfs changes the contents of the page (either due to an
+initial download or if a user performs a write), then the page should be
+written back to the cache:
+
+	int fscache_write_page(struct fscache_cookie *cookie,
+			       struct page *page,
+			       gfp_t gfp);
+
+The cookie argument must specify a data file cookie, the page specified should
+contain the data to be written (and is also used to specify the page number),
+and the gfp argument is used to control how any memory allocations made are
+satisfied.
+
+The page must have first been read or allocated successfully and must not have
+been uncached before writing is performed.
+
+If the cookie indicates the inode is not cached then:
+
+ (1) The function will return -ENOBUFS.
+
+Else if space can be allocated in the cache to hold this page:
+
+ (1) PG_fscache_write will be set on the page.
+
+ (2) The function will submit a request to write the data to cache's backing
+     device directly from the page specified.
+
+ (3) The function will return 0.
+
+ (4) When the write is complete PG_fscache_write is cleared on the page and
+     anyone waiting for that bit will be woken up.
+
+Else if there's no space available in the cache, -ENOBUFS will be returned.  It
+is also possible for the PG_fscache_write bit to be cleared when no write took
+place if unforeseen circumstances arose (such as a disk error).
+
+Writing takes place asynchronously.
+
+
+MULTIPLE PAGE READ
+------------------
+
+A facility is provided to read several pages at once, as requested by the
+readpages() address space operation:
+
+	int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+					struct address_space *mapping,
+					struct list_head *pages,
+					int *nr_pages,
+					fscache_rw_complete_t end_io_func,
+					void *context,
+					gfp_t gfp);
+
+This works in a similar way to fscache_read_or_alloc_page(), except:
+
+ (1) Any page it can retrieve data for is removed from pages and nr_pages and
+     dispatched for reading to the disk.  Reads of adjacent pages on disk may
+     be merged for greater efficiency.
+
+ (2) The mark_pages_cached() cookie operation will be called on several pages
+     at once if they're being read or allocated.
+
+ (3) If there was an general error, then that error will be returned.
+
+     Else if some pages couldn't be allocated or read, then -ENOBUFS will be
+     returned.
+
+     Else if some pages couldn't be read but were allocated, then -ENODATA will
+     be returned.
+
+     Otherwise, if all pages had reads dispatched, then 0 will be returned, the
+     list will be empty and *nr_pages will be 0.
+
+ (4) end_io_func will be called once for each page being read as the reads
+     complete.  It will be called in process context if error != 0, but it may
+     be called in interrupt context if there is no error.
+
+Note that a return of -ENODATA, -ENOBUFS or any other error does not preclude
+some of the pages being read and some being allocated.  Those pages will have
+been marked appropriately and will need uncaching.
+
+
+==============
+PAGE UNCACHING
+==============
+
+To uncache a page, this function should be called:
+
+	void fscache_uncache_page(struct fscache_cookie *cookie,
+				  struct page *page);
+
+This function permits the cache to release any in-memory representation it
+might be holding for this netfs page.  This function must be called once for
+each page on which the read or write page functions above have been called to
+make sure the cache's in-memory tracking information gets torn down.
+
+Note that pages can't be explicitly deleted from the a data file.  The whole
+data file must be retired (see the relinquish cookie function below).
+
+Furthermore, note that this does not cancel the asynchronous read or write
+operation started by the read/alloc and write functions, so the page
+invalidation and release functions must use:
+
+	bool fscache_check_page_write(struct fscache_cookie *cookie,
+				      struct page *page);
+
+to see if a page is being written to the cache, and:
+
+	void fscache_wait_on_page_write(struct fscache_cookie *cookie,
+					struct page *page);
+
+to wait for it to finish if it is.
+
+
+==========================
+INDEX AND DATA FILE UPDATE
+==========================
+
+To request an update of the index data for an index or other object, the
+following function should be called:
+
+	void fscache_update_cookie(struct fscache_cookie *cookie);
+
+This function will refer back to the netfs_data pointer stored in the cookie by
+the acquisition function to obtain the data to write into each revised index
+entry.  The update method in the parent index definition will be called to
+transfer the data.
+
+Note that partial updates may happen automatically at other times, such as when
+data blocks are added to a data file object.
+
+
+===============================
+MISCELLANEOUS COOKIE OPERATIONS
+===============================
+
+There are a number of operations that can be used to control cookies:
+
+ (*) Cookie pinning:
+
+	int fscache_pin_cookie(struct fscache_cookie *cookie);
+	void fscache_unpin_cookie(struct fscache_cookie *cookie);
+
+     These operations permit data cookies to be pinned into the cache and to
+     have the pinning removed.  They are not permitted on index cookies.
+
+     The pinning function will return 0 if successful, -ENOBUFS in the cookie
+     isn't backed by a cache, -EOPNOTSUPP if the cache doesn't support pinning,
+     -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
+     -EIO if there's any other problem.
+
+ (*) Data space reservation:
+
+	int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size);
+
+     This permits a netfs to request cache space be reserved to store up to the
+     given amount of a file.  It is permitted to ask for more than the current
+     size of the file to allow for future file expansion.
+
+     If size is given as zero then the reservation will be cancelled.
+
+     The function will return 0 if successful, -ENOBUFS in the cookie isn't
+     backed by a cache, -EOPNOTSUPP if the cache doesn't support reservations,
+     -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
+     -EIO if there's any other problem.
+
+     Note that this doesn't pin an object in a cache; it can still be culled to
+     make space if it's not in use.
+
+
+=====================
+COOKIE UNREGISTRATION
+=====================
+
+To get rid of a cookie, this function should be called.
+
+	void fscache_relinquish_cookie(struct fscache_cookie *cookie,
+				       int retire);
+
+If retire is non-zero, then the object will be marked for recycling, and all
+copies of it will be removed from all active caches in which it is present.
+Not only that but all child objects will also be retired.
+
+If retire is zero, then the object may be available again when next the
+acquisition function is called.  Retirement here will overrule the pinning on a
+cookie.
+
+One very important note - relinquish must NOT be called for a cookie unless all
+the cookies for "child" indices, objects and pages have been relinquished
+first.
+
+
+================================
+INDEX AND DATA FILE INVALIDATION
+================================
+
+There is no direct way to invalidate an index subtree or a data file.  To do
+this, the caller should relinquish and retire the cookie they have, and then
+acquire a new one.
+
+
+===========================
+FS-CACHE SPECIFIC PAGE FLAG
+===========================
+
+FS-Cache makes use of a page flag, PG_private_2, for its own purpose.  This is
+given the alternative name PG_fscache.
+
+PG_fscache is used to indicate that the page is known by the cache, and that
+the cache must be informed if the page is going to go away.  It's an indication
+to the netfs that the cache has an interest in this page, where an interest may
+be a pointer to it, resources allocated or reserved for it, or I/O in progress
+upon it.
+
+The netfs can use this information in methods such as releasepage() to
+determine whether it needs to uncache a page or update it.
+
+Furthermore, if this bit is set, releasepage() and invalidatepage() operations
+will be called on a page to get rid of it, even if PG_private is not set.  This
+allows caching to attempted on a page before read_cache_pages() to be called
+after fscache_read_or_alloc_pages() as the former will try and release pages it
+was given under certain circumstances.
+
+This bit does not overlap with such as PG_private.  This means that FS-Cache
+can be used with a filesystem that uses the block buffering code.
+
+There are a number of operations defined on this flag:
+
+	int PageFsCache(struct page *page);
+	void SetPageFsCache(struct page *page)
+	void ClearPageFsCache(struct page *page)
+	int TestSetPageFsCache(struct page *page)
+	int TestClearPageFsCache(struct page *page)
+
+These functions are bit test, bit set, bit clear, bit test and set and bit
+test and clear operations on PG_fscache.
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
new file mode 100644
index 000000000000..feb3b0e0af4d
--- /dev/null
+++ b/include/linux/fscache.h
@@ -0,0 +1,548 @@
+/* General filesystem caching interface
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * NOTE!!! See:
+ *
+ *	Documentation/filesystems/caching/netfs-api.txt
+ *
+ * for a description of the network filesystem interface declared here.
+ */
+
+#ifndef _LINUX_FSCACHE_H
+#define _LINUX_FSCACHE_H
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+
+#if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE)
+#define fscache_available() (1)
+#define fscache_cookie_valid(cookie) (cookie)
+#else
+#define fscache_available() (0)
+#define fscache_cookie_valid(cookie) (0)
+#endif
+
+
+/*
+ * overload PG_private_2 to give us PG_fscache - this is used to indicate that
+ * a page is currently backed by a local disk cache
+ */
+#define PageFsCache(page)		PagePrivate2((page))
+#define SetPageFsCache(page)		SetPagePrivate2((page))
+#define ClearPageFsCache(page)		ClearPagePrivate2((page))
+#define TestSetPageFsCache(page)	TestSetPagePrivate2((page))
+#define TestClearPageFsCache(page)	TestClearPagePrivate2((page))
+
+/* pattern used to fill dead space in an index entry */
+#define FSCACHE_INDEX_DEADFILL_PATTERN 0x79
+
+struct pagevec;
+struct fscache_cache_tag;
+struct fscache_cookie;
+struct fscache_netfs;
+
+typedef void (*fscache_rw_complete_t)(struct page *page,
+				      void *context,
+				      int error);
+
+/* result of index entry consultation */
+enum fscache_checkaux {
+	FSCACHE_CHECKAUX_OKAY,		/* entry okay as is */
+	FSCACHE_CHECKAUX_NEEDS_UPDATE,	/* entry requires update */
+	FSCACHE_CHECKAUX_OBSOLETE,	/* entry requires deletion */
+};
+
+/*
+ * fscache cookie definition
+ */
+struct fscache_cookie_def {
+	/* name of cookie type */
+	char name[16];
+
+	/* cookie type */
+	uint8_t type;
+#define FSCACHE_COOKIE_TYPE_INDEX	0
+#define FSCACHE_COOKIE_TYPE_DATAFILE	1
+
+	/* select the cache into which to insert an entry in this index
+	 * - optional
+	 * - should return a cache identifier or NULL to cause the cache to be
+	 *   inherited from the parent if possible or the first cache picked
+	 *   for a non-index file if not
+	 */
+	struct fscache_cache_tag *(*select_cache)(
+		const void *parent_netfs_data,
+		const void *cookie_netfs_data);
+
+	/* get an index key
+	 * - should store the key data in the buffer
+	 * - should return the amount of amount stored
+	 * - not permitted to return an error
+	 * - the netfs data from the cookie being used as the source is
+	 *   presented
+	 */
+	uint16_t (*get_key)(const void *cookie_netfs_data,
+			    void *buffer,
+			    uint16_t bufmax);
+
+	/* get certain file attributes from the netfs data
+	 * - this function can be absent for an index
+	 * - not permitted to return an error
+	 * - the netfs data from the cookie being used as the source is
+	 *   presented
+	 */
+	void (*get_attr)(const void *cookie_netfs_data, uint64_t *size);
+
+	/* get the auxilliary data from netfs data
+	 * - this function can be absent if the index carries no state data
+	 * - should store the auxilliary data in the buffer
+	 * - should return the amount of amount stored
+	 * - not permitted to return an error
+	 * - the netfs data from the cookie being used as the source is
+	 *   presented
+	 */
+	uint16_t (*get_aux)(const void *cookie_netfs_data,
+			    void *buffer,
+			    uint16_t bufmax);
+
+	/* consult the netfs about the state of an object
+	 * - this function can be absent if the index carries no state data
+	 * - the netfs data from the cookie being used as the target is
+	 *   presented, as is the auxilliary data
+	 */
+	enum fscache_checkaux (*check_aux)(void *cookie_netfs_data,
+					   const void *data,
+					   uint16_t datalen);
+
+	/* get an extra reference on a read context
+	 * - this function can be absent if the completion function doesn't
+	 *   require a context
+	 */
+	void (*get_context)(void *cookie_netfs_data, void *context);
+
+	/* release an extra reference on a read context
+	 * - this function can be absent if the completion function doesn't
+	 *   require a context
+	 */
+	void (*put_context)(void *cookie_netfs_data, void *context);
+
+	/* indicate pages that now have cache metadata retained
+	 * - this function should mark the specified pages as now being cached
+	 * - the pages will have been marked with PG_fscache before this is
+	 *   called, so this is optional
+	 */
+	void (*mark_pages_cached)(void *cookie_netfs_data,
+				  struct address_space *mapping,
+				  struct pagevec *cached_pvec);
+
+	/* indicate the cookie is no longer cached
+	 * - this function is called when the backing store currently caching
+	 *   a cookie is removed
+	 * - the netfs should use this to clean up any markers indicating
+	 *   cached pages
+	 * - this is mandatory for any object that may have data
+	 */
+	void (*now_uncached)(void *cookie_netfs_data);
+};
+
+/*
+ * fscache cached network filesystem type
+ * - name, version and ops must be filled in before registration
+ * - all other fields will be set during registration
+ */
+struct fscache_netfs {
+	uint32_t			version;	/* indexing version */
+	const char			*name;		/* filesystem name */
+	struct fscache_cookie		*primary_index;
+	struct list_head		link;		/* internal link */
+};
+
+/*
+ * slow-path functions for when there is actually caching available, and the
+ * netfs does actually have a valid token
+ * - these are not to be called directly
+ * - these are undefined symbols when FS-Cache is not configured and the
+ *   optimiser takes care of not using them
+ */
+
+/**
+ * fscache_register_netfs - Register a filesystem as desiring caching services
+ * @netfs: The description of the filesystem
+ *
+ * Register a filesystem as desiring caching services if they're available.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_register_netfs(struct fscache_netfs *netfs)
+{
+	return 0;
+}
+
+/**
+ * fscache_unregister_netfs - Indicate that a filesystem no longer desires
+ * caching services
+ * @netfs: The description of the filesystem
+ *
+ * Indicate that a filesystem no longer desires caching services for the
+ * moment.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+}
+
+/**
+ * fscache_lookup_cache_tag - Look up a cache tag
+ * @name: The name of the tag to search for
+ *
+ * Acquire a specific cache referral tag that can be used to select a specific
+ * cache in which to cache an index.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name)
+{
+	return NULL;
+}
+
+/**
+ * fscache_release_cache_tag - Release a cache tag
+ * @tag: The tag to release
+ *
+ * Release a reference to a cache referral tag previously looked up.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+}
+
+/**
+ * fscache_acquire_cookie - Acquire a cookie to represent a cache object
+ * @parent: The cookie that's to be the parent of this one
+ * @def: A description of the cache object, including callback operations
+ * @netfs_data: An arbitrary piece of data to be kept in the cookie to
+ * represent the cache object to the netfs
+ *
+ * This function is used to inform FS-Cache about part of an index hierarchy
+ * that can be used to locate files.  This is done by requesting a cookie for
+ * each index in the path to the file.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+struct fscache_cookie *fscache_acquire_cookie(
+	struct fscache_cookie *parent,
+	const struct fscache_cookie_def *def,
+	void *netfs_data)
+{
+	return NULL;
+}
+
+/**
+ * fscache_relinquish_cookie - Return the cookie to the cache, maybe discarding
+ * it
+ * @cookie: The cookie being returned
+ * @retire: True if the cache object the cookie represents is to be discarded
+ *
+ * This function returns a cookie to the cache, forcibly discarding the
+ * associated cache object if retire is set to true.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+{
+}
+
+/**
+ * fscache_update_cookie - Request that a cache object be updated
+ * @cookie: The cookie representing the cache object
+ *
+ * Request an update of the index data for the cache object associated with the
+ * cookie.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_update_cookie(struct fscache_cookie *cookie)
+{
+}
+
+/**
+ * fscache_pin_cookie - Pin a data-storage cache object in its cache
+ * @cookie: The cookie representing the cache object
+ *
+ * Permit data-storage cache objects to be pinned in the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_pin_cookie(struct fscache_cookie *cookie)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_pin_cookie - Unpin a data-storage cache object in its cache
+ * @cookie: The cookie representing the cache object
+ *
+ * Permit data-storage cache objects to be unpinned from the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_unpin_cookie(struct fscache_cookie *cookie)
+{
+}
+
+/**
+ * fscache_attr_changed - Notify cache that an object's attributes changed
+ * @cookie: The cookie representing the cache object
+ *
+ * Send a notification to the cache indicating that an object's attributes have
+ * changed.  This includes the data size.  These attributes will be obtained
+ * through the get_attr() cookie definition op.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_attr_changed(struct fscache_cookie *cookie)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_reserve_space - Reserve data space for a cached object
+ * @cookie: The cookie representing the cache object
+ * @i_size: The amount of space to be reserved
+ *
+ * Reserve an amount of space in the cache for the cache object attached to a
+ * cookie so that a write to that object within the space can always be
+ * honoured.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_read_or_alloc_page - Read a page from the cache or allocate a block
+ * in which to store it
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page to fill if possible
+ * @end_io_func: The callback to invoke when and if the page is filled
+ * @context: An arbitrary piece of data to pass on to end_io_func()
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Read a page from the cache, or if that's not possible make a potential
+ * one-block reservation in the cache into which the page may be stored once
+ * fetched from the server.
+ *
+ * If the page is not backed by the cache object, or if it there's some reason
+ * it can't be, -ENOBUFS will be returned and nothing more will be done for
+ * that page.
+ *
+ * Else, if that page is backed by the cache, a read will be initiated directly
+ * to the netfs's page and 0 will be returned by this function.  The
+ * end_io_func() callback will be invoked when the operation terminates on a
+ * completion or failure.  Note that the callback may be invoked before the
+ * return.
+ *
+ * Else, if the page is unbacked, -ENODATA is returned and a block may have
+ * been allocated in the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+			       struct page *page,
+			       fscache_rw_complete_t end_io_func,
+			       void *context,
+			       gfp_t gfp)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_read_or_alloc_pages - Read pages from the cache and/or allocate
+ * blocks in which to store them
+ * @cookie: The cookie representing the cache object
+ * @mapping: The netfs inode mapping to which the pages will be attached
+ * @pages: A list of potential netfs pages to be filled
+ * @end_io_func: The callback to invoke when and if each page is filled
+ * @context: An arbitrary piece of data to pass on to end_io_func()
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Read a set of pages from the cache, or if that's not possible, attempt to
+ * make a potential one-block reservation for each page in the cache into which
+ * that page may be stored once fetched from the server.
+ *
+ * If some pages are not backed by the cache object, or if it there's some
+ * reason they can't be, -ENOBUFS will be returned and nothing more will be
+ * done for that pages.
+ *
+ * Else, if some of the pages are backed by the cache, a read will be initiated
+ * directly to the netfs's page and 0 will be returned by this function.  The
+ * end_io_func() callback will be invoked when the operation terminates on a
+ * completion or failure.  Note that the callback may be invoked before the
+ * return.
+ *
+ * Else, if a page is unbacked, -ENODATA is returned and a block may have
+ * been allocated in the cache.
+ *
+ * Because the function may want to return all of -ENOBUFS, -ENODATA and 0 in
+ * regard to different pages, the return values are prioritised in that order.
+ * Any pages submitted for reading are removed from the pages list.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+				struct address_space *mapping,
+				struct list_head *pages,
+				unsigned *nr_pages,
+				fscache_rw_complete_t end_io_func,
+				void *context,
+				gfp_t gfp)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_alloc_page - Allocate a block in which to store a page
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page to allocate a page for
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Request Allocation a block in the cache in which to store a netfs page
+ * without retrieving any contents from the cache.
+ *
+ * If the page is not backed by a file then -ENOBUFS will be returned and
+ * nothing more will be done, and no reservation will be made.
+ *
+ * Else, a block will be allocated if one wasn't already, and 0 will be
+ * returned
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_alloc_page(struct fscache_cookie *cookie,
+		       struct page *page,
+		       gfp_t gfp)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_write_page - Request storage of a page in the cache
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page to store
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Request the contents of the netfs page be written into the cache.  This
+ * request may be ignored if no cache block is currently allocated, in which
+ * case it will return -ENOBUFS.
+ *
+ * If a cache block was already allocated, a write will be initiated and 0 will
+ * be returned.  The PG_fscache_write page bit is set immediately and will then
+ * be cleared at the completion of the write to indicate the success or failure
+ * of the operation.  Note that the completion may happen before the return.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_write_page(struct fscache_cookie *cookie,
+		       struct page *page,
+		       gfp_t gfp)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_uncache_page - Indicate that caching is no longer required on a page
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page that was being cached.
+ *
+ * Tell the cache that we no longer want a page to be cached and that it should
+ * remove any knowledge of the netfs page it may have.
+ *
+ * Note that this cannot cancel any outstanding I/O operations between this
+ * page and the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_uncache_page(struct fscache_cookie *cookie,
+			  struct page *page)
+{
+}
+
+/**
+ * fscache_check_page_write - Ask if a page is being writing to the cache
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page that is being cached.
+ *
+ * Ask the cache if a page is being written to the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+bool fscache_check_page_write(struct fscache_cookie *cookie,
+			      struct page *page)
+{
+	return false;
+}
+
+/**
+ * fscache_wait_on_page_write - Wait for a page to complete writing to the cache
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page that is being cached.
+ *
+ * Ask the cache to wake us up when a page is no longer being written to the
+ * cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_wait_on_page_write(struct fscache_cookie *cookie,
+				struct page *page)
+{
+}
+
+#endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From 0dfc41d1efcc4180abfd32f68f0ade540e636ff6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:36 +0100
Subject: FS-Cache: Add the FS-Cache cache backend API and documentation

Add the API for a generic facility (FS-Cache) by which caches may declare them
selves open for business, and may obtain work to be done from network
filesystems.  The header file is included by:

	#include <linux/fscache-cache.h>

Documentation for the API is also added to:

	Documentation/filesystems/caching/backend-api.txt

This API is not usable without the implementation of the utility functions
which will be added in further patches.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 Documentation/filesystems/caching/backend-api.txt | 664 ++++++++++++++++++++++
 include/linux/fscache-cache.h                     | 509 +++++++++++++++++
 2 files changed, 1173 insertions(+)
 create mode 100644 Documentation/filesystems/caching/backend-api.txt
 create mode 100644 include/linux/fscache-cache.h

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt
new file mode 100644
index 000000000000..17723053aa91
--- /dev/null
+++ b/Documentation/filesystems/caching/backend-api.txt
@@ -0,0 +1,664 @@
+			  ==========================
+			  FS-CACHE CACHE BACKEND API
+			  ==========================
+
+The FS-Cache system provides an API by which actual caches can be supplied to
+FS-Cache for it to then serve out to network filesystems and other interested
+parties.
+
+This API is declared in <linux/fscache-cache.h>.
+
+
+====================================
+INITIALISING AND REGISTERING A CACHE
+====================================
+
+To start off, a cache definition must be initialised and registered for each
+cache the backend wants to make available.  For instance, CacheFS does this in
+the fill_super() operation on mounting.
+
+The cache definition (struct fscache_cache) should be initialised by calling:
+
+	void fscache_init_cache(struct fscache_cache *cache,
+				struct fscache_cache_ops *ops,
+				const char *idfmt,
+				...);
+
+Where:
+
+ (*) "cache" is a pointer to the cache definition;
+
+ (*) "ops" is a pointer to the table of operations that the backend supports on
+     this cache; and
+
+ (*) "idfmt" is a format and printf-style arguments for constructing a label
+     for the cache.
+
+
+The cache should then be registered with FS-Cache by passing a pointer to the
+previously initialised cache definition to:
+
+	int fscache_add_cache(struct fscache_cache *cache,
+			      struct fscache_object *fsdef,
+			      const char *tagname);
+
+Two extra arguments should also be supplied:
+
+ (*) "fsdef" which should point to the object representation for the FS-Cache
+     master index in this cache.  Netfs primary index entries will be created
+     here.  FS-Cache keeps the caller's reference to the index object if
+     successful and will release it upon withdrawal of the cache.
+
+ (*) "tagname" which, if given, should be a text string naming this cache.  If
+     this is NULL, the identifier will be used instead.  For CacheFS, the
+     identifier is set to name the underlying block device and the tag can be
+     supplied by mount.
+
+This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag
+is already in use.  0 will be returned on success.
+
+
+=====================
+UNREGISTERING A CACHE
+=====================
+
+A cache can be withdrawn from the system by calling this function with a
+pointer to the cache definition:
+
+	void fscache_withdraw_cache(struct fscache_cache *cache);
+
+In CacheFS's case, this is called by put_super().
+
+
+========
+SECURITY
+========
+
+The cache methods are executed one of two contexts:
+
+ (1) that of the userspace process that issued the netfs operation that caused
+     the cache method to be invoked, or
+
+ (2) that of one of the processes in the FS-Cache thread pool.
+
+In either case, this may not be an appropriate context in which to access the
+cache.
+
+The calling process's fsuid, fsgid and SELinux security identities may need to
+be masqueraded for the duration of the cache driver's access to the cache.
+This is left to the cache to handle; FS-Cache makes no effort in this regard.
+
+
+===================================
+CONTROL AND STATISTICS PRESENTATION
+===================================
+
+The cache may present data to the outside world through FS-Cache's interfaces
+in sysfs and procfs - the former for control and the latter for statistics.
+
+A sysfs directory called /sys/fs/fscache/<cachetag>/ is created if CONFIG_SYSFS
+is enabled.  This is accessible through the kobject struct fscache_cache::kobj
+and is for use by the cache as it sees fit.
+
+The cache driver may create itself a directory named for the cache type in the
+/proc/fs/fscache/ directory.  This is available if CONFIG_FSCACHE_PROC is
+enabled and is accessible through:
+
+	struct proc_dir_entry *proc_fscache;
+
+
+========================
+RELEVANT DATA STRUCTURES
+========================
+
+ (*) Index/Data file FS-Cache representation cookie:
+
+	struct fscache_cookie {
+		struct fscache_object_def	*def;
+		struct fscache_netfs		*netfs;
+		void				*netfs_data;
+		...
+	};
+
+     The fields that might be of use to the backend describe the object
+     definition, the netfs definition and the netfs's data for this cookie.
+     The object definition contain functions supplied by the netfs for loading
+     and matching index entries; these are required to provide some of the
+     cache operations.
+
+
+ (*) In-cache object representation:
+
+	struct fscache_object {
+		int				debug_id;
+		enum {
+			FSCACHE_OBJECT_RECYCLING,
+			...
+		}				state;
+		spinlock_t			lock
+		struct fscache_cache		*cache;
+		struct fscache_cookie		*cookie;
+		...
+	};
+
+     Structures of this type should be allocated by the cache backend and
+     passed to FS-Cache when requested by the appropriate cache operation.  In
+     the case of CacheFS, they're embedded in CacheFS's internal object
+     structures.
+
+     The debug_id is a simple integer that can be used in debugging messages
+     that refer to a particular object.  In such a case it should be printed
+     using "OBJ%x" to be consistent with FS-Cache.
+
+     Each object contains a pointer to the cookie that represents the object it
+     is backing.  An object should retired when put_object() is called if it is
+     in state FSCACHE_OBJECT_RECYCLING.  The fscache_object struct should be
+     initialised by calling fscache_object_init(object).
+
+
+ (*) FS-Cache operation record:
+
+	struct fscache_operation {
+		atomic_t		usage;
+		struct fscache_object	*object;
+		unsigned long		flags;
+	#define FSCACHE_OP_EXCLUSIVE
+		void (*processor)(struct fscache_operation *op);
+		void (*release)(struct fscache_operation *op);
+		...
+	};
+
+     FS-Cache has a pool of threads that it uses to give CPU time to the
+     various asynchronous operations that need to be done as part of driving
+     the cache.  These are represented by the above structure.  The processor
+     method is called to give the op CPU time, and the release method to get
+     rid of it when its usage count reaches 0.
+
+     An operation can be made exclusive upon an object by setting the
+     appropriate flag before enqueuing it with fscache_enqueue_operation().  If
+     an operation needs more processing time, it should be enqueued again.
+
+
+ (*) FS-Cache retrieval operation record:
+
+	struct fscache_retrieval {
+		struct fscache_operation op;
+		struct address_space	*mapping;
+		struct list_head	*to_do;
+		...
+	};
+
+     A structure of this type is allocated by FS-Cache to record retrieval and
+     allocation requests made by the netfs.  This struct is then passed to the
+     backend to do the operation.  The backend may get extra refs to it by
+     calling fscache_get_retrieval() and refs may be discarded by calling
+     fscache_put_retrieval().
+
+     A retrieval operation can be used by the backend to do retrieval work.  To
+     do this, the retrieval->op.processor method pointer should be set
+     appropriately by the backend and fscache_enqueue_retrieval() called to
+     submit it to the thread pool.  CacheFiles, for example, uses this to queue
+     page examination when it detects PG_lock being cleared.
+
+     The to_do field is an empty list available for the cache backend to use as
+     it sees fit.
+
+
+ (*) FS-Cache storage operation record:
+
+	struct fscache_storage {
+		struct fscache_operation op;
+		pgoff_t			store_limit;
+		...
+	};
+
+     A structure of this type is allocated by FS-Cache to record outstanding
+     writes to be made.  FS-Cache itself enqueues this operation and invokes
+     the write_page() method on the object at appropriate times to effect
+     storage.
+
+
+================
+CACHE OPERATIONS
+================
+
+The cache backend provides FS-Cache with a table of operations that can be
+performed on the denizens of the cache.  These are held in a structure of type:
+
+	struct fscache_cache_ops
+
+ (*) Name of cache provider [mandatory]:
+
+	const char *name
+
+     This isn't strictly an operation, but should be pointed at a string naming
+     the backend.
+
+
+ (*) Allocate a new object [mandatory]:
+
+	struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
+					       struct fscache_cookie *cookie)
+
+     This method is used to allocate a cache object representation to back a
+     cookie in a particular cache.  fscache_object_init() should be called on
+     the object to initialise it prior to returning.
+
+     This function may also be used to parse the index key to be used for
+     multiple lookup calls to turn it into a more convenient form.  FS-Cache
+     will call the lookup_complete() method to allow the cache to release the
+     form once lookup is complete or aborted.
+
+
+ (*) Look up and create object [mandatory]:
+
+	void (*lookup_object)(struct fscache_object *object)
+
+     This method is used to look up an object, given that the object is already
+     allocated and attached to the cookie.  This should instantiate that object
+     in the cache if it can.
+
+     The method should call fscache_object_lookup_negative() as soon as
+     possible if it determines the object doesn't exist in the cache.  If the
+     object is found to exist and the netfs indicates that it is valid then
+     fscache_obtained_object() should be called once the object is in a
+     position to have data stored in it.  Similarly, fscache_obtained_object()
+     should also be called once a non-present object has been created.
+
+     If a lookup error occurs, fscache_object_lookup_error() should be called
+     to abort the lookup of that object.
+
+
+ (*) Release lookup data [mandatory]:
+
+	void (*lookup_complete)(struct fscache_object *object)
+
+     This method is called to ask the cache to release any resources it was
+     using to perform a lookup.
+
+
+ (*) Increment object refcount [mandatory]:
+
+	struct fscache_object *(*grab_object)(struct fscache_object *object)
+
+     This method is called to increment the reference count on an object.  It
+     may fail (for instance if the cache is being withdrawn) by returning NULL.
+     It should return the object pointer if successful.
+
+
+ (*) Lock/Unlock object [mandatory]:
+
+	void (*lock_object)(struct fscache_object *object)
+	void (*unlock_object)(struct fscache_object *object)
+
+     These methods are used to exclusively lock an object.  It must be possible
+     to schedule with the lock held, so a spinlock isn't sufficient.
+
+
+ (*) Pin/Unpin object [optional]:
+
+	int (*pin_object)(struct fscache_object *object)
+	void (*unpin_object)(struct fscache_object *object)
+
+     These methods are used to pin an object into the cache.  Once pinned an
+     object cannot be reclaimed to make space.  Return -ENOSPC if there's not
+     enough space in the cache to permit this.
+
+
+ (*) Update object [mandatory]:
+
+	int (*update_object)(struct fscache_object *object)
+
+     This is called to update the index entry for the specified object.  The
+     new information should be in object->cookie->netfs_data.  This can be
+     obtained by calling object->cookie->def->get_aux()/get_attr().
+
+
+ (*) Discard object [mandatory]:
+
+	void (*drop_object)(struct fscache_object *object)
+
+     This method is called to indicate that an object has been unbound from its
+     cookie, and that the cache should release the object's resources and
+     retire it if it's in state FSCACHE_OBJECT_RECYCLING.
+
+     This method should not attempt to release any references held by the
+     caller.  The caller will invoke the put_object() method as appropriate.
+
+
+ (*) Release object reference [mandatory]:
+
+	void (*put_object)(struct fscache_object *object)
+
+     This method is used to discard a reference to an object.  The object may
+     be freed when all the references to it are released.
+
+
+ (*) Synchronise a cache [mandatory]:
+
+	void (*sync)(struct fscache_cache *cache)
+
+     This is called to ask the backend to synchronise a cache with its backing
+     device.
+
+
+ (*) Dissociate a cache [mandatory]:
+
+	void (*dissociate_pages)(struct fscache_cache *cache)
+
+     This is called to ask a cache to perform any page dissociations as part of
+     cache withdrawal.
+
+
+ (*) Notification that the attributes on a netfs file changed [mandatory]:
+
+	int (*attr_changed)(struct fscache_object *object);
+
+     This is called to indicate to the cache that certain attributes on a netfs
+     file have changed (for example the maximum size a file may reach).  The
+     cache can read these from the netfs by calling the cookie's get_attr()
+     method.
+
+     The cache may use the file size information to reserve space on the cache.
+     It should also call fscache_set_store_limit() to indicate to FS-Cache the
+     highest byte it's willing to store for an object.
+
+     This method may return -ve if an error occurred or the cache object cannot
+     be expanded.  In such a case, the object will be withdrawn from service.
+
+     This operation is run asynchronously from FS-Cache's thread pool, and
+     storage and retrieval operations from the netfs are excluded during the
+     execution of this operation.
+
+
+ (*) Reserve cache space for an object's data [optional]:
+
+	int (*reserve_space)(struct fscache_object *object, loff_t size);
+
+     This is called to request that cache space be reserved to hold the data
+     for an object and the metadata used to track it.  Zero size should be
+     taken as request to cancel a reservation.
+
+     This should return 0 if successful, -ENOSPC if there isn't enough space
+     available, or -ENOMEM or -EIO on other errors.
+
+     The reservation may exceed the current size of the object, thus permitting
+     future expansion.  If the amount of space consumed by an object would
+     exceed the reservation, it's permitted to refuse requests to allocate
+     pages, but not required.  An object may be pruned down to its reservation
+     size if larger than that already.
+
+
+ (*) Request page be read from cache [mandatory]:
+
+	int (*read_or_alloc_page)(struct fscache_retrieval *op,
+				  struct page *page,
+				  gfp_t gfp)
+
+     This is called to attempt to read a netfs page from the cache, or to
+     reserve a backing block if not.  FS-Cache will have done as much checking
+     as it can before calling, but most of the work belongs to the backend.
+
+     If there's no page in the cache, then -ENODATA should be returned if the
+     backend managed to reserve a backing block; -ENOBUFS or -ENOMEM if it
+     didn't.
+
+     If there is suitable data in the cache, then a read operation should be
+     queued and 0 returned.  When the read finishes, fscache_end_io() should be
+     called.
+
+     The fscache_mark_pages_cached() should be called for the page if any cache
+     metadata is retained.  This will indicate to the netfs that the page needs
+     explicit uncaching.  This operation takes a pagevec, thus allowing several
+     pages to be marked at once.
+
+     The retrieval record pointed to by op should be retained for each page
+     queued and released when I/O on the page has been formally ended.
+     fscache_get/put_retrieval() are available for this purpose.
+
+     The retrieval record may be used to get CPU time via the FS-Cache thread
+     pool.  If this is desired, the op->op.processor should be set to point to
+     the appropriate processing routine, and fscache_enqueue_retrieval() should
+     be called at an appropriate point to request CPU time.  For instance, the
+     retrieval routine could be enqueued upon the completion of a disk read.
+     The to_do field in the retrieval record is provided to aid in this.
+
+     If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS
+     returned if possible or fscache_end_io() called with a suitable error
+     code..
+
+
+ (*) Request pages be read from cache [mandatory]:
+
+	int (*read_or_alloc_pages)(struct fscache_retrieval *op,
+				   struct list_head *pages,
+				   unsigned *nr_pages,
+				   gfp_t gfp)
+
+     This is like the read_or_alloc_page() method, except it is handed a list
+     of pages instead of one page.  Any pages on which a read operation is
+     started must be added to the page cache for the specified mapping and also
+     to the LRU.  Such pages must also be removed from the pages list and
+     *nr_pages decremented per page.
+
+     If there was an error such as -ENOMEM, then that should be returned; else
+     if one or more pages couldn't be read or allocated, then -ENOBUFS should
+     be returned; else if one or more pages couldn't be read, then -ENODATA
+     should be returned.  If all the pages are dispatched then 0 should be
+     returned.
+
+
+ (*) Request page be allocated in the cache [mandatory]:
+
+	int (*allocate_page)(struct fscache_retrieval *op,
+			     struct page *page,
+			     gfp_t gfp)
+
+     This is like the read_or_alloc_page() method, except that it shouldn't
+     read from the cache, even if there's data there that could be retrieved.
+     It should, however, set up any internal metadata required such that
+     the write_page() method can write to the cache.
+
+     If there's no backing block available, then -ENOBUFS should be returned
+     (or -ENOMEM if there were other problems).  If a block is successfully
+     allocated, then the netfs page should be marked and 0 returned.
+
+
+ (*) Request pages be allocated in the cache [mandatory]:
+
+	int (*allocate_pages)(struct fscache_retrieval *op,
+			      struct list_head *pages,
+			      unsigned *nr_pages,
+			      gfp_t gfp)
+
+     This is an multiple page version of the allocate_page() method.  pages and
+     nr_pages should be treated as for the read_or_alloc_pages() method.
+
+
+ (*) Request page be written to cache [mandatory]:
+
+	int (*write_page)(struct fscache_storage *op,
+			  struct page *page);
+
+     This is called to write from a page on which there was a previously
+     successful read_or_alloc_page() call or similar.  FS-Cache filters out
+     pages that don't have mappings.
+
+     This method is called asynchronously from the FS-Cache thread pool.  It is
+     not required to actually store anything, provided -ENODATA is then
+     returned to the next read of this page.
+
+     If an error occurred, then a negative error code should be returned,
+     otherwise zero should be returned.  FS-Cache will take appropriate action
+     in response to an error, such as withdrawing this object.
+
+     If this method returns success then FS-Cache will inform the netfs
+     appropriately.
+
+
+ (*) Discard retained per-page metadata [mandatory]:
+
+	void (*uncache_page)(struct fscache_object *object, struct page *page)
+
+     This is called when a netfs page is being evicted from the pagecache.  The
+     cache backend should tear down any internal representation or tracking it
+     maintains for this page.
+
+
+==================
+FS-CACHE UTILITIES
+==================
+
+FS-Cache provides some utilities that a cache backend may make use of:
+
+ (*) Note occurrence of an I/O error in a cache:
+
+	void fscache_io_error(struct fscache_cache *cache)
+
+     This tells FS-Cache that an I/O error occurred in the cache.  After this
+     has been called, only resource dissociation operations (object and page
+     release) will be passed from the netfs to the cache backend for the
+     specified cache.
+
+     This does not actually withdraw the cache.  That must be done separately.
+
+
+ (*) Invoke the retrieval I/O completion function:
+
+	void fscache_end_io(struct fscache_retrieval *op, struct page *page,
+			    int error);
+
+     This is called to note the end of an attempt to retrieve a page.  The
+     error value should be 0 if successful and an error otherwise.
+
+
+ (*) Set highest store limit:
+
+	void fscache_set_store_limit(struct fscache_object *object,
+				     loff_t i_size);
+
+     This sets the limit FS-Cache imposes on the highest byte it's willing to
+     try and store for a netfs.  Any page over this limit is automatically
+     rejected by fscache_read_alloc_page() and co with -ENOBUFS.
+
+
+ (*) Mark pages as being cached:
+
+	void fscache_mark_pages_cached(struct fscache_retrieval *op,
+				       struct pagevec *pagevec);
+
+     This marks a set of pages as being cached.  After this has been called,
+     the netfs must call fscache_uncache_page() to unmark the pages.
+
+
+ (*) Perform coherency check on an object:
+
+	enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+						const void *data,
+						uint16_t datalen);
+
+     This asks the netfs to perform a coherency check on an object that has
+     just been looked up.  The cookie attached to the object will determine the
+     netfs to use.  data and datalen should specify where the auxiliary data
+     retrieved from the cache can be found.
+
+     One of three values will be returned:
+
+	(*) FSCACHE_CHECKAUX_OKAY
+
+	    The coherency data indicates the object is valid as is.
+
+	(*) FSCACHE_CHECKAUX_NEEDS_UPDATE
+
+	    The coherency data needs updating, but otherwise the object is
+	    valid.
+
+	(*) FSCACHE_CHECKAUX_OBSOLETE
+
+	    The coherency data indicates that the object is obsolete and should
+	    be discarded.
+
+
+ (*) Initialise a freshly allocated object:
+
+	void fscache_object_init(struct fscache_object *object);
+
+     This initialises all the fields in an object representation.
+
+
+ (*) Indicate the destruction of an object:
+
+	void fscache_object_destroyed(struct fscache_cache *cache);
+
+     This must be called to inform FS-Cache that an object that belonged to a
+     cache has been destroyed and deallocated.  This will allow continuation
+     of the cache withdrawal process when it is stopped pending destruction of
+     all the objects.
+
+
+ (*) Indicate negative lookup on an object:
+
+	void fscache_object_lookup_negative(struct fscache_object *object);
+
+     This is called to indicate to FS-Cache that a lookup process for an object
+     found a negative result.
+
+     This changes the state of an object to permit reads pending on lookup
+     completion to go off and start fetching data from the netfs server as it's
+     known at this point that there can't be any data in the cache.
+
+     This may be called multiple times on an object.  Only the first call is
+     significant - all subsequent calls are ignored.
+
+
+ (*) Indicate an object has been obtained:
+
+	void fscache_obtained_object(struct fscache_object *object);
+
+     This is called to indicate to FS-Cache that a lookup process for an object
+     produced a positive result, or that an object was created.  This should
+     only be called once for any particular object.
+
+     This changes the state of an object to indicate:
+
+	(1) if no call to fscache_object_lookup_negative() has been made on
+	    this object, that there may be data available, and that reads can
+	    now go and look for it; and
+
+        (2) that writes may now proceed against this object.
+
+
+ (*) Indicate that object lookup failed:
+
+	void fscache_object_lookup_error(struct fscache_object *object);
+
+     This marks an object as having encountered a fatal error (usually EIO)
+     and causes it to move into a state whereby it will be withdrawn as soon
+     as possible.
+
+
+ (*) Get and release references on a retrieval record:
+
+	void fscache_get_retrieval(struct fscache_retrieval *op);
+	void fscache_put_retrieval(struct fscache_retrieval *op);
+
+     These two functions are used to retain a retrieval record whilst doing
+     asynchronous data retrieval and block allocation.
+
+
+ (*) Enqueue a retrieval record for processing.
+
+	void fscache_enqueue_retrieval(struct fscache_retrieval *op);
+
+     This enqueues a retrieval record for processing by the FS-Cache thread
+     pool.  One of the threads in the pool will invoke the retrieval record's
+     op->op.processor callback function.  This function may be called from
+     within the callback function.
+
+
+ (*) List of object state names:
+
+	const char *fscache_object_states[];
+
+     For debugging purposes, this may be used to turn the state that an object
+     is in into a text string for display purposes.
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
new file mode 100644
index 000000000000..b2a9a484c4cf
--- /dev/null
+++ b/include/linux/fscache-cache.h
@@ -0,0 +1,509 @@
+/* General filesystem caching backing cache interface
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * NOTE!!! See:
+ *
+ *	Documentation/filesystems/caching/backend-api.txt
+ *
+ * for a description of the cache backend interface declared here.
+ */
+
+#ifndef _LINUX_FSCACHE_CACHE_H
+#define _LINUX_FSCACHE_CACHE_H
+
+#include <linux/fscache.h>
+#include <linux/sched.h>
+#include <linux/slow-work.h>
+
+#define NR_MAXCACHES BITS_PER_LONG
+
+struct fscache_cache;
+struct fscache_cache_ops;
+struct fscache_object;
+struct fscache_operation;
+
+#ifdef CONFIG_FSCACHE_PROC
+extern struct proc_dir_entry *proc_fscache;
+#endif
+
+/*
+ * cache tag definition
+ */
+struct fscache_cache_tag {
+	struct list_head	link;
+	struct fscache_cache	*cache;		/* cache referred to by this tag */
+	unsigned long		flags;
+#define FSCACHE_TAG_RESERVED	0		/* T if tag is reserved for a cache */
+	atomic_t		usage;
+	char			name[0];	/* tag name */
+};
+
+/*
+ * cache definition
+ */
+struct fscache_cache {
+	const struct fscache_cache_ops *ops;
+	struct fscache_cache_tag *tag;		/* tag representing this cache */
+	struct kobject		*kobj;		/* system representation of this cache */
+	struct list_head	link;		/* link in list of caches */
+	size_t			max_index_size;	/* maximum size of index data */
+	char			identifier[36];	/* cache label */
+
+	/* node management */
+	struct work_struct	op_gc;		/* operation garbage collector */
+	struct list_head	object_list;	/* list of data/index objects */
+	struct list_head	op_gc_list;	/* list of ops to be deleted */
+	spinlock_t		object_list_lock;
+	spinlock_t		op_gc_list_lock;
+	atomic_t		object_count;	/* no. of live objects in this cache */
+	struct fscache_object	*fsdef;		/* object for the fsdef index */
+	unsigned long		flags;
+#define FSCACHE_IOERROR		0	/* cache stopped on I/O error */
+#define FSCACHE_CACHE_WITHDRAWN	1	/* cache has been withdrawn */
+};
+
+extern wait_queue_head_t fscache_cache_cleared_wq;
+
+/*
+ * operation to be applied to a cache object
+ * - retrieval initiation operations are done in the context of the process
+ *   that issued them, and not in an async thread pool
+ */
+typedef void (*fscache_operation_release_t)(struct fscache_operation *op);
+typedef void (*fscache_operation_processor_t)(struct fscache_operation *op);
+
+struct fscache_operation {
+	union {
+		struct work_struct fast_work;	/* record for fast ops */
+		struct slow_work slow_work;	/* record for (very) slow ops */
+	};
+	struct list_head	pend_link;	/* link in object->pending_ops */
+	struct fscache_object	*object;	/* object to be operated upon */
+
+	unsigned long		flags;
+#define FSCACHE_OP_TYPE		0x000f	/* operation type */
+#define FSCACHE_OP_FAST		0x0001	/* - fast op, processor may not sleep for disk */
+#define FSCACHE_OP_SLOW		0x0002	/* - (very) slow op, processor may sleep for disk */
+#define FSCACHE_OP_MYTHREAD	0x0003	/* - processing is done be issuing thread, not pool */
+#define FSCACHE_OP_WAITING	4	/* cleared when op is woken */
+#define FSCACHE_OP_EXCLUSIVE	5	/* exclusive op, other ops must wait */
+#define FSCACHE_OP_DEAD		6	/* op is now dead */
+
+	atomic_t		usage;
+	unsigned		debug_id;	/* debugging ID */
+
+	/* operation processor callback
+	 * - can be NULL if FSCACHE_OP_WAITING is going to be used to perform
+	 *   the op in a non-pool thread */
+	fscache_operation_processor_t processor;
+
+	/* operation releaser */
+	fscache_operation_release_t release;
+};
+
+extern atomic_t fscache_op_debug_id;
+extern const struct slow_work_ops fscache_op_slow_work_ops;
+
+extern void fscache_enqueue_operation(struct fscache_operation *);
+extern void fscache_put_operation(struct fscache_operation *);
+
+/**
+ * fscache_operation_init - Do basic initialisation of an operation
+ * @op: The operation to initialise
+ * @release: The release function to assign
+ *
+ * Do basic initialisation of an operation.  The caller must still set flags,
+ * object, either fast_work or slow_work if necessary, and processor if needed.
+ */
+static inline void fscache_operation_init(struct fscache_operation *op,
+					  fscache_operation_release_t release)
+{
+	atomic_set(&op->usage, 1);
+	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+	op->release = release;
+	INIT_LIST_HEAD(&op->pend_link);
+}
+
+/**
+ * fscache_operation_init_slow - Do additional initialisation of a slow op
+ * @op: The operation to initialise
+ * @processor: The processor function to assign
+ *
+ * Do additional initialisation of an operation as required for slow work.
+ */
+static inline
+void fscache_operation_init_slow(struct fscache_operation *op,
+				 fscache_operation_processor_t processor)
+{
+	op->processor = processor;
+	slow_work_init(&op->slow_work, &fscache_op_slow_work_ops);
+}
+
+/*
+ * data read operation
+ */
+struct fscache_retrieval {
+	struct fscache_operation op;
+	struct address_space	*mapping;	/* netfs pages */
+	fscache_rw_complete_t	end_io_func;	/* function to call on I/O completion */
+	void			*context;	/* netfs read context (pinned) */
+	struct list_head	to_do;		/* list of things to be done by the backend */
+	unsigned long		start_time;	/* time at which retrieval started */
+};
+
+typedef int (*fscache_page_retrieval_func_t)(struct fscache_retrieval *op,
+					     struct page *page,
+					     gfp_t gfp);
+
+typedef int (*fscache_pages_retrieval_func_t)(struct fscache_retrieval *op,
+					      struct list_head *pages,
+					      unsigned *nr_pages,
+					      gfp_t gfp);
+
+/**
+ * fscache_get_retrieval - Get an extra reference on a retrieval operation
+ * @op: The retrieval operation to get a reference on
+ *
+ * Get an extra reference on a retrieval operation.
+ */
+static inline
+struct fscache_retrieval *fscache_get_retrieval(struct fscache_retrieval *op)
+{
+	atomic_inc(&op->op.usage);
+	return op;
+}
+
+/**
+ * fscache_enqueue_retrieval - Enqueue a retrieval operation for processing
+ * @op: The retrieval operation affected
+ *
+ * Enqueue a retrieval operation for processing by the FS-Cache thread pool.
+ */
+static inline void fscache_enqueue_retrieval(struct fscache_retrieval *op)
+{
+	fscache_enqueue_operation(&op->op);
+}
+
+/**
+ * fscache_put_retrieval - Drop a reference to a retrieval operation
+ * @op: The retrieval operation affected
+ *
+ * Drop a reference to a retrieval operation.
+ */
+static inline void fscache_put_retrieval(struct fscache_retrieval *op)
+{
+	fscache_put_operation(&op->op);
+}
+
+/*
+ * cached page storage work item
+ * - used to do three things:
+ *   - batch writes to the cache
+ *   - do cache writes asynchronously
+ *   - defer writes until cache object lookup completion
+ */
+struct fscache_storage {
+	struct fscache_operation op;
+	pgoff_t			store_limit;	/* don't write more than this */
+};
+
+/*
+ * cache operations
+ */
+struct fscache_cache_ops {
+	/* name of cache provider */
+	const char *name;
+
+	/* allocate an object record for a cookie */
+	struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
+					       struct fscache_cookie *cookie);
+
+	/* look up the object for a cookie */
+	void (*lookup_object)(struct fscache_object *object);
+
+	/* finished looking up */
+	void (*lookup_complete)(struct fscache_object *object);
+
+	/* increment the usage count on this object (may fail if unmounting) */
+	struct fscache_object *(*grab_object)(struct fscache_object *object);
+
+	/* pin an object in the cache */
+	int (*pin_object)(struct fscache_object *object);
+
+	/* unpin an object in the cache */
+	void (*unpin_object)(struct fscache_object *object);
+
+	/* store the updated auxilliary data on an object */
+	void (*update_object)(struct fscache_object *object);
+
+	/* discard the resources pinned by an object and effect retirement if
+	 * necessary */
+	void (*drop_object)(struct fscache_object *object);
+
+	/* dispose of a reference to an object */
+	void (*put_object)(struct fscache_object *object);
+
+	/* sync a cache */
+	void (*sync_cache)(struct fscache_cache *cache);
+
+	/* notification that the attributes of a non-index object (such as
+	 * i_size) have changed */
+	int (*attr_changed)(struct fscache_object *object);
+
+	/* reserve space for an object's data and associated metadata */
+	int (*reserve_space)(struct fscache_object *object, loff_t i_size);
+
+	/* request a backing block for a page be read or allocated in the
+	 * cache */
+	fscache_page_retrieval_func_t read_or_alloc_page;
+
+	/* request backing blocks for a list of pages be read or allocated in
+	 * the cache */
+	fscache_pages_retrieval_func_t read_or_alloc_pages;
+
+	/* request a backing block for a page be allocated in the cache so that
+	 * it can be written directly */
+	fscache_page_retrieval_func_t allocate_page;
+
+	/* request backing blocks for pages be allocated in the cache so that
+	 * they can be written directly */
+	fscache_pages_retrieval_func_t allocate_pages;
+
+	/* write a page to its backing block in the cache */
+	int (*write_page)(struct fscache_storage *op, struct page *page);
+
+	/* detach backing block from a page (optional)
+	 * - must release the cookie lock before returning
+	 * - may sleep
+	 */
+	void (*uncache_page)(struct fscache_object *object,
+			     struct page *page);
+
+	/* dissociate a cache from all the pages it was backing */
+	void (*dissociate_pages)(struct fscache_cache *cache);
+};
+
+/*
+ * data file or index object cookie
+ * - a file will only appear in one cache
+ * - a request to cache a file may or may not be honoured, subject to
+ *   constraints such as disk space
+ * - indices are created on disk just-in-time
+ */
+struct fscache_cookie {
+	atomic_t			usage;		/* number of users of this cookie */
+	atomic_t			n_children;	/* number of children of this cookie */
+	spinlock_t			lock;
+	struct hlist_head		backing_objects; /* object(s) backing this file/index */
+	const struct fscache_cookie_def	*def;		/* definition */
+	struct fscache_cookie		*parent;	/* parent of this entry */
+	void				*netfs_data;	/* back pointer to netfs */
+	struct radix_tree_root		stores;		/* pages to be stored on this cookie */
+#define FSCACHE_COOKIE_PENDING_TAG	0		/* pages tag: pending write to cache */
+
+	unsigned long			flags;
+#define FSCACHE_COOKIE_LOOKING_UP	0	/* T if non-index cookie being looked up still */
+#define FSCACHE_COOKIE_CREATING		1	/* T if non-index object being created still */
+#define FSCACHE_COOKIE_NO_DATA_YET	2	/* T if new object with no cached data yet */
+#define FSCACHE_COOKIE_PENDING_FILL	3	/* T if pending initial fill on object */
+#define FSCACHE_COOKIE_FILLING		4	/* T if filling object incrementally */
+#define FSCACHE_COOKIE_UNAVAILABLE	5	/* T if cookie is unavailable (error, etc) */
+};
+
+extern struct fscache_cookie fscache_fsdef_index;
+
+/*
+ * on-disk cache file or index handle
+ */
+struct fscache_object {
+	enum fscache_object_state {
+		FSCACHE_OBJECT_INIT,		/* object in initial unbound state */
+		FSCACHE_OBJECT_LOOKING_UP,	/* looking up object */
+		FSCACHE_OBJECT_CREATING,	/* creating object */
+
+		/* active states */
+		FSCACHE_OBJECT_AVAILABLE,	/* cleaning up object after creation */
+		FSCACHE_OBJECT_ACTIVE,		/* object is usable */
+		FSCACHE_OBJECT_UPDATING,	/* object is updating */
+
+		/* terminal states */
+		FSCACHE_OBJECT_DYING,		/* object waiting for accessors to finish */
+		FSCACHE_OBJECT_LC_DYING,	/* object cleaning up after lookup/create */
+		FSCACHE_OBJECT_ABORT_INIT,	/* abort the init state */
+		FSCACHE_OBJECT_RELEASING,	/* releasing object */
+		FSCACHE_OBJECT_RECYCLING,	/* retiring object */
+		FSCACHE_OBJECT_WITHDRAWING,	/* withdrawing object */
+		FSCACHE_OBJECT_DEAD,		/* object is now dead */
+	} state;
+
+	int			debug_id;	/* debugging ID */
+	int			n_children;	/* number of child objects */
+	int			n_ops;		/* number of ops outstanding on object */
+	int			n_obj_ops;	/* number of object ops outstanding on object */
+	int			n_in_progress;	/* number of ops in progress */
+	int			n_exclusive;	/* number of exclusive ops queued */
+	spinlock_t		lock;		/* state and operations lock */
+
+	unsigned long		lookup_jif;	/* time at which lookup started */
+	unsigned long		event_mask;	/* events this object is interested in */
+	unsigned long		events;		/* events to be processed by this object
+						 * (order is important - using fls) */
+#define FSCACHE_OBJECT_EV_REQUEUE	0	/* T if object should be requeued */
+#define FSCACHE_OBJECT_EV_UPDATE	1	/* T if object should be updated */
+#define FSCACHE_OBJECT_EV_CLEARED	2	/* T if accessors all gone */
+#define FSCACHE_OBJECT_EV_ERROR		3	/* T if fatal error occurred during processing */
+#define FSCACHE_OBJECT_EV_RELEASE	4	/* T if netfs requested object release */
+#define FSCACHE_OBJECT_EV_RETIRE	5	/* T if netfs requested object retirement */
+#define FSCACHE_OBJECT_EV_WITHDRAW	6	/* T if cache requested object withdrawal */
+
+	unsigned long		flags;
+#define FSCACHE_OBJECT_LOCK		0	/* T if object is busy being processed */
+#define FSCACHE_OBJECT_PENDING_WRITE	1	/* T if object has pending write */
+#define FSCACHE_OBJECT_WAITING		2	/* T if object is waiting on its parent */
+
+	struct list_head	cache_link;	/* link in cache->object_list */
+	struct hlist_node	cookie_link;	/* link in cookie->backing_objects */
+	struct fscache_cache	*cache;		/* cache that supplied this object */
+	struct fscache_cookie	*cookie;	/* netfs's file/index object */
+	struct fscache_object	*parent;	/* parent object */
+	struct slow_work	work;		/* attention scheduling record */
+	struct list_head	dependents;	/* FIFO of dependent objects */
+	struct list_head	dep_link;	/* link in parent's dependents list */
+	struct list_head	pending_ops;	/* unstarted operations on this object */
+	pgoff_t			store_limit;	/* current storage limit */
+};
+
+extern const char *fscache_object_states[];
+
+#define fscache_object_is_active(obj)			      \
+	(!test_bit(FSCACHE_IOERROR, &(obj)->cache->flags) &&  \
+	 (obj)->state >= FSCACHE_OBJECT_AVAILABLE &&	      \
+	 (obj)->state < FSCACHE_OBJECT_DYING)
+
+extern const struct slow_work_ops fscache_object_slow_work_ops;
+
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_object_init(struct fscache_object *object,
+			 struct fscache_cookie *cookie,
+			 struct fscache_cache *cache)
+{
+	atomic_inc(&cache->object_count);
+
+	object->state = FSCACHE_OBJECT_INIT;
+	spin_lock_init(&object->lock);
+	INIT_LIST_HEAD(&object->cache_link);
+	INIT_HLIST_NODE(&object->cookie_link);
+	vslow_work_init(&object->work, &fscache_object_slow_work_ops);
+	INIT_LIST_HEAD(&object->dependents);
+	INIT_LIST_HEAD(&object->dep_link);
+	INIT_LIST_HEAD(&object->pending_ops);
+	object->n_children = 0;
+	object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+	object->events = object->event_mask = 0;
+	object->flags = 0;
+	object->store_limit = 0;
+	object->cache = cache;
+	object->cookie = cookie;
+	object->parent = NULL;
+}
+
+extern void fscache_object_lookup_negative(struct fscache_object *object);
+extern void fscache_obtained_object(struct fscache_object *object);
+
+/**
+ * fscache_object_destroyed - Note destruction of an object in a cache
+ * @cache: The cache from which the object came
+ *
+ * Note the destruction and deallocation of an object record in a cache.
+ */
+static inline void fscache_object_destroyed(struct fscache_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->object_count))
+		wake_up_all(&fscache_cache_cleared_wq);
+}
+
+/**
+ * fscache_object_lookup_error - Note an object encountered an error
+ * @object: The object on which the error was encountered
+ *
+ * Note that an object encountered a fatal error (usually an I/O error) and
+ * that it should be withdrawn as soon as possible.
+ */
+static inline void fscache_object_lookup_error(struct fscache_object *object)
+{
+	set_bit(FSCACHE_OBJECT_EV_ERROR, &object->events);
+}
+
+/**
+ * fscache_set_store_limit - Set the maximum size to be stored in an object
+ * @object: The object to set the maximum on
+ * @i_size: The limit to set in bytes
+ *
+ * Set the maximum size an object is permitted to reach, implying the highest
+ * byte that may be written.  Intended to be called by the attr_changed() op.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_set_store_limit(struct fscache_object *object, loff_t i_size)
+{
+	object->store_limit = i_size >> PAGE_SHIFT;
+	if (i_size & ~PAGE_MASK)
+		object->store_limit++;
+}
+
+/**
+ * fscache_end_io - End a retrieval operation on a page
+ * @op: The FS-Cache operation covering the retrieval
+ * @page: The page that was to be fetched
+ * @error: The error code (0 if successful)
+ *
+ * Note the end of an operation to retrieve a page, as covered by a particular
+ * operation record.
+ */
+static inline void fscache_end_io(struct fscache_retrieval *op,
+				  struct page *page, int error)
+{
+	op->end_io_func(page, op->context, error);
+}
+
+/*
+ * out-of-line cache backend functions
+ */
+extern void fscache_init_cache(struct fscache_cache *cache,
+			       const struct fscache_cache_ops *ops,
+			       const char *idfmt,
+			       ...) __attribute__ ((format (printf, 3, 4)));
+
+extern int fscache_add_cache(struct fscache_cache *cache,
+			     struct fscache_object *fsdef,
+			     const char *tagname);
+extern void fscache_withdraw_cache(struct fscache_cache *cache);
+
+extern void fscache_io_error(struct fscache_cache *cache);
+
+extern void fscache_mark_pages_cached(struct fscache_retrieval *op,
+				      struct pagevec *pagevec);
+
+extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+					       const void *data,
+					       uint16_t datalen);
+
+#endif /* _LINUX_FSCACHE_CACHE_H */
-- 
cgit v1.2.3


From 7394daa8c61dfda4baa687f133748fa0b599b017 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:37 +0100
Subject: FS-Cache: Add use of /proc and presentation of statistics

Make FS-Cache create its /proc interface and present various statistical
information through it.  Also provide the functions for updating this
information.

These features are enabled by:

	CONFIG_FSCACHE_PROC
	CONFIG_FSCACHE_STATS
	CONFIG_FSCACHE_HISTOGRAM

The /proc directory for FS-Cache is also exported so that caching modules can
add their own statistics there too.

The FS-Cache module is loadable at this point, and the statistics files can be
examined by userspace:

	cat /proc/fs/fscache/stats
	cat /proc/fs/fscache/histogram

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 Documentation/filesystems/caching/backend-api.txt |   6 -
 Documentation/filesystems/caching/fscache.txt     |  12 +-
 fs/fscache/Kconfig                                |  34 ++++
 fs/fscache/Makefile                               |   4 +
 fs/fscache/histogram.c                            | 109 +++++++++++
 fs/fscache/internal.h                             | 127 +++++++++++++
 fs/fscache/main.c                                 |   7 +
 fs/fscache/proc.c                                 |  68 +++++++
 fs/fscache/stats.c                                | 212 ++++++++++++++++++++++
 include/linux/fscache-cache.h                     |   4 -
 10 files changed, 566 insertions(+), 17 deletions(-)
 create mode 100644 fs/fscache/histogram.c
 create mode 100644 fs/fscache/proc.c
 create mode 100644 fs/fscache/stats.c

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt
index 17723053aa91..382d52cdaf2d 100644
--- a/Documentation/filesystems/caching/backend-api.txt
+++ b/Documentation/filesystems/caching/backend-api.txt
@@ -100,12 +100,6 @@ A sysfs directory called /sys/fs/fscache/<cachetag>/ is created if CONFIG_SYSFS
 is enabled.  This is accessible through the kobject struct fscache_cache::kobj
 and is for use by the cache as it sees fit.
 
-The cache driver may create itself a directory named for the cache type in the
-/proc/fs/fscache/ directory.  This is available if CONFIG_FSCACHE_PROC is
-enabled and is accessible through:
-
-	struct proc_dir_entry *proc_fscache;
-
 
 ========================
 RELEVANT DATA STRUCTURES
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index a759d916273e..0a751f3c2c70 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -195,7 +195,6 @@ STATISTICAL INFORMATION
 
 If FS-Cache is compiled with the following options enabled:
 
-	CONFIG_FSCACHE_PROC=y (implied by the following two)
 	CONFIG_FSCACHE_STATS=y
 	CONFIG_FSCACHE_HISTOGRAM=y
 
@@ -275,7 +274,7 @@ proc files.
  (*) /proc/fs/fscache/histogram
 
 	cat /proc/fs/fscache/histogram
-	+HZ   +TIME OBJ INST  OP RUNS   OBJ RUNS  RETRV DLY RETRIEVLS
+	JIFS  SECS  OBJ INST  OP RUNS   OBJ RUNS  RETRV DLY RETRIEVLS
 	===== ===== ========= ========= ========= ========= =========
 
      This shows the breakdown of the number of times each amount of time
@@ -291,16 +290,16 @@ proc files.
 	RETRIEVLS	Time between beginning and end of a retrieval
 
      Each row shows the number of events that took a particular range of times.
-     Each step is 1 jiffy in size.  The +HZ column indicates the particular
-     jiffy range covered, and the +TIME field the equivalent number of seconds.
+     Each step is 1 jiffy in size.  The JIFS column indicates the particular
+     jiffy range covered, and the SECS field the equivalent number of seconds.
 
 
 =========
 DEBUGGING
 =========
 
-The FS-Cache facility can have runtime debugging enabled by adjusting the value
-in:
+If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime
+debugging enabled by adjusting the value in:
 
 	/sys/module/fscache/parameters/debug
 
@@ -327,4 +326,3 @@ the control file.  For example:
 	echo $((1|8|64)) >/sys/module/fscache/parameters/debug
 
 will turn on all function entry debugging.
-
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 7c7bccd5eee4..9bbb8ce7bea0 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -11,6 +11,40 @@ config FSCACHE
 
 	  See Documentation/filesystems/caching/fscache.txt for more information.
 
+config FSCACHE_STATS
+	bool "Gather statistical information on local caching"
+	depends on FSCACHE && PROC_FS
+	help
+	  This option causes statistical information to be gathered on local
+	  caching and exported through file:
+
+		/proc/fs/fscache/stats
+
+	  The gathering of statistics adds a certain amount of overhead to
+	  execution as there are a quite a few stats gathered, and on a
+	  multi-CPU system these may be on cachelines that keep bouncing
+	  between CPUs.  On the other hand, the stats are very useful for
+	  debugging purposes.  Saying 'Y' here is recommended.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_HISTOGRAM
+	bool "Gather latency information on local caching"
+	depends on FSCACHE && PROC_FS
+	help
+	  This option causes latency information to be gathered on local
+	  caching and exported through file:
+
+		/proc/fs/fscache/histogram
+
+	  The generation of this histogram adds a certain amount of overhead to
+	  execution as there are a number of points at which data is gathered,
+	  and on a multi-CPU system these may be on cachelines that keep
+	  bouncing between CPUs.  On the other hand, the histogram may be
+	  useful for debugging purposes.  Saying 'N' here is recommended.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
 config FSCACHE_DEBUG
 	bool "Debug FS-Cache"
 	depends on FSCACHE
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index f8038b83e0ef..1384823a160c 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -5,4 +5,8 @@
 fscache-y := \
 	main.o
 
+fscache-$(CONFIG_PROC_FS) += proc.o
+fscache-$(CONFIG_FSCACHE_STATS) += stats.o
+fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+
 obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644
index 000000000000..bad496748a59
--- /dev/null
+++ b/fs/fscache/histogram.c
@@ -0,0 +1,109 @@
+/* FS-Cache latency histogram
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t fscache_obj_instantiate_histogram[HZ];
+atomic_t fscache_objs_histogram[HZ];
+atomic_t fscache_ops_histogram[HZ];
+atomic_t fscache_retrieval_delay_histogram[HZ];
+atomic_t fscache_retrieval_histogram[HZ];
+
+/*
+ * display the time-taken histogram
+ */
+static int fscache_histogram_show(struct seq_file *m, void *v)
+{
+	unsigned long index;
+	unsigned n[5], t;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "JIFS  SECS  OBJ INST  OP RUNS   OBJ RUNS "
+			 " RETRV DLY RETRIEVLS\n");
+		return 0;
+	case 2:
+		seq_puts(m, "===== ===== ========= ========= ========="
+			 " ========= =========\n");
+		return 0;
+	default:
+		index = (unsigned long) v - 3;
+		n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
+		n[1] = atomic_read(&fscache_ops_histogram[index]);
+		n[2] = atomic_read(&fscache_objs_histogram[index]);
+		n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
+		n[4] = atomic_read(&fscache_retrieval_histogram[index]);
+		if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
+			return 0;
+
+		t = (index * 1000) / HZ;
+
+		seq_printf(m, "%4lu  0.%03u %9u %9u %9u %9u %9u\n",
+			   index, t, n[0], n[1], n[2], n[3], n[4]);
+		return 0;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+	if ((unsigned long long)*_pos >= HZ + 2)
+		return NULL;
+	if (*_pos == 0)
+		*_pos = 1;
+	return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return (unsigned long long)*pos > HZ + 2 ?
+		NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations fscache_histogram_ops = {
+	.start		= fscache_histogram_start,
+	.stop		= fscache_histogram_stop,
+	.next		= fscache_histogram_next,
+	.show		= fscache_histogram_show,
+};
+
+/*
+ * open "/proc/fs/fscache/histogram" to provide latency data
+ */
+static int fscache_histogram_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &fscache_histogram_ops);
+}
+
+const struct file_operations fscache_histogram_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_histogram_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 95dc92da7152..16f9f1f46e4d 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -27,6 +27,30 @@
 #define FSCACHE_MIN_THREADS	4
 #define FSCACHE_MAX_THREADS	32
 
+/*
+ * fsc-histogram.c
+ */
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+extern atomic_t fscache_obj_instantiate_histogram[HZ];
+extern atomic_t fscache_objs_histogram[HZ];
+extern atomic_t fscache_ops_histogram[HZ];
+extern atomic_t fscache_retrieval_delay_histogram[HZ];
+extern atomic_t fscache_retrieval_histogram[HZ];
+
+static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
+{
+	unsigned long jif = jiffies - start_jif;
+	if (jif >= HZ)
+		jif = HZ - 1;
+	atomic_inc(&histogram[jif]);
+}
+
+extern const struct file_operations fscache_histogram_fops;
+
+#else
+#define fscache_hist(hist, start_jif) do {} while (0)
+#endif
+
 /*
  * fsc-main.c
  */
@@ -35,6 +59,109 @@ extern unsigned fscache_defer_create;
 extern unsigned fscache_debug;
 extern struct kobject *fscache_root;
 
+/*
+ * fsc-proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init()	(0)
+#define fscache_proc_cleanup()	do {} while (0)
+#endif
+
+/*
+ * fsc-stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
+extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
+
+extern atomic_t fscache_n_op_pend;
+extern atomic_t fscache_n_op_run;
+extern atomic_t fscache_n_op_enqueue;
+extern atomic_t fscache_n_op_deferred_release;
+extern atomic_t fscache_n_op_release;
+extern atomic_t fscache_n_op_gc;
+
+extern atomic_t fscache_n_attr_changed;
+extern atomic_t fscache_n_attr_changed_ok;
+extern atomic_t fscache_n_attr_changed_nobufs;
+extern atomic_t fscache_n_attr_changed_nomem;
+extern atomic_t fscache_n_attr_changed_calls;
+
+extern atomic_t fscache_n_allocs;
+extern atomic_t fscache_n_allocs_ok;
+extern atomic_t fscache_n_allocs_wait;
+extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_alloc_ops;
+extern atomic_t fscache_n_alloc_op_waits;
+
+extern atomic_t fscache_n_retrievals;
+extern atomic_t fscache_n_retrievals_ok;
+extern atomic_t fscache_n_retrievals_wait;
+extern atomic_t fscache_n_retrievals_nodata;
+extern atomic_t fscache_n_retrievals_nobufs;
+extern atomic_t fscache_n_retrievals_intr;
+extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrieval_ops;
+extern atomic_t fscache_n_retrieval_op_waits;
+
+extern atomic_t fscache_n_stores;
+extern atomic_t fscache_n_stores_ok;
+extern atomic_t fscache_n_stores_again;
+extern atomic_t fscache_n_stores_nobufs;
+extern atomic_t fscache_n_stores_oom;
+extern atomic_t fscache_n_store_ops;
+extern atomic_t fscache_n_store_calls;
+
+extern atomic_t fscache_n_marks;
+extern atomic_t fscache_n_uncaches;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_null;
+extern atomic_t fscache_n_acquires_no_cache;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_nobufs;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_updates;
+extern atomic_t fscache_n_updates_null;
+extern atomic_t fscache_n_updates_run;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_null;
+extern atomic_t fscache_n_relinquishes_waitcrt;
+
+extern atomic_t fscache_n_cookie_index;
+extern atomic_t fscache_n_cookie_data;
+extern atomic_t fscache_n_cookie_special;
+
+extern atomic_t fscache_n_object_alloc;
+extern atomic_t fscache_n_object_no_alloc;
+extern atomic_t fscache_n_object_lookups;
+extern atomic_t fscache_n_object_lookups_negative;
+extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_created;
+extern atomic_t fscache_n_object_avail;
+extern atomic_t fscache_n_object_dead;
+
+extern atomic_t fscache_n_checkaux_none;
+extern atomic_t fscache_n_checkaux_okay;
+extern atomic_t fscache_n_checkaux_update;
+extern atomic_t fscache_n_checkaux_obsolete;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+	atomic_inc(stat);
+}
+
+extern const struct file_operations fscache_stats_fops;
+#else
+
+#define fscache_stat(stat) do {} while (0)
+#endif
+
 /*****************************************************************************/
 /*
  * debug tracing
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 76f7c69079c0..7c734b7fb18e 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -52,9 +52,15 @@ static int __init fscache_init(void)
 	if (ret < 0)
 		goto error_slow_work;
 
+	ret = fscache_proc_init();
+	if (ret < 0)
+		goto error_proc;
+
 	printk(KERN_NOTICE "FS-Cache: Loaded\n");
 	return 0;
 
+error_proc:
+	slow_work_unregister_user();
 error_slow_work:
 	return ret;
 }
@@ -68,6 +74,7 @@ static void __exit fscache_exit(void)
 {
 	_enter("");
 
+	fscache_proc_cleanup();
 	slow_work_unregister_user();
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644
index 000000000000..beeab44bc31a
--- /dev/null
+++ b/fs/fscache/proc.c
@@ -0,0 +1,68 @@
+/* FS-Cache statistics viewing interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * initialise the /proc/fs/fscache/ directory
+ */
+int __init fscache_proc_init(void)
+{
+	_enter("");
+
+	if (!proc_mkdir("fs/fscache", NULL))
+		goto error_dir;
+
+#ifdef CONFIG_FSCACHE_STATS
+	if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
+			 &fscache_stats_fops))
+		goto error_stats;
+#endif
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
+			 &fscache_histogram_fops))
+		goto error_histogram;
+#endif
+
+	_leave(" = 0");
+	return 0;
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+error_histogram:
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+	remove_proc_entry("fs/fscache/stats", NULL);
+error_stats:
+#endif
+	remove_proc_entry("fs/fscache", NULL);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/fscache/ directory
+ */
+void fscache_proc_cleanup(void)
+{
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	remove_proc_entry("fs/fscache/histogram", NULL);
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+	remove_proc_entry("fs/fscache/stats", NULL);
+#endif
+	remove_proc_entry("fs/fscache", NULL);
+}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644
index 000000000000..65deb99e756b
--- /dev/null
+++ b/fs/fscache/stats.c
@@ -0,0 +1,212 @@
+/* FS-Cache statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * operation counters
+ */
+atomic_t fscache_n_op_pend;
+atomic_t fscache_n_op_run;
+atomic_t fscache_n_op_enqueue;
+atomic_t fscache_n_op_requeue;
+atomic_t fscache_n_op_deferred_release;
+atomic_t fscache_n_op_release;
+atomic_t fscache_n_op_gc;
+
+atomic_t fscache_n_attr_changed;
+atomic_t fscache_n_attr_changed_ok;
+atomic_t fscache_n_attr_changed_nobufs;
+atomic_t fscache_n_attr_changed_nomem;
+atomic_t fscache_n_attr_changed_calls;
+
+atomic_t fscache_n_allocs;
+atomic_t fscache_n_allocs_ok;
+atomic_t fscache_n_allocs_wait;
+atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_alloc_ops;
+atomic_t fscache_n_alloc_op_waits;
+
+atomic_t fscache_n_retrievals;
+atomic_t fscache_n_retrievals_ok;
+atomic_t fscache_n_retrievals_wait;
+atomic_t fscache_n_retrievals_nodata;
+atomic_t fscache_n_retrievals_nobufs;
+atomic_t fscache_n_retrievals_intr;
+atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrieval_ops;
+atomic_t fscache_n_retrieval_op_waits;
+
+atomic_t fscache_n_stores;
+atomic_t fscache_n_stores_ok;
+atomic_t fscache_n_stores_again;
+atomic_t fscache_n_stores_nobufs;
+atomic_t fscache_n_stores_oom;
+atomic_t fscache_n_store_ops;
+atomic_t fscache_n_store_calls;
+
+atomic_t fscache_n_marks;
+atomic_t fscache_n_uncaches;
+
+atomic_t fscache_n_acquires;
+atomic_t fscache_n_acquires_null;
+atomic_t fscache_n_acquires_no_cache;
+atomic_t fscache_n_acquires_ok;
+atomic_t fscache_n_acquires_nobufs;
+atomic_t fscache_n_acquires_oom;
+
+atomic_t fscache_n_updates;
+atomic_t fscache_n_updates_null;
+atomic_t fscache_n_updates_run;
+
+atomic_t fscache_n_relinquishes;
+atomic_t fscache_n_relinquishes_null;
+atomic_t fscache_n_relinquishes_waitcrt;
+
+atomic_t fscache_n_cookie_index;
+atomic_t fscache_n_cookie_data;
+atomic_t fscache_n_cookie_special;
+
+atomic_t fscache_n_object_alloc;
+atomic_t fscache_n_object_no_alloc;
+atomic_t fscache_n_object_lookups;
+atomic_t fscache_n_object_lookups_negative;
+atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_created;
+atomic_t fscache_n_object_avail;
+atomic_t fscache_n_object_dead;
+
+atomic_t fscache_n_checkaux_none;
+atomic_t fscache_n_checkaux_okay;
+atomic_t fscache_n_checkaux_update;
+atomic_t fscache_n_checkaux_obsolete;
+
+/*
+ * display the general statistics
+ */
+static int fscache_stats_show(struct seq_file *m, void *v)
+{
+	seq_puts(m, "FS-Cache statistics\n");
+
+	seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
+		   atomic_read(&fscache_n_cookie_index),
+		   atomic_read(&fscache_n_cookie_data),
+		   atomic_read(&fscache_n_cookie_special));
+
+	seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
+		   atomic_read(&fscache_n_object_alloc),
+		   atomic_read(&fscache_n_object_no_alloc),
+		   atomic_read(&fscache_n_object_avail),
+		   atomic_read(&fscache_n_object_dead));
+	seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
+		   atomic_read(&fscache_n_checkaux_none),
+		   atomic_read(&fscache_n_checkaux_okay),
+		   atomic_read(&fscache_n_checkaux_update),
+		   atomic_read(&fscache_n_checkaux_obsolete));
+
+	seq_printf(m, "Pages  : mrk=%u unc=%u\n",
+		   atomic_read(&fscache_n_marks),
+		   atomic_read(&fscache_n_uncaches));
+
+	seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
+		   " oom=%u\n",
+		   atomic_read(&fscache_n_acquires),
+		   atomic_read(&fscache_n_acquires_null),
+		   atomic_read(&fscache_n_acquires_no_cache),
+		   atomic_read(&fscache_n_acquires_ok),
+		   atomic_read(&fscache_n_acquires_nobufs),
+		   atomic_read(&fscache_n_acquires_oom));
+
+	seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
+		   atomic_read(&fscache_n_object_lookups),
+		   atomic_read(&fscache_n_object_lookups_negative),
+		   atomic_read(&fscache_n_object_lookups_positive),
+		   atomic_read(&fscache_n_object_created));
+
+	seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
+		   atomic_read(&fscache_n_updates),
+		   atomic_read(&fscache_n_updates_null),
+		   atomic_read(&fscache_n_updates_run));
+
+	seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
+		   atomic_read(&fscache_n_relinquishes),
+		   atomic_read(&fscache_n_relinquishes_null),
+		   atomic_read(&fscache_n_relinquishes_waitcrt));
+
+	seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
+		   atomic_read(&fscache_n_attr_changed),
+		   atomic_read(&fscache_n_attr_changed_ok),
+		   atomic_read(&fscache_n_attr_changed_nobufs),
+		   atomic_read(&fscache_n_attr_changed_nomem),
+		   atomic_read(&fscache_n_attr_changed_calls));
+
+	seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
+		   atomic_read(&fscache_n_allocs),
+		   atomic_read(&fscache_n_allocs_ok),
+		   atomic_read(&fscache_n_allocs_wait),
+		   atomic_read(&fscache_n_allocs_nobufs));
+	seq_printf(m, "Allocs : ops=%u owt=%u\n",
+		   atomic_read(&fscache_n_alloc_ops),
+		   atomic_read(&fscache_n_alloc_op_waits));
+
+	seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
+		   " int=%u oom=%u\n",
+		   atomic_read(&fscache_n_retrievals),
+		   atomic_read(&fscache_n_retrievals_ok),
+		   atomic_read(&fscache_n_retrievals_wait),
+		   atomic_read(&fscache_n_retrievals_nodata),
+		   atomic_read(&fscache_n_retrievals_nobufs),
+		   atomic_read(&fscache_n_retrievals_intr),
+		   atomic_read(&fscache_n_retrievals_nomem));
+	seq_printf(m, "Retrvls: ops=%u owt=%u\n",
+		   atomic_read(&fscache_n_retrieval_ops),
+		   atomic_read(&fscache_n_retrieval_op_waits));
+
+	seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
+		   atomic_read(&fscache_n_stores),
+		   atomic_read(&fscache_n_stores_ok),
+		   atomic_read(&fscache_n_stores_again),
+		   atomic_read(&fscache_n_stores_nobufs),
+		   atomic_read(&fscache_n_stores_oom));
+	seq_printf(m, "Stores : ops=%u run=%u\n",
+		   atomic_read(&fscache_n_store_ops),
+		   atomic_read(&fscache_n_store_calls));
+
+	seq_printf(m, "Ops    : pend=%u run=%u enq=%u\n",
+		   atomic_read(&fscache_n_op_pend),
+		   atomic_read(&fscache_n_op_run),
+		   atomic_read(&fscache_n_op_enqueue));
+	seq_printf(m, "Ops    : dfr=%u rel=%u gc=%u\n",
+		   atomic_read(&fscache_n_op_deferred_release),
+		   atomic_read(&fscache_n_op_release),
+		   atomic_read(&fscache_n_op_gc));
+	return 0;
+}
+
+/*
+ * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
+ */
+static int fscache_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, fscache_stats_show, NULL);
+}
+
+const struct file_operations fscache_stats_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index b2a9a484c4cf..84d3532dd3ea 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -29,10 +29,6 @@ struct fscache_cache_ops;
 struct fscache_object;
 struct fscache_operation;
 
-#ifdef CONFIG_FSCACHE_PROC
-extern struct proc_dir_entry *proc_fscache;
-#endif
-
 /*
  * cache tag definition
  */
-- 
cgit v1.2.3


From 0e04d4cefcf4d8fbbdb2c50e93ad541582933fd2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:37 +0100
Subject: FS-Cache: Add cache tag handling

Implement two features of FS-Cache:

 (1) The ability to request and release cache tags - names by which a cache may
     be known to a netfs, and thus selected for use.

 (2) An internal function by which a cache is selected by consulting the netfs,
     if the netfs wishes to be consulted.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/fscache/Makefile     |   1 +
 fs/fscache/cache.c      | 166 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fscache/internal.h   |  20 ++++++
 include/linux/fscache.h |   9 ++-
 4 files changed, 195 insertions(+), 1 deletion(-)
 create mode 100644 fs/fscache/cache.c

(limited to 'include/linux')

diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index bc1f3b9d811a..556708bb9796 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -3,6 +3,7 @@
 #
 
 fscache-y := \
+	cache.o \
 	fsdef.o \
 	main.o
 
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644
index 000000000000..1a28df36dd93
--- /dev/null
+++ b/fs/fscache/cache.c
@@ -0,0 +1,166 @@
+/* FS-Cache cache handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+LIST_HEAD(fscache_cache_list);
+DECLARE_RWSEM(fscache_addremove_sem);
+
+static LIST_HEAD(fscache_cache_tag_list);
+
+/*
+ * look up a cache tag
+ */
+struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
+{
+	struct fscache_cache_tag *tag, *xtag;
+
+	/* firstly check for the existence of the tag under read lock */
+	down_read(&fscache_addremove_sem);
+
+	list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+		if (strcmp(tag->name, name) == 0) {
+			atomic_inc(&tag->usage);
+			up_read(&fscache_addremove_sem);
+			return tag;
+		}
+	}
+
+	up_read(&fscache_addremove_sem);
+
+	/* the tag does not exist - create a candidate */
+	xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
+	if (!xtag)
+		/* return a dummy tag if out of memory */
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&xtag->usage, 1);
+	strcpy(xtag->name, name);
+
+	/* write lock, search again and add if still not present */
+	down_write(&fscache_addremove_sem);
+
+	list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+		if (strcmp(tag->name, name) == 0) {
+			atomic_inc(&tag->usage);
+			up_write(&fscache_addremove_sem);
+			kfree(xtag);
+			return tag;
+		}
+	}
+
+	list_add_tail(&xtag->link, &fscache_cache_tag_list);
+	up_write(&fscache_addremove_sem);
+	return xtag;
+}
+
+/*
+ * release a reference to a cache tag
+ */
+void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+	if (tag != ERR_PTR(-ENOMEM)) {
+		down_write(&fscache_addremove_sem);
+
+		if (atomic_dec_and_test(&tag->usage))
+			list_del_init(&tag->link);
+		else
+			tag = NULL;
+
+		up_write(&fscache_addremove_sem);
+
+		kfree(tag);
+	}
+}
+
+/*
+ * select a cache in which to store an object
+ * - the cache addremove semaphore must be at least read-locked by the caller
+ * - the object will never be an index
+ */
+struct fscache_cache *fscache_select_cache_for_object(
+	struct fscache_cookie *cookie)
+{
+	struct fscache_cache_tag *tag;
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+
+	_enter("");
+
+	if (list_empty(&fscache_cache_list)) {
+		_leave(" = NULL [no cache]");
+		return NULL;
+	}
+
+	/* we check the parent to determine the cache to use */
+	spin_lock(&cookie->lock);
+
+	/* the first in the parent's backing list should be the preferred
+	 * cache */
+	if (!hlist_empty(&cookie->backing_objects)) {
+		object = hlist_entry(cookie->backing_objects.first,
+				     struct fscache_object, cookie_link);
+
+		cache = object->cache;
+		if (object->state >= FSCACHE_OBJECT_DYING ||
+		    test_bit(FSCACHE_IOERROR, &cache->flags))
+			cache = NULL;
+
+		spin_unlock(&cookie->lock);
+		_leave(" = %p [parent]", cache);
+		return cache;
+	}
+
+	/* the parent is unbacked */
+	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		/* cookie not an index and is unbacked */
+		spin_unlock(&cookie->lock);
+		_leave(" = NULL [cookie ub,ni]");
+		return NULL;
+	}
+
+	spin_unlock(&cookie->lock);
+
+	if (!cookie->def->select_cache)
+		goto no_preference;
+
+	/* ask the netfs for its preference */
+	tag = cookie->def->select_cache(cookie->parent->netfs_data,
+					cookie->netfs_data);
+	if (!tag)
+		goto no_preference;
+
+	if (tag == ERR_PTR(-ENOMEM)) {
+		_leave(" = NULL [nomem tag]");
+		return NULL;
+	}
+
+	if (!tag->cache) {
+		_leave(" = NULL [unbacked tag]");
+		return NULL;
+	}
+
+	if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
+		return NULL;
+
+	_leave(" = %p [specific]", tag->cache);
+	return tag->cache;
+
+no_preference:
+	/* netfs has no preference - just select first cache */
+	cache = list_entry(fscache_cache_list.next,
+			   struct fscache_cache, link);
+	_leave(" = %p [first]", cache);
+	return cache;
+}
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 4113af8d1660..0a2069afa417 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -27,6 +27,15 @@
 #define FSCACHE_MIN_THREADS	4
 #define FSCACHE_MAX_THREADS	32
 
+/*
+ * fsc-cache.c
+ */
+extern struct list_head fscache_cache_list;
+extern struct rw_semaphore fscache_addremove_sem;
+
+extern struct fscache_cache *fscache_select_cache_for_object(
+	struct fscache_cookie *);
+
 /*
  * fsc-fsdef.c
  */
@@ -168,6 +177,17 @@ extern const struct file_operations fscache_stats_fops;
 #define fscache_stat(stat) do {} while (0)
 #endif
 
+/*
+ * raise an event on an object
+ * - if the event is not masked for that object, then the object is
+ *   queued for attention by the thread pool.
+ */
+static inline void fscache_raise_event(struct fscache_object *object,
+				       unsigned event)
+{
+	BUG();  // TODO
+}
+
 /*****************************************************************************/
 /*
  * debug tracing
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index feb3b0e0af4d..9584c094d69f 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -173,6 +173,8 @@ struct fscache_netfs {
  * - these are undefined symbols when FS-Cache is not configured and the
  *   optimiser takes care of not using them
  */
+extern struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *);
+extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
 
 /**
  * fscache_register_netfs - Register a filesystem as desiring caching services
@@ -218,7 +220,10 @@ void fscache_unregister_netfs(struct fscache_netfs *netfs)
 static inline
 struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name)
 {
-	return NULL;
+	if (fscache_available())
+		return __fscache_lookup_cache_tag(name);
+	else
+		return NULL;
 }
 
 /**
@@ -233,6 +238,8 @@ struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name)
 static inline
 void fscache_release_cache_tag(struct fscache_cache_tag *tag)
 {
+	if (fscache_available())
+		__fscache_release_cache_tag(tag);
 }
 
 /**
-- 
cgit v1.2.3


From 726dd7ff10c217dd74329c94643dc8ebea27334b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:38 +0100
Subject: FS-Cache: Add netfs registration

Add functions to register and unregister a network filesystem or other client
of the FS-Cache service.  This allocates and releases the cookie representing
the top-level index for a netfs, and makes it available to the netfs.

If the FS-Cache facility is disabled, then the calls are optimised away at
compile time.

Note that whilst this patch may appear to work with FS-Cache enabled and a
netfs attempting to use it, it will leak the cookie it allocates for the netfs
as fscache_relinquish_cookie() is implemented in a later patch.  This will
cause the slab code to emit a warning when the module is removed.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/fscache/Makefile     |   3 +-
 fs/fscache/netfs.c      | 103 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fscache.h |   9 ++++-
 3 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 fs/fscache/netfs.c

(limited to 'include/linux')

diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index f88ac1764ce3..ecf6946eaeb3 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -6,7 +6,8 @@ fscache-y := \
 	cache.o \
 	cookie.o \
 	fsdef.o \
-	main.o
+	main.o \
+	netfs.o
 
 fscache-$(CONFIG_PROC_FS) += proc.o
 fscache-$(CONFIG_FSCACHE_STATS) += stats.o
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644
index 000000000000..e028b8eb1c40
--- /dev/null
+++ b/fs/fscache/netfs.c
@@ -0,0 +1,103 @@
+/* FS-Cache netfs (client) registration
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static LIST_HEAD(fscache_netfs_list);
+
+/*
+ * register a network filesystem for caching
+ */
+int __fscache_register_netfs(struct fscache_netfs *netfs)
+{
+	struct fscache_netfs *ptr;
+	int ret;
+
+	_enter("{%s}", netfs->name);
+
+	INIT_LIST_HEAD(&netfs->link);
+
+	/* allocate a cookie for the primary index */
+	netfs->primary_index =
+		kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+
+	if (!netfs->primary_index) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	/* initialise the primary index cookie */
+	atomic_set(&netfs->primary_index->usage, 1);
+	atomic_set(&netfs->primary_index->n_children, 0);
+
+	netfs->primary_index->def		= &fscache_fsdef_netfs_def;
+	netfs->primary_index->parent		= &fscache_fsdef_index;
+	netfs->primary_index->netfs_data	= netfs;
+
+	atomic_inc(&netfs->primary_index->parent->usage);
+	atomic_inc(&netfs->primary_index->parent->n_children);
+
+	spin_lock_init(&netfs->primary_index->lock);
+	INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+
+	/* check the netfs type is not already present */
+	down_write(&fscache_addremove_sem);
+
+	ret = -EEXIST;
+	list_for_each_entry(ptr, &fscache_netfs_list, link) {
+		if (strcmp(ptr->name, netfs->name) == 0)
+			goto already_registered;
+	}
+
+	list_add(&netfs->link, &fscache_netfs_list);
+	ret = 0;
+
+	printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
+	       netfs->name);
+
+already_registered:
+	up_write(&fscache_addremove_sem);
+
+	if (ret < 0) {
+		netfs->primary_index->parent = NULL;
+		__fscache_cookie_put(netfs->primary_index);
+		netfs->primary_index = NULL;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(__fscache_register_netfs);
+
+/*
+ * unregister a network filesystem from the cache
+ * - all cookies must have been released first
+ */
+void __fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+	_enter("{%s.%u}", netfs->name, netfs->version);
+
+	down_write(&fscache_addremove_sem);
+
+	list_del(&netfs->link);
+	fscache_relinquish_cookie(netfs->primary_index, 0);
+
+	up_write(&fscache_addremove_sem);
+
+	printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
+	       netfs->name);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 9584c094d69f..b195c2e1ef6a 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -173,6 +173,8 @@ struct fscache_netfs {
  * - these are undefined symbols when FS-Cache is not configured and the
  *   optimiser takes care of not using them
  */
+extern int __fscache_register_netfs(struct fscache_netfs *);
+extern void __fscache_unregister_netfs(struct fscache_netfs *);
 extern struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *);
 extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
 
@@ -188,7 +190,10 @@ extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
 static inline
 int fscache_register_netfs(struct fscache_netfs *netfs)
 {
-	return 0;
+	if (fscache_available())
+		return __fscache_register_netfs(netfs);
+	else
+		return 0;
 }
 
 /**
@@ -205,6 +210,8 @@ int fscache_register_netfs(struct fscache_netfs *netfs)
 static inline
 void fscache_unregister_netfs(struct fscache_netfs *netfs)
 {
+	if (fscache_available())
+		__fscache_unregister_netfs(netfs);
 }
 
 /**
-- 
cgit v1.2.3


From ccc4fc3d11e91477036d1f82bfa2d442f6ce77f0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:38 +0100
Subject: FS-Cache: Implement the cookie management part of the netfs API

Implement the cookie management part of the FS-Cache netfs client API.  The
documentation and API header file were added in a previous patch.

This patch implements the following three functions:

 (1) fscache_acquire_cookie().

     Acquire a cookie to represent an object to the netfs.  If the object in
     question is a non-index object, then that object and its parent indices
     will be created on disk at this point if they don't already exist.  Index
     creation is deferred because an index may reside in multiple caches.

 (2) fscache_relinquish_cookie().

     Retire or release a cookie previously acquired.  At this point, the
     object on disk may be destroyed.

 (3) fscache_update_cookie().

     Update the in-cache representation of a cookie.  This is used to update
     the auxiliary data for coherency management purposes.

With this patch it is possible to have a netfs instruct a cache backend to
look up, validate and create metadata on disk and to destroy it again.
The ability to actually store and retrieve data in the objects so created is
added in later patches.

Note that these functions will never return an error.  _All_ errors are
handled internally to FS-Cache.

The worst that can happen is that fscache_acquire_cookie() may return a NULL
pointer - which is considered a negative cookie pointer and can be passed back
to any function that takes a cookie without harm.  A negative cookie pointer
merely suppresses caching at that level.

The stub in linux/fscache.h will detect inline the negative cookie pointer and
abort the operation as fast as possible.  This means that the compiler doesn't
have to set up for a call in that case.

See the documentation in Documentation/filesystems/caching/netfs-api.txt for
more information.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/fscache/cookie.c     | 444 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fscache.h |  16 +-
 2 files changed, 459 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 47fd75b832e1..72fd18f6c71f 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -7,6 +7,9 @@
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for more information on
+ * the netfs API.
  */
 
 #define FSCACHE_DEBUG_LEVEL COOKIE
@@ -16,6 +19,14 @@
 
 struct kmem_cache *fscache_cookie_jar;
 
+static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
+
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
+static int fscache_alloc_object(struct fscache_cache *cache,
+				struct fscache_cookie *cookie);
+static int fscache_attach_object(struct fscache_cookie *cookie,
+				 struct fscache_object *object);
+
 /*
  * initialise an cookie jar slab element prior to any use
  */
@@ -28,6 +39,439 @@ void fscache_cookie_init_once(void *_cookie)
 	INIT_HLIST_HEAD(&cookie->backing_objects);
 }
 
+/*
+ * request a cookie to represent an object (index, datafile, xattr, etc)
+ * - parent specifies the parent object
+ *   - the top level index cookie for each netfs is stored in the fscache_netfs
+ *     struct upon registration
+ * - def points to the definition
+ * - the netfs_data will be passed to the functions pointed to in *def
+ * - all attached caches will be searched to see if they contain this object
+ * - index objects aren't stored on disk until there's a dependent file that
+ *   needs storing
+ * - other objects are stored in a selected cache immediately, and all the
+ *   indices forming the path to it are instantiated if necessary
+ * - we never let on to the netfs about errors
+ *   - we may set a negative cookie pointer, but that's okay
+ */
+struct fscache_cookie *__fscache_acquire_cookie(
+	struct fscache_cookie *parent,
+	const struct fscache_cookie_def *def,
+	void *netfs_data)
+{
+	struct fscache_cookie *cookie;
+
+	BUG_ON(!def);
+
+	_enter("{%s},{%s},%p",
+	       parent ? (char *) parent->def->name : "<no-parent>",
+	       def->name, netfs_data);
+
+	fscache_stat(&fscache_n_acquires);
+
+	/* if there's no parent cookie, then we don't create one here either */
+	if (!parent) {
+		fscache_stat(&fscache_n_acquires_null);
+		_leave(" [no parent]");
+		return NULL;
+	}
+
+	/* validate the definition */
+	BUG_ON(!def->get_key);
+	BUG_ON(!def->name[0]);
+
+	BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
+	       parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
+
+	/* allocate and initialise a cookie */
+	cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+	if (!cookie) {
+		fscache_stat(&fscache_n_acquires_oom);
+		_leave(" [ENOMEM]");
+		return NULL;
+	}
+
+	atomic_set(&cookie->usage, 1);
+	atomic_set(&cookie->n_children, 0);
+
+	atomic_inc(&parent->usage);
+	atomic_inc(&parent->n_children);
+
+	cookie->def		= def;
+	cookie->parent		= parent;
+	cookie->netfs_data	= netfs_data;
+	cookie->flags		= 0;
+
+	INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
+
+	switch (cookie->def->type) {
+	case FSCACHE_COOKIE_TYPE_INDEX:
+		fscache_stat(&fscache_n_cookie_index);
+		break;
+	case FSCACHE_COOKIE_TYPE_DATAFILE:
+		fscache_stat(&fscache_n_cookie_data);
+		break;
+	default:
+		fscache_stat(&fscache_n_cookie_special);
+		break;
+	}
+
+	/* if the object is an index then we need do nothing more here - we
+	 * create indices on disk when we need them as an index may exist in
+	 * multiple caches */
+	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		if (fscache_acquire_non_index_cookie(cookie) < 0) {
+			atomic_dec(&parent->n_children);
+			__fscache_cookie_put(cookie);
+			fscache_stat(&fscache_n_acquires_nobufs);
+			_leave(" = NULL");
+			return NULL;
+		}
+	}
+
+	fscache_stat(&fscache_n_acquires_ok);
+	_leave(" = %p", cookie);
+	return cookie;
+}
+EXPORT_SYMBOL(__fscache_acquire_cookie);
+
+/*
+ * acquire a non-index cookie
+ * - this must make sure the index chain is instantiated and instantiate the
+ *   object representation too
+ */
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+	uint64_t i_size;
+	int ret;
+
+	_enter("");
+
+	cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+
+	/* now we need to see whether the backing objects for this cookie yet
+	 * exist, if not there'll be nothing to search */
+	down_read(&fscache_addremove_sem);
+
+	if (list_empty(&fscache_cache_list)) {
+		up_read(&fscache_addremove_sem);
+		_leave(" = 0 [no caches]");
+		return 0;
+	}
+
+	/* select a cache in which to store the object */
+	cache = fscache_select_cache_for_object(cookie->parent);
+	if (!cache) {
+		up_read(&fscache_addremove_sem);
+		fscache_stat(&fscache_n_acquires_no_cache);
+		_leave(" = -ENOMEDIUM [no cache]");
+		return -ENOMEDIUM;
+	}
+
+	_debug("cache %s", cache->tag->name);
+
+	cookie->flags =
+		(1 << FSCACHE_COOKIE_LOOKING_UP) |
+		(1 << FSCACHE_COOKIE_CREATING) |
+		(1 << FSCACHE_COOKIE_NO_DATA_YET);
+
+	/* ask the cache to allocate objects for this cookie and its parent
+	 * chain */
+	ret = fscache_alloc_object(cache, cookie);
+	if (ret < 0) {
+		up_read(&fscache_addremove_sem);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* pass on how big the object we're caching is supposed to be */
+	cookie->def->get_attr(cookie->netfs_data, &i_size);
+
+	spin_lock(&cookie->lock);
+	if (hlist_empty(&cookie->backing_objects)) {
+		spin_unlock(&cookie->lock);
+		goto unavailable;
+	}
+
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	fscache_set_store_limit(object, i_size);
+
+	/* initiate the process of looking up all the objects in the chain
+	 * (done by fscache_initialise_object()) */
+	fscache_enqueue_object(object);
+
+	spin_unlock(&cookie->lock);
+
+	/* we may be required to wait for lookup to complete at this point */
+	if (!fscache_defer_lookup) {
+		_debug("non-deferred lookup %p", &cookie->flags);
+		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("complete");
+		if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
+			goto unavailable;
+	}
+
+	up_read(&fscache_addremove_sem);
+	_leave(" = 0 [deferred]");
+	return 0;
+
+unavailable:
+	up_read(&fscache_addremove_sem);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+
+/*
+ * recursively allocate cache object records for a cookie/cache combination
+ * - caller must be holding the addremove sem
+ */
+static int fscache_alloc_object(struct fscache_cache *cache,
+				struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct hlist_node *_n;
+	int ret;
+
+	_enter("%p,%p{%s}", cache, cookie, cookie->def->name);
+
+	spin_lock(&cookie->lock);
+	hlist_for_each_entry(object, _n, &cookie->backing_objects,
+			     cookie_link) {
+		if (object->cache == cache)
+			goto object_already_extant;
+	}
+	spin_unlock(&cookie->lock);
+
+	/* ask the cache to allocate an object (we may end up with duplicate
+	 * objects at this stage, but we sort that out later) */
+	object = cache->ops->alloc_object(cache, cookie);
+	if (IS_ERR(object)) {
+		fscache_stat(&fscache_n_object_no_alloc);
+		ret = PTR_ERR(object);
+		goto error;
+	}
+
+	fscache_stat(&fscache_n_object_alloc);
+
+	object->debug_id = atomic_inc_return(&fscache_object_debug_id);
+
+	_debug("ALLOC OBJ%x: %s {%lx}",
+	       object->debug_id, cookie->def->name, object->events);
+
+	ret = fscache_alloc_object(cache, cookie->parent);
+	if (ret < 0)
+		goto error_put;
+
+	/* only attach if we managed to allocate all we needed, otherwise
+	 * discard the object we just allocated and instead use the one
+	 * attached to the cookie */
+	if (fscache_attach_object(cookie, object) < 0)
+		cache->ops->put_object(object);
+
+	_leave(" = 0");
+	return 0;
+
+object_already_extant:
+	ret = -ENOBUFS;
+	if (object->state >= FSCACHE_OBJECT_DYING) {
+		spin_unlock(&cookie->lock);
+		goto error;
+	}
+	spin_unlock(&cookie->lock);
+	_leave(" = 0 [found]");
+	return 0;
+
+error_put:
+	cache->ops->put_object(object);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * attach a cache object to a cookie
+ */
+static int fscache_attach_object(struct fscache_cookie *cookie,
+				 struct fscache_object *object)
+{
+	struct fscache_object *p;
+	struct fscache_cache *cache = object->cache;
+	struct hlist_node *_n;
+	int ret;
+
+	_enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
+
+	spin_lock(&cookie->lock);
+
+	/* there may be multiple initial creations of this object, but we only
+	 * want one */
+	ret = -EEXIST;
+	hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
+		if (p->cache == object->cache) {
+			if (p->state >= FSCACHE_OBJECT_DYING)
+				ret = -ENOBUFS;
+			goto cant_attach_object;
+		}
+	}
+
+	/* pin the parent object */
+	spin_lock_nested(&cookie->parent->lock, 1);
+	hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
+			     cookie_link) {
+		if (p->cache == object->cache) {
+			if (p->state >= FSCACHE_OBJECT_DYING) {
+				ret = -ENOBUFS;
+				spin_unlock(&cookie->parent->lock);
+				goto cant_attach_object;
+			}
+			object->parent = p;
+			spin_lock(&p->lock);
+			p->n_children++;
+			spin_unlock(&p->lock);
+			break;
+		}
+	}
+	spin_unlock(&cookie->parent->lock);
+
+	/* attach to the cache's object list */
+	if (list_empty(&object->cache_link)) {
+		spin_lock(&cache->object_list_lock);
+		list_add(&object->cache_link, &cache->object_list);
+		spin_unlock(&cache->object_list_lock);
+	}
+
+	/* attach to the cookie */
+	object->cookie = cookie;
+	atomic_inc(&cookie->usage);
+	hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+	ret = 0;
+
+cant_attach_object:
+	spin_unlock(&cookie->lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * update the index entries backing a cookie
+ */
+void __fscache_update_cookie(struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct hlist_node *_p;
+
+	fscache_stat(&fscache_n_updates);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_updates_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("{%s}", cookie->def->name);
+
+	BUG_ON(!cookie->def->get_aux);
+
+	spin_lock(&cookie->lock);
+
+	/* update the index entry on disk in each cache backing this cookie */
+	hlist_for_each_entry(object, _p,
+			     &cookie->backing_objects, cookie_link) {
+		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+	}
+
+	spin_unlock(&cookie->lock);
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_update_cookie);
+
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+{
+	struct fscache_cache *cache;
+	struct fscache_object *object;
+	unsigned long event;
+
+	fscache_stat(&fscache_n_relinquishes);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_relinquishes_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("%p{%s,%p},%d",
+	       cookie, cookie->def->name, cookie->netfs_data, retire);
+
+	if (atomic_read(&cookie->n_children) != 0) {
+		printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
+		       cookie->def->name);
+		BUG();
+	}
+
+	/* wait for the cookie to finish being instantiated (or to fail) */
+	if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
+		fscache_stat(&fscache_n_relinquishes_waitcrt);
+		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+	}
+
+	event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+
+	/* detach pointers back to the netfs */
+	spin_lock(&cookie->lock);
+
+	cookie->netfs_data	= NULL;
+	cookie->def		= NULL;
+
+	/* break links with all the active objects */
+	while (!hlist_empty(&cookie->backing_objects)) {
+		object = hlist_entry(cookie->backing_objects.first,
+				     struct fscache_object,
+				     cookie_link);
+
+		_debug("RELEASE OBJ%x", object->debug_id);
+
+		/* detach each cache object from the object cookie */
+		spin_lock(&object->lock);
+		hlist_del_init(&object->cookie_link);
+
+		cache = object->cache;
+		object->cookie = NULL;
+		fscache_raise_event(object, event);
+		spin_unlock(&object->lock);
+
+		if (atomic_dec_and_test(&cookie->usage))
+			/* the cookie refcount shouldn't be reduced to 0 yet */
+			BUG();
+	}
+
+	spin_unlock(&cookie->lock);
+
+	if (cookie->parent) {
+		ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
+		ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
+		atomic_dec(&cookie->parent->n_children);
+	}
+
+	/* finally dispose of the cookie */
+	ASSERTCMP(atomic_read(&cookie->usage), >, 0);
+	fscache_cookie_put(cookie);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_relinquish_cookie);
+
 /*
  * destroy a cookie
  */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index b195c2e1ef6a..245b48646efa 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -178,6 +178,13 @@ extern void __fscache_unregister_netfs(struct fscache_netfs *);
 extern struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *);
 extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
 
+extern struct fscache_cookie *__fscache_acquire_cookie(
+	struct fscache_cookie *,
+	const struct fscache_cookie_def *,
+	void *);
+extern void __fscache_relinquish_cookie(struct fscache_cookie *, int);
+extern void __fscache_update_cookie(struct fscache_cookie *);
+
 /**
  * fscache_register_netfs - Register a filesystem as desiring caching services
  * @netfs: The description of the filesystem
@@ -269,7 +276,10 @@ struct fscache_cookie *fscache_acquire_cookie(
 	const struct fscache_cookie_def *def,
 	void *netfs_data)
 {
-	return NULL;
+	if (fscache_cookie_valid(parent))
+		return __fscache_acquire_cookie(parent, def, netfs_data);
+	else
+		return NULL;
 }
 
 /**
@@ -287,6 +297,8 @@ struct fscache_cookie *fscache_acquire_cookie(
 static inline
 void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 {
+	if (fscache_cookie_valid(cookie))
+		__fscache_relinquish_cookie(cookie, retire);
 }
 
 /**
@@ -302,6 +314,8 @@ void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 static inline
 void fscache_update_cookie(struct fscache_cookie *cookie)
 {
+	if (fscache_cookie_valid(cookie))
+		__fscache_update_cookie(cookie);
 }
 
 /**
-- 
cgit v1.2.3


From b510882281d56873e1194021643b7c325336f84f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:39 +0100
Subject: FS-Cache: Implement data I/O part of netfs API

Implement the data I/O part of the FS-Cache netfs API.  The documentation and
API header file were added in a previous patch.

This patch implements the following functions for the netfs to call:

 (*) fscache_attr_changed().

     Indicate that the object has changed its attributes.  The only attribute
     currently recorded is the file size.  Only pages within the set file size
     will be stored in the cache.

     This operation is submitted for asynchronous processing, and will return
     immediately.  It will return -ENOMEM if an out of memory error is
     encountered, -ENOBUFS if the object is not actually cached, or 0 if the
     operation is successfully queued.

 (*) fscache_read_or_alloc_page().
 (*) fscache_read_or_alloc_pages().

     Request data be fetched from the disk, and allocate internal metadata to
     track the netfs pages and reserve disk space for unknown pages.

     These operations perform semi-asynchronous data reads.  Upon returning
     they will indicate which pages they think can be retrieved from disk, and
     will have set in progress attempts to retrieve those pages.

     These will return, in order of preference, -ENOMEM on memory allocation
     error, -ERESTARTSYS if a signal interrupted proceedings, -ENODATA if one
     or more requested pages are not yet cached, -ENOBUFS if the object is not
     actually cached or if there isn't space for future pages to be cached on
     this object, or 0 if successful.

     In the case of the multipage function, the pages for which reads are set
     in progress will be removed from the list and the page count decreased
     appropriately.

     If any read operations should fail, the completion function will be given
     an error, and will also be passed contextual information to allow the
     netfs to fall back to querying the server for the absent pages.

     For each successful read, the page completion function will also be
     called.

     Any pages subsequently tracked by the cache will have PG_fscache set upon
     them on return.  fscache_uncache_page() must be called for such pages.

     If supplied by the netfs, the mark_pages_cached() cookie op will be
     invoked for any pages now tracked.

 (*) fscache_alloc_page().

     Allocate internal metadata to track a netfs page and reserve disk space.

     This will return -ENOMEM on memory allocation error, -ERESTARTSYS on
     signal, -ENOBUFS if the object isn't cached, or there isn't enough space
     in the cache, or 0 if successful.

     Any pages subsequently tracked by the cache will have PG_fscache set upon
     them on return.  fscache_uncache_page() must be called for such pages.

     If supplied by the netfs, the mark_pages_cached() cookie op will be
     invoked for any pages now tracked.

 (*) fscache_write_page().

     Request data be stored to disk.  This may only be called on pages that
     have been read or alloc'd by the above three functions and have not yet
     been uncached.

     This will return -ENOMEM on memory allocation error, -ERESTARTSYS on
     signal, -ENOBUFS if the object isn't cached, or there isn't immediately
     enough space in the cache, or 0 if successful.

     On a successful return, this operation will have queued the page for
     asynchronous writing to the cache.  The page will be returned with
     PG_fscache_write set until the write completes one way or another.  The
     caller will not be notified if the write fails due to an I/O error.  If
     that happens, the object will become available and all pending writes will
     be aborted.

     Note that the cache may batch up page writes, and so it may take a while
     to get around to writing them out.

     The caller must assume that until PG_fscache_write is cleared the page is
     use by the cache.  Any changes made to the page may be reflected on disk.
     The page may even be under DMA.

 (*) fscache_uncache_page().

     Indicate that the cache should stop tracking a page previously read or
     alloc'd from the cache.  If the page was alloc'd only, but unwritten, it
     will not appear on disk.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/fscache/Makefile     |   3 +-
 fs/fscache/internal.h   |  21 ++
 fs/fscache/page.c       | 816 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fscache.h |  52 ++-
 4 files changed, 886 insertions(+), 6 deletions(-)
 create mode 100644 fs/fscache/page.c

(limited to 'include/linux')

diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 6f82da2aa9d1..91571b95aacc 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -9,7 +9,8 @@ fscache-y := \
 	main.o \
 	netfs.o \
 	object.o \
-	operation.o
+	operation.o \
+	page.o
 
 fscache-$(CONFIG_PROC_FS) += proc.o
 fscache-$(CONFIG_FSCACHE_STATS) += stats.o
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 014a830c8b37..e0cbd16f6dc9 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -229,6 +229,27 @@ static inline void fscache_cookie_put(struct fscache_cookie *cookie)
 		__fscache_cookie_put(cookie);
 }
 
+/*
+ * get an extra reference to a netfs retrieval context
+ */
+static inline
+void *fscache_get_context(struct fscache_cookie *cookie, void *context)
+{
+	if (cookie->def->get_context)
+		cookie->def->get_context(cookie->netfs_data, context);
+	return context;
+}
+
+/*
+ * release a reference to a netfs retrieval context
+ */
+static inline
+void fscache_put_context(struct fscache_cookie *cookie, void *context)
+{
+	if (cookie->def->put_context)
+		cookie->def->put_context(cookie->netfs_data, context);
+}
+
 /*****************************************************************************/
 /*
  * debug tracing
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644
index 000000000000..2568e0eb644f
--- /dev/null
+++ b/fs/fscache/page.c
@@ -0,0 +1,816 @@
+/* Cache page management and data I/O routines
+ *
+ * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL PAGE
+#include <linux/module.h>
+#include <linux/fscache-cache.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/*
+ * check to see if a page is being written to the cache
+ */
+bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	void *val;
+
+	rcu_read_lock();
+	val = radix_tree_lookup(&cookie->stores, page->index);
+	rcu_read_unlock();
+
+	return val != NULL;
+}
+EXPORT_SYMBOL(__fscache_check_page_write);
+
+/*
+ * wait for a page to finish being written to the cache
+ */
+void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+	wait_event(*wq, !__fscache_check_page_write(cookie, page));
+}
+EXPORT_SYMBOL(__fscache_wait_on_page_write);
+
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	struct page *xpage;
+
+	spin_lock(&cookie->lock);
+	xpage = radix_tree_delete(&cookie->stores, page->index);
+	spin_unlock(&cookie->lock);
+	ASSERT(xpage != NULL);
+
+	wake_up_bit(&cookie->flags, 0);
+}
+
+/*
+ * actually apply the changed attributes to a cache object
+ */
+static void fscache_attr_changed_op(struct fscache_operation *op)
+{
+	struct fscache_object *object = op->object;
+
+	_enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
+
+	fscache_stat(&fscache_n_attr_changed_calls);
+
+	if (fscache_object_is_active(object) &&
+	    object->cache->ops->attr_changed(object) < 0)
+		fscache_abort_object(object);
+
+	_leave("");
+}
+
+/*
+ * notification that the attributes on an object have changed
+ */
+int __fscache_attr_changed(struct fscache_cookie *cookie)
+{
+	struct fscache_operation *op;
+	struct fscache_object *object;
+
+	_enter("%p", cookie);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+
+	fscache_stat(&fscache_n_attr_changed);
+
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op) {
+		fscache_stat(&fscache_n_attr_changed_nomem);
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	fscache_operation_init(op, NULL);
+	fscache_operation_init_slow(op, fscache_attr_changed_op);
+	op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	if (fscache_submit_exclusive_op(object, op) < 0)
+		goto nobufs;
+	spin_unlock(&cookie->lock);
+	fscache_stat(&fscache_n_attr_changed_ok);
+	fscache_put_operation(op);
+	_leave(" = 0");
+	return 0;
+
+nobufs:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+	fscache_stat(&fscache_n_attr_changed_nobufs);
+	_leave(" = %d", -ENOBUFS);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_attr_changed);
+
+/*
+ * handle secondary execution given to a retrieval op on behalf of the
+ * cache
+ */
+static void fscache_retrieval_work(struct work_struct *work)
+{
+	struct fscache_retrieval *op =
+		container_of(work, struct fscache_retrieval, op.fast_work);
+	unsigned long start;
+
+	_enter("{OP%x}", op->op.debug_id);
+
+	start = jiffies;
+	op->op.processor(&op->op);
+	fscache_hist(fscache_ops_histogram, start);
+	fscache_put_operation(&op->op);
+}
+
+/*
+ * release a retrieval op reference
+ */
+static void fscache_release_retrieval_op(struct fscache_operation *_op)
+{
+	struct fscache_retrieval *op =
+		container_of(_op, struct fscache_retrieval, op);
+
+	_enter("{OP%x}", op->op.debug_id);
+
+	fscache_hist(fscache_retrieval_histogram, op->start_time);
+	if (op->context)
+		fscache_put_context(op->op.object->cookie, op->context);
+
+	_leave("");
+}
+
+/*
+ * allocate a retrieval op
+ */
+static struct fscache_retrieval *fscache_alloc_retrieval(
+	struct address_space *mapping,
+	fscache_rw_complete_t end_io_func,
+	void *context)
+{
+	struct fscache_retrieval *op;
+
+	/* allocate a retrieval operation and attempt to submit it */
+	op = kzalloc(sizeof(*op), GFP_NOIO);
+	if (!op) {
+		fscache_stat(&fscache_n_retrievals_nomem);
+		return NULL;
+	}
+
+	fscache_operation_init(&op->op, fscache_release_retrieval_op);
+	op->op.flags	= FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+	op->mapping	= mapping;
+	op->end_io_func	= end_io_func;
+	op->context	= context;
+	op->start_time	= jiffies;
+	INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
+	INIT_LIST_HEAD(&op->to_do);
+	return op;
+}
+
+/*
+ * wait for a deferred lookup to complete
+ */
+static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
+{
+	unsigned long jif;
+
+	_enter("");
+
+	if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
+		_leave(" = 0 [imm]");
+		return 0;
+	}
+
+	fscache_stat(&fscache_n_retrievals_wait);
+
+	jif = jiffies;
+	if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+			fscache_wait_bit_interruptible,
+			TASK_INTERRUPTIBLE) != 0) {
+		fscache_stat(&fscache_n_retrievals_intr);
+		_leave(" = -ERESTARTSYS");
+		return -ERESTARTSYS;
+	}
+
+	ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
+
+	smp_rmb();
+	fscache_hist(fscache_retrieval_delay_histogram, jif);
+	_leave(" = 0 [dly]");
+	return 0;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - we return:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS	- no backing object available in which to cache the block
+ *   -ENODATA	- no data available in the backing object for this block
+ *   0		- dispatched a read - it'll call end_io_func() when finished
+ */
+int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+				 struct page *page,
+				 fscache_rw_complete_t end_io_func,
+				 void *context,
+				 gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%p,,,", cookie, page);
+
+	fscache_stat(&fscache_n_retrievals);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+	if (!op) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_retrieval_ops);
+
+	/* pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure */
+	fscache_get_context(object->cookie, op->context);
+
+	/* we wait for the operation to become active, and then process it
+	 * *here*, in this thread, and not in the thread pool */
+	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+		_debug(">>> WT");
+		fscache_stat(&fscache_n_retrieval_op_waits);
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("<<< GO");
+	}
+
+	/* ask the cache to honour the operation */
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+		ret = object->cache->ops->allocate_page(op, page, gfp);
+		if (ret == 0)
+			ret = -ENODATA;
+	} else {
+		ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+	}
+
+	if (ret == -ENOMEM)
+		fscache_stat(&fscache_n_retrievals_nomem);
+	else if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_retrievals_intr);
+	else if (ret == -ENODATA)
+		fscache_stat(&fscache_n_retrievals_nodata);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_retrievals_nobufs);
+	else
+		fscache_stat(&fscache_n_retrievals_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_retrievals_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_page);
+
+/*
+ * read a list of page from the cache or allocate a block in which to store
+ * them
+ * - we return:
+ *   -ENOMEM	- out of memory, some pages may be being read
+ *   -ERESTARTSYS - interrupted, some pages may be being read
+ *   -ENOBUFS	- no backing object or space available in which to cache any
+ *                pages not being read
+ *   -ENODATA	- no data available in the backing object for some or all of
+ *                the pages
+ *   0		- dispatched a read on all pages
+ *
+ * end_io_func() will be called for each page read from the cache as it is
+ * finishes being read
+ *
+ * any pages for which a read is dispatched will be removed from pages and
+ * nr_pages
+ */
+int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned *nr_pages,
+				  fscache_rw_complete_t end_io_func,
+				  void *context,
+				  gfp_t gfp)
+{
+	fscache_pages_retrieval_func_t func;
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,,%d,,,", cookie, *nr_pages);
+
+	fscache_stat(&fscache_n_retrievals);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(*nr_pages, >, 0);
+	ASSERT(!list_empty(pages));
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(mapping, end_io_func, context);
+	if (!op)
+		return -ENOMEM;
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_retrieval_ops);
+
+	/* pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure */
+	fscache_get_context(object->cookie, op->context);
+
+	/* we wait for the operation to become active, and then process it
+	 * *here*, in this thread, and not in the thread pool */
+	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+		_debug(">>> WT");
+		fscache_stat(&fscache_n_retrieval_op_waits);
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("<<< GO");
+	}
+
+	/* ask the cache to honour the operation */
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
+		func = object->cache->ops->allocate_pages;
+	else
+		func = object->cache->ops->read_or_alloc_pages;
+	ret = func(op, pages, nr_pages, gfp);
+
+	if (ret == -ENOMEM)
+		fscache_stat(&fscache_n_retrievals_nomem);
+	else if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_retrievals_intr);
+	else if (ret == -ENODATA)
+		fscache_stat(&fscache_n_retrievals_nodata);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_retrievals_nobufs);
+	else
+		fscache_stat(&fscache_n_retrievals_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_retrievals_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
+
+/*
+ * allocate a block in the cache on which to store a page
+ * - we return:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS	- no backing object available in which to cache the block
+ *   0		- block allocated
+ */
+int __fscache_alloc_page(struct fscache_cookie *cookie,
+			 struct page *page,
+			 gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%p,,,", cookie, page);
+
+	fscache_stat(&fscache_n_allocs);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+	if (!op)
+		return -ENOMEM;
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_alloc_ops);
+
+	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+		_debug(">>> WT");
+		fscache_stat(&fscache_n_alloc_op_waits);
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("<<< GO");
+	}
+
+	/* ask the cache to honour the operation */
+	ret = object->cache->ops->allocate_page(op, page, gfp);
+
+	if (ret < 0)
+		fscache_stat(&fscache_n_allocs_nobufs);
+	else
+		fscache_stat(&fscache_n_allocs_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_allocs_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_alloc_page);
+
+/*
+ * release a write op reference
+ */
+static void fscache_release_write_op(struct fscache_operation *_op)
+{
+	_enter("{OP%x}", _op->debug_id);
+}
+
+/*
+ * perform the background storage of a page into the cache
+ */
+static void fscache_write_op(struct fscache_operation *_op)
+{
+	struct fscache_storage *op =
+		container_of(_op, struct fscache_storage, op);
+	struct fscache_object *object = op->op.object;
+	struct fscache_cookie *cookie = object->cookie;
+	struct page *page;
+	unsigned n;
+	void *results[1];
+	int ret;
+
+	_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
+
+	spin_lock(&cookie->lock);
+	spin_lock(&object->lock);
+
+	if (!fscache_object_is_active(object)) {
+		spin_unlock(&object->lock);
+		spin_unlock(&cookie->lock);
+		_leave("");
+		return;
+	}
+
+	fscache_stat(&fscache_n_store_calls);
+
+	/* find a page to store */
+	page = NULL;
+	n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
+				       FSCACHE_COOKIE_PENDING_TAG);
+	if (n != 1)
+		goto superseded;
+	page = results[0];
+	_debug("gang %d [%lx]", n, page->index);
+	if (page->index > op->store_limit)
+		goto superseded;
+
+	radix_tree_tag_clear(&cookie->stores, page->index,
+			     FSCACHE_COOKIE_PENDING_TAG);
+
+	spin_unlock(&object->lock);
+	spin_unlock(&cookie->lock);
+
+	if (page) {
+		ret = object->cache->ops->write_page(op, page);
+		fscache_end_page_write(cookie, page);
+		page_cache_release(page);
+		if (ret < 0)
+			fscache_abort_object(object);
+		else
+			fscache_enqueue_operation(&op->op);
+	}
+
+	_leave("");
+	return;
+
+superseded:
+	/* this writer is going away and there aren't any more things to
+	 * write */
+	_debug("cease");
+	clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+	spin_unlock(&object->lock);
+	spin_unlock(&cookie->lock);
+	_leave("");
+}
+
+/*
+ * request a page be stored in the cache
+ * - returns:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ENOBUFS	- no backing object available in which to cache the page
+ *   0		- dispatched a write - it'll call end_io_func() when finished
+ *
+ * if the cookie still has a backing object at this point, that object can be
+ * in one of a few states with respect to storage processing:
+ *
+ *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
+ *      set)
+ *
+ *	(a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
+ *	    fill op)
+ *
+ *	(b) writes deferred till post-creation (mark page for writing and
+ *	    return immediately)
+ *
+ *  (2) negative lookup, object created, initial fill being made from netfs
+ *      (FSCACHE_COOKIE_INITIAL_FILL is set)
+ *
+ *	(a) fill point not yet reached this page (mark page for writing and
+ *          return)
+ *
+ *	(b) fill point passed this page (queue op to store this page)
+ *
+ *  (3) object extant (queue op to store this page)
+ *
+ * any other state is invalid
+ */
+int __fscache_write_page(struct fscache_cookie *cookie,
+			 struct page *page,
+			 gfp_t gfp)
+{
+	struct fscache_storage *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%x,", cookie, (u32) page->flags);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERT(PageFsCache(page));
+
+	fscache_stat(&fscache_n_stores);
+
+	op = kzalloc(sizeof(*op), GFP_NOIO);
+	if (!op)
+		goto nomem;
+
+	fscache_operation_init(&op->op, fscache_release_write_op);
+	fscache_operation_init_slow(&op->op, fscache_write_op);
+	op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+
+	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+	if (ret < 0)
+		goto nomem_free;
+
+	ret = -ENOBUFS;
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+	if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
+		goto nobufs;
+
+	/* add the page to the pending-storage radix tree on the backing
+	 * object */
+	spin_lock(&object->lock);
+
+	_debug("store limit %llx", (unsigned long long) object->store_limit);
+
+	ret = radix_tree_insert(&cookie->stores, page->index, page);
+	if (ret < 0) {
+		if (ret == -EEXIST)
+			goto already_queued;
+		_debug("insert failed %d", ret);
+		goto nobufs_unlock_obj;
+	}
+
+	radix_tree_tag_set(&cookie->stores, page->index,
+			   FSCACHE_COOKIE_PENDING_TAG);
+	page_cache_get(page);
+
+	/* we only want one writer at a time, but we do need to queue new
+	 * writers after exclusive ops */
+	if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
+		goto already_pending;
+
+	spin_unlock(&object->lock);
+
+	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
+	op->store_limit = object->store_limit;
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto submit_failed;
+
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	fscache_stat(&fscache_n_store_ops);
+	fscache_stat(&fscache_n_stores_ok);
+
+	/* the slow work queue now carries its own ref on the object */
+	fscache_put_operation(&op->op);
+	_leave(" = 0");
+	return 0;
+
+already_queued:
+	fscache_stat(&fscache_n_stores_again);
+already_pending:
+	spin_unlock(&object->lock);
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	kfree(op);
+	fscache_stat(&fscache_n_stores_ok);
+	_leave(" = 0");
+	return 0;
+
+submit_failed:
+	radix_tree_delete(&cookie->stores, page->index);
+	page_cache_release(page);
+	ret = -ENOBUFS;
+	goto nobufs;
+
+nobufs_unlock_obj:
+	spin_unlock(&object->lock);
+nobufs:
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	kfree(op);
+	fscache_stat(&fscache_n_stores_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+
+nomem_free:
+	kfree(op);
+nomem:
+	fscache_stat(&fscache_n_stores_oom);
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(__fscache_write_page);
+
+/*
+ * remove a page from the cache
+ */
+void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
+{
+	struct fscache_object *object;
+
+	_enter(",%p", page);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	fscache_stat(&fscache_n_uncaches);
+
+	/* cache withdrawal may beat us to it */
+	if (!PageFsCache(page))
+		goto done;
+
+	/* get the object */
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects)) {
+		ClearPageFsCache(page);
+		goto done_unlock;
+	}
+
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	/* there might now be stuff on disk we could read */
+	clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+	/* only invoke the cache backend if we managed to mark the page
+	 * uncached here; this deals with synchronisation vs withdrawal */
+	if (TestClearPageFsCache(page) &&
+	    object->cache->ops->uncache_page) {
+		/* the cache backend releases the cookie lock */
+		object->cache->ops->uncache_page(object, page);
+		goto done;
+	}
+
+done_unlock:
+	spin_unlock(&cookie->lock);
+done:
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_page);
+
+/**
+ * fscache_mark_pages_cached - Mark pages as being cached
+ * @op: The retrieval op pages are being marked for
+ * @pagevec: The pages to be marked
+ *
+ * Mark a bunch of netfs pages as being cached.  After this is called,
+ * the netfs must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_pages_cached(struct fscache_retrieval *op,
+			       struct pagevec *pagevec)
+{
+	struct fscache_cookie *cookie = op->op.object->cookie;
+	unsigned long loop;
+
+#ifdef CONFIG_FSCACHE_STATS
+	atomic_add(pagevec->nr, &fscache_n_marks);
+#endif
+
+	for (loop = 0; loop < pagevec->nr; loop++) {
+		struct page *page = pagevec->pages[loop];
+
+		_debug("- mark %p{%lx}", page, page->index);
+		if (TestSetPageFsCache(page)) {
+			static bool once_only;
+			if (!once_only) {
+				once_only = true;
+				printk(KERN_WARNING "FS-Cache:"
+				       " Cookie type %s marked page %lx"
+				       " multiple times\n",
+				       cookie->def->name, page->index);
+			}
+		}
+	}
+
+	if (cookie->def->mark_pages_cached)
+		cookie->def->mark_pages_cached(cookie->netfs_data,
+					       op->mapping, pagevec);
+	pagevec_reinit(pagevec);
+}
+EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 245b48646efa..6d8ee466e0a0 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -184,6 +184,24 @@ extern struct fscache_cookie *__fscache_acquire_cookie(
 	void *);
 extern void __fscache_relinquish_cookie(struct fscache_cookie *, int);
 extern void __fscache_update_cookie(struct fscache_cookie *);
+extern int __fscache_attr_changed(struct fscache_cookie *);
+extern int __fscache_read_or_alloc_page(struct fscache_cookie *,
+					struct page *,
+					fscache_rw_complete_t,
+					void *,
+					gfp_t);
+extern int __fscache_read_or_alloc_pages(struct fscache_cookie *,
+					 struct address_space *,
+					 struct list_head *,
+					 unsigned *,
+					 fscache_rw_complete_t,
+					 void *,
+					 gfp_t);
+extern int __fscache_alloc_page(struct fscache_cookie *, struct page *, gfp_t);
+extern int __fscache_write_page(struct fscache_cookie *, struct page *, gfp_t);
+extern void __fscache_uncache_page(struct fscache_cookie *, struct page *);
+extern bool __fscache_check_page_write(struct fscache_cookie *, struct page *);
+extern void __fscache_wait_on_page_write(struct fscache_cookie *, struct page *);
 
 /**
  * fscache_register_netfs - Register a filesystem as desiring caching services
@@ -361,7 +379,10 @@ void fscache_unpin_cookie(struct fscache_cookie *cookie)
 static inline
 int fscache_attr_changed(struct fscache_cookie *cookie)
 {
-	return -ENOBUFS;
+	if (fscache_cookie_valid(cookie))
+		return __fscache_attr_changed(cookie);
+	else
+		return -ENOBUFS;
 }
 
 /**
@@ -418,7 +439,11 @@ int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 			       void *context,
 			       gfp_t gfp)
 {
-	return -ENOBUFS;
+	if (fscache_cookie_valid(cookie))
+		return __fscache_read_or_alloc_page(cookie, page, end_io_func,
+						    context, gfp);
+	else
+		return -ENOBUFS;
 }
 
 /**
@@ -464,7 +489,12 @@ int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 				void *context,
 				gfp_t gfp)
 {
-	return -ENOBUFS;
+	if (fscache_cookie_valid(cookie))
+		return __fscache_read_or_alloc_pages(cookie, mapping, pages,
+						     nr_pages, end_io_func,
+						     context, gfp);
+	else
+		return -ENOBUFS;
 }
 
 /**
@@ -490,7 +520,10 @@ int fscache_alloc_page(struct fscache_cookie *cookie,
 		       struct page *page,
 		       gfp_t gfp)
 {
-	return -ENOBUFS;
+	if (fscache_cookie_valid(cookie))
+		return __fscache_alloc_page(cookie, page, gfp);
+	else
+		return -ENOBUFS;
 }
 
 /**
@@ -516,7 +549,10 @@ int fscache_write_page(struct fscache_cookie *cookie,
 		       struct page *page,
 		       gfp_t gfp)
 {
-	return -ENOBUFS;
+	if (fscache_cookie_valid(cookie))
+		return __fscache_write_page(cookie, page, gfp);
+	else
+		return -ENOBUFS;
 }
 
 /**
@@ -537,6 +573,8 @@ static inline
 void fscache_uncache_page(struct fscache_cookie *cookie,
 			  struct page *page)
 {
+	if (fscache_cookie_valid(cookie))
+		__fscache_uncache_page(cookie, page);
 }
 
 /**
@@ -553,6 +591,8 @@ static inline
 bool fscache_check_page_write(struct fscache_cookie *cookie,
 			      struct page *page)
 {
+	if (fscache_cookie_valid(cookie))
+		return __fscache_check_page_write(cookie, page);
 	return false;
 }
 
@@ -571,6 +611,8 @@ static inline
 void fscache_wait_on_page_write(struct fscache_cookie *cookie,
 				struct page *page)
 {
+	if (fscache_cookie_valid(cookie))
+		__fscache_wait_on_page_write(cookie, page);
 }
 
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From 385e1ca5f21c4680ad6a46a3aa2ea8af99e99c92 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:39 +0100
Subject: CacheFiles: Permit the page lock state to be monitored

Add a function to install a monitor on the page lock waitqueue for a particular
page, thus allowing the page being unlocked to be detected.

This is used by CacheFiles to detect read completion on a page in the backing
filesystem so that it can then copy the data to the waiting netfs page.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/pagemap.h |  5 +++++
 mm/filemap.c            | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 076a7dc67c2b..34da5230faab 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -383,6 +383,11 @@ static inline void wait_on_page_writeback(struct page *page)
 
 extern void end_page_writeback(struct page *page);
 
+/*
+ * Add an arbitrary waiter to a page's wait queue
+ */
+extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
+
 /*
  * Fault a userspace page into pagetables.  Return non-zero on a fault.
  *
diff --git a/mm/filemap.c b/mm/filemap.c
index cbc5772e7171..fc11974f2bee 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -564,6 +564,24 @@ void wait_on_page_bit(struct page *page, int bit_nr)
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
+/**
+ * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
+ * @page - Page defining the wait queue of interest
+ * @waiter - Waiter to add to the queue
+ *
+ * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ */
+void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+{
+	wait_queue_head_t *q = page_waitqueue(page);
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__add_wait_queue(q, waiter);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_page_wait_queue);
+
 /**
  * unlock_page - unlock a locked page
  * @page: the page
-- 
cgit v1.2.3


From c6a6f19e22da0a3d74214ee010224c9a30a794c1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:42 +0100
Subject: NFS: Add FS-Cache option bit and debug bit

Add FS-Cache option bit to nfs_server struct.  This is set to indicate local
on-disk caching is enabled for a particular superblock.

Also add debug bit for local caching operations.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/nfs_fs.h    | 1 +
 include/linux/nfs_fs_sb.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index bde2557c2a9c..fd3e7f9c6fd3 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -583,6 +583,7 @@ extern void * nfs_root_data(void);
 #define NFSDBG_CALLBACK		0x0100
 #define NFSDBG_CLIENT		0x0200
 #define NFSDBG_MOUNT		0x0400
+#define NFSDBG_FSCACHE		0x0800
 #define NFSDBG_ALL		0xFFFF
 
 #ifdef __KERNEL__
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 29b1e40dce99..a749f8564aa6 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -96,6 +96,8 @@ struct nfs_server {
 	unsigned int		acdirmin;
 	unsigned int		acdirmax;
 	unsigned int		namelen;
+	unsigned int		options;	/* extra options enabled by mount */
+#define NFS_OPTION_FSCACHE	0x00000001	/* - local caching enabled */
 
 	struct nfs_fsid		fsid;
 	__u64			maxfilesize;	/* maximum file size */
-- 
cgit v1.2.3


From 147272813e043fb44bd112527951da70c1e663de Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:42 +0100
Subject: NFS: Define and create server-level objects

Define and create server-level cache index objects (as managed by nfs_client
structs).

Each server object is created in the NFS top-level index object and is itself
an index into which superblock-level objects are inserted.

Ideally there would be one superblock-level object per server, and the former
would be folded into the latter; however, since the "nosharecache" option
exists this isn't possible.

The server object key is a sequence consisting of:

 (1) NFS version

 (2) Server address family (eg: AF_INET or AF_INET6)

 (3) Server port.

 (4) Server IP address.

The key blob is of variable length, depending on the length of (4).

The server object is given no coherency data to carry in the auxiliary data
permitted by the cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/nfs/Makefile           |  2 +-
 fs/nfs/client.c           |  5 ++++
 fs/nfs/fscache-index.c    | 65 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/fscache.c          | 52 +++++++++++++++++++++++++++++++++++++
 fs/nfs/fscache.h          | 10 ++++++++
 include/linux/nfs_fs_sb.h |  4 +++
 6 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 fs/nfs/fscache.c

(limited to 'include/linux')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 0e0bb6c17a86..845159814de2 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,4 +15,4 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   callback.o callback_xdr.o callback_proc.o \
 			   nfs4namespace.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
-nfs-$(CONFIG_NFS_FSCACHE) += fscache-index.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index aba38017bdef..aa04da8748a6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -45,6 +45,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_CLIENT
 
@@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	if (!IS_ERR(cred))
 		clp->cl_machine_cred = cred;
 
+	nfs_fscache_get_client_cookie(clp);
+
 	return clp;
 
 error_3:
@@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp)
 
 	nfs4_shutdown_client(clp);
 
+	nfs_fscache_release_client_cookie(clp);
+
 	/* -EIO all pending I/O */
 	if (!IS_ERR(clp->cl_rpcclient))
 		rpc_shutdown_client(clp->cl_rpcclient);
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 6d5bb5c69048..ff14b032459b 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -47,3 +47,68 @@ void nfs_fscache_unregister(void)
 {
 	fscache_unregister_netfs(&nfs_fscache_netfs);
 }
+
+/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+	uint16_t	nfsversion;		/* NFS protocol version */
+	uint16_t	family;			/* address family */
+	uint16_t	port;			/* IP port */
+	union {
+		struct in_addr	ipv4_addr;	/* IPv4 address */
+		struct in6_addr ipv6_addr;	/* IPv6 address */
+	} addr[0];
+};
+
+/*
+ * Generate a key to describe a server in the main NFS index
+ * - We return the length of the key, or 0 if we can't generate one
+ */
+static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
+				   void *buffer, uint16_t bufmax)
+{
+	const struct nfs_client *clp = cookie_netfs_data;
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+	const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+	struct nfs_server_key *key = buffer;
+	uint16_t len = sizeof(struct nfs_server_key);
+
+	key->nfsversion = clp->rpc_ops->version;
+	key->family = clp->cl_addr.ss_family;
+
+	memset(key, 0, len);
+
+	switch (clp->cl_addr.ss_family) {
+	case AF_INET:
+		key->port = sin->sin_port;
+		key->addr[0].ipv4_addr = sin->sin_addr;
+		len += sizeof(key->addr[0].ipv4_addr);
+		break;
+
+	case AF_INET6:
+		key->port = sin6->sin6_port;
+		key->addr[0].ipv6_addr = sin6->sin6_addr;
+		len += sizeof(key->addr[0].ipv6_addr);
+		break;
+
+	default:
+		printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+		       clp->cl_addr.ss_family);
+		len = 0;
+		break;
+	}
+
+	return len;
+}
+
+/*
+ * Define the server object for FS-Cache.  This is used to describe a server
+ * object to fscache_acquire_cookie().  It is keyed by the NFS protocol and
+ * server address parameters.
+ */
+const struct fscache_cookie_def nfs_fscache_server_index_def = {
+	.name		= "NFS.server",
+	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= nfs_server_get_key,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 000000000000..c3f056f89477
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,52 @@
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY		NFSDBG_FSCACHE
+
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ *   cookies on a per-superblock basis, depending on the mount flags
+ */
+void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+{
+	/* create a cache index for looking up filehandles */
+	clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
+					      &nfs_fscache_server_index_def,
+					      clp);
+	dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
+		 clp, clp->fscache);
+}
+
+/*
+ * Dispose of a per-client cookie
+ */
+void nfs_fscache_release_client_cookie(struct nfs_client *clp)
+{
+	dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
+		 clp, clp->fscache);
+
+	fscache_relinquish_cookie(clp->fscache, 0);
+	clp->fscache = NULL;
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index ccfcdc58066e..1d864bedf154 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -23,13 +23,23 @@
  * fscache-index.c
  */
 extern struct fscache_netfs nfs_fscache_netfs;
+extern const struct fscache_cookie_def nfs_fscache_server_index_def;
 
 extern int nfs_fscache_register(void);
 extern void nfs_fscache_unregister(void);
 
+/*
+ * fscache.c
+ */
+extern void nfs_fscache_get_client_cookie(struct nfs_client *);
+extern void nfs_fscache_release_client_cookie(struct nfs_client *);
+
 #else /* CONFIG_NFS_FSCACHE */
 static inline int nfs_fscache_register(void) { return 0; }
 static inline void nfs_fscache_unregister(void) {}
 
+static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
+
 #endif /* CONFIG_NFS_FSCACHE */
 #endif /* _NFS_FSCACHE_H */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index a749f8564aa6..0a374b9c5093 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -64,6 +64,10 @@ struct nfs_client {
 	char			cl_ipaddr[48];
 	unsigned char		cl_id_uniquifier;
 #endif
+
+#ifdef CONFIG_NFS_FSCACHE
+	struct fscache_cookie	*fscache;	/* client index cache cookie */
+#endif
 };
 
 /*
-- 
cgit v1.2.3


From 08734048b380103f0412f58b84c2f76a2c8b599f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:42 +0100
Subject: NFS: Define and create superblock-level objects

Define and create superblock-level cache index objects (as managed by
nfs_server structs).

Each superblock object is created in a server level index object and is itself
an index into which inode-level objects are inserted.

Ideally there would be one superblock-level object per server, and the former
would be folded into the latter; however, since the "nosharecache" option
exists this isn't possible.

The superblock object key is a sequence consisting of:

 (1) Certain superblock s_flags.

 (2) Various connection parameters that serve to distinguish superblocks for
     sget().

 (3) The volume FSID.

 (4) The security flavour.

 (5) The uniquifier length.

 (6) The uniquifier text.  This is normally an empty string, unless the fsc=xyz
     mount option was used to explicitly specify a uniquifier.

The key blob is of variable length, depending on the length of (6).

The superblock object is given no coherency data to carry in the auxiliary data
permitted by the cache.  It is assumed that the superblock is always coherent.

This patch also adds uniquification handling such that two otherwise identical
superblocks, at least one of which is marked "nosharecache", won't end up
trying to share the on-disk cache.  It will be possible to manually provide a
uniquifier through a mount option with a later patch to avoid the error
otherwise produced.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/nfs/fscache-index.c    |  34 ++++++++++++++
 fs/nfs/fscache.c          | 116 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/fscache.h          |  49 ++++++++++++++++++++
 fs/nfs/internal.h         |   3 ++
 fs/nfs/super.c            |   9 +++-
 include/linux/nfs_fs_sb.h |   5 ++
 6 files changed, 214 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index ff14b032459b..a824050be807 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -112,3 +112,37 @@ const struct fscache_cookie_def nfs_fscache_server_index_def = {
 	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
 	.get_key	= nfs_server_get_key,
 };
+
+/*
+ * Generate a key to describe a superblock key in the main NFS index
+ */
+static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
+				  void *buffer, uint16_t bufmax)
+{
+	const struct nfs_fscache_key *key;
+	const struct nfs_server *nfss = cookie_netfs_data;
+	uint16_t len;
+
+	key = nfss->fscache_key;
+	len = sizeof(key->key) + key->key.uniq_len;
+	if (len > bufmax) {
+		len = 0;
+	} else {
+		memcpy(buffer, &key->key, sizeof(key->key));
+		memcpy(buffer + sizeof(key->key),
+		       key->key.uniquifier, key->key.uniq_len);
+	}
+
+	return len;
+}
+
+/*
+ * Define the superblock object for FS-Cache.  This is used to describe a
+ * superblock object to fscache_acquire_cookie().  It is keyed by all the NFS
+ * parameters that might cause a separate superblock.
+ */
+const struct fscache_cookie_def nfs_fscache_super_index_def = {
+	.name		= "NFS.super",
+	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= nfs_super_get_key,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index c3f056f89477..ab2de2c92b21 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -23,6 +23,9 @@
 
 #define NFSDBG_FACILITY		NFSDBG_FSCACHE
 
+static struct rb_root nfs_fscache_keys = RB_ROOT;
+static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
+
 /*
  * Get the per-client index cookie for an NFS client if the appropriate mount
  * flag was set
@@ -50,3 +53,116 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
 	fscache_relinquish_cookie(clp->fscache, 0);
 	clp->fscache = NULL;
 }
+
+/*
+ * Get the cache cookie for an NFS superblock.  We have to handle
+ * uniquification here because the cache doesn't do it for us.
+ */
+void nfs_fscache_get_super_cookie(struct super_block *sb,
+				  struct nfs_parsed_mount_data *data)
+{
+	struct nfs_fscache_key *key, *xkey;
+	struct nfs_server *nfss = NFS_SB(sb);
+	struct rb_node **p, *parent;
+	const char *uniq = data->fscache_uniq ?: "";
+	int diff, ulen;
+
+	ulen = strlen(uniq);
+	key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+	if (!key)
+		return;
+
+	key->nfs_client = nfss->nfs_client;
+	key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
+	key->key.nfs_server.flags = nfss->flags;
+	key->key.nfs_server.rsize = nfss->rsize;
+	key->key.nfs_server.wsize = nfss->wsize;
+	key->key.nfs_server.acregmin = nfss->acregmin;
+	key->key.nfs_server.acregmax = nfss->acregmax;
+	key->key.nfs_server.acdirmin = nfss->acdirmin;
+	key->key.nfs_server.acdirmax = nfss->acdirmax;
+	key->key.nfs_server.fsid = nfss->fsid;
+	key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
+
+	key->key.uniq_len = ulen;
+	memcpy(key->key.uniquifier, uniq, ulen);
+
+	spin_lock(&nfs_fscache_keys_lock);
+	p = &nfs_fscache_keys.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		xkey = rb_entry(parent, struct nfs_fscache_key, node);
+
+		if (key->nfs_client < xkey->nfs_client)
+			goto go_left;
+		if (key->nfs_client > xkey->nfs_client)
+			goto go_right;
+
+		diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
+		if (diff < 0)
+			goto go_left;
+		if (diff > 0)
+			goto go_right;
+
+		if (key->key.uniq_len == 0)
+			goto non_unique;
+		diff = memcmp(key->key.uniquifier,
+			      xkey->key.uniquifier,
+			      key->key.uniq_len);
+		if (diff < 0)
+			goto go_left;
+		if (diff > 0)
+			goto go_right;
+		goto non_unique;
+
+	go_left:
+		p = &(*p)->rb_left;
+		continue;
+	go_right:
+		p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&key->node, parent, p);
+	rb_insert_color(&key->node, &nfs_fscache_keys);
+	spin_unlock(&nfs_fscache_keys_lock);
+	nfss->fscache_key = key;
+
+	/* create a cache index for looking up filehandles */
+	nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
+					       &nfs_fscache_super_index_def,
+					       nfss);
+	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
+		 nfss, nfss->fscache);
+	return;
+
+non_unique:
+	spin_unlock(&nfs_fscache_keys_lock);
+	kfree(key);
+	nfss->fscache_key = NULL;
+	nfss->fscache = NULL;
+	printk(KERN_WARNING "NFS:"
+	       " Cache request denied due to non-unique superblock keys\n");
+}
+
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+	struct nfs_server *nfss = NFS_SB(sb);
+
+	dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
+		 nfss, nfss->fscache);
+
+	fscache_relinquish_cookie(nfss->fscache, 0);
+	nfss->fscache = NULL;
+
+	if (nfss->fscache_key) {
+		spin_lock(&nfs_fscache_keys_lock);
+		rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
+		spin_unlock(&nfs_fscache_keys_lock);
+		kfree(nfss->fscache_key);
+		nfss->fscache_key = NULL;
+	}
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 1d864bedf154..22b971e8b380 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -19,11 +19,49 @@
 
 #ifdef CONFIG_NFS_FSCACHE
 
+/*
+ * set of NFS FS-Cache objects that form a superblock key
+ */
+struct nfs_fscache_key {
+	struct rb_node		node;
+	struct nfs_client	*nfs_client;	/* the server */
+
+	/* the elements of the unique key - as used by nfs_compare_super() and
+	 * nfs_compare_mount_options() to distinguish superblocks */
+	struct {
+		struct {
+			unsigned long	s_flags;	/* various flags
+							 * (& NFS_MS_MASK) */
+		} super;
+
+		struct {
+			struct nfs_fsid fsid;
+			int		flags;
+			unsigned int	rsize;		/* read size */
+			unsigned int	wsize;		/* write size */
+			unsigned int	acregmin;	/* attr cache timeouts */
+			unsigned int	acregmax;
+			unsigned int	acdirmin;
+			unsigned int	acdirmax;
+		} nfs_server;
+
+		struct {
+			rpc_authflavor_t au_flavor;
+		} rpc_auth;
+
+		/* uniquifier - can be used if nfs_server.flags includes
+		 * NFS_MOUNT_UNSHARED  */
+		u8 uniq_len;
+		char uniquifier[0];
+	} key;
+};
+
 /*
  * fscache-index.c
  */
 extern struct fscache_netfs nfs_fscache_netfs;
 extern const struct fscache_cookie_def nfs_fscache_server_index_def;
+extern const struct fscache_cookie_def nfs_fscache_super_index_def;
 
 extern int nfs_fscache_register(void);
 extern void nfs_fscache_unregister(void);
@@ -34,6 +72,10 @@ extern void nfs_fscache_unregister(void);
 extern void nfs_fscache_get_client_cookie(struct nfs_client *);
 extern void nfs_fscache_release_client_cookie(struct nfs_client *);
 
+extern void nfs_fscache_get_super_cookie(struct super_block *,
+					 struct nfs_parsed_mount_data *);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+
 #else /* CONFIG_NFS_FSCACHE */
 static inline int nfs_fscache_register(void) { return 0; }
 static inline void nfs_fscache_unregister(void) {}
@@ -41,5 +83,12 @@ static inline void nfs_fscache_unregister(void) {}
 static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
 static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
 
+static inline void nfs_fscache_get_super_cookie(
+	struct super_block *sb,
+	struct nfs_parsed_mount_data *data)
+{
+}
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+
 #endif /* CONFIG_NFS_FSCACHE */
 #endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2041f68ff1cc..013070000c38 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,8 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 
+#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+
 struct nfs_string;
 
 /* Maximum number of readahead requests
@@ -41,6 +43,7 @@ struct nfs_parsed_mount_data {
 	unsigned int		auth_flavor_len;
 	rpc_authflavor_t	auth_flavors[1];
 	char			*client_address;
+	char			*fscache_uniq;
 
 	struct {
 		struct sockaddr_storage	address;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0942fcbbad3c..87f65ae07f32 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -60,6 +60,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -1870,8 +1871,6 @@ static void nfs_clone_super(struct super_block *sb,
  	nfs_initialise_sb(sb);
 }
 
-#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
-
 static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
 {
 	const struct nfs_server *a = s->s_fs_info;
@@ -2036,6 +2035,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs_fill_super(s, data);
+		nfs_fscache_get_super_cookie(s, data);
 	}
 
 	mntroot = nfs_get_root(s, mntfh);
@@ -2056,6 +2056,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 out:
 	kfree(data->nfs_server.hostname);
 	kfree(data->mount_server.hostname);
+	kfree(data->fscache_uniq);
 	security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
 	kfree(mntfh);
@@ -2083,6 +2084,7 @@ static void nfs_kill_super(struct super_block *s)
 
 	bdi_unregister(&server->backing_dev_info);
 	kill_anon_super(s);
+	nfs_fscache_release_super_cookie(s);
 	nfs_free_server(server);
 }
 
@@ -2390,6 +2392,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs4_fill_super(s);
+		nfs_fscache_get_super_cookie(s, data);
 	}
 
 	mntroot = nfs4_get_root(s, mntfh);
@@ -2411,6 +2414,7 @@ out:
 	kfree(data->client_address);
 	kfree(data->nfs_server.export_path);
 	kfree(data->nfs_server.hostname);
+	kfree(data->fscache_uniq);
 	security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
 	kfree(mntfh);
@@ -2437,6 +2441,7 @@ static void nfs4_kill_super(struct super_block *sb)
 	kill_anon_super(sb);
 
 	nfs4_renewd_prepare_shutdown(server);
+	nfs_fscache_release_super_cookie(sb);
 	nfs_free_server(server);
 }
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 0a374b9c5093..6ad75948cbf7 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -108,6 +108,11 @@ struct nfs_server {
 	unsigned long		mount_time;	/* when this fs was mounted */
 	dev_t			s_dev;		/* superblock dev numbers */
 
+#ifdef CONFIG_NFS_FSCACHE
+	struct nfs_fscache_key	*fscache_key;	/* unique key for superblock */
+	struct fscache_cookie	*fscache;	/* superblock cookie */
+#endif
+
 #ifdef CONFIG_NFS_V4
 	u32			attr_bitmask[2];/* V4 bitmask representing the set
 						   of attributes supported on this
-- 
cgit v1.2.3


From ef79c097bbe9724e13937271b3457df560e00370 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:43 +0100
Subject: NFS: Use local disk inode cache

Bind data storage objects in the local cache to NFS inodes.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/nfs/fscache.c       | 162 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/fscache.h       |  13 ++++
 fs/nfs/inode.c         |   6 ++
 include/linux/nfs_fs.h |  10 +++
 4 files changed, 191 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index ab2de2c92b21..e3816eb53fb8 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -166,3 +166,165 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
 		nfss->fscache_key = NULL;
 	}
 }
+
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode_cookie(struct inode *inode)
+{
+	NFS_I(inode)->fscache = NULL;
+	if (S_ISREG(inode->i_mode))
+		set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+
+/*
+ * Get the per-inode cache cookie for an NFS inode.
+ */
+static void nfs_fscache_enable_inode_cookie(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	if (nfsi->fscache || !NFS_FSCACHE(inode))
+		return;
+
+	if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
+		nfsi->fscache = fscache_acquire_cookie(
+			NFS_SB(sb)->fscache,
+			&nfs_fscache_inode_object_def,
+			nfsi);
+
+		dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
+			 sb, nfsi, nfsi->fscache);
+	}
+}
+
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_release_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
+		 nfsi, nfsi->fscache);
+
+	fscache_relinquish_cookie(nfsi->fscache, 0);
+	nfsi->fscache = NULL;
+}
+
+/*
+ * Retire a per-inode cookie, destroying the data attached to it.
+ */
+void nfs_fscache_zap_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
+		 nfsi, nfsi->fscache);
+
+	fscache_relinquish_cookie(nfsi->fscache, 1);
+	nfsi->fscache = NULL;
+}
+
+/*
+ * Turn off the cache with regard to a per-inode cookie if opened for writing,
+ * invalidating all the pages in the page cache relating to the associated
+ * inode to clear the per-page caching.
+ */
+static void nfs_fscache_disable_inode_cookie(struct inode *inode)
+{
+	clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+
+	if (NFS_I(inode)->fscache) {
+		dfprintk(FSCACHE,
+			 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
+
+		/* Need to invalidate any mapped pages that were read in before
+		 * turning off the cache.
+		 */
+		if (inode->i_mapping && inode->i_mapping->nrpages)
+			invalidate_inode_pages2(inode->i_mapping);
+
+		nfs_fscache_zap_inode_cookie(inode);
+	}
+}
+
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+static int nfs_fscache_wait_bit(void *flags)
+{
+	schedule();
+	return 0;
+}
+
+/*
+ * Lock against someone else trying to also acquire or relinquish a cookie
+ */
+static inline void nfs_fscache_inode_lock(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
+		wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
+			    nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * Unlock cookie management lock
+ */
+static inline void nfs_fscache_inode_unlock(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	smp_mb__before_clear_bit();
+	clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
+}
+
+/*
+ * Decide if we should enable or disable local caching for this inode.
+ * - For now, with NFS, only regular files that are open read-only will be able
+ *   to use the cache.
+ * - May be invoked multiple times in parallel by parallel nfs_open() functions.
+ */
+void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+	if (NFS_FSCACHE(inode)) {
+		nfs_fscache_inode_lock(inode);
+		if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+			nfs_fscache_disable_inode_cookie(inode);
+		else
+			nfs_fscache_enable_inode_cookie(inode);
+		nfs_fscache_inode_unlock(inode);
+	}
+}
+
+/*
+ * Replace a per-inode cookie due to revalidation detecting a file having
+ * changed on the server.
+ */
+void nfs_fscache_reset_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_server *nfss = NFS_SERVER(inode);
+	struct fscache_cookie *old = nfsi->fscache;
+
+	nfs_fscache_inode_lock(inode);
+	if (nfsi->fscache) {
+		/* retire the current fscache cache and get a new one */
+		fscache_relinquish_cookie(nfsi->fscache, 1);
+
+		nfsi->fscache = fscache_acquire_cookie(
+			nfss->nfs_client->fscache,
+			&nfs_fscache_inode_object_def,
+			nfsi);
+
+		dfprintk(FSCACHE,
+			 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
+			 nfss, nfsi, old, nfsi->fscache);
+	}
+	nfs_fscache_inode_unlock(inode);
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index d21b5906ccf4..8b4299a0ad61 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -77,6 +77,12 @@ extern void nfs_fscache_get_super_cookie(struct super_block *,
 					 struct nfs_parsed_mount_data *);
 extern void nfs_fscache_release_super_cookie(struct super_block *);
 
+extern void nfs_fscache_init_inode_cookie(struct inode *);
+extern void nfs_fscache_release_inode_cookie(struct inode *);
+extern void nfs_fscache_zap_inode_cookie(struct inode *);
+extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void nfs_fscache_reset_inode_cookie(struct inode *);
+
 #else /* CONFIG_NFS_FSCACHE */
 static inline int nfs_fscache_register(void) { return 0; }
 static inline void nfs_fscache_unregister(void) {}
@@ -91,5 +97,12 @@ static inline void nfs_fscache_get_super_cookie(
 }
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
 
+static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
+						struct file *filp) {}
+static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+
 #endif /* CONFIG_NFS_FSCACHE */
 #endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index cd29f410e941..64f87194d390 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -122,6 +122,7 @@ void nfs_clear_inode(struct inode *inode)
 	BUG_ON(!list_empty(&NFS_I(inode)->open_files));
 	nfs_zap_acl_cache(inode);
 	nfs_access_zap_cache(inode);
+	nfs_fscache_release_inode_cookie(inode);
 }
 
 /**
@@ -356,6 +357,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		nfsi->attrtimeo_timestamp = now;
 		nfsi->access_cache = RB_ROOT;
 
+		nfs_fscache_init_inode_cookie(inode);
+
 		unlock_new_inode(inode);
 	} else
 		nfs_refresh_inode(inode, fattr);
@@ -687,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 	ctx->mode = filp->f_mode;
 	nfs_file_set_open_context(filp, ctx);
 	put_nfs_open_context(ctx);
+	nfs_fscache_set_inode_cookie(inode, filp);
 	return 0;
 }
 
@@ -787,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 	spin_unlock(&inode->i_lock);
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+	nfs_fscache_reset_inode_cookie(inode);
 	dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
 			inode->i_sb->s_id, (long long)NFS_FILEID(inode));
 	return 0;
@@ -1031,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	spin_lock(&inode->i_lock);
 	status = nfs_refresh_inode_locked(inode, fattr);
 	spin_unlock(&inode->i_lock);
+
 	return status;
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index fd3e7f9c6fd3..8a99e79d5ea5 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -185,6 +185,9 @@ struct nfs_inode {
 	fmode_t			 delegation_state;
 	struct rw_semaphore	rwsem;
 #endif /* CONFIG_NFS_V4*/
+#ifdef CONFIG_NFS_FSCACHE
+	struct fscache_cookie	*fscache;
+#endif
 	struct inode		vfs_inode;
 };
 
@@ -207,6 +210,8 @@ struct nfs_inode {
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_MOUNTPOINT	(3)		/* inode is remote mountpoint */
 #define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
+#define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
+#define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
@@ -260,6 +265,11 @@ static inline int NFS_STALE(const struct inode *inode)
 	return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
 }
 
+static inline int NFS_FSCACHE(const struct inode *inode)
+{
+	return test_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+
 static inline __u64 NFS_FILEID(const struct inode *inode)
 {
 	return NFS_I(inode)->fileid;
-- 
cgit v1.2.3


From 6a51091d0775cdc4a923f2172c61925ad416aa32 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:43 +0100
Subject: NFS: Add some new I/O counters for FS-Cache doing things for NFS

Add some new NFS I/O counters for FS-Cache doing things for NFS.  A new line is
emitted into /proc/pid/mountstats if caching is enabled that looks like:

	fsc: <rok> <rfl> <wok> <wfl> <unc>

Where <rok> is the number of pages read successfully from the cache, <rfl> is
the number of failed page reads against the cache, <wok> is the number of
successful page writes to the cache, <wfl> is the number of failed page writes
to the cache, and <unc> is the number of NFS pages that have been disconnected
from the cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/nfs/iostat.h            | 18 ++++++++++++++++++
 fs/nfs/super.c             | 11 +++++++++++
 include/linux/nfs_iostat.h | 12 ++++++++++++
 3 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a36952810032..a2ab2529b5ca 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -16,6 +16,9 @@
 
 struct nfs_iostats {
 	unsigned long long	bytes[__NFSIOS_BYTESMAX];
+#ifdef CONFIG_NFS_FSCACHE
+	unsigned long long	fscache[__NFSIOS_FSCACHEMAX];
+#endif
 	unsigned long		events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
 
@@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode,
 	nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
 
+#ifdef CONFIG_NFS_FSCACHE
+static inline void nfs_add_fscache_stats(struct inode *inode,
+					 enum nfs_stat_fscachecounters stat,
+					 unsigned long addend)
+{
+	struct nfs_iostats *iostats;
+	int cpu;
+
+	cpu = get_cpu();
+	iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
+	iostats->fscache[stat] += addend;
+	put_cpu_no_resched();
+}
+#endif
+
 static inline struct nfs_iostats *nfs_alloc_iostats(void)
 {
 	return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 87f65ae07f32..b5fea776a0dc 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -642,6 +642,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 			totals.events[i] += stats->events[i];
 		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
 			totals.bytes[i] += stats->bytes[i];
+#ifdef CONFIG_NFS_FSCACHE
+		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+			totals.fscache[i] += stats->fscache[i];
+#endif
 
 		preempt_enable();
 	}
@@ -652,6 +656,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 	seq_printf(m, "\n\tbytes:\t");
 	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
 		seq_printf(m, "%Lu ", totals.bytes[i]);
+#ifdef CONFIG_NFS_FSCACHE
+	if (nfss->options & NFS_OPTION_FSCACHE) {
+		seq_printf(m, "\n\tfsc:\t");
+		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+			seq_printf(m, "%Lu ", totals.bytes[i]);
+	}
+#endif
 	seq_printf(m, "\n");
 
 	rpc_print_iostats(m, nfss->client);
diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h
index 1cb9a3fed2b3..68b10f5f8907 100644
--- a/include/linux/nfs_iostat.h
+++ b/include/linux/nfs_iostat.h
@@ -116,4 +116,16 @@ enum nfs_stat_eventcounters {
 	__NFSIOS_COUNTSMAX,
 };
 
+/*
+ * NFS local caching servicing counters
+ */
+enum nfs_stat_fscachecounters {
+	NFSIOS_FSCACHE_PAGES_READ_OK,
+	NFSIOS_FSCACHE_PAGES_READ_FAIL,
+	NFSIOS_FSCACHE_PAGES_WRITTEN_OK,
+	NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL,
+	NFSIOS_FSCACHE_PAGES_UNCACHED,
+	__NFSIOS_FSCACHEMAX,
+};
+
 #endif	/* _LINUX_NFS_IOSTAT */
-- 
cgit v1.2.3


From f42b293d6d5259043a8944b556eeab427c695d57 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:44 +0100
Subject: NFS: nfs_readpage_async() needs to be accessible as a fallback for
 local caching

nfs_readpage_async() needs to be non-static so that it can be used as a
fallback for the local on-disk caching should an EIO crop up when reading the
cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/nfs/read.c          | 4 ++--
 include/linux/nfs_fs.h | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f856004bb7fa..98b74009c9d7 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -111,8 +111,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
 	}
 }
 
-static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
-		struct page *page)
+int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+		       struct page *page)
 {
 	LIST_HEAD(one_request);
 	struct nfs_page	*new;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 8a99e79d5ea5..fdffb413b192 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -516,6 +516,8 @@ extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
 extern int  nfs_readpage_result(struct rpc_task *, struct nfs_read_data *);
 extern void nfs_readdata_release(void *data);
+extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
+			       struct page *);
 
 /*
  * Allocate nfs_read_data structures
-- 
cgit v1.2.3


From 3688e07f83d81941c4a8b20e29602c6d0c883539 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Wed, 1 Apr 2009 23:38:49 -0500
Subject: Fix highmem PPC build failure

Commit f4112de6b679d84bd9b9681c7504be7bdfb7c7d5 ("mm: introduce
debug_kmap_atomic") broke PPC builds with CONFIG_HIGHMEM=y:

   CC      init/main.o
  In file included from include/linux/highmem.h:25,
                   from include/linux/pagemap.h:11,
                   from include/linux/mempolicy.h:63,
                   from init/main.c:53:
  arch/powerpc/include/asm/highmem.h: In function 'kmap_atomic_prot':
  arch/powerpc/include/asm/highmem.h:98: error: implicit declaration of function 'debug_kmap_atomic'
  In file included from include/linux/pagemap.h:11,
                   from include/linux/mempolicy.h:63,
                   from init/main.c:53:
  include/linux/highmem.h: At top level:
  include/linux/highmem.h:196: warning: conflicting types for 'debug_kmap_atomic'
  include/linux/highmem.h:196: error: static declaration of 'debug_kmap_atomic' follows non-static declaration
  include/asm/highmem.h:98: error: previous implicit declaration of 'debug_kmap_atomic' was here
  make[1]: *** [init/main.o] Error 1
  make: *** [init] Error 2

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Acked-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 7ff5c55f9b55..1fcb7126a01f 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -19,8 +19,21 @@ static inline void flush_kernel_dcache_page(struct page *page)
 }
 #endif
 
-#ifdef CONFIG_HIGHMEM
+#include <asm/kmap_types.h>
+
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+
+void debug_kmap_atomic(enum km_type type);
+
+#else
 
+static inline void debug_kmap_atomic(enum km_type type)
+{
+}
+
+#endif
+
+#ifdef CONFIG_HIGHMEM
 #include <asm/highmem.h>
 
 /* declarations for linux/mm/highmem.c */
@@ -44,8 +57,6 @@ static inline void *kmap(struct page *page)
 
 #define kunmap(page) do { (void) (page); } while (0)
 
-#include <asm/kmap_types.h>
-
 static inline void *kmap_atomic(struct page *page, enum km_type idx)
 {
 	pagefault_disable();
@@ -187,16 +198,4 @@ static inline void copy_highpage(struct page *to, struct page *from)
 	kunmap_atomic(vto, KM_USER1);
 }
 
-#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
-
-void debug_kmap_atomic(enum km_type type);
-
-#else
-
-static inline void debug_kmap_atomic(enum km_type type)
-{
-}
-
-#endif
-
 #endif /* _LINUX_HIGHMEM_H */
-- 
cgit v1.2.3


From 9756b15e1b58453a6fd54b85c1ad8515209e10bb Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 30 Mar 2009 20:37:20 -0700
Subject: irq: fix cpumask memory leak on offstack cpumask kernels

Need to free the old cpumask for affinity and pending_mask.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49D18FF0.50707@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h       | 14 ++++++++++++++
 kernel/irq/numa_migrate.c |  1 +
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 974890b3c52f..99d147efe399 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -482,6 +482,16 @@ static inline void init_copy_desc_masks(struct irq_desc *old_desc,
 #endif
 }
 
+static inline void free_desc_masks(struct irq_desc *old_desc,
+				   struct irq_desc *new_desc)
+{
+	free_cpumask_var(old_desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	free_cpumask_var(old_desc->pending_mask);
+#endif
+}
+
 #else /* !CONFIG_SMP */
 
 static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
@@ -495,6 +505,10 @@ static inline void init_copy_desc_masks(struct irq_desc *old_desc,
 {
 }
 
+static inline void free_desc_masks(struct irq_desc *old_desc,
+				   struct irq_desc *new_desc)
+{
+}
 #endif	/* CONFIG_SMP */
 
 #endif /* _LINUX_IRQ_H */
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 243d6121e50e..44bbdcbaf8d2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -54,6 +54,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
 {
 	free_kstat_irqs(old_desc, desc);
+	free_desc_masks(old_desc, desc);
 	arch_free_chip_data(old_desc, desc);
 }
 
-- 
cgit v1.2.3


From 8f912ba4d7cdaf7d31cf39fe5a9b7732308a256d Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Fri, 3 Apr 2009 15:19:32 +0100
Subject: intel-iommu: Add for_each_iommu() and for_each_active_iommu() macros

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/dmar.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 2f3427468956..0b4aa8069985 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -44,6 +44,14 @@ extern struct list_head dmar_drhd_units;
 #define for_each_drhd_unit(drhd) \
 	list_for_each_entry(drhd, &dmar_drhd_units, list)
 
+#define for_each_active_iommu(i, drhd)					\
+	list_for_each_entry(drhd, &dmar_drhd_units, list)		\
+		if (i=drhd->iommu, drhd->ignored) {} else
+
+#define for_each_iommu(i, drhd)						\
+	list_for_each_entry(drhd, &dmar_drhd_units, list)		\
+		if (i=drhd->iommu, 0) {} else 
+
 extern int dmar_table_init(void);
 extern int dmar_dev_scope_init(void);
 
-- 
cgit v1.2.3


From f59c7b69bcba31cd355ababe067202b9895d6102 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 27 Mar 2009 14:22:42 -0700
Subject: Intel IOMMU Suspend/Resume Support - DMAR

This patch implements the suspend and resume feature for Intel IOMMU
DMAR. It hooks to kernel suspend and resume interface. When suspend happens, it
saves necessary hardware registers. When resume happens, it restores the
registers and restarts IOMMU by enabling translation, setting up root entry, and
re-enabling queued invalidation.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c   | 146 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/intel-iommu.h |  11 ++++
 2 files changed, 157 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 23e56a564e05..ef5795d0396b 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -36,6 +36,7 @@
 #include <linux/iova.h>
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
+#include <linux/sysdev.h>
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
 #include "pci.h"
@@ -2597,6 +2598,150 @@ static void __init init_no_remapping_devices(void)
 	}
 }
 
+#ifdef CONFIG_SUSPEND
+static int init_iommu_hw(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+
+	for_each_active_iommu(iommu, drhd)
+		if (iommu->qi)
+			dmar_reenable_qi(iommu);
+
+	for_each_active_iommu(iommu, drhd) {
+		iommu_flush_write_buffer(iommu);
+
+		iommu_set_root_entry(iommu);
+
+		iommu->flush.flush_context(iommu, 0, 0, 0,
+						DMA_CCMD_GLOBAL_INVL, 0);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+						DMA_TLB_GLOBAL_FLUSH, 0);
+		iommu_disable_protect_mem_regions(iommu);
+		iommu_enable_translation(iommu);
+	}
+
+	return 0;
+}
+
+static void iommu_flush_all(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	for_each_active_iommu(iommu, drhd) {
+		iommu->flush.flush_context(iommu, 0, 0, 0,
+						DMA_CCMD_GLOBAL_INVL, 0);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+						DMA_TLB_GLOBAL_FLUSH, 0);
+	}
+}
+
+static int iommu_suspend(struct sys_device *dev, pm_message_t state)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+	unsigned long flag;
+
+	for_each_active_iommu(iommu, drhd) {
+		iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
+						 GFP_ATOMIC);
+		if (!iommu->iommu_state)
+			goto nomem;
+	}
+
+	iommu_flush_all();
+
+	for_each_active_iommu(iommu, drhd) {
+		iommu_disable_translation(iommu);
+
+		spin_lock_irqsave(&iommu->register_lock, flag);
+
+		iommu->iommu_state[SR_DMAR_FECTL_REG] =
+			readl(iommu->reg + DMAR_FECTL_REG);
+		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
+			readl(iommu->reg + DMAR_FEDATA_REG);
+		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
+			readl(iommu->reg + DMAR_FEADDR_REG);
+		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
+			readl(iommu->reg + DMAR_FEUADDR_REG);
+
+		spin_unlock_irqrestore(&iommu->register_lock, flag);
+	}
+	return 0;
+
+nomem:
+	for_each_active_iommu(iommu, drhd)
+		kfree(iommu->iommu_state);
+
+	return -ENOMEM;
+}
+
+static int iommu_resume(struct sys_device *dev)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+	unsigned long flag;
+
+	if (init_iommu_hw()) {
+		WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
+		return -EIO;
+	}
+
+	for_each_active_iommu(iommu, drhd) {
+
+		spin_lock_irqsave(&iommu->register_lock, flag);
+
+		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
+			iommu->reg + DMAR_FECTL_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
+			iommu->reg + DMAR_FEDATA_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
+			iommu->reg + DMAR_FEADDR_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
+			iommu->reg + DMAR_FEUADDR_REG);
+
+		spin_unlock_irqrestore(&iommu->register_lock, flag);
+	}
+
+	for_each_active_iommu(iommu, drhd)
+		kfree(iommu->iommu_state);
+
+	return 0;
+}
+
+static struct sysdev_class iommu_sysclass = {
+	.name		= "iommu",
+	.resume		= iommu_resume,
+	.suspend	= iommu_suspend,
+};
+
+static struct sys_device device_iommu = {
+	.cls	= &iommu_sysclass,
+};
+
+static int __init init_iommu_sysfs(void)
+{
+	int error;
+
+	error = sysdev_class_register(&iommu_sysclass);
+	if (error)
+		return error;
+
+	error = sysdev_register(&device_iommu);
+	if (error)
+		sysdev_class_unregister(&iommu_sysclass);
+
+	return error;
+}
+
+#else
+static int __init init_iommu_sysfs(void)
+{
+	return 0;
+}
+#endif	/* CONFIG_PM */
+
 int __init intel_iommu_init(void)
 {
 	int ret = 0;
@@ -2632,6 +2777,7 @@ int __init intel_iommu_init(void)
 	init_timer(&unmap_timer);
 	force_iommu = 1;
 	dma_ops = &intel_dma_ops;
+	init_iommu_sysfs();
 
 	register_iommu(&intel_iommu_ops);
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 77214ead1a36..3771cd1f876e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -284,6 +284,14 @@ struct iommu_flush {
 		unsigned int size_order, u64 type, int non_present_entry_flush);
 };
 
+enum {
+	SR_DMAR_FECTL_REG,
+	SR_DMAR_FEDATA_REG,
+	SR_DMAR_FEADDR_REG,
+	SR_DMAR_FEUADDR_REG,
+	MAX_SR_DMAR_REGS
+};
+
 struct intel_iommu {
 	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
 	u64		cap;
@@ -304,6 +312,8 @@ struct intel_iommu {
 	struct iommu_flush flush;
 #endif
 	struct q_inval  *qi;            /* Queued invalidation info */
+	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
+
 #ifdef CONFIG_INTR_REMAP
 	struct ir_table *ir_table;	/* Interrupt remapping info */
 #endif
@@ -322,6 +332,7 @@ extern int alloc_iommu(struct dmar_drhd_unit *drhd);
 extern void free_iommu(struct intel_iommu *iommu);
 extern int dmar_enable_qi(struct intel_iommu *iommu);
 extern void dmar_disable_qi(struct intel_iommu *iommu);
+extern int dmar_reenable_qi(struct intel_iommu *iommu);
 extern void qi_global_iec(struct intel_iommu *iommu);
 
 extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
-- 
cgit v1.2.3


From b24696bc55f66fecc30715e003f10fc2555a9271 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 27 Mar 2009 14:22:44 -0700
Subject: Intel IOMMU Suspend/Resume Support - Interrupt Remapping

This patch enables suspend/resume for interrupt remapping. During suspend,
interrupt remapping is disabled. When resume, interrupt remapping is enabled
again.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/include/asm/apic.h    |   4 ++
 arch/x86/include/asm/io_apic.h |  11 ++--
 arch/x86/kernel/apic/apic.c    |  70 +++++++++++++++++++++++---
 arch/x86/kernel/apic/io_apic.c | 111 ++++++++++++++++++++++++++---------------
 drivers/pci/intr_remapping.c   |  61 +++++++++++++++++++++-
 include/linux/dmar.h           |   2 +
 6 files changed, 204 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index df8a300dfe6c..f9f0866ed6f8 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -108,6 +108,10 @@ extern void native_apic_icr_write(u32 low, u32 id);
 extern u64 native_apic_icr_read(void);
 
 #ifdef CONFIG_X86_X2APIC
+
+#define EIM_8BIT_APIC_ID	0
+#define EIM_32BIT_APIC_ID	1
+
 /*
  * Make previous memory operations globally visible before
  * sending the IPI through x2apic wrmsr. We need a serializing instruction or
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 373cc2bbcad2..9d826e436010 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -162,10 +162,13 @@ extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
 #ifdef CONFIG_X86_64
-extern int save_IO_APIC_setup(void);
-extern void mask_IO_APIC_setup(void);
-extern void restore_IO_APIC_setup(void);
-extern void reinit_intr_remapped_IO_APIC(int);
+extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
+extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
+extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
+extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
+extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
+extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
+	struct IO_APIC_route_entry **ioapic_entries);
 #endif
 
 extern void probe_nr_irqs_gsi(void);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 85eb8e100818..098ec84b8c00 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1304,6 +1304,7 @@ void __init enable_IR_x2apic(void)
 #ifdef CONFIG_INTR_REMAP
 	int ret;
 	unsigned long flags;
+	struct IO_APIC_route_entry **ioapic_entries = NULL;
 
 	if (!cpu_has_x2apic)
 		return;
@@ -1334,17 +1335,23 @@ void __init enable_IR_x2apic(void)
 		return;
 	}
 
-	ret = save_IO_APIC_setup();
+	ioapic_entries = alloc_ioapic_entries();
+	if (!ioapic_entries) {
+		pr_info("Allocate ioapic_entries failed: %d\n", ret);
+		goto end;
+	}
+
+	ret = save_IO_APIC_setup(ioapic_entries);
 	if (ret) {
 		pr_info("Saving IO-APIC state failed: %d\n", ret);
 		goto end;
 	}
 
 	local_irq_save(flags);
-	mask_IO_APIC_setup();
+	mask_IO_APIC_setup(ioapic_entries);
 	mask_8259A();
 
-	ret = enable_intr_remapping(1);
+	ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
 
 	if (ret && x2apic_preenabled) {
 		local_irq_restore(flags);
@@ -1364,9 +1371,9 @@ end_restore:
 		/*
 		 * IR enabling failed
 		 */
-		restore_IO_APIC_setup();
+		restore_IO_APIC_setup(ioapic_entries);
 	else
-		reinit_intr_remapped_IO_APIC(x2apic_preenabled);
+		reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
 
 	unmask_8259A();
 	local_irq_restore(flags);
@@ -1379,6 +1386,8 @@ end:
 			pr_info("Enabled Interrupt-remapping\n");
 	} else
 		pr_err("Failed to enable Interrupt-remapping and x2apic\n");
+	if (ioapic_entries)
+		free_ioapic_entries(ioapic_entries);
 #else
 	if (!cpu_has_x2apic)
 		return;
@@ -1954,6 +1963,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 
 	local_irq_save(flags);
 	disable_local_APIC();
+#ifdef CONFIG_INTR_REMAP
+	if (intr_remapping_enabled)
+		disable_intr_remapping();
+#endif
 	local_irq_restore(flags);
 	return 0;
 }
@@ -1964,15 +1977,41 @@ static int lapic_resume(struct sys_device *dev)
 	unsigned long flags;
 	int maxlvt;
 
+#ifdef CONFIG_INTR_REMAP
+	int ret;
+	struct IO_APIC_route_entry **ioapic_entries = NULL;
+
 	if (!apic_pm_state.active)
 		return 0;
 
-	maxlvt = lapic_get_maxlvt();
-
 	local_irq_save(flags);
+	if (x2apic) {
+		ioapic_entries = alloc_ioapic_entries();
+		if (!ioapic_entries) {
+			WARN(1, "Alloc ioapic_entries in lapic resume failed.");
+			return -ENOMEM;
+		}
+
+		ret = save_IO_APIC_setup(ioapic_entries);
+		if (ret) {
+			WARN(1, "Saving IO-APIC state failed: %d\n", ret);
+			free_ioapic_entries(ioapic_entries);
+			return ret;
+		}
+
+		mask_IO_APIC_setup(ioapic_entries);
+		mask_8259A();
+		enable_x2apic();
+	}
+#else
+	if (!apic_pm_state.active)
+		return 0;
 
+	local_irq_save(flags);
 	if (x2apic)
 		enable_x2apic();
+#endif
+
 	else {
 		/*
 		 * Make sure the APICBASE points to the right address
@@ -1986,6 +2025,7 @@ static int lapic_resume(struct sys_device *dev)
 		wrmsr(MSR_IA32_APICBASE, l, h);
 	}
 
+	maxlvt = lapic_get_maxlvt();
 	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
 	apic_write(APIC_ID, apic_pm_state.apic_id);
 	apic_write(APIC_DFR, apic_pm_state.apic_dfr);
@@ -2009,8 +2049,20 @@ static int lapic_resume(struct sys_device *dev)
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
 
+#ifdef CONFIG_INTR_REMAP
+	if (intr_remapping_enabled)
+		reenable_intr_remapping(EIM_32BIT_APIC_ID);
+
+	if (x2apic) {
+		unmask_8259A();
+		restore_IO_APIC_setup(ioapic_entries);
+		free_ioapic_entries(ioapic_entries);
+	}
+#endif
+
 	local_irq_restore(flags);
 
+
 	return 0;
 }
 
@@ -2048,7 +2100,9 @@ static int __init init_lapic_sysfs(void)
 		error = sysdev_register(&device_lapic);
 	return error;
 }
-device_initcall(init_lapic_sysfs);
+
+/* local apic needs to resume before other devices access its registers. */
+core_initcall(init_lapic_sysfs);
 
 #else	/* CONFIG_PM */
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1bb5c6cee3eb..0ad3fe77641a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -851,63 +851,74 @@ __setup("pirq=", ioapic_pirq_setup);
 #endif /* CONFIG_X86_32 */
 
 #ifdef CONFIG_INTR_REMAP
-/* I/O APIC RTE contents at the OS boot up */
-static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
+struct IO_APIC_route_entry **alloc_ioapic_entries(void)
+{
+	int apic;
+	struct IO_APIC_route_entry **ioapic_entries;
+
+	ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
+				GFP_ATOMIC);
+	if (!ioapic_entries)
+		return 0;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		ioapic_entries[apic] =
+			kzalloc(sizeof(struct IO_APIC_route_entry) *
+				nr_ioapic_registers[apic], GFP_ATOMIC);
+		if (!ioapic_entries[apic])
+			goto nomem;
+	}
+
+	return ioapic_entries;
+
+nomem:
+	while (--apic >= 0)
+		kfree(ioapic_entries[apic]);
+	kfree(ioapic_entries);
+
+	return 0;
+}
 
 /*
  * Saves all the IO-APIC RTE's
  */
-int save_IO_APIC_setup(void)
+int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
 {
-	union IO_APIC_reg_01 reg_01;
-	unsigned long flags;
 	int apic, pin;
 
-	/*
-	 * The number of IO-APIC IRQ registers (== #pins):
-	 */
-	for (apic = 0; apic < nr_ioapics; apic++) {
-		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_01.raw = io_apic_read(apic, 1);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
-	}
+	if (!ioapic_entries)
+		return -ENOMEM;
 
 	for (apic = 0; apic < nr_ioapics; apic++) {
-		early_ioapic_entries[apic] =
-			kzalloc(sizeof(struct IO_APIC_route_entry) *
-				nr_ioapic_registers[apic], GFP_KERNEL);
-		if (!early_ioapic_entries[apic])
-			goto nomem;
-	}
+		if (!ioapic_entries[apic])
+			return -ENOMEM;
 
-	for (apic = 0; apic < nr_ioapics; apic++)
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
-			early_ioapic_entries[apic][pin] =
+			ioapic_entries[apic][pin] =
 				ioapic_read_entry(apic, pin);
+	}
 
 	return 0;
-
-nomem:
-	while (apic >= 0)
-		kfree(early_ioapic_entries[apic--]);
-	memset(early_ioapic_entries, 0,
-		ARRAY_SIZE(early_ioapic_entries));
-
-	return -ENOMEM;
 }
 
-void mask_IO_APIC_setup(void)
+/*
+ * Mask all IO APIC entries.
+ */
+void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
 {
 	int apic, pin;
 
+	if (!ioapic_entries)
+		return;
+
 	for (apic = 0; apic < nr_ioapics; apic++) {
-		if (!early_ioapic_entries[apic])
+		if (!ioapic_entries[apic])
 			break;
+
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
 			struct IO_APIC_route_entry entry;
 
-			entry = early_ioapic_entries[apic][pin];
+			entry = ioapic_entries[apic][pin];
 			if (!entry.mask) {
 				entry.mask = 1;
 				ioapic_write_entry(apic, pin, entry);
@@ -916,22 +927,30 @@ void mask_IO_APIC_setup(void)
 	}
 }
 
-void restore_IO_APIC_setup(void)
+/*
+ * Restore IO APIC entries which was saved in ioapic_entries.
+ */
+int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
 {
 	int apic, pin;
 
+	if (!ioapic_entries)
+		return -ENOMEM;
+
 	for (apic = 0; apic < nr_ioapics; apic++) {
-		if (!early_ioapic_entries[apic])
-			break;
+		if (!ioapic_entries[apic])
+			return -ENOMEM;
+
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
 			ioapic_write_entry(apic, pin,
-					   early_ioapic_entries[apic][pin]);
-		kfree(early_ioapic_entries[apic]);
-		early_ioapic_entries[apic] = NULL;
+					ioapic_entries[apic][pin]);
 	}
+	return 0;
 }
 
-void reinit_intr_remapped_IO_APIC(int intr_remapping)
+void reinit_intr_remapped_IO_APIC(int intr_remapping,
+	struct IO_APIC_route_entry **ioapic_entries)
+
 {
 	/*
 	 * for now plain restore of previous settings.
@@ -940,7 +959,17 @@ void reinit_intr_remapped_IO_APIC(int intr_remapping)
 	 * table entries. for now, do a plain restore, and wait for
 	 * the setup_IO_APIC_irqs() to do proper initialization.
 	 */
-	restore_IO_APIC_setup();
+	restore_IO_APIC_setup(ioapic_entries);
+}
+
+void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
+{
+	int apic;
+
+	for (apic = 0; apic < nr_ioapics; apic++)
+		kfree(ioapic_entries[apic]);
+
+	kfree(ioapic_entries);
 }
 #endif
 
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index b041a409f4a7..c26633d7e7da 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -470,7 +470,7 @@ static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
 /*
  * Disable Interrupt Remapping.
  */
-static void disable_intr_remapping(struct intel_iommu *iommu)
+static void iommu_disable_intr_remapping(struct intel_iommu *iommu)
 {
 	unsigned long flags;
 	u32 sts;
@@ -478,6 +478,12 @@ static void disable_intr_remapping(struct intel_iommu *iommu)
 	if (!ecap_ir_support(iommu->ecap))
 		return;
 
+	/*
+	 * global invalidation of interrupt entry cache before disabling
+	 * interrupt-remapping.
+	 */
+	qi_global_iec(iommu);
+
 	spin_lock_irqsave(&iommu->register_lock, flags);
 
 	sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
@@ -511,7 +517,7 @@ int __init enable_intr_remapping(int eim)
 		 * Disable intr remapping and queued invalidation, if already
 		 * enabled prior to OS handover.
 		 */
-		disable_intr_remapping(iommu);
+		iommu_disable_intr_remapping(iommu);
 
 		dmar_disable_qi(iommu);
 	}
@@ -639,3 +645,54 @@ int __init parse_ioapics_under_ir(void)
 
 	return ir_supported;
 }
+
+void disable_intr_remapping(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+
+	/*
+	 * Disable Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_iommu(iommu, drhd) {
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		iommu_disable_intr_remapping(iommu);
+	}
+}
+
+int reenable_intr_remapping(int eim)
+{
+	struct dmar_drhd_unit *drhd;
+	int setup = 0;
+	struct intel_iommu *iommu = NULL;
+
+	for_each_iommu(iommu, drhd)
+		if (iommu->qi)
+			dmar_reenable_qi(iommu);
+
+	/*
+	 * Setup Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_iommu(iommu, drhd) {
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		/* Set up interrupt remapping for iommu.*/
+		iommu_set_intr_remapping(iommu, eim);
+		setup = 1;
+	}
+
+	if (!setup)
+		goto error;
+
+	return 0;
+
+error:
+	/*
+	 * handle error condition gracefully here!
+	 */
+	return -1;
+}
+
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 0b4aa8069985..4a0ce6f27e1b 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -108,6 +108,8 @@ struct irte {
 #ifdef CONFIG_INTR_REMAP
 extern int intr_remapping_enabled;
 extern int enable_intr_remapping(int);
+extern void disable_intr_remapping(void);
+extern int reenable_intr_remapping(int);
 
 extern int get_irte(int irq, struct irte *entry);
 extern int modify_irte(int irq, struct irte *irte_modified);
-- 
cgit v1.2.3


From 161fde083f3403e7aa178dc944bf43c339e18491 Mon Sep 17 00:00:00 2001
From: "Han, Weidong" <weidong.han@intel.com>
Date: Fri, 3 Apr 2009 17:15:47 +0800
Subject: intel-iommu: set compatibility format interrupt

When extended interrupt mode (x2apic mode) is not supported in a
system, it must set compatibility format interrupt to bypass
interrupt remapping, otherwise compatibility format interrupts
will be blocked.

This will be used when interrupt remapping is enabled while x2apic
is not supported.

Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intr_remapping.c | 15 +++++++++++++++
 include/linux/intel-iommu.h  |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index c26633d7e7da..ef25caade54b 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -415,12 +415,27 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
 
 	/* Set interrupt-remapping table pointer */
 	cmd = iommu->gcmd | DMA_GCMD_SIRTP;
+	iommu->gcmd |= DMA_GCMD_SIRTP;
 	writel(cmd, iommu->reg + DMAR_GCMD_REG);
 
 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 		      readl, (sts & DMA_GSTS_IRTPS), sts);
 	spin_unlock_irqrestore(&iommu->register_lock, flags);
 
+	if (mode == 0) {
+		spin_lock_irqsave(&iommu->register_lock, flags);
+
+		/* enable comaptiblity format interrupt pass through */
+		cmd = iommu->gcmd | DMA_GCMD_CFI;
+		iommu->gcmd |= DMA_GCMD_CFI;
+		writel(cmd, iommu->reg + DMAR_GCMD_REG);
+
+		IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+			      readl, (sts & DMA_GSTS_CFIS), sts);
+
+		spin_unlock_irqrestore(&iommu->register_lock, flags);
+	}
+
 	/*
 	 * global invalidation of interrupt entry cache before enabling
 	 * interrupt-remapping.
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 3771cd1f876e..aa8c53171233 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -164,6 +164,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define DMA_GCMD_QIE (((u32)1) << 26)
 #define DMA_GCMD_SIRTP (((u32)1) << 24)
 #define DMA_GCMD_IRE (((u32) 1) << 25)
+#define DMA_GCMD_CFI (((u32) 1) << 23)
 
 /* GSTS_REG */
 #define DMA_GSTS_TES (((u32)1) << 31)
@@ -174,6 +175,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define DMA_GSTS_QIES (((u32)1) << 26)
 #define DMA_GSTS_IRTPS (((u32)1) << 24)
 #define DMA_GSTS_IRES (((u32)1) << 25)
+#define DMA_GSTS_CFIS (((u32)1) << 23)
 
 /* CCMD_REG */
 #define DMA_CCMD_ICC (((u64)1) << 63)
-- 
cgit v1.2.3


From ce0d9d7255a55628fd3732bf583c83e90150b699 Mon Sep 17 00:00:00 2001
From: Evgeniy Polyakov <zbr@ioremap.net>
Date: Wed, 14 Jan 2009 02:05:27 +0300
Subject: Staging: dst: core files.

This patch contains DST core files, which introduce
block layer, connector and sysfs registration glue and main headers.

Connector is used for the configuration of the node (its type, address,
device name and so on). Sysfs provides bits of information about running
devices in the following format:

+/*
+ * DST sysfs tree for device called 'storage':
+ *
+ * /sys/bus/dst/devices/storage/
+ * /sys/bus/dst/devices/storage/type : 192.168.4.80:1025
+ * /sys/bus/dst/devices/storage/size : 800
+ * /sys/bus/dst/devices/storage/name : storage
+ */

DST header contains structure definitions and protocol command description.

Signed-off-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/staging/dst/dcore.c | 972 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/connector.h   |   4 +-
 include/linux/dst.h         | 587 ++++++++++++++++++++++++++
 3 files changed, 1562 insertions(+), 1 deletion(-)
 create mode 100644 drivers/staging/dst/dcore.c
 create mode 100644 include/linux/dst.h

(limited to 'include/linux')

diff --git a/drivers/staging/dst/dcore.c b/drivers/staging/dst/dcore.c
new file mode 100644
index 000000000000..c6e3cd1a5051
--- /dev/null
+++ b/drivers/staging/dst/dcore.c
@@ -0,0 +1,972 @@
+/*
+ * 2007+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/connector.h>
+#include <linux/dst.h>
+#include <linux/device.h>
+#include <linux/jhash.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+#include <net/sock.h>
+
+static int dst_major;
+
+static DEFINE_MUTEX(dst_hash_lock);
+static struct list_head *dst_hashtable;
+static unsigned int dst_hashtable_size = 128;
+module_param(dst_hashtable_size, uint, 0644);
+
+static char dst_name[] = "Dementianting goldfish";
+
+static DEFINE_IDR(dst_index_idr);
+static struct cb_id cn_dst_id = { CN_DST_IDX, CN_DST_VAL };
+
+/*
+ * DST sysfs tree for device called 'storage':
+ *
+ * /sys/bus/dst/devices/storage/
+ * /sys/bus/dst/devices/storage/type : 192.168.4.80:1025
+ * /sys/bus/dst/devices/storage/size : 800
+ * /sys/bus/dst/devices/storage/name : storage
+ */
+
+static int dst_dev_match(struct device *dev, struct device_driver *drv)
+{
+	return 1;
+}
+
+static struct bus_type dst_dev_bus_type = {
+	.name 		= "dst",
+	.match 		= &dst_dev_match,
+};
+
+static void dst_node_release(struct device *dev)
+{
+	struct dst_info *info = container_of(dev, struct dst_info, device);
+
+	kfree(info);
+}
+
+static struct device dst_node_dev = {
+	.bus 		= &dst_dev_bus_type,
+	.release 	= &dst_node_release
+};
+
+/*
+ * Setting size of the node after it was changed.
+ */
+static void dst_node_set_size(struct dst_node *n)
+{
+	struct block_device *bdev;
+
+	set_capacity(n->disk, n->size >> 9);
+
+	bdev = bdget_disk(n->disk, 0);
+	if (bdev) {
+		mutex_lock(&bdev->bd_inode->i_mutex);
+		i_size_write(bdev->bd_inode, n->size);
+		mutex_unlock(&bdev->bd_inode->i_mutex);
+		bdput(bdev);
+	}
+}
+
+/*
+ * Distributed storage request processing function.
+ */
+static int dst_request(struct request_queue *q, struct bio *bio)
+{
+	struct dst_node *n = q->queuedata;
+
+	bio_get(bio);
+
+	return dst_process_bio(n, bio);
+}
+
+/*
+ * Open/close callbacks for appropriate block device.
+ */
+static int dst_bdev_open(struct block_device *bdev, fmode_t mode)
+{
+	struct dst_node *n = bdev->bd_disk->private_data;
+
+	dst_node_get(n);
+	return 0;
+}
+
+static int dst_bdev_release(struct gendisk *disk, fmode_t mode)
+{
+	struct dst_node *n = disk->private_data;
+
+	dst_node_put(n);
+	return 0;
+}
+
+static struct block_device_operations dst_blk_ops = {
+	.open		= dst_bdev_open,
+	.release	= dst_bdev_release,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Block layer binding - disk is created when array is fully configured
+ * by userspace request.
+ */
+static int dst_node_create_disk(struct dst_node *n)
+{
+	int err = -ENOMEM;
+	u32 index = 0;
+
+	n->queue = blk_init_queue(NULL, NULL);
+	if (!n->queue)
+		goto err_out_exit;
+
+	n->queue->queuedata = n;
+	blk_queue_make_request(n->queue, dst_request);
+	blk_queue_max_phys_segments(n->queue, n->max_pages);
+	blk_queue_max_hw_segments(n->queue, n->max_pages);
+
+	err = -ENOMEM;
+	n->disk = alloc_disk(1);
+	if (!n->disk)
+		goto err_out_free_queue;
+
+	if (!(n->state->permissions & DST_PERM_WRITE)) {
+		printk(KERN_INFO "DST node %s attached read-only.\n", n->name);
+		set_disk_ro(n->disk, 1);
+	}
+
+	if (!idr_pre_get(&dst_index_idr, GFP_KERNEL))
+		goto err_out_put;
+
+	mutex_lock(&dst_hash_lock);
+	err = idr_get_new(&dst_index_idr, NULL, &index);
+	mutex_unlock(&dst_hash_lock);
+	if (err)
+		goto err_out_put;
+
+	n->disk->major = dst_major;
+	n->disk->first_minor = index;
+	n->disk->fops = &dst_blk_ops;
+	n->disk->queue = n->queue;
+	n->disk->private_data = n;
+	snprintf(n->disk->disk_name, sizeof(n->disk->disk_name), "dst-%s", n->name);
+
+	return 0;
+
+err_out_put:
+	put_disk(n->disk);
+err_out_free_queue:
+	blk_cleanup_queue(n->queue);
+err_out_exit:
+	return err;
+}
+
+/*
+ * Sysfs machinery: show device's size.
+ */
+static ssize_t dst_show_size(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dst_info *info = container_of(dev, struct dst_info, device);
+
+	return sprintf(buf, "%llu\n", info->size);
+}
+
+/*
+ * Show local exported device.
+ */
+static ssize_t dst_show_local(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dst_info *info = container_of(dev, struct dst_info, device);
+
+	return sprintf(buf, "%s\n", info->local);
+}
+
+/*
+ * Shows type of the remote node - device major/minor number
+ * for local nodes and address (af_inet ipv4/ipv6 only) for remote nodes.
+ */
+static ssize_t dst_show_type(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dst_info *info = container_of(dev, struct dst_info, device);
+	int family = info->net.addr.sa_family;
+
+	if (family == AF_INET) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)&info->net.addr;
+		return sprintf(buf, "%u.%u.%u.%u:%d\n",
+			NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port));
+	} else if (family == AF_INET6) {
+		struct sockaddr_in6 *sin = (struct sockaddr_in6 *)&info->net.addr;
+		return sprintf(buf,
+			"%pi6:%d\n",
+			&sin->sin6_addr, ntohs(sin->sin6_port));
+	} else {
+		int i, sz = PAGE_SIZE - 2; /* 0 symbol and '\n' below */
+		int size, addrlen = info->net.addr.sa_data_len;
+		unsigned char *a = (unsigned char *)&info->net.addr.sa_data;
+		char *buf_orig = buf;
+
+		size = snprintf(buf, sz, "family: %d, addrlen: %u, addr: ",
+				family, addrlen);
+		sz -= size;
+		buf += size;
+
+		for (i=0; i<addrlen; ++i) {
+			if (sz < 3)
+				break;
+
+			size = snprintf(buf, sz, "%02x ", a[i]);
+			sz -= size;
+			buf += size;
+		}
+		buf += sprintf(buf, "\n");
+
+		return buf - buf_orig;
+	}
+	return 0;
+}
+
+static struct device_attribute dst_node_attrs[] = {
+	__ATTR(size, 0444, dst_show_size, NULL),
+	__ATTR(type, 0444, dst_show_type, NULL),
+	__ATTR(local, 0444, dst_show_local, NULL),
+};
+
+static int dst_create_node_attributes(struct dst_node *n)
+{
+	int err, i;
+
+	for (i=0; i<ARRAY_SIZE(dst_node_attrs); ++i) {
+		err = device_create_file(&n->info->device,
+				&dst_node_attrs[i]);
+		if (err)
+			goto err_out_remove_all;
+	}
+	return 0;
+
+err_out_remove_all:
+	while (--i >= 0)
+		device_remove_file(&n->info->device,
+				&dst_node_attrs[i]);
+
+	return err;
+}
+
+static void dst_remove_node_attributes(struct dst_node *n)
+{
+	int i;
+
+	for (i=0; i<ARRAY_SIZE(dst_node_attrs); ++i)
+		device_remove_file(&n->info->device,
+				&dst_node_attrs[i]);
+}
+
+/*
+ * Sysfs cleanup and initialization.
+ * Shows number of useful parameters.
+ */
+static void dst_node_sysfs_exit(struct dst_node *n)
+{
+	if (n->info) {
+		dst_remove_node_attributes(n);
+		device_unregister(&n->info->device);
+		n->info = NULL;
+	}
+}
+
+static int dst_node_sysfs_init(struct dst_node *n)
+{
+	int err;
+
+	n->info = kzalloc(sizeof(struct dst_info), GFP_KERNEL);
+	if (!n->info)
+		return -ENOMEM;
+
+	memcpy(&n->info->device, &dst_node_dev, sizeof(struct device));
+	n->info->size = n->size;
+
+	snprintf(n->info->device.bus_id, sizeof(n->info->device.bus_id), "dst-%s", n->name);
+	err = device_register(&n->info->device);
+	if (err) {
+		dprintk(KERN_ERR "Failed to register node '%s', err: %d.\n",
+				n->name, err);
+		goto err_out_exit;
+	}
+
+	dst_create_node_attributes(n);
+
+	return 0;
+
+err_out_exit:
+	kfree(n->info);
+	n->info = NULL;
+	return err;
+}
+
+/*
+ * DST node hash tables machinery.
+ */
+static inline unsigned int dst_hash(char *str, unsigned int size)
+{
+	return (jhash(str, size, 0) % dst_hashtable_size);
+}
+
+static void dst_node_remove(struct dst_node *n)
+{
+	mutex_lock(&dst_hash_lock);
+	list_del_init(&n->node_entry);
+	mutex_unlock(&dst_hash_lock);
+}
+
+static void dst_node_add(struct dst_node *n)
+{
+	unsigned hash = dst_hash(n->name, sizeof(n->name));
+
+	mutex_lock(&dst_hash_lock);
+	list_add_tail(&n->node_entry, &dst_hashtable[hash]);
+	mutex_unlock(&dst_hash_lock);
+}
+
+/*
+ * Cleaning node when it is about to be freed.
+ * There are still users of the socket though,
+ * so connection cleanup should be protected.
+ */
+static void dst_node_cleanup(struct dst_node *n)
+{
+	struct dst_state *st = n->state;
+
+	if (!st)
+		return;
+
+	if (n->queue) {
+		blk_cleanup_queue(n->queue);
+
+		mutex_lock(&dst_hash_lock);
+		idr_remove(&dst_index_idr, n->disk->first_minor);
+		mutex_unlock(&dst_hash_lock);
+
+		put_disk(n->disk);
+	}
+
+	if (n->bdev) {
+		sync_blockdev(n->bdev);
+		blkdev_put(n->bdev, FMODE_READ|FMODE_WRITE);
+	}
+
+	dst_state_lock(st);
+	st->need_exit = 1;
+	dst_state_exit_connected(st);
+	dst_state_unlock(st);
+
+	wake_up(&st->thread_wait);
+
+	dst_state_put(st);
+	n->state = NULL;
+}
+
+/*
+ * Free security attributes attached to given node.
+ */
+static void dst_security_exit(struct dst_node *n)
+{
+	struct dst_secure *s, *tmp;
+
+	list_for_each_entry_safe(s, tmp, &n->security_list, sec_entry) {
+		list_del(&s->sec_entry);
+		kfree(s);
+	}
+}
+
+/*
+ * Free node when there are no more users.
+ * Actually node has to be freed on behalf od userspace process,
+ * since there are number of threads, which are embedded in the
+ * node, so they can not exit and free node from there, that is
+ * why there is a wakeup if reference counter is not equal to zero.
+ */
+void dst_node_put(struct dst_node *n)
+{
+	if (unlikely(!n))
+		return;
+
+	dprintk("%s: n: %p, refcnt: %d.\n",
+			__func__, n, atomic_read(&n->refcnt));
+
+	if (atomic_dec_and_test(&n->refcnt)) {
+		dst_node_remove(n);
+		n->trans_scan_timeout = 0;
+		dst_node_cleanup(n);
+		thread_pool_destroy(n->pool);
+		dst_node_sysfs_exit(n);
+		dst_node_crypto_exit(n);
+		dst_security_exit(n);
+		dst_node_trans_exit(n);
+
+		kfree(n);
+
+		dprintk("%s: freed n: %p.\n", __func__, n);
+	} else {
+		wake_up(&n->wait);
+	}
+}
+
+/*
+ * This function finds devices major/minor numbers for given pathname.
+ */
+static int dst_lookup_device(const char *path, dev_t *dev)
+{
+	int err;
+	struct nameidata nd;
+	struct inode *inode;
+
+	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+	if (err)
+		return err;
+
+	inode = nd.path.dentry->d_inode;
+	if (!inode) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	if (!S_ISBLK(inode->i_mode)) {
+		err = -ENOTBLK;
+		goto out;
+	}
+
+	*dev = inode->i_rdev;
+
+out:
+	path_put(&nd.path);
+	return err;
+}
+
+/*
+ * Setting up export device: lookup by the name, get its size
+ * and setup listening socket, which will accept clients, which
+ * will submit IO for given storage.
+ */
+static int dst_setup_export(struct dst_node *n, struct dst_ctl *ctl,
+		struct dst_export_ctl *le)
+{
+	int err;
+	dev_t dev = 0; /* gcc likes to scream here */
+
+	snprintf(n->info->local, sizeof(n->info->local), "%s", le->device);
+
+	err = dst_lookup_device(le->device, &dev);
+	if (err)
+		return err;
+
+	n->bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	if (!n->bdev)
+		return -ENODEV;
+
+	if (n->size != 0)
+		n->size = min_t(loff_t, n->bdev->bd_inode->i_size, n->size);
+	else
+		n->size = n->bdev->bd_inode->i_size;
+
+	n->info->size = n->size;
+	err = dst_node_init_listened(n, le);
+	if (err)
+		goto err_out_cleanup;
+
+	return 0;
+
+err_out_cleanup:
+	blkdev_put(n->bdev, FMODE_READ|FMODE_WRITE);
+	n->bdev = NULL;
+
+	return err;
+}
+
+/* Empty thread pool callbacks for the network processing threads. */
+static inline void *dst_thread_network_init(void *data)
+{
+	dprintk("%s: data: %p.\n", __func__, data);
+	return data;
+}
+
+static inline void dst_thread_network_cleanup(void *data)
+{
+	dprintk("%s: data: %p.\n", __func__, data);
+}
+
+/*
+ * Allocate DST node and initialize some of its parameters.
+ */
+static struct dst_node *dst_alloc_node(struct dst_ctl *ctl,
+		int (*start)(struct dst_node *),
+		int num)
+{
+	struct dst_node *n;
+	int err;
+
+	n = kzalloc(sizeof(struct dst_node), GFP_KERNEL);
+	if (!n)
+		return NULL;
+
+	INIT_LIST_HEAD(&n->node_entry);
+
+	INIT_LIST_HEAD(&n->security_list);
+	mutex_init(&n->security_lock);
+
+	init_waitqueue_head(&n->wait);
+
+	n->trans_scan_timeout = msecs_to_jiffies(ctl->trans_scan_timeout);
+	if (!n->trans_scan_timeout)
+		n->trans_scan_timeout = HZ;
+
+	n->trans_max_retries = ctl->trans_max_retries;
+	if (!n->trans_max_retries)
+		n->trans_max_retries = 10;
+
+	/*
+	 * Pretty much arbitrary default numbers.
+	 * 32 matches maximum number of pages in bio originated from ext3 (31).
+	 */
+	n->max_pages = ctl->max_pages;
+	if (!n->max_pages)
+		n->max_pages = 32;
+
+	if (n->max_pages > 1024)
+		n->max_pages = 1024;
+
+	n->start = start;
+	n->size = ctl->size;
+
+	atomic_set(&n->refcnt, 1);
+	atomic_long_set(&n->gen, 0);
+	snprintf(n->name, sizeof(n->name), "%s", ctl->name);
+
+	err = dst_node_sysfs_init(n);
+	if (err)
+		goto err_out_free;
+
+	n->pool = thread_pool_create(num, n->name, dst_thread_network_init,
+			dst_thread_network_cleanup, n);
+	if (IS_ERR(n->pool)) {
+		err = PTR_ERR(n->pool);
+		goto err_out_sysfs_exit;
+	}
+
+	dprintk("%s: n: %p, name: %s.\n", __func__, n, n->name);
+
+	return n;
+
+err_out_sysfs_exit:
+	dst_node_sysfs_exit(n);
+err_out_free:
+	kfree(n);
+	return NULL;
+}
+
+/*
+ * Starting a node, connected to the remote server:
+ * register block device and initialize transaction mechanism.
+ * In revers order though.
+ *
+ * It will autonegotiate some parameters with the remote node
+ * and update local if needed.
+ *
+ * Transaction initialization should be the last thing before
+ * starting the node, since transaction should include not only
+ * block IO, but also crypto related data (if any), which are
+ * initialized separately.
+ */
+static int dst_start_remote(struct dst_node *n)
+{
+	int err;
+
+	err = dst_node_trans_init(n, sizeof(struct dst_trans));
+	if (err)
+		return err;
+
+	err = dst_node_create_disk(n);
+	if (err)
+		return err;
+
+	dst_node_set_size(n);
+	add_disk(n->disk);
+
+	dprintk("DST: started remote node '%s', minor: %d.\n", n->name, n->disk->first_minor);
+
+	return 0;
+}
+
+/*
+ * Adding remote node and initialize connection.
+ */
+static int dst_add_remote(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	int err;
+	struct dst_network_ctl *rctl = data;
+
+	if (n)
+		return -EEXIST;
+
+	if (size != sizeof(struct dst_network_ctl))
+		return -EINVAL;
+
+	n = dst_alloc_node(ctl, dst_start_remote, 1);
+	if (!n)
+		return -ENOMEM;
+
+	memcpy(&n->info->net, rctl, sizeof(struct dst_network_ctl));
+	err = dst_node_init_connected(n, rctl);
+	if (err)
+		goto err_out_free;
+
+	dst_node_add(n);
+
+	return 0;
+
+err_out_free:
+	dst_node_put(n);
+	return err;
+}
+
+/*
+ * Adding export node: initializing block device and listening socket.
+ */
+static int dst_add_export(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	int err;
+	struct dst_export_ctl *le = data;
+
+	if (n)
+		return -EEXIST;
+
+	if (size != sizeof(struct dst_export_ctl))
+		return -EINVAL;
+
+	n = dst_alloc_node(ctl, dst_start_export, 2);
+	if (!n)
+		return -EINVAL;
+
+	err = dst_setup_export(n, ctl, le);
+	if (err)
+		goto err_out_free;
+
+	dst_node_add(n);
+
+	return 0;
+
+err_out_free:
+	dst_node_put(n);
+	return err;
+}
+
+static int dst_node_remove_unload(struct dst_node *n)
+{
+	printk(KERN_INFO "STOPPED name: '%s', size: %llu.\n",
+			n->name, n->size);
+
+	if (n->disk)
+		del_gendisk(n->disk);
+
+	dst_node_remove(n);
+	dst_node_sysfs_exit(n);
+
+	/*
+	 * This is not a hack. Really.
+	 * Node's reference counter allows to implement fine grained
+	 * node freeing, but since all transactions (which hold node's
+	 * reference counter) are processed in the dedicated thread,
+	 * it is possible that reference will hit zero in that thread,
+	 * so we will not be able to exit thread and cleanup the node.
+	 *
+	 * So, we remove disk, so no new activity is possible, and
+	 * wait until all pending transaction are completed (either
+	 * in receiving thread or by timeout in workqueue), in this
+	 * case reference counter will be less or equal to 2 (once set in
+	 * dst_alloc_node() and then in connector message parser;
+	 * or when we force module unloading, and connector message
+	 * parser does not hold a reference, in this case reference
+	 * counter will be equal to 1),
+	 * and subsequent dst_node_put() calls will free the node.
+	 */
+	dprintk("%s: going to sleep with %d refcnt.\n", __func__, atomic_read(&n->refcnt));
+	wait_event(n->wait, atomic_read(&n->refcnt) <= 2);
+
+	dst_node_put(n);
+	return 0;
+}
+
+/*
+ * Remove node from the hash table.
+ */
+static int dst_del_node(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	if (!n)
+		return -ENODEV;
+
+	return dst_node_remove_unload(n);
+}
+
+/*
+ * Initialize crypto processing for given node.
+ */
+static int dst_crypto_init(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	struct dst_crypto_ctl *crypto = data;
+
+	if (!n)
+		return -ENODEV;
+
+	if (size != sizeof(struct dst_crypto_ctl) + crypto->hash_keysize +
+			crypto->cipher_keysize)
+		return -EINVAL;
+
+	if (n->trans_cache)
+		return -EEXIST;
+
+	return dst_node_crypto_init(n, crypto);
+}
+
+/*
+ * Security attributes for given node.
+ */
+static int dst_security_init(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	struct dst_secure *s;
+
+	if (!n)
+		return -ENODEV;
+
+	if (size != sizeof(struct dst_secure_user))
+		return -EINVAL;
+
+	s = kmalloc(sizeof(struct dst_secure), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	memcpy(&s->sec, data, size);
+
+	mutex_lock(&n->security_lock);
+	list_add_tail(&s->sec_entry, &n->security_list);
+	mutex_unlock(&n->security_lock);
+
+	return 0;
+}
+
+/*
+ * Kill'em all!
+ */
+static int dst_start_node(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	int err;
+
+	if (!n)
+		return -ENODEV;
+
+	if (n->trans_cache)
+		return 0;
+
+	err = n->start(n);
+	if (err)
+		return err;
+
+	printk(KERN_INFO "STARTED name: '%s', size: %llu.\n", n->name, n->size);
+	return 0;
+}
+
+typedef int (*dst_command_func)(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size);
+
+/*
+ * List of userspace commands.
+ */
+static dst_command_func dst_commands[] = {
+	[DST_ADD_REMOTE] = &dst_add_remote,
+	[DST_ADD_EXPORT] = &dst_add_export,
+	[DST_DEL_NODE] = &dst_del_node,
+	[DST_CRYPTO] = &dst_crypto_init,
+	[DST_SECURITY] = &dst_security_init,
+	[DST_START] = &dst_start_node,
+};
+
+/*
+ * Configuration parser.
+ */
+static void cn_dst_callback(void *data)
+{
+	struct dst_ctl *ctl;
+	struct cn_msg *msg = data;
+	int err;
+	struct dst_ctl_ack ack;
+	struct dst_node *n = NULL, *tmp;
+	unsigned int hash;
+
+	if (msg->len < sizeof(struct dst_ctl)) {
+		err = -EBADMSG;
+		goto out;
+	}
+
+	ctl = (struct dst_ctl *)msg->data;
+
+	if (ctl->cmd >= DST_CMD_MAX) {
+		err = -EINVAL;
+		goto out;
+	}
+	hash = dst_hash(ctl->name, sizeof(ctl->name));
+
+	mutex_lock(&dst_hash_lock);
+	list_for_each_entry(tmp, &dst_hashtable[hash], node_entry) {
+		if (!memcmp(tmp->name, ctl->name, sizeof(tmp->name))) {
+			n = tmp;
+			dst_node_get(n);
+			break;
+		}
+	}
+	mutex_unlock(&dst_hash_lock);
+
+	err = dst_commands[ctl->cmd](n, ctl, msg->data + sizeof(struct dst_ctl),
+			msg->len - sizeof(struct dst_ctl));
+
+	dst_node_put(n);
+out:
+	memcpy(&ack.msg, msg, sizeof(struct cn_msg));
+
+	ack.msg.ack = msg->ack + 1;
+	ack.msg.len = sizeof(struct dst_ctl_ack) - sizeof(struct cn_msg);
+
+	ack.error = err;
+
+	cn_netlink_send(&ack.msg, 0, GFP_KERNEL);
+}
+
+/*
+ * Global initialization: sysfs, hash table, block device registration,
+ * connector and various caches.
+ */
+static int __init dst_sysfs_init(void)
+{
+	return bus_register(&dst_dev_bus_type);
+}
+
+static void dst_sysfs_exit(void)
+{
+	bus_unregister(&dst_dev_bus_type);
+}
+
+static int __init dst_hashtable_init(void)
+{
+	unsigned int i;
+
+	dst_hashtable = kcalloc(dst_hashtable_size, sizeof(struct list_head),
+			GFP_KERNEL);
+	if (!dst_hashtable)
+		return -ENOMEM;
+
+	for (i=0; i<dst_hashtable_size; ++i)
+		INIT_LIST_HEAD(&dst_hashtable[i]);
+
+	return 0;
+}
+
+static void dst_hashtable_exit(void)
+{
+	unsigned int i;
+	struct dst_node *n, *tmp;
+
+	for (i=0; i<dst_hashtable_size; ++i) {
+		list_for_each_entry_safe(n, tmp, &dst_hashtable[i], node_entry) {
+			dst_node_remove_unload(n);
+		}
+	}
+
+	kfree(dst_hashtable);
+}
+
+static int __init dst_sys_init(void)
+{
+	int err = -ENOMEM;
+
+	err = dst_hashtable_init();
+	if (err)
+		goto err_out_exit;
+
+	err = dst_export_init();
+	if (err)
+		goto err_out_hashtable_exit;
+
+	err = register_blkdev(dst_major, DST_NAME);
+	if (err < 0)
+		goto err_out_export_exit;
+	if (err)
+		dst_major = err;
+
+	err = dst_sysfs_init();
+	if (err)
+		goto err_out_unregister;
+
+	err = cn_add_callback(&cn_dst_id, "DST", cn_dst_callback);
+	if (err)
+		goto err_out_sysfs_exit;
+
+	printk(KERN_INFO "Distributed storage, '%s' release.\n", dst_name);
+
+	return 0;
+
+err_out_sysfs_exit:
+	dst_sysfs_exit();
+err_out_unregister:
+	unregister_blkdev(dst_major, DST_NAME);
+err_out_export_exit:
+	dst_export_exit();
+err_out_hashtable_exit:
+	dst_hashtable_exit();
+err_out_exit:
+	return err;
+}
+
+static void __exit dst_sys_exit(void)
+{
+	cn_del_callback(&cn_dst_id);
+	unregister_blkdev(dst_major, DST_NAME);
+	dst_hashtable_exit();
+	dst_sysfs_exit();
+	dst_export_exit();
+}
+
+module_init(dst_sys_init);
+module_exit(dst_sys_exit);
+
+MODULE_DESCRIPTION("Distributed storage");
+MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/connector.h b/include/linux/connector.h
index fc65d219d88c..b9966e64604e 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -39,8 +39,10 @@
 #define CN_IDX_V86D			0x4
 #define CN_VAL_V86D_UVESAFB		0x1
 #define CN_IDX_BB			0x5	/* BlackBoard, from the TSP GPL sampling framework */
+#define CN_DST_IDX			0x6
+#define CN_DST_VAL			0x1
 
-#define CN_NETLINK_USERS		6
+#define CN_NETLINK_USERS		7
 
 /*
  * Maximum connector's message size.
diff --git a/include/linux/dst.h b/include/linux/dst.h
new file mode 100644
index 000000000000..e26fed84b1aa
--- /dev/null
+++ b/include/linux/dst.h
@@ -0,0 +1,587 @@
+/*
+ * 2007+ Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __DST_H
+#define __DST_H
+
+#include <linux/types.h>
+#include <linux/connector.h>
+
+#define DST_NAMELEN		32
+#define DST_NAME		"dst"
+
+enum {
+	/* Remove node with given id from storage */
+	DST_DEL_NODE	= 0,
+	/* Add remote node with given id to the storage */
+	DST_ADD_REMOTE,
+	/* Add local node with given id to the storage to be exported and used by remote peers */
+	DST_ADD_EXPORT,
+	/* Crypto initialization command (hash/cipher used to protect the connection) */
+	DST_CRYPTO,
+	/* Security attributes for given connection (permissions for example) */
+	DST_SECURITY,
+	/* Register given node in the block layer subsystem */
+	DST_START,
+	DST_CMD_MAX
+};
+
+struct dst_ctl
+{
+	/* Storage name */
+	char			name[DST_NAMELEN];
+	/* Command flags */
+	__u32			flags;
+	/* Command itself (see above) */
+	__u32			cmd;
+	/* Maximum number of pages per single request in this device */
+	__u32			max_pages;
+	/* Stale/error transaction scanning timeout in milliseconds */
+	__u32			trans_scan_timeout;
+	/* Maximum number of retry sends before completing transaction as broken */
+	__u32			trans_max_retries;
+	/* Storage size */
+	__u64			size;
+};
+
+/* Reply command carries completion status */
+struct dst_ctl_ack
+{
+	struct cn_msg		msg;
+	int			error;
+	int			unused[3];
+};
+
+/*
+ * Unfortunaltely socket address structure is not exported to userspace
+ * and is redefined there.
+ */
+#define SADDR_MAX_DATA	128
+
+struct saddr {
+	/* address family, AF_xxx	*/
+	unsigned short		sa_family;
+	/* 14 bytes of protocol address	*/
+	char			sa_data[SADDR_MAX_DATA];
+	/* Number of bytes used in sa_data */
+	unsigned short		sa_data_len;
+};
+
+/* Address structure */
+struct dst_network_ctl
+{
+	/* Socket type: datagram, stream...*/
+	unsigned int		type;
+	/* Let me guess, is it a Jupiter diameter? */
+	unsigned int		proto;
+	/* Peer's address */
+	struct saddr		addr;
+};
+
+struct dst_crypto_ctl
+{
+	/* Cipher and hash names */
+	char			cipher_algo[DST_NAMELEN];
+	char			hash_algo[DST_NAMELEN];
+
+	/* Key sizes. Can be zero for digest for example */
+	unsigned int		cipher_keysize, hash_keysize;
+	/* Alignment. Calculated by the DST itself. */
+	unsigned int		crypto_attached_size;
+	/* Number of threads to perform crypto operations */
+	int			thread_num;
+};
+
+/* Export security attributes have this bits checked in when client connects */
+#define DST_PERM_READ		(1<<0)
+#define DST_PERM_WRITE		(1<<1)
+
+/*
+ * Right now it is simple model, where each remote address
+ * is assigned to set of permissions it is allowed to perform.
+ * In real world block device does not know anything but
+ * reading and writing, so it should be more than enough.
+ */
+struct dst_secure_user
+{
+	unsigned int		permissions;
+	struct saddr		addr;
+};
+
+/*
+ * Export control command: device to export and network address to accept
+ * clients to work with given device
+ */
+struct dst_export_ctl
+{
+	char			device[DST_NAMELEN];
+	struct dst_network_ctl	ctl;
+};
+
+enum {
+	DST_CFG	= 1, 		/* Request remote configuration */
+	DST_IO,			/* IO command */
+	DST_IO_RESPONSE,	/* IO response */
+	DST_PING,		/* Keepalive message */
+	DST_NCMD_MAX,
+};
+
+struct dst_cmd
+{
+	/* Network command itself, see above */
+	__u32			cmd;
+	/*
+	 * Size of the attached data
+	 * (in most cases, for READ command it means how many bytes were requested)
+	 */
+	__u32			size;
+	/* Crypto size: number of attached bytes with digest/hmac */
+	__u32			csize;
+	/* Here we can carry secret data */
+	__u32			reserved;
+	/* Read/write bits, see how they are encoded in bio structure */
+	__u64			rw;
+	/* BIO flags */
+	__u64			flags;
+	/* Unique command id (like transaction ID) */
+	__u64			id;
+	/* Sector to start IO from */
+	__u64			sector;
+	/* Hash data is placed after this header */
+	__u8			hash[0];
+};
+
+/*
+ * Convert command to/from network byte order.
+ * We do not use hton*() functions, since there is
+ * no 64-bit implementation.
+ */
+static inline void dst_convert_cmd(struct dst_cmd *c)
+{
+	c->cmd = __cpu_to_be32(c->cmd);
+	c->csize = __cpu_to_be32(c->csize);
+	c->size = __cpu_to_be32(c->size);
+	c->sector = __cpu_to_be64(c->sector);
+	c->id = __cpu_to_be64(c->id);
+	c->flags = __cpu_to_be64(c->flags);
+	c->rw = __cpu_to_be64(c->rw);
+}
+
+/* Transaction id */
+typedef __u64 dst_gen_t;
+
+#ifdef __KERNEL__
+
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/device.h>
+#include <linux/mempool.h>
+#include <linux/net.h>
+#include <linux/poll.h>
+#include <linux/rbtree.h>
+
+#ifdef CONFIG_DST_DEBUG
+#define dprintk(f, a...) printk(KERN_NOTICE f, ##a)
+#else
+static inline void __attribute__ ((format (printf, 1, 2)))
+	dprintk(const char *fmt, ...) {}
+#endif
+
+struct dst_node;
+
+struct dst_trans
+{
+	/* DST node we are working with */
+	struct dst_node		*n;
+
+	/* Entry inside transaction tree */
+	struct rb_node		trans_entry;
+
+	/* Merlin kills this transaction when this memory cell equals zero */
+	atomic_t		refcnt;
+
+	/* How this transaction should be processed by crypto engine */
+	short			enc;
+	/* How many times this transaction was resent */
+	short			retries;
+	/* Completion status */
+	int			error;
+
+	/* When did we send it to the remote peer */
+	long			send_time;
+
+	/* My name is...
+	 * Well, computers does not speak, they have unique id instead */
+	dst_gen_t		gen;
+
+	/* Block IO we are working with */
+	struct bio		*bio;
+
+	/* Network command for above block IO request */
+	struct dst_cmd		cmd;
+};
+
+struct dst_crypto_engine
+{
+	/* What should we do with all block requests */
+	struct crypto_hash	*hash;
+	struct crypto_ablkcipher	*cipher;
+
+	/* Pool of pages used to encrypt data into before sending */
+	int			page_num;
+	struct page		**pages;
+
+	/* What to do with current request */
+	int			enc;
+	/* Who we are and where do we go */
+	struct scatterlist	*src, *dst;
+
+	/* Maximum timeout waiting for encryption to be completed */
+	long			timeout;
+	/* IV is a 64-bit sequential counter */
+	u64			iv;
+
+	/* Secret data */
+	void			*private;
+
+	/* Cached temporary data lives here */
+	int			size;
+	void			*data;
+};
+
+struct dst_state
+{
+	/* The main state protection */
+	struct mutex		state_lock;
+
+	/* Polling machinery for sockets */
+	wait_queue_t 		wait;
+	wait_queue_head_t 	*whead;
+	/* Most of events are being waited here */
+	wait_queue_head_t 	thread_wait;
+
+	/* Who owns this? */
+	struct dst_node		*node;
+
+	/* Network address for this state */
+	struct dst_network_ctl	ctl;
+
+	/* Permissions to work with: read-only or rw connection */
+	u32			permissions;
+
+	/* Called when we need to clean private data */
+	void			(* cleanup)(struct dst_state *st);
+
+	/* Used by the server: BIO completion queues BIOs here */
+	struct list_head	request_list;
+	spinlock_t		request_lock;
+
+	/* Guess what? No, it is not number of planets */
+	atomic_t		refcnt;
+
+	/* This flags is set when connection should be dropped */
+	int			need_exit;
+
+	/*
+	 * Socket to work with. Second pointer is used for
+	 * lockless check if socket was changed before performing
+	 * next action (like working with cached polling result)
+	 */
+	struct socket		*socket, *read_socket;
+
+	/* Cached preallocated data */
+	void			*data;
+	unsigned int		size;
+
+	/* Currently processed command */
+	struct dst_cmd		cmd;
+};
+
+struct dst_info
+{
+	/* Device size */
+	u64			size;
+
+	/* Local device name for export devices */
+	char			local[DST_NAMELEN];
+
+	/* Network setup */
+	struct dst_network_ctl	net;
+
+	/* Sysfs bits use this */
+	struct device		device;
+};
+
+struct dst_node
+{
+	struct list_head	node_entry;
+
+	/* Hi, my name is stored here */
+	char			name[DST_NAMELEN];
+	/* My cache name is stored here */
+	char			cache_name[DST_NAMELEN];
+
+	/* Block device attached to given node.
+	 * Only valid for exporting nodes */
+	struct block_device 	*bdev;
+	/* Network state machine for given peer */
+	struct dst_state	*state;
+
+	/* Block IO machinery */
+	struct request_queue	*queue;
+	struct gendisk		*disk;
+
+	/* Number of threads in processing pool */
+	int			thread_num;
+	/* Maximum number of pages in single IO */
+	int			max_pages;
+
+	/* I'm that big in bytes */
+	loff_t			size;
+
+	/* Exported to userspace node information */
+	struct dst_info		*info;
+
+	/*
+	 * Security attribute list.
+	 * Used only by exporting node currently.
+	 */
+	struct list_head	security_list;
+	struct mutex		security_lock;
+
+	/*
+	 * When this unerflows below zero, university collapses.
+	 * But this will not happen, since node will be freed,
+	 * when reference counter reaches zero.
+	 */
+	atomic_t		refcnt;
+
+	/* How precisely should I be started? */
+	int 			(*start)(struct dst_node *);
+
+	/* Crypto capabilities */
+	struct dst_crypto_ctl	crypto;
+	u8			*hash_key;
+	u8			*cipher_key;
+
+	/* Pool of processing thread */
+	struct thread_pool	*pool;
+
+	/* Transaction IDs live here */
+	atomic_long_t		gen;
+
+	/*
+	 * How frequently and how many times transaction
+	 * tree should be scanned to drop stale objects.
+	 */
+	long			trans_scan_timeout;
+	int			trans_max_retries;
+
+	/* Small gnomes live here */
+	struct rb_root		trans_root;
+	struct mutex		trans_lock;
+
+	/*
+	 * Transaction cache/memory pool.
+	 * It is big enough to contain not only transaction
+	 * itself, but additional crypto data (digest/hmac).
+	 */
+	struct kmem_cache	*trans_cache;
+	mempool_t		*trans_pool;
+
+	/* This entity scans transaction tree */
+	struct delayed_work 	trans_work;
+
+	wait_queue_head_t	wait;
+};
+
+/* Kernel representation of the security attribute */
+struct dst_secure
+{
+	struct list_head	sec_entry;
+	struct dst_secure_user	sec;
+};
+
+int dst_process_bio(struct dst_node *n, struct bio *bio);
+
+int dst_node_init_connected(struct dst_node *n, struct dst_network_ctl *r);
+int dst_node_init_listened(struct dst_node *n, struct dst_export_ctl *le);
+
+static inline struct dst_state *dst_state_get(struct dst_state *st)
+{
+	BUG_ON(atomic_read(&st->refcnt) == 0);
+	atomic_inc(&st->refcnt);
+	return st;
+}
+
+void dst_state_put(struct dst_state *st);
+
+struct dst_state *dst_state_alloc(struct dst_node *n);
+int dst_state_socket_create(struct dst_state *st);
+void dst_state_socket_release(struct dst_state *st);
+
+void dst_state_exit_connected(struct dst_state *st);
+
+int dst_state_schedule_receiver(struct dst_state *st);
+
+void dst_dump_addr(struct socket *sk, struct sockaddr *sa, char *str);
+
+static inline void dst_state_lock(struct dst_state *st)
+{
+	mutex_lock(&st->state_lock);
+}
+
+static inline void dst_state_unlock(struct dst_state *st)
+{
+	mutex_unlock(&st->state_lock);
+}
+
+void dst_poll_exit(struct dst_state *st);
+int dst_poll_init(struct dst_state *st);
+
+static inline unsigned int dst_state_poll(struct dst_state *st)
+{
+	unsigned int revents = POLLHUP | POLLERR;
+
+	dst_state_lock(st);
+	if (st->socket)
+		revents = st->socket->ops->poll(NULL, st->socket, NULL);
+	dst_state_unlock(st);
+
+	return revents;
+}
+
+static inline int dst_thread_setup(void *private, void *data)
+{
+	return 0;
+}
+
+void dst_node_put(struct dst_node *n);
+
+static inline struct dst_node *dst_node_get(struct dst_node *n)
+{
+	atomic_inc(&n->refcnt);
+	return n;
+}
+
+int dst_data_recv(struct dst_state *st, void *data, unsigned int size);
+int dst_recv_cdata(struct dst_state *st, void *cdata);
+int dst_data_send_header(struct socket *sock,
+		void *data, unsigned int size, int more);
+
+int dst_send_bio(struct dst_state *st, struct dst_cmd *cmd, struct bio *bio);
+
+int dst_process_io(struct dst_state *st);
+int dst_export_crypto(struct dst_node *n, struct bio *bio);
+int dst_export_send_bio(struct bio *bio);
+int dst_start_export(struct dst_node *n);
+
+int __init dst_export_init(void);
+void dst_export_exit(void);
+
+/* Private structure for export block IO requests */
+struct dst_export_priv
+{
+	struct list_head		request_entry;
+	struct dst_state		*state;
+	struct bio			*bio;
+	struct dst_cmd			cmd;
+};
+
+static inline void dst_trans_get(struct dst_trans *t)
+{
+	atomic_inc(&t->refcnt);
+}
+
+struct dst_trans *dst_trans_search(struct dst_node *node, dst_gen_t gen);
+int dst_trans_remove(struct dst_trans *t);
+int dst_trans_remove_nolock(struct dst_trans *t);
+void dst_trans_put(struct dst_trans *t);
+
+/*
+ * Convert bio into network command.
+ */
+static inline void dst_bio_to_cmd(struct bio *bio, struct dst_cmd *cmd,
+		u32 command, u64 id)
+{
+	cmd->cmd = command;
+	cmd->flags = (bio->bi_flags << BIO_POOL_BITS) >> BIO_POOL_BITS;
+	cmd->rw = bio->bi_rw;
+	cmd->size = bio->bi_size;
+	cmd->csize = 0;
+	cmd->id = id;
+	cmd->sector = bio->bi_sector;
+};
+
+int dst_trans_send(struct dst_trans *t);
+int dst_trans_crypto(struct dst_trans *t);
+
+int dst_node_crypto_init(struct dst_node *n, struct dst_crypto_ctl *ctl);
+void dst_node_crypto_exit(struct dst_node *n);
+
+static inline int dst_need_crypto(struct dst_node *n)
+{
+	struct dst_crypto_ctl *c = &n->crypto;
+	/*
+	 * Logical OR is appropriate here, but boolean one produces
+	 * more optimal code, so it is used instead.
+	 */
+	return (c->hash_algo[0] | c->cipher_algo[0]);
+}
+
+int dst_node_trans_init(struct dst_node *n, unsigned int size);
+void dst_node_trans_exit(struct dst_node *n);
+
+/*
+ * Pool of threads.
+ * Ready list contains threads currently free to be used,
+ * active one contains threads with some work scheduled for them.
+ * Caller can wait in given queue when thread is ready.
+ */
+struct thread_pool
+{
+	int			thread_num;
+	struct mutex		thread_lock;
+	struct list_head	ready_list, active_list;
+
+	wait_queue_head_t	wait;
+};
+
+void thread_pool_del_worker(struct thread_pool *p);
+void thread_pool_del_worker_id(struct thread_pool *p, unsigned int id);
+int thread_pool_add_worker(struct thread_pool *p,
+		char *name,
+		unsigned int id,
+		void *(* init)(void *data),
+		void (* cleanup)(void *data),
+		void *data);
+
+void thread_pool_destroy(struct thread_pool *p);
+struct thread_pool *thread_pool_create(int num, char *name,
+		void *(* init)(void *data),
+		void (* cleanup)(void *data),
+		void *data);
+
+int thread_pool_schedule(struct thread_pool *p,
+		int (* setup)(void *stored_private, void *setup_data),
+		int (* action)(void *stored_private, void *setup_data),
+		void *setup_data, long timeout);
+int thread_pool_schedule_private(struct thread_pool *p,
+		int (* setup)(void *private, void *data),
+		int (* action)(void *private, void *data),
+		void *data, long timeout, void *id);
+
+#endif /* __KERNEL__ */
+#endif /* __DST_H */
-- 
cgit v1.2.3


From 7237d3de78ff89ec2e18eae5fe962d063024fef5 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 30 Mar 2009 13:55:30 -0800
Subject: x86, ACPI: add support for x2apic ACPI extensions

All logical processors with APIC ID values of 255 and greater will have their
APIC reported through Processor X2APIC structure (type-9 entry type) and all
logical processors with APIC ID less than 255 will have their APIC reported
through legacy Processor Local APIC (type-0 entry type) only. This is the
same case even for NMI structure reporting.

The Processor X2APIC Affinity structure provides the association between the
X2APIC ID of a logical processor and the proximity domain to which the logical
processor belongs.

For OSPM, Procssor IDs outside the 0-254 range are to be declared as Device()
objects in the ACPI namespace.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/acpi/boot.c   | 63 ++++++++++++++++++++++++++++++++++++++++---
 arch/x86/mm/srat_64.c         | 30 +++++++++++++++++++++
 drivers/acpi/numa.c           | 46 ++++++++++++++++++++++++++++++-
 drivers/acpi/processor_core.c | 26 ++++++++++++++++++
 drivers/acpi/tables.c         | 30 +++++++++++++++++++++
 include/linux/acpi.h          |  1 +
 6 files changed, 191 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7678f10c4568..565e70c7ca93 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -259,6 +259,35 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 	generic_processor_info(id, ver);
 }
 
+static int __init
+acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
+{
+	struct acpi_madt_local_x2apic *processor = NULL;
+
+	processor = (struct acpi_madt_local_x2apic *)header;
+
+	if (BAD_MADT_ENTRY(processor, end))
+		return -EINVAL;
+
+	acpi_table_print_madt_entry(header);
+
+#ifdef CONFIG_X86_X2APIC
+	/*
+	 * We need to register disabled CPU as well to permit
+	 * counting disabled CPUs. This allows us to size
+	 * cpus_possible_map more accurately, to permit
+	 * to not preallocating memory for all NR_CPUS
+	 * when we use CPU hotplug.
+	 */
+	acpi_register_lapic(processor->local_apic_id,	/* APIC ID */
+			    processor->lapic_flags & ACPI_MADT_ENABLED);
+#else
+	printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+#endif
+
+	return 0;
+}
+
 static int __init
 acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
 {
@@ -318,6 +347,25 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 	return 0;
 }
 
+static int __init
+acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
+		      const unsigned long end)
+{
+	struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
+
+	x2apic_nmi = (struct acpi_madt_local_x2apic_nmi *)header;
+
+	if (BAD_MADT_ENTRY(x2apic_nmi, end))
+		return -EINVAL;
+
+	acpi_table_print_madt_entry(header);
+
+	if (x2apic_nmi->lint != 1)
+		printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+
+	return 0;
+}
+
 static int __init
 acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
 {
@@ -823,6 +871,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
 static int __init acpi_parse_madt_lapic_entries(void)
 {
 	int count;
+	int x2count = 0;
 
 	if (!cpu_has_apic)
 		return -ENODEV;
@@ -846,22 +895,28 @@ static int __init acpi_parse_madt_lapic_entries(void)
 	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
 				      acpi_parse_sapic, MAX_APICS);
 
-	if (!count)
+	if (!count) {
+		x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
+						acpi_parse_x2apic, MAX_APICS);
 		count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
 					      acpi_parse_lapic, MAX_APICS);
-	if (!count) {
+	}
+	if (!count && !x2count) {
 		printk(KERN_ERR PREFIX "No LAPIC entries present\n");
 		/* TBD: Cleanup to allow fallback to MPS */
 		return -ENODEV;
-	} else if (count < 0) {
+	} else if (count < 0 || x2count < 0) {
 		printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
 		/* TBD: Cleanup to allow fallback to MPS */
 		return count;
 	}
 
+	x2count =
+	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+				  acpi_parse_x2apic_nmi, 0);
 	count =
 	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
-	if (count < 0) {
+	if (count < 0 || x2count < 0) {
 		printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
 		/* TBD: Cleanup to allow fallback to MPS */
 		return count;
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af074..13d56f5b1349 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -115,6 +115,36 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 	reserve_early(phys, phys + length, "ACPI SLIT");
 }
 
+/* Callback for Proximity Domain -> x2APIC mapping */
+void __init
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+	int pxm, node;
+	int apic_id;
+
+	if (srat_disabled())
+		return;
+	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
+	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+		return;
+	pxm = pa->proximity_domain;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+		bad_srat();
+		return;
+	}
+
+	apic_id = pa->apic_id;
+	apicid_to_node[apic_id] = node;
+	acpi_numa = 1;
+	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+	       pxm, apic_id, node);
+}
+
 /* Callback for Proximity Domain -> LAPIC mapping */
 void __init
 acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 3a0d8ef25c75..d440ccd27d91 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -131,6 +131,21 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header)
 #endif				/* ACPI_DEBUG_OUTPUT */
 		break;
 
+	case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY:
+#ifdef ACPI_DEBUG_OUTPUT
+		{
+			struct acpi_srat_x2apic_cpu_affinity *p =
+			    (struct acpi_srat_x2apic_cpu_affinity *)header;
+			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+					  "SRAT Processor (x2apicid[0x%08x]) in"
+					  " proximity domain %d %s\n",
+					  p->apic_id,
+					  p->proximity_domain,
+					  (p->flags & ACPI_SRAT_CPU_ENABLED) ?
+					  "enabled" : "disabled"));
+		}
+#endif				/* ACPI_DEBUG_OUTPUT */
+		break;
 	default:
 		printk(KERN_WARNING PREFIX
 		       "Found unsupported SRAT entry (type = 0x%x)\n",
@@ -180,8 +195,35 @@ static int __init acpi_parse_slit(struct acpi_table_header *table)
 	return 0;
 }
 
+void __init __attribute__ ((weak))
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+	printk(KERN_WARNING PREFIX
+	       "Found unsupported x2apic [0x%08x] SRAT entry\n", pa->apic_id);
+	return;
+}
+
+
+static int __init
+acpi_parse_x2apic_affinity(struct acpi_subtable_header *header,
+			   const unsigned long end)
+{
+	struct acpi_srat_x2apic_cpu_affinity *processor_affinity;
+
+	processor_affinity = (struct acpi_srat_x2apic_cpu_affinity *)header;
+	if (!processor_affinity)
+		return -EINVAL;
+
+	acpi_table_print_srat_entry(header);
+
+	/* let architecture-dependent part to do it */
+	acpi_numa_x2apic_affinity_init(processor_affinity);
+
+	return 0;
+}
+
 static int __init
-acpi_parse_processor_affinity(struct acpi_subtable_header * header,
+acpi_parse_processor_affinity(struct acpi_subtable_header *header,
 			      const unsigned long end)
 {
 	struct acpi_srat_cpu_affinity *processor_affinity;
@@ -241,6 +283,8 @@ int __init acpi_numa_init(void)
 {
 	/* SRAT: Static Resource Affinity Table */
 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
+		acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
+				      acpi_parse_x2apic_affinity, NR_CPUS);
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
 				      acpi_parse_processor_affinity, NR_CPUS);
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 0cc2fd31e376..775324e34ffa 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -427,6 +427,29 @@ static int map_lapic_id(struct acpi_subtable_header *entry,
 	return 0;
 }
 
+static int map_x2apic_id(struct acpi_subtable_header *entry,
+			 int device_declaration, u32 acpi_id, int *apic_id)
+{
+	struct acpi_madt_local_x2apic *apic =
+		(struct acpi_madt_local_x2apic *)entry;
+	u32 tmp = apic->local_apic_id;
+
+	/* Only check enabled APICs*/
+	if (!(apic->lapic_flags & ACPI_MADT_ENABLED))
+		return 0;
+
+	/* Device statement declaration type */
+	if (device_declaration) {
+		if (apic->uid == acpi_id)
+			goto found;
+	}
+
+	return 0;
+found:
+	*apic_id = tmp;
+	return 1;
+}
+
 static int map_lsapic_id(struct acpi_subtable_header *entry,
 		int device_declaration, u32 acpi_id, int *apic_id)
 {
@@ -476,6 +499,9 @@ static int map_madt_entry(int type, u32 acpi_id)
 		if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {
 			if (map_lapic_id(header, acpi_id, &apic_id))
 				break;
+		} else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) {
+			if (map_x2apic_id(header, type, acpi_id, &apic_id))
+				break;
 		} else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {
 			if (map_lsapic_id(header, type, acpi_id, &apic_id))
 				break;
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index a8852952fac4..991c006a301b 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -62,6 +62,18 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header)
 		}
 		break;
 
+	case ACPI_MADT_TYPE_LOCAL_X2APIC:
+		{
+			struct acpi_madt_local_x2apic *p =
+			    (struct acpi_madt_local_x2apic *)header;
+			printk(KERN_INFO PREFIX
+			       "X2APIC (apic_id[0x%02x] uid[0x%02x] %s)\n",
+			       p->local_apic_id, p->uid,
+			       (p->lapic_flags & ACPI_MADT_ENABLED) ?
+			       "enabled" : "disabled");
+		}
+		break;
+
 	case ACPI_MADT_TYPE_IO_APIC:
 		{
 			struct acpi_madt_io_apic *p =
@@ -116,6 +128,24 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header)
 		}
 		break;
 
+	case ACPI_MADT_TYPE_LOCAL_X2APIC_NMI:
+		{
+			u16 polarity, trigger;
+			struct acpi_madt_local_x2apic_nmi *p =
+			    (struct acpi_madt_local_x2apic_nmi *)header;
+
+			polarity = p->inti_flags & ACPI_MADT_POLARITY_MASK;
+			trigger = (p->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2;
+
+			printk(KERN_INFO PREFIX
+			       "X2APIC_NMI (uid[0x%02x] %s %s lint[0x%x])\n",
+			       p->uid,
+			       mps_inti_flags_polarity[polarity],
+			       mps_inti_flags_trigger[trigger],
+			       p->lint);
+		}
+		break;
+
 	case ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE:
 		{
 			struct acpi_madt_local_apic_override *p =
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6fce2fc2d124..a6989e517549 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -96,6 +96,7 @@ void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
 /* the following four functions are architecture-dependent */
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa);
+void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa);
 void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);
 void acpi_numa_arch_fixup(void);
 
-- 
cgit v1.2.3


From 2f425878b6a71571341dcd3f9e9d1a6f6355da9c Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:27:32 +0300
Subject: nfsd: don't use the deferral service, return NFS4ERR_DELAY

On an NFSv4.1 server cache miss that causes an upcall, NFS4ERR_DELAY will be
returned. It is up to the NFSv4.1 client to resend only the operations that
have not been processed.

Initialize rq_usedeferral to 1 in svc_process(). It sill be turned off in
nfsd4_proc_compound() only when NFSv4.1 Sessions are used.

Note: this isn't an adequate solution on its own. It's acceptable as a way
to get some minimal 4.1 up and working, but we're going to have to find a
way to avoid returning DELAY in all common cases before 4.1 can really be
considered ready.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: reverse rq_nodeferral negative logic]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[sunrpc: initialize rq_usedeferral]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c         | 8 ++++++++
 include/linux/sunrpc/svc.h | 1 +
 net/sunrpc/svc.c           | 2 ++
 net/sunrpc/svc_xprt.c      | 2 +-
 4 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 249dad987a16..ded469ff08b3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -854,6 +854,8 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	resp->cstate.replay_owner = NULL;
 	fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
 	fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+	/* Use the deferral mechanism only for NFSv4.0 compounds */
+	rqstp->rq_usedeferral = (args->minorversion == 0);
 
 	/*
 	 * According to RFC3010, this takes precedence over all other errors.
@@ -933,12 +935,18 @@ encode_op:
 
 		nfsd4_increment_op_stats(op->opnum);
 	}
+	if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
+		dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
+		status = nfserr_jukebox;
+	}
 
 	fh_put(&resp->cstate.current_fh);
 	fh_put(&resp->cstate.save_fh);
 	BUG_ON(resp->cstate.replay_owner);
 out:
 	nfsd4_release_compoundargs(args);
+	/* Reset deferral mechanism for RPC deferrals */
+	rqstp->rq_usedeferral = 1;
 	dprintk("nfsv4 compound returned %d\n", ntohl(status));
 	return status;
 }
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 9f9f699dd469..815dd589d4db 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -230,6 +230,7 @@ struct svc_rqst {
 	struct svc_cred		rq_cred;	/* auth info */
 	void *			rq_xprt_ctxt;	/* transport specific context ptr */
 	struct svc_deferred_req*rq_deferred;	/* deferred request we are replaying */
+	int			rq_usedeferral;	/* use deferral */
 
 	size_t			rq_xprt_hlen;	/* xprt header len */
 	struct xdr_buf		rq_arg;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index fff09a2d8960..45984cbe1bfa 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1023,6 +1023,8 @@ svc_process(struct svc_rqst *rqstp)
 	rqstp->rq_res.tail[0].iov_len = 0;
 	/* Will be turned off only in gss privacy case: */
 	rqstp->rq_splice_ok = 1;
+	/* Will be turned off only when NFSv4 Sessions are used */
+	rqstp->rq_usedeferral = 1;
 
 	/* Setup reply header */
 	rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 1e66f2491460..600d0918e3ae 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -974,7 +974,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req)
 	struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
 	struct svc_deferred_req *dr;
 
-	if (rqstp->rq_arg.page_len)
+	if (rqstp->rq_arg.page_len || !rqstp->rq_usedeferral)
 		return NULL; /* if more than a page, give up FIXME */
 	if (rqstp->rq_deferred) {
 		dr = rqstp->rq_deferred;
-- 
cgit v1.2.3


From 18df1884a872a2cc405a578cfd0d3adc8d227277 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 3 Apr 2009 08:27:36 +0300
Subject: nfs41: common protocol definitions

Define all NFSv4.1 common operation and error code constants.

Note that some of the definitions are used by both the nfs41 client
and the server code. This patch is duplicated in the nfs41 and nfsd41
sessions patchset.

Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: add exchange id flags]
Signed-off-by: Mike Sager <sager@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[removed server-only hunk changing NFSERR_REPLAY_ME]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: add SEQ4_XX to nfs41-common-protocol]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: generic error code update]
[nfs41: reverse EXCHGID4_INVAL_FLAG_MASK_{A,R}]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/nfs4.h | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 127 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index b912311a56b1..29ed600e193b 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -21,6 +21,7 @@
 #define NFS4_FHSIZE		128
 #define NFS4_MAXPATHLEN		PATH_MAX
 #define NFS4_MAXNAMLEN		NAME_MAX
+#define NFS4_MAX_SESSIONID_LEN	16
 
 #define NFS4_ACCESS_READ        0x0001
 #define NFS4_ACCESS_LOOKUP      0x0002
@@ -38,6 +39,7 @@
 #define NFS4_OPEN_RESULT_CONFIRM 0x0002
 #define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004
 
+#define NFS4_SHARE_ACCESS_MASK	0x000F
 #define NFS4_SHARE_ACCESS_READ	0x0001
 #define NFS4_SHARE_ACCESS_WRITE	0x0002
 #define NFS4_SHARE_ACCESS_BOTH	0x0003
@@ -45,6 +47,19 @@
 #define NFS4_SHARE_DENY_WRITE	0x0002
 #define NFS4_SHARE_DENY_BOTH	0x0003
 
+/* nfs41 */
+#define NFS4_SHARE_WANT_MASK		0xFF00
+#define NFS4_SHARE_WANT_NO_PREFERENCE	0x0000
+#define NFS4_SHARE_WANT_READ_DELEG	0x0100
+#define NFS4_SHARE_WANT_WRITE_DELEG	0x0200
+#define NFS4_SHARE_WANT_ANY_DELEG	0x0300
+#define NFS4_SHARE_WANT_NO_DELEG	0x0400
+#define NFS4_SHARE_WANT_CANCEL		0x0500
+
+#define NFS4_SHARE_WHEN_MASK		0xF0000
+#define NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL	0x10000
+#define NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED		0x20000
+
 #define NFS4_SET_TO_SERVER_TIME	0
 #define NFS4_SET_TO_CLIENT_TIME	1
 
@@ -88,6 +103,31 @@
 #define NFS4_ACE_GENERIC_EXECUTE              0x001200A0
 #define NFS4_ACE_MASK_ALL                     0x001F01FF
 
+#define EXCHGID4_FLAG_SUPP_MOVED_REFER		0x00000001
+#define EXCHGID4_FLAG_SUPP_MOVED_MIGR		0x00000002
+#define EXCHGID4_FLAG_USE_NON_PNFS		0x00010000
+#define EXCHGID4_FLAG_USE_PNFS_MDS		0x00020000
+#define EXCHGID4_FLAG_USE_PNFS_DS		0x00040000
+#define EXCHGID4_FLAG_UPD_CONFIRMED_REC_A	0x40000000
+#define EXCHGID4_FLAG_CONFIRMED_R		0x80000000
+/*
+ * Since the validity of these bits depends on whether
+ * they're set in the argument or response, have separate
+ * invalid flag masks for arg (_A) and resp (_R).
+ */
+#define EXCHGID4_FLAG_MASK_A			0x40070003
+#define EXCHGID4_FLAG_MASK_R			0x80070003
+
+#define SEQ4_STATUS_CB_PATH_DOWN		0x00000001
+#define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING	0x00000002
+#define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED	0x00000004
+#define SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED	0x00000008
+#define SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED	0x00000010
+#define SEQ4_STATUS_ADMIN_STATE_REVOKED		0x00000020
+#define SEQ4_STATUS_RECALLABLE_STATE_REVOKED	0x00000040
+#define SEQ4_STATUS_LEASE_MOVED			0x00000080
+#define SEQ4_STATUS_RESTART_RECLAIM_NEEDED	0x00000100
+
 #define NFS4_MAX_UINT64	(~(u64)0)
 
 enum nfs4_acl_whotype {
@@ -154,6 +194,28 @@ enum nfs_opnum4 {
 	OP_VERIFY = 37,
 	OP_WRITE = 38,
 	OP_RELEASE_LOCKOWNER = 39,
+
+	/* nfs41 */
+	OP_BACKCHANNEL_CTL = 40,
+	OP_BIND_CONN_TO_SESSION = 41,
+	OP_EXCHANGE_ID = 42,
+	OP_CREATE_SESSION = 43,
+	OP_DESTROY_SESSION = 44,
+	OP_FREE_STATEID = 45,
+	OP_GET_DIR_DELEGATION = 46,
+	OP_GETDEVICEINFO = 47,
+	OP_GETDEVICELIST = 48,
+	OP_LAYOUTCOMMIT = 49,
+	OP_LAYOUTGET = 50,
+	OP_LAYOUTRETURN = 51,
+	OP_SECINFO_NO_NAME = 52,
+	OP_SEQUENCE = 53,
+	OP_SET_SSV = 54,
+	OP_TEST_STATEID = 55,
+	OP_WANT_DELEGATION = 56,
+	OP_DESTROY_CLIENTID = 57,
+	OP_RECLAIM_COMPLETE = 58,
+
 	OP_ILLEGAL = 10044,
 };
 
@@ -230,7 +292,48 @@ enum nfsstat4 {
 	NFS4ERR_DEADLOCK = 10045,
 	NFS4ERR_FILE_OPEN = 10046,
 	NFS4ERR_ADMIN_REVOKED = 10047,
-	NFS4ERR_CB_PATH_DOWN = 10048
+	NFS4ERR_CB_PATH_DOWN = 10048,
+
+	/* nfs41 */
+	NFS4ERR_BADIOMODE	= 10049,
+	NFS4ERR_BADLAYOUT	= 10050,
+	NFS4ERR_BAD_SESSION_DIGEST = 10051,
+	NFS4ERR_BADSESSION	= 10052,
+	NFS4ERR_BADSLOT		= 10053,
+	NFS4ERR_COMPLETE_ALREADY = 10054,
+	NFS4ERR_CONN_NOT_BOUND_TO_SESSION = 10055,
+	NFS4ERR_DELEG_ALREADY_WANTED = 10056,
+	NFS4ERR_BACK_CHAN_BUSY	= 10057,	/* backchan reqs outstanding */
+	NFS4ERR_LAYOUTTRYLATER	= 10058,
+	NFS4ERR_LAYOUTUNAVAILABLE = 10059,
+	NFS4ERR_NOMATCHING_LAYOUT = 10060,
+	NFS4ERR_RECALLCONFLICT	= 10061,
+	NFS4ERR_UNKNOWN_LAYOUTTYPE = 10062,
+	NFS4ERR_SEQ_MISORDERED = 10063, 	/* unexpected seq.id in req */
+	NFS4ERR_SEQUENCE_POS	= 10064,	/* [CB_]SEQ. op not 1st op */
+	NFS4ERR_REQ_TOO_BIG	= 10065,	/* request too big */
+	NFS4ERR_REP_TOO_BIG	= 10066,	/* reply too big */
+	NFS4ERR_REP_TOO_BIG_TO_CACHE = 10067,	/* rep. not all cached */
+	NFS4ERR_RETRY_UNCACHED_REP = 10068,	/* retry & rep. uncached */
+	NFS4ERR_UNSAFE_COMPOUND = 10069,	/* retry/recovery too hard */
+	NFS4ERR_TOO_MANY_OPS	= 10070,	/* too many ops in [CB_]COMP */
+	NFS4ERR_OP_NOT_IN_SESSION = 10071,	/* op needs [CB_]SEQ. op */
+	NFS4ERR_HASH_ALG_UNSUPP = 10072,	/* hash alg. not supp. */
+						/* Error 10073 is unused. */
+	NFS4ERR_CLIENTID_BUSY	= 10074,	/* clientid has state */
+	NFS4ERR_PNFS_IO_HOLE	= 10075,	/* IO to _SPARSE file hole */
+	NFS4ERR_SEQ_FALSE_RETRY	= 10076,	/* retry not origional */
+	NFS4ERR_BAD_HIGH_SLOT	= 10077,	/* sequence arg bad */
+	NFS4ERR_DEADSESSION	= 10078,	/* persistent session dead */
+	NFS4ERR_ENCR_ALG_UNSUPP = 10079,	/* SSV alg mismatch */
+	NFS4ERR_PNFS_NO_LAYOUT	= 10080,	/* direct I/O with no layout */
+	NFS4ERR_NOT_ONLY_OP	= 10081,	/* bad compound */
+	NFS4ERR_WRONG_CRED	= 10082,	/* permissions:state change */
+	NFS4ERR_WRONG_TYPE	= 10083,	/* current operation mismatch */
+	NFS4ERR_DIRDELEG_UNAVAIL = 10084,	/* no directory delegation */
+	NFS4ERR_REJECT_DELEG	= 10085,	/* on callback */
+	NFS4ERR_RETURNCONFLICT	= 10086,	/* outstanding layoutreturn */
+	NFS4ERR_DELEG_REVOKED	= 10087,	/* deleg./layout revoked */
 };
 
 /*
@@ -391,6 +494,29 @@ enum {
 	NFSPROC4_CLNT_GETACL,
 	NFSPROC4_CLNT_SETACL,
 	NFSPROC4_CLNT_FS_LOCATIONS,
+
+	/* nfs41 */
+	NFSPROC4_CLNT_EXCHANGE_ID,
+	NFSPROC4_CLNT_CREATE_SESSION,
+	NFSPROC4_CLNT_DESTROY_SESSION,
+	NFSPROC4_CLNT_SEQUENCE,
+	NFSPROC4_CLNT_GET_LEASE_TIME,
+};
+
+/* nfs41 types */
+struct nfs4_sessionid {
+	unsigned char data[NFS4_MAX_SESSIONID_LEN];
+};
+
+/* Create Session Flags */
+#define SESSION4_PERSIST	 0x001
+#define SESSION4_BACK_CHAN 	 0x002
+#define SESSION4_RDMA		 0x004
+
+enum state_protect_how4 {
+	SP4_NONE	= 0,
+	SP4_MACH_CRED	= 1,
+	SP4_SSV		= 2
 };
 
 #endif
-- 
cgit v1.2.3


From 10add806c38c022d18af48f3ec28c91b4eaf7bb3 Mon Sep 17 00:00:00 2001
From: Marc Eshel <eshel@almaden.ibm.com>
Date: Fri, 3 Apr 2009 08:27:40 +0300
Subject: nfsd41: define nfs41 error codes

Define all error code present in
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: clean up error code definitions]
[nfsd41: change NFSERR_REPLAY_ME]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/nfs.h       |  1 -
 include/linux/nfsd/nfsd.h | 43 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index 54af92c1c70b..214d499718f7 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -109,7 +109,6 @@
 	NFSERR_FILE_OPEN = 10046,      /*       v4 */
 	NFSERR_ADMIN_REVOKED = 10047,  /*       v4 */
 	NFSERR_CB_PATH_DOWN = 10048,   /*       v4 */
-	NFSERR_REPLAY_ME = 10049	/*       v4 */
 };
 
 /* NFSv2 file types - beware, these are not the same in NFSv3 */
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 54beda12d26b..ab9616d09204 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -249,7 +249,44 @@ void		nfsd_lockd_shutdown(void);
 #define	nfserr_cb_path_down	cpu_to_be32(NFSERR_CB_PATH_DOWN)
 #define	nfserr_locked		cpu_to_be32(NFSERR_LOCKED)
 #define	nfserr_wrongsec		cpu_to_be32(NFSERR_WRONGSEC)
-#define	nfserr_replay_me	cpu_to_be32(NFSERR_REPLAY_ME)
+#define nfserr_badiomode		cpu_to_be32(NFS4ERR_BADIOMODE)
+#define nfserr_badlayout		cpu_to_be32(NFS4ERR_BADLAYOUT)
+#define nfserr_bad_session_digest	cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST)
+#define nfserr_badsession		cpu_to_be32(NFS4ERR_BADSESSION)
+#define nfserr_badslot			cpu_to_be32(NFS4ERR_BADSLOT)
+#define nfserr_complete_already		cpu_to_be32(NFS4ERR_COMPLETE_ALREADY)
+#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
+#define nfserr_deleg_already_wanted	cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED)
+#define nfserr_back_chan_busy		cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY)
+#define nfserr_layouttrylater		cpu_to_be32(NFS4ERR_LAYOUTTRYLATER)
+#define nfserr_layoutunavailable	cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE)
+#define nfserr_nomatching_layout	cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT)
+#define nfserr_recallconflict		cpu_to_be32(NFS4ERR_RECALLCONFLICT)
+#define nfserr_unknown_layouttype	cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE)
+#define nfserr_seq_misordered		cpu_to_be32(NFS4ERR_SEQ_MISORDERED)
+#define nfserr_sequence_pos		cpu_to_be32(NFS4ERR_SEQUENCE_POS)
+#define nfserr_req_too_big		cpu_to_be32(NFS4ERR_REQ_TOO_BIG)
+#define nfserr_rep_too_big		cpu_to_be32(NFS4ERR_REP_TOO_BIG)
+#define nfserr_rep_too_big_to_cache	cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE)
+#define nfserr_retry_uncached_rep	cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP)
+#define nfserr_unsafe_compound		cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND)
+#define nfserr_too_many_ops		cpu_to_be32(NFS4ERR_TOO_MANY_OPS)
+#define nfserr_op_not_in_session	cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION)
+#define nfserr_hash_alg_unsupp		cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP)
+#define nfserr_clientid_busy		cpu_to_be32(NFS4ERR_CLIENTID_BUSY)
+#define nfserr_pnfs_io_hole		cpu_to_be32(NFS4ERR_PNFS_IO_HOLE)
+#define nfserr_seq_false_retry		cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY)
+#define nfserr_bad_high_slot		cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT)
+#define nfserr_deadsession		cpu_to_be32(NFS4ERR_DEADSESSION)
+#define nfserr_encr_alg_unsupp		cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP)
+#define nfserr_pnfs_no_layout		cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT)
+#define nfserr_not_only_op		cpu_to_be32(NFS4ERR_NOT_ONLY_OP)
+#define nfserr_wrong_cred		cpu_to_be32(NFS4ERR_WRONG_CRED)
+#define nfserr_wrong_type		cpu_to_be32(NFS4ERR_WRONG_TYPE)
+#define nfserr_dirdeleg_unavail		cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL)
+#define nfserr_reject_deleg		cpu_to_be32(NFS4ERR_REJECT_DELEG)
+#define nfserr_returnconflict		cpu_to_be32(NFS4ERR_RETURNCONFLICT)
+#define nfserr_deleg_revoked		cpu_to_be32(NFS4ERR_DELEG_REVOKED)
 
 /* error codes for internal use */
 /* if a request fails due to kmalloc failure, it gets dropped.
@@ -258,6 +295,10 @@ void		nfsd_lockd_shutdown(void);
 #define	nfserr_dropit		cpu_to_be32(30000)
 /* end-of-file indicator in readdir */
 #define	nfserr_eof		cpu_to_be32(30001)
+/* replay detected */
+#define	nfserr_replay_me	cpu_to_be32(11001)
+/* nfs41 replay detected */
+#define	nfserr_replay_cache	cpu_to_be32(11002)
 
 /* Check for dir entries '.' and '..' */
 #define isdotent(n, l)	(l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
-- 
cgit v1.2.3


From 7116ed6b9973021ff43edeb10f4cb834db94000f Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:27:43 +0300
Subject: nfsd41: sessions basic data types

This patch provides basic data structures representing the nfs41
sessions and slots, plus helpers for keeping a reference count
on the session and freeing it.

Note that our server only support a headerpadsz of 0 and
it ignores backchannel attributes at the moment.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: remove headerpadsz from channel attributes]
[nfsd41: embed nfsd4_channel in nfsd4_session]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: use bool inuse for slot state]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41 remove sl_session from nfsd4_slot]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 18 ++++++++++++++++++
 include/linux/nfsd/state.h | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 070e9e5c0452..8c70f12159b6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -382,6 +382,24 @@ static void release_openowner(struct nfs4_stateowner *sop)
 	nfs4_put_stateowner(sop);
 }
 
+static void
+release_session(struct nfsd4_session *ses)
+{
+	list_del(&ses->se_hash);
+	list_del(&ses->se_perclnt);
+	nfsd4_put_session(ses);
+}
+
+void
+free_session(struct kref *kref)
+{
+	struct nfsd4_session *ses;
+
+	ses = container_of(kref, struct nfsd4_session, se_ref);
+	kfree(ses->se_slots);
+	kfree(ses);
+}
+
 static inline void
 renew_client(struct nfs4_client *clp)
 {
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index a6e4a00fa392..baea7f1fdb4a 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -99,6 +99,39 @@ struct nfs4_callback {
 	struct rpc_clnt *       cb_client;
 };
 
+struct nfsd4_slot {
+	bool				sl_inuse;
+	u32				sl_seqid;
+};
+
+struct nfsd4_session {
+	struct kref		se_ref;
+	struct list_head	se_hash;	/* hash by sessionid */
+	struct list_head	se_perclnt;
+	u32			se_flags;
+	struct nfs4_client	*se_client;	/* for expire_client */
+	struct nfs4_sessionid	se_sessionid;
+	u32			se_fmaxreq_sz;
+	u32			se_fmaxresp_sz;
+	u32			se_fmaxresp_cached;
+	u32			se_fmaxops;
+	u32			se_fnumslots;
+	struct nfsd4_slot	*se_slots;	/* forward channel slots */
+};
+
+static inline void
+nfsd4_put_session(struct nfsd4_session *ses)
+{
+	extern void free_session(struct kref *kref);
+	kref_put(&ses->se_ref, free_session);
+}
+
+static inline void
+nfsd4_get_session(struct nfsd4_session *ses)
+{
+	kref_get(&ses->se_ref);
+}
+
 #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
 
 /*
-- 
cgit v1.2.3


From 9fb870702d02c05f9410423bfff3f63e46e26180 Mon Sep 17 00:00:00 2001
From: Marc Eshel <eshel@almaden.ibm.com>
Date: Fri, 3 Apr 2009 08:27:46 +0300
Subject: nfsd41: introduce nfs4_client cl_sessions list

[get rid of CONFIG_NFSD_V4_1]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 1 +
 include/linux/nfsd/state.h | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8c70f12159b6..b2611999600a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -528,6 +528,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
 	INIT_LIST_HEAD(&clp->cl_delegations);
+	INIT_LIST_HEAD(&clp->cl_sessions);
 	INIT_LIST_HEAD(&clp->cl_lru);
 	return clp;
 }
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index baea7f1fdb4a..7faefc7c9d9a 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -163,6 +163,9 @@ struct nfs4_client {
 	struct nfs4_callback	cl_callback;    /* callback info */
 	atomic_t		cl_count;	/* ref count */
 	u32			cl_firststate;	/* recovery dir creation */
+
+	/* for nfs41 */
+	struct list_head	cl_sessions;
 };
 
 /* struct nfs4_client_reset
-- 
cgit v1.2.3


From 5282fd724b667b7d65f2e41e405a825e58a78813 Mon Sep 17 00:00:00 2001
From: Marc Eshel <eshel@almaden.ibm.com>
Date: Fri, 3 Apr 2009 08:27:52 +0300
Subject: nfsd41: sessionid hashing

Simple sessionid hashing using its monotonically increasing sequence number.

Locking considerations:
sessionid_hashtbl access is controlled by the sessionid_lock spin lock.
It must be taken for insert, delete, and lookup.
nfsd4_sequence looks up the session id and if the session is found,
it calls nfsd4_get_session (still under the sessionid_lock).
nfsd4_destroy_session calls nfsd4_put_session after unhashing
it, so when the session's kref reaches zero it's going to get freed.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[we don't use a prime for sessionid hash table size]
[use sessionid_lock spin lock]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 55 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nfsd/state.h |  7 ++++++
 2 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f23385b13065..fc20e1f38d75 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -382,11 +382,62 @@ static void release_openowner(struct nfs4_stateowner *sop)
 	nfs4_put_stateowner(sop);
 }
 
+static DEFINE_SPINLOCK(sessionid_lock);
+#define SESSION_HASH_SIZE	512
+static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
+
+static inline int
+hash_sessionid(struct nfs4_sessionid *sessionid)
+{
+	struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
+
+	return sid->sequence % SESSION_HASH_SIZE;
+}
+
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+	u32 *ptr = (u32 *)(&sessionid->data[0]);
+	dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+/* caller must hold sessionid_lock */
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+{
+	struct nfsd4_session *elem;
+	int idx;
+
+	dump_sessionid(__func__, sessionid);
+	idx = hash_sessionid(sessionid);
+	dprintk("%s: idx is %d\n", __func__, idx);
+	/* Search in the appropriate list */
+	list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+		dump_sessionid("list traversal", &elem->se_sessionid);
+		if (!memcmp(elem->se_sessionid.data, sessionid->data,
+			    NFS4_MAX_SESSIONID_LEN)) {
+			return elem;
+		}
+	}
+
+	dprintk("%s: session not found\n", __func__);
+	return NULL;
+}
+
+/* caller must hold sessionid_lock */
 static void
-release_session(struct nfsd4_session *ses)
+unhash_session(struct nfsd4_session *ses)
 {
 	list_del(&ses->se_hash);
 	list_del(&ses->se_perclnt);
+}
+
+static void
+release_session(struct nfsd4_session *ses)
+{
+	spin_lock(&sessionid_lock);
+	unhash_session(ses);
+	spin_unlock(&sessionid_lock);
 	nfsd4_put_session(ses);
 }
 
@@ -3205,6 +3256,8 @@ nfs4_state_init(void)
 		INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
 		INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
 	}
+	for (i = 0; i < SESSION_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&sessionid_hashtbl[i]);
 	for (i = 0; i < FILE_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&file_hashtbl[i]);
 	}
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 7faefc7c9d9a..5b3a6660f3af 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -132,6 +132,13 @@ nfsd4_get_session(struct nfsd4_session *ses)
 	kref_get(&ses->se_ref);
 }
 
+/* formatted contents of nfs4_sessionid */
+struct nfsd4_sessionid {
+	clientid_t	clientid;
+	u32		sequence;
+	u32		reserved;
+};
+
 #define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
 
 /*
-- 
cgit v1.2.3


From 2db134eb3b39faefc7fbfb200156d175edba2f68 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:27:55 +0300
Subject: nfsd41: xdr infrastructure

Define nfsd41_dec_ops vector and add it to nfsd4_minorversion for
minorversion 1.

Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
since we don't need to filter out obsolete ops as this is
done in the decoding phase.

exchange_id, create_session, destroy_session, and sequence ops are
implemented as stubs returning nfserr_opnotsupp at this stage.

[was nfsd41: xdr stubs]
[get rid of CONFIG_NFSD_V4_1]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4xdr.c         | 148 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfsd/xdr4.h |  22 +++++++
 2 files changed, 170 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 76a0b2a8d69b..6973d61bedea 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -996,6 +996,34 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
 	DECODE_TAIL;
 }
 
+static __be32
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+			 struct nfsd4_exchange_id *clid)
+{
+	return nfserr_opnotsupp;	/* stub */
+}
+
+static __be32
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+			    struct nfsd4_create_session *sess)
+{
+	return nfserr_opnotsupp;	/* stub */
+}
+
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+			     struct nfsd4_destroy_session *destroy_session)
+{
+	return nfserr_opnotsupp;	/* stub */
+}
+
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+		      struct nfsd4_sequence *seq)
+{
+	return nfserr_opnotsupp;	/* stub */
+}
+
 static __be32
 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 {
@@ -1050,6 +1078,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_release_lockowner,
 };
 
+static nfsd4_dec nfsd41_dec_ops[] = {
+	[OP_ACCESS]		(nfsd4_dec)nfsd4_decode_access,
+	[OP_CLOSE]		(nfsd4_dec)nfsd4_decode_close,
+	[OP_COMMIT]		(nfsd4_dec)nfsd4_decode_commit,
+	[OP_CREATE]		(nfsd4_dec)nfsd4_decode_create,
+	[OP_DELEGPURGE]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DELEGRETURN]	(nfsd4_dec)nfsd4_decode_delegreturn,
+	[OP_GETATTR]		(nfsd4_dec)nfsd4_decode_getattr,
+	[OP_GETFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_LINK]		(nfsd4_dec)nfsd4_decode_link,
+	[OP_LOCK]		(nfsd4_dec)nfsd4_decode_lock,
+	[OP_LOCKT]		(nfsd4_dec)nfsd4_decode_lockt,
+	[OP_LOCKU]		(nfsd4_dec)nfsd4_decode_locku,
+	[OP_LOOKUP]		(nfsd4_dec)nfsd4_decode_lookup,
+	[OP_LOOKUPP]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_NVERIFY]		(nfsd4_dec)nfsd4_decode_verify,
+	[OP_OPEN]		(nfsd4_dec)nfsd4_decode_open,
+	[OP_OPENATTR]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_CONFIRM]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_DOWNGRADE]	(nfsd4_dec)nfsd4_decode_open_downgrade,
+	[OP_PUTFH]		(nfsd4_dec)nfsd4_decode_putfh,
+	[OP_PUTPUBFH]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_PUTROOTFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_READ]		(nfsd4_dec)nfsd4_decode_read,
+	[OP_READDIR]		(nfsd4_dec)nfsd4_decode_readdir,
+	[OP_READLINK]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_REMOVE]		(nfsd4_dec)nfsd4_decode_remove,
+	[OP_RENAME]		(nfsd4_dec)nfsd4_decode_rename,
+	[OP_RENEW]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RESTOREFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_SAVEFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_SECINFO]		(nfsd4_dec)nfsd4_decode_secinfo,
+	[OP_SETATTR]		(nfsd4_dec)nfsd4_decode_setattr,
+	[OP_SETCLIENTID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_VERIFY]		(nfsd4_dec)nfsd4_decode_verify,
+	[OP_WRITE]		(nfsd4_dec)nfsd4_decode_write,
+	[OP_RELEASE_LOCKOWNER]	(nfsd4_dec)nfsd4_decode_notsupp,
+
+	/* new operations for NFSv4.1 */
+	[OP_BACKCHANNEL_CTL]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_EXCHANGE_ID]	(nfsd4_dec)nfsd4_decode_exchange_id,
+	[OP_CREATE_SESSION]	(nfsd4_dec)nfsd4_decode_create_session,
+	[OP_DESTROY_SESSION]	(nfsd4_dec)nfsd4_decode_destroy_session,
+	[OP_FREE_STATEID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GET_DIR_DELEGATION]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICEINFO]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICELIST]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTGET]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTRETURN]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SECINFO_NO_NAME]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SEQUENCE]		(nfsd4_dec)nfsd4_decode_sequence,
+	[OP_SET_SSV]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_TEST_STATEID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_WANT_DELEGATION]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DESTROY_CLIENTID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RECLAIM_COMPLETE]	(nfsd4_dec)nfsd4_decode_notsupp,
+};
+
 struct nfsd4_minorversion_ops {
 	nfsd4_dec *decoders;
 	int nops;
@@ -1057,6 +1146,7 @@ struct nfsd4_minorversion_ops {
 
 static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
 	[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+	[1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
 };
 
 static __be32
@@ -2571,6 +2661,38 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
 	return nfserr;
 }
 
+static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+			 struct nfsd4_exchange_id *exid)
+{
+	/* stub */
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+			    struct nfsd4_create_session *sess)
+{
+	/* stub */
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+			     struct nfsd4_destroy_session *destroy_session)
+{
+	/* stub */
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+		      struct nfsd4_sequence *seq)
+{
+	/* stub */
+	return nfserr;
+}
+
 static __be32
 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 {
@@ -2579,6 +2701,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 
 typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
 
+/*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+ * since we don't need to filter out obsolete ops as this is
+ * done in the decoding phase.
+ */
 static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_ACCESS]		= (nfsd4_enc)nfsd4_encode_access,
 	[OP_CLOSE]		= (nfsd4_enc)nfsd4_encode_close,
@@ -2617,6 +2744,27 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_VERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_WRITE]		= (nfsd4_enc)nfsd4_encode_write,
 	[OP_RELEASE_LOCKOWNER]	= (nfsd4_enc)nfsd4_encode_noop,
+
+	/* NFSv4.1 operations */
+	[OP_BACKCHANNEL_CTL]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+	[OP_EXCHANGE_ID]	= (nfsd4_enc)nfsd4_encode_exchange_id,
+	[OP_CREATE_SESSION]	= (nfsd4_enc)nfsd4_encode_create_session,
+	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_destroy_session,
+	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
+	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_TEST_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_WANT_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DESTROY_CLIENTID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_RECLAIM_COMPLETE]	= (nfsd4_enc)nfsd4_encode_noop,
 };
 
 void
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index fd15ddc3359d..28af925c2a3d 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -344,6 +344,22 @@ struct nfsd4_write {
 	nfs4_verifier	wr_verifier;        /* response */
 };
 
+struct nfsd4_exchange_id {
+	int	foo;	/* stub */
+};
+
+struct nfsd4_create_session {
+	int	foo;	/* stub */
+};
+
+struct nfsd4_sequence {
+	int	foo;	/* stub */
+};
+
+struct nfsd4_destroy_session {
+	int	foo;	/* stub */
+};
+
 struct nfsd4_op {
 	int					opnum;
 	__be32					status;
@@ -378,6 +394,12 @@ struct nfsd4_op {
 		struct nfsd4_verify		verify;
 		struct nfsd4_write		write;
 		struct nfsd4_release_lockowner	release_lockowner;
+
+		/* NFSv4.1 */
+		struct nfsd4_exchange_id	exchange_id;
+		struct nfsd4_create_session	create_session;
+		struct nfsd4_destroy_session	destroy_session;
+		struct nfsd4_sequence		sequence;
 	} u;
 	struct nfs4_replay *			replay;
 };
-- 
cgit v1.2.3


From 069b6ad4bb20abf175ea7875e82e8002154773af Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:27:58 +0300
Subject: nfsd41: proc stubs

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c        | 22 ++++++++++++++++++++++
 fs/nfsd/nfs4state.c       | 32 ++++++++++++++++++++++++++++++++
 include/linux/nfsd/xdr4.h | 12 ++++++++++++
 3 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ded469ff08b3..f7be7fabe62c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1101,6 +1101,28 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
 		.op_name = "OP_RELEASE_LOCKOWNER",
 	},
+
+	/* NFSv4.1 operations */
+	[OP_EXCHANGE_ID] = {
+		.op_func = (nfsd4op_func)nfsd4_exchange_id,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_EXCHANGE_ID",
+	},
+	[OP_CREATE_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_create_session,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_CREATE_SESSION",
+	},
+	[OP_DESTROY_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_destroy_session,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_DESTROY_SESSION",
+	},
+	[OP_SEQUENCE] = {
+		.op_func = (nfsd4op_func)nfsd4_sequence,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_SEQUENCE",
+	},
 };
 
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fc20e1f38d75..cfc01f415d15 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -832,6 +832,38 @@ out_err:
 	return;
 }
 
+__be32
+nfsd4_exchange_id(struct svc_rqst *rqstp,
+		  struct nfsd4_compound_state *cstate,
+		  struct nfsd4_exchange_id *exid)
+{
+	return -1;	/* stub */
+}
+
+__be32
+nfsd4_create_session(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_create_session *cr_ses)
+{
+	return -1;	/* stub */
+}
+
+__be32
+nfsd4_destroy_session(struct svc_rqst *r,
+		      struct nfsd4_compound_state *cstate,
+		      struct nfsd4_destroy_session *sessionid)
+{
+	return -1;	/* stub */
+}
+
+__be32
+nfsd4_sequence(struct svc_rqst *r,
+	       struct nfsd4_compound_state *cstate,
+	       struct nfsd4_sequence *seq)
+{
+	return -1;	/* stub */
+}
+
 __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 28af925c2a3d..294940c58094 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -471,6 +471,18 @@ extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
 extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *,
 		struct nfsd4_setclientid_confirm *setclientid_confirm);
+extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+struct nfsd4_exchange_id *);
+		extern __be32 nfsd4_create_session(struct svc_rqst *,
+		struct nfsd4_compound_state *,
+		struct nfsd4_create_session *);
+extern __be32 nfsd4_sequence(struct svc_rqst *,
+		struct nfsd4_compound_state *,
+		struct nfsd4_sequence *);
+extern __be32 nfsd4_destroy_session(struct svc_rqst *,
+		struct nfsd4_compound_state *,
+		struct nfsd4_destroy_session *);
 extern __be32 nfsd4_process_open1(struct nfsd4_open *open);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
 		struct svc_fh *current_fh, struct nfsd4_open *open);
-- 
cgit v1.2.3


From 0733d21338747483985a5964e852af160d88e429 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:28:01 +0300
Subject: nfsd41: exchange_id operation

Implement the exchange_id operation confoming to
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-28

Based on the client provided name, hash a client id.
If a confirmed one is found, compare the op's creds and
verifier.  If the creds match and the verifier is different
then expire the old client (client re-incarnated), otherwise,
if both match, assume it's a replay and ignore it.

If an unconfirmed client is found, then copy the new creds
and verifer if need update, otherwise assume replay.

The client is moved to a confirmed state on create_session.

In the nfs41 branch set the exchange_id flags to
EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_SUPP_MOVED_REFER
(pNFS is not supported, Referrals are supported,
Migration is not.).

Address various scenarios from section 18.35 of the spec:

1. Check for EXCHGID4_FLAG_UPD_CONFIRMED_REC_A and set
   EXCHGID4_FLAG_CONFIRMED_R as appropriate.

2. Return error codes per 18.35.4 scenarios.

3. Update client records or generate new client ids depending on
   scenario.

Note: 18.35.4 case 3 probably still needs revisiting.  The handling
seems not quite right.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Andy Adamosn <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: use utsname for major_id (and copy to server_scope)]
[nfsd41: fix handling of various exchange id scenarios]
Signed-off-by: Mike Sager <sager@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: reverse use of EXCHGID4_INVAL_FLAG_MASK_A]
[simplify nfsd4_encode_exchange_id error handling]
[nfsd41: embed an xdr_netobj in nfsd4_exchange_id]
[nfsd41: return nfserr_serverfault for spa_how == SP4_MACH_CRED]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 142 ++++++++++++++++++++++++++++++++++++++++++-
 fs/nfsd/nfs4xdr.c          | 147 +++++++++++++++++++++++++++++++++++++++++++--
 include/linux/nfsd/state.h |   2 +
 include/linux/nfsd/xdr4.h  |   7 ++-
 4 files changed, 292 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cfc01f415d15..4f963e902972 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -832,12 +832,152 @@ out_err:
 	return;
 }
 
+/*
+ * Set the exchange_id flags returned by the server.
+ */
+static void
+nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+{
+	/* pNFS is not supported */
+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+
+	/* Referrals are supported, Migration is not. */
+	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+
+	/* set the wire flags to return to client. */
+	clid->flags = new->cl_exchange_flags;
+}
+
 __be32
 nfsd4_exchange_id(struct svc_rqst *rqstp,
 		  struct nfsd4_compound_state *cstate,
 		  struct nfsd4_exchange_id *exid)
 {
-	return -1;	/* stub */
+	struct nfs4_client *unconf, *conf, *new;
+	int status;
+	unsigned int		strhashval;
+	char			dname[HEXDIR_LEN];
+	nfs4_verifier		verf = exid->verifier;
+	u32			ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+
+	dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+		" ip_addr=%u flags %x, spa_how %d\n",
+		__func__, rqstp, exid, exid->clname.len, exid->clname.data,
+		ip_addr, exid->flags, exid->spa_how);
+
+	if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+		return nfserr_inval;
+
+	/* Currently only support SP4_NONE */
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_SSV:
+		return nfserr_encr_alg_unsupp;
+	default:
+		BUG();				/* checked by xdr code */
+	case SP4_MACH_CRED:
+		return nfserr_serverfault;	/* no excuse :-/ */
+	}
+
+	status = nfs4_make_rec_clidname(dname, &exid->clname);
+
+	if (status)
+		goto error;
+
+	strhashval = clientstr_hashval(dname);
+
+	nfs4_lock_state();
+	status = nfs_ok;
+
+	conf = find_confirmed_client_by_str(dname, strhashval);
+	if (conf) {
+		if (!same_verf(&verf, &conf->cl_verifier)) {
+			/* 18.35.4 case 8 */
+			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+				status = nfserr_not_same;
+				goto out;
+			}
+			/* Client reboot: destroy old state */
+			expire_client(conf);
+			goto out_new;
+		}
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+			/* 18.35.4 case 9 */
+			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+				status = nfserr_perm;
+				goto out;
+			}
+			expire_client(conf);
+			goto out_new;
+		}
+		if (ip_addr != conf->cl_addr &&
+		    !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
+			/* Client collision. 18.35.4 case 3 */
+			status = nfserr_clid_inuse;
+			goto out;
+		}
+		/*
+		 * Set bit when the owner id and verifier map to an already
+		 * confirmed client id (18.35.3).
+		 */
+		exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+
+		/*
+		 * Falling into 18.35.4 case 2, possible router replay.
+		 * Leave confirmed record intact and return same result.
+		 */
+		copy_verf(conf, &verf);
+		new = conf;
+		goto out_copy;
+	} else {
+		/* 18.35.4 case 7 */
+		if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+			status = nfserr_noent;
+			goto out;
+		}
+	}
+
+	unconf  = find_unconfirmed_client_by_str(dname, strhashval);
+	if (unconf) {
+		/*
+		 * Possible retry or client restart.  Per 18.35.4 case 4,
+		 * a new unconfirmed record should be generated regardless
+		 * of whether any properties have changed.
+		 */
+		expire_client(unconf);
+	}
+
+out_new:
+	/* Normal case */
+	new = create_client(exid->clname, dname);
+	if (new == NULL) {
+		status = nfserr_resource;
+		goto out;
+	}
+
+	copy_verf(new, &verf);
+	copy_cred(&new->cl_cred, &rqstp->rq_cred);
+	new->cl_addr = ip_addr;
+	gen_clid(new);
+	gen_confirm(new);
+	add_to_unconfirmed(new, strhashval);
+out_copy:
+	exid->clientid.cl_boot = new->cl_clientid.cl_boot;
+	exid->clientid.cl_id = new->cl_clientid.cl_id;
+
+	new->cl_seqid = exid->seqid = 1;
+	nfsd4_set_ex_flags(new, exid);
+
+	dprintk("nfsd4_exchange_id seqid %d flags %x\n",
+		new->cl_seqid, new->cl_exchange_flags);
+	status = nfs_ok;
+
+out:
+	nfs4_unlock_state();
+error:
+	dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
+	return status;
 }
 
 __be32
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6973d61bedea..bebf6d249069 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/vfs.h>
+#include <linux/utsname.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
@@ -998,9 +999,100 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
 
 static __be32
 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
-			 struct nfsd4_exchange_id *clid)
+			 struct nfsd4_exchange_id *exid)
 {
-	return nfserr_opnotsupp;	/* stub */
+	int dummy;
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_VERIFIER_SIZE);
+	COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+
+	READ_BUF(4);
+	READ32(exid->clname.len);
+
+	READ_BUF(exid->clname.len);
+	SAVEMEM(exid->clname.data, exid->clname.len);
+
+	READ_BUF(4);
+	READ32(exid->flags);
+
+	/* Ignore state_protect4_a */
+	READ_BUF(4);
+	READ32(exid->spa_how);
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_MACH_CRED:
+		/* spo_must_enforce */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* spo_must_allow */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+		break;
+	case SP4_SSV:
+		/* ssp_ops */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* ssp_hash_algs<> */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* ssp_encr_algs<> */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* ssp_window and ssp_num_gss_handles */
+		READ_BUF(8);
+		READ32(dummy);
+		READ32(dummy);
+		break;
+	default:
+		goto xdr_error;
+	}
+
+	/* Ignore Implementation ID */
+	READ_BUF(4);    /* nfs_impl_id4 array length */
+	READ32(dummy);
+
+	if (dummy > 1)
+		goto xdr_error;
+
+	if (dummy == 1) {
+		/* nii_domain */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_name */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_date */
+		READ_BUF(12);
+		p += 3;
+	}
+	DECODE_TAIL;
 }
 
 static __be32
@@ -2665,8 +2757,55 @@ static __be32
 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
 			 struct nfsd4_exchange_id *exid)
 {
-	/* stub */
-	return nfserr;
+	ENCODE_HEAD;
+	char *major_id;
+	char *server_scope;
+	int major_id_sz;
+	int server_scope_sz;
+	uint64_t minor_id = 0;
+
+	if (nfserr)
+		return nfserr;
+
+	major_id = utsname()->nodename;
+	major_id_sz = strlen(major_id);
+	server_scope = utsname()->nodename;
+	server_scope_sz = strlen(server_scope);
+
+	RESERVE_SPACE(
+		8 /* eir_clientid */ +
+		4 /* eir_sequenceid */ +
+		4 /* eir_flags */ +
+		4 /* spr_how (SP4_NONE) */ +
+		8 /* so_minor_id */ +
+		4 /* so_major_id.len */ +
+		(XDR_QUADLEN(major_id_sz) * 4) +
+		4 /* eir_server_scope.len */ +
+		(XDR_QUADLEN(server_scope_sz) * 4) +
+		4 /* eir_server_impl_id.count (0) */);
+
+	WRITEMEM(&exid->clientid, 8);
+	WRITE32(exid->seqid);
+	WRITE32(exid->flags);
+
+	/* state_protect4_r. Currently only support SP4_NONE */
+	BUG_ON(exid->spa_how != SP4_NONE);
+	WRITE32(exid->spa_how);
+
+	/* The server_owner struct */
+	WRITE64(minor_id);      /* Minor id */
+	/* major id */
+	WRITE32(major_id_sz);
+	WRITEMEM(major_id, major_id_sz);
+
+	/* Server scope */
+	WRITE32(server_scope_sz);
+	WRITEMEM(server_scope, server_scope_sz);
+
+	/* Implementation id */
+	WRITE32(0);	/* zero length nfs_impl_id4 array */
+	ADJUST_ARGS();
+	return 0;
 }
 
 static __be32
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 5b3a6660f3af..8d0b10167937 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -173,6 +173,8 @@ struct nfs4_client {
 
 	/* for nfs41 */
 	struct list_head	cl_sessions;
+	u32			cl_seqid;       /* seqid for create_session */
+	u32			cl_exchange_flags;
 };
 
 /* struct nfs4_client_reset
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 294940c58094..33ee71e988b4 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -345,7 +345,12 @@ struct nfsd4_write {
 };
 
 struct nfsd4_exchange_id {
-	int	foo;	/* stub */
+	nfs4_verifier	verifier;
+	struct xdr_netobj clname;
+	u32		flags;
+	clientid_t	clientid;
+	u32		seqid;
+	int		spa_how;
 };
 
 struct nfsd4_create_session {
-- 
cgit v1.2.3


From a1bcecd29cdf1670df6908a620add4211c0abb7a Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:28:05 +0300
Subject: nfsd41: match clientid establishment method

We need to distinguish between client names provided by NFSv4.0 clients
SETCLIENTID and those provided by NFSv4.1 via EXCHANGE_ID when looking
up the clientid by string.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Andy Adamson <andros@netapp.com>
[nfsd41: use boolean values for use_exchange_id argument]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: simplify match_clientid_establishment logic]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4recover.c      |  3 ++-
 fs/nfsd/nfs4state.c        | 42 +++++++++++++++++++++++++++++++-----------
 include/linux/nfsd/state.h |  2 +-
 3 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index b11cf8d34280..3444c0052a87 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -344,7 +344,8 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
 	int status;
 
-	if (nfs4_has_reclaimed_state(child->d_name.name))
+	/* note: we currently use this path only for minorversion 0 */
+	if (nfs4_has_reclaimed_state(child->d_name.name, false))
 		return 0;
 
 	status = nfsd4_clear_clid_dir(parent, child);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4f963e902972..dfc6d946cdfe 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -715,25 +715,45 @@ find_unconfirmed_client(clientid_t *clid)
 	return NULL;
 }
 
+/*
+ * Return 1 iff clp's clientid establishment method matches the use_exchange_id
+ * parameter. Matching is based on the fact the at least one of the
+ * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
+ *
+ * FIXME: we need to unify the clientid namespaces for nfsv4.x
+ * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
+ * and SET_CLIENTID{,_CONFIRM}
+ */
+static inline int
+match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
+{
+	bool has_exchange_flags = (clp->cl_exchange_flags != 0);
+	return use_exchange_id == has_exchange_flags;
+}
+
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+find_confirmed_client_by_str(const char *dname, unsigned int hashval,
+			     bool use_exchange_id)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname))
+		if (same_name(clp->cl_recdir, dname) &&
+		    match_clientid_establishment(clp, use_exchange_id))
 			return clp;
 	}
 	return NULL;
 }
 
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+			       bool use_exchange_id)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname))
+		if (same_name(clp->cl_recdir, dname) &&
+		    match_clientid_establishment(clp, use_exchange_id))
 			return clp;
 	}
 	return NULL;
@@ -890,7 +910,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 	nfs4_lock_state();
 	status = nfs_ok;
 
-	conf = find_confirmed_client_by_str(dname, strhashval);
+	conf = find_confirmed_client_by_str(dname, strhashval, true);
 	if (conf) {
 		if (!same_verf(&verf, &conf->cl_verifier)) {
 			/* 18.35.4 case 8 */
@@ -938,7 +958,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 		}
 	}
 
-	unconf  = find_unconfirmed_client_by_str(dname, strhashval);
+	unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
 	if (unconf) {
 		/*
 		 * Possible retry or client restart.  Per 18.35.4 case 4,
@@ -1035,7 +1055,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	strhashval = clientstr_hashval(dname);
 
 	nfs4_lock_state();
-	conf = find_confirmed_client_by_str(dname, strhashval);
+	conf = find_confirmed_client_by_str(dname, strhashval, false);
 	if (conf) {
 		/* RFC 3530 14.2.33 CASE 0: */
 		status = nfserr_clid_inuse;
@@ -1050,7 +1070,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * has a description of SETCLIENTID request processing consisting
 	 * of 5 bullet points, labeled as CASE0 - CASE4 below.
 	 */
-	unconf = find_unconfirmed_client_by_str(dname, strhashval);
+	unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
 	status = nfserr_resource;
 	if (!conf) {
 		/*
@@ -1205,7 +1225,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			unsigned int hash =
 				clientstr_hashval(unconf->cl_recdir);
 			conf = find_confirmed_client_by_str(unconf->cl_recdir,
-									hash);
+							    hash, false);
 			if (conf) {
 				nfsd4_remove_clid_dir(conf);
 				expire_client(conf);
@@ -3326,12 +3346,12 @@ alloc_reclaim(void)
 }
 
 int
-nfs4_has_reclaimed_state(const char *name)
+nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
 {
 	unsigned int strhashval = clientstr_hashval(name);
 	struct nfs4_client *clp;
 
-	clp = find_confirmed_client_by_str(name, strhashval);
+	clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
 	return clp ? 1 : 0;
 }
 
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 8d0b10167937..90829db76861 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -330,7 +330,7 @@ extern void nfsd4_init_recdir(char *recdir_name);
 extern int nfsd4_recdir_load(void);
 extern void nfsd4_shutdown_recdir(void);
 extern int nfs4_client_to_reclaim(const char *name);
-extern int nfs4_has_reclaimed_state(const char *name);
+extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
 extern void nfsd4_recdir_purge_old(void);
 extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
-- 
cgit v1.2.3


From b85d4c01b76f6969a085d07a767fa45225cb14be Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 3 Apr 2009 08:28:08 +0300
Subject: nfsd41: sequence operation

Implement the sequence operation conforming to
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-26

Check for stale clientid (as derived from the sessionid).
Enforce slotid range and exactly-once semantics using
the slotid and seqid.

If everything went well renew the client lease and
mark the slot INPROGRESS.

Add a struct nfsd4_slot pointer to struct nfsd4_compound_state.
To be used for sessions DRC replay.

[nfsd41: rename sequence catchthis to cachethis]
Signed-off-by: Andy Adamson<andros@netapp.com>
[pulled some code to set cstate->slot from "nfsd DRC logic"]
[use sessionid_lock spin lock]
[nfsd41: use bool inuse for slot state]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd: add a struct nfsd4_slot pointer to struct nfsd4_compound_state]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: add nfsd4_session pointer to nfsd4_compound_state]
[nfsd41: set cstate session]
[nfsd41: use cstate session in nfsd4_sequence]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[simplify nfsd4_encode_sequence error handling]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c       | 73 +++++++++++++++++++++++++++++++++++++++++++++--
 fs/nfsd/nfs4xdr.c         | 33 +++++++++++++++++++--
 include/linux/nfsd/xdr4.h | 13 ++++++++-
 3 files changed, 113 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index dfc6d946cdfe..24b6e0593184 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1000,6 +1000,32 @@ error:
 	return status;
 }
 
+static int
+check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
+{
+	dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
+		slot->sl_seqid);
+
+	/* The slot is in use, and no response has been sent. */
+	if (slot->sl_inuse) {
+		if (seqid == slot->sl_seqid)
+			return nfserr_jukebox;
+		else
+			return nfserr_seq_misordered;
+	}
+	/* Normal */
+	if (likely(seqid == slot->sl_seqid + 1))
+		return nfs_ok;
+	/* Replay */
+	if (seqid == slot->sl_seqid)
+		return nfserr_replay_cache;
+	/* Wraparound */
+	if (seqid == 1 && (slot->sl_seqid + 1) == 0)
+		return nfs_ok;
+	/* Misordered replay or misordered new request */
+	return nfserr_seq_misordered;
+}
+
 __be32
 nfsd4_create_session(struct svc_rqst *rqstp,
 		     struct nfsd4_compound_state *cstate,
@@ -1017,11 +1043,54 @@ nfsd4_destroy_session(struct svc_rqst *r,
 }
 
 __be32
-nfsd4_sequence(struct svc_rqst *r,
+nfsd4_sequence(struct svc_rqst *rqstp,
 	       struct nfsd4_compound_state *cstate,
 	       struct nfsd4_sequence *seq)
 {
-	return -1;	/* stub */
+	struct nfsd4_session *session;
+	struct nfsd4_slot *slot;
+	int status;
+
+	spin_lock(&sessionid_lock);
+	status = nfserr_badsession;
+	session = find_in_sessionid_hashtbl(&seq->sessionid);
+	if (!session)
+		goto out;
+
+	status = nfserr_badslot;
+	if (seq->slotid >= session->se_fnumslots)
+		goto out;
+
+	slot = &session->se_slots[seq->slotid];
+	dprintk("%s: slotid %d\n", __func__, seq->slotid);
+
+	status = check_slot_seqid(seq->seqid, slot);
+	if (status == nfserr_replay_cache) {
+		cstate->slot = slot;
+		cstate->session = session;
+		goto replay_cache;
+	}
+	if (status)
+		goto out;
+
+	/* Success! bump slot seqid */
+	slot->sl_inuse = true;
+	slot->sl_seqid = seq->seqid;
+
+	cstate->slot = slot;
+	cstate->session = session;
+
+replay_cache:
+	/* Renew the clientid on success and on replay.
+	 * Hold a session reference until done processing the compound:
+	 * nfsd4_put_session called only if the cstate slot is set.
+	 */
+	renew_client(session->se_client);
+	nfsd4_get_session(session);
+out:
+	spin_unlock(&sessionid_lock);
+	dprintk("%s: return %d\n", __func__, ntohl(status));
+	return status;
 }
 
 __be32
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index bebf6d249069..8556575480ce 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1113,7 +1113,16 @@ static __be32
 nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
 		      struct nfsd4_sequence *seq)
 {
-	return nfserr_opnotsupp;	/* stub */
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+	COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	READ32(seq->seqid);
+	READ32(seq->slotid);
+	READ32(seq->maxslots);
+	READ32(seq->cachethis);
+
+	DECODE_TAIL;
 }
 
 static __be32
@@ -2828,8 +2837,26 @@ static __be32
 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
 		      struct nfsd4_sequence *seq)
 {
-	/* stub */
-	return nfserr;
+	ENCODE_HEAD;
+
+	if (nfserr)
+		return nfserr;
+
+	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
+	WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(seq->seqid);
+	WRITE32(seq->slotid);
+	WRITE32(seq->maxslots);
+	/*
+	 * FIXME: for now:
+	 *   target_maxslots = maxslots
+	 *   status_flags = 0
+	 */
+	WRITE32(seq->maxslots);
+	WRITE32(0);
+
+	ADJUST_ARGS();
+	return 0;
 }
 
 static __be32
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 33ee71e988b4..6e28a041008d 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -48,6 +48,9 @@ struct nfsd4_compound_state {
 	struct svc_fh		current_fh;
 	struct svc_fh		save_fh;
 	struct nfs4_stateowner	*replay_owner;
+	/* For sessions DRC */
+	struct nfsd4_session	*session;
+	struct nfsd4_slot	*slot;
 };
 
 struct nfsd4_change_info {
@@ -358,7 +361,15 @@ struct nfsd4_create_session {
 };
 
 struct nfsd4_sequence {
-	int	foo;	/* stub */
+	struct nfs4_sessionid	sessionid;		/* request/response */
+	u32			seqid;			/* request/response */
+	u32			slotid;			/* request/response */
+	u32			maxslots;		/* request/response */
+	u32			cachethis;		/* request */
+#if 0
+	u32			target_maxslots;	/* response */
+	u32			status_flags;		/* response */
+#endif /* not yet */
 };
 
 struct nfsd4_destroy_session {
-- 
cgit v1.2.3


From 074fe897536f095309c5aaffcf912952882ab2cb Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:28:15 +0300
Subject: nfsd41: DRC save, restore, and clear functions

Cache all the result pages, including the rpc header in rq_respages[0],
for a request in the slot table cache entry.

Cache the statp pointer from nfsd_dispatch which points into rq_respages[0]
just past the rpc header. When setting a cache entry, calculate and save the
length of the nfs data minus the rpc header for rq_respages[0].

When replaying a cache entry, replace the cached rpc header with the
replayed request rpc result header, unless there is not enough room in the
cached results first page. In that case, use the cached rpc header.

The sessions fore channel maxresponse size cached is set to NFSD_PAGES_PER_SLOT
* PAGE_SIZE. For compounds we are cacheing with operations such as READDIR
that use the xdr_buf->pages to hold data, we choose to cache the extra page of
data rather than copying data from xdr_buf->pages into the xdr_buf->head page.

[nfsd41: limit cache to maxresponsesize_cached]
[nfsd41: mv nfsd4_set_statp under CONFIG_NFSD_V4_1]
[nfsd41: rename nfsd4_move_pages]
[nfsd41: rename page_no variable]
[nfsd41: rename nfsd4_set_cache_entry]
[nfsd41: fix nfsd41_copy_replay_data comment]
[nfsd41: add to nfsd4_set_cache_entry]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 142 +++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfssvc.c           |   4 ++
 include/linux/nfsd/cache.h |   1 +
 include/linux/nfsd/state.h |  13 +++++
 include/linux/nfsd/xdr4.h  |   4 ++
 5 files changed, 164 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9243dca3576c..a37b91dab1bf 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -852,6 +852,148 @@ out_err:
 	return;
 }
 
+void
+nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+
+	resp->cstate.statp = statp;
+}
+
+/*
+ * Dereference the result pages.
+ */
+static void
+nfsd4_release_respages(struct page **respages, short resused)
+{
+	int i;
+
+	dprintk("--> %s\n", __func__);
+	for (i = 0; i < resused; i++) {
+		if (!respages[i])
+			continue;
+		put_page(respages[i]);
+		respages[i] = NULL;
+	}
+}
+
+static void
+nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		topages[i] = frompages[i];
+		if (!topages[i])
+			continue;
+		get_page(topages[i]);
+	}
+}
+
+/*
+ * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
+ * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
+ * length of the XDR response is less than se_fmaxresp_cached
+ * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
+ * of the reply (e.g. readdir).
+ *
+ * Store the base and length of the rq_req.head[0] page
+ * of the NFSv4.1 data, just past the rpc header.
+ */
+void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+	struct svc_rqst *rqstp = resp->rqstp;
+	struct nfsd4_compoundargs *args = rqstp->rq_argp;
+	struct nfsd4_op *op = &args->ops[resp->opcnt];
+	struct kvec *resv = &rqstp->rq_res.head[0];
+
+	dprintk("--> %s entry %p\n", __func__, entry);
+
+	/* Don't cache a failed OP_SEQUENCE. */
+	if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
+		return;
+	nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
+	entry->ce_resused = rqstp->rq_resused;
+	if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
+		entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
+	nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
+			 entry->ce_resused);
+	entry->ce_status = resp->cstate.status;
+	entry->ce_datav.iov_base = resp->cstate.statp;
+	entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
+				(char *)page_address(rqstp->rq_respages[0]));
+	entry->ce_opcnt = resp->opcnt;
+	/* Current request rpc header length*/
+	entry->ce_rpchdrlen = (char *)resp->cstate.statp -
+				(char *)page_address(rqstp->rq_respages[0]);
+}
+
+/*
+ * We keep the rpc header, but take the nfs reply from the replycache.
+ */
+static int
+nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
+			struct nfsd4_cache_entry *entry)
+{
+	struct svc_rqst *rqstp = resp->rqstp;
+	struct kvec *resv = &resp->rqstp->rq_res.head[0];
+	int len;
+
+	/* Current request rpc header length*/
+	len = (char *)resp->cstate.statp -
+			(char *)page_address(rqstp->rq_respages[0]);
+	if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
+		dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
+			entry->ce_datav.iov_len);
+		return 0;
+	}
+	/* copy the cached reply nfsd data past the current rpc header */
+	memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
+		entry->ce_datav.iov_len);
+	resv->iov_len = len + entry->ce_datav.iov_len;
+	return 1;
+}
+
+/*
+ * Keep the first page of the replay. Copy the NFSv4.1 data from the first
+ * cached page.  Replace any futher replay pages from the cache.
+ */
+__be32
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+	__be32 status;
+
+	dprintk("--> %s entry %p\n", __func__, entry);
+
+
+	if (!nfsd41_copy_replay_data(resp, entry)) {
+		/*
+		 * Not enough room to use the replay rpc header, send the
+		 * cached header. Release all the allocated result pages.
+		 */
+		svc_free_res_pages(resp->rqstp);
+		nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
+			entry->ce_resused);
+	} else {
+		/* Release all but the first allocated result page */
+
+		resp->rqstp->rq_resused--;
+		svc_free_res_pages(resp->rqstp);
+
+		nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
+				 &entry->ce_respages[1],
+				 entry->ce_resused - 1);
+	}
+
+	resp->rqstp->rq_resused = entry->ce_resused;
+	status = entry->ce_status;
+
+	return status;
+}
+
 /*
  * Set the exchange_id flags returned by the server.
  */
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ef0a3686639d..b5168d1898ec 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -515,6 +515,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 		+ rqstp->rq_res.head[0].iov_len;
 	rqstp->rq_res.head[0].iov_len += sizeof(__be32);
 
+	/* NFSv4.1 DRC requires statp */
+	if (rqstp->rq_vers == 4)
+		nfsd4_set_statp(rqstp, statp);
+
 	/* Now call the procedure handler, and encode NFS status. */
 	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h
index 04b355c801d8..a59a2df6d079 100644
--- a/include/linux/nfsd/cache.h
+++ b/include/linux/nfsd/cache.h
@@ -75,5 +75,6 @@ int	nfsd_reply_cache_init(void);
 void	nfsd_reply_cache_shutdown(void);
 int	nfsd_cache_lookup(struct svc_rqst *, int);
 void	nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+void	nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
 
 #endif /* NFSCACHE_H */
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 90829db76861..f1edb1d98523 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -99,9 +99,22 @@ struct nfs4_callback {
 	struct rpc_clnt *       cb_client;
 };
 
+/* Maximum number of pages per slot cache entry */
+#define NFSD_PAGES_PER_SLOT	1
+
+struct nfsd4_cache_entry {
+	__be32		ce_status;
+	struct kvec	ce_datav; /* encoded NFSv4.1 data in rq_res.head[0] */
+	struct page	*ce_respages[NFSD_PAGES_PER_SLOT + 1];
+	short		ce_resused;
+	int		ce_opcnt;
+	int		ce_rpchdrlen;
+};
+
 struct nfsd4_slot {
 	bool				sl_inuse;
 	u32				sl_seqid;
+	struct nfsd4_cache_entry	sl_cache_entry;
 };
 
 struct nfsd4_session {
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 6e28a041008d..d091684325af 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -51,6 +51,8 @@ struct nfsd4_compound_state {
 	/* For sessions DRC */
 	struct nfsd4_session	*session;
 	struct nfsd4_slot	*slot;
+	__be32			*statp;
+	u32			status;
 };
 
 struct nfsd4_change_info {
@@ -487,6 +489,8 @@ extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
 extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *,
 		struct nfsd4_setclientid_confirm *setclientid_confirm);
+extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
+extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *,
 struct nfsd4_exchange_id *);
-- 
cgit v1.2.3


From c3d06f9ce8544fecfe13e377d1e2c2e47fe18dbc Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:28:18 +0300
Subject: nfsd41: hard page limit for DRC

Use no more than 1/128th of the number of free pages at nfsd startup for the
v4.1 DRC.

This is an arbitrary default which should probably end up under the control
of an administrator.

Signed-off-by: Andy Adamson <andros@netapp.com>
[moved added fields in struct svc_serv under CONFIG_NFSD_V4_1]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[fix set_max_drc calculation of sv_drc_max_pages]
[moved NFSD_DRC_SIZE_SHIFT's declaration up in header file]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfssvc.c           | 23 +++++++++++++++++++++++
 include/linux/nfsd/nfsd.h  |  3 +++
 include/linux/sunrpc/svc.h |  2 ++
 3 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b5168d1898ec..b53a098e97a4 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
 #include <linux/kthread.h>
+#include <linux/swap.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -197,6 +198,26 @@ void nfsd_reset_versions(void)
 	}
 }
 
+/*
+ * Each session guarantees a negotiated per slot memory cache for replies
+ * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
+ * NFSv4.1 server might want to use more memory for a DRC than a machine
+ * with mutiple services.
+ *
+ * Impose a hard limit on the number of pages for the DRC which varies
+ * according to the machines free pages. This is of course only a default.
+ *
+ * For now this is a #defined shift which could be under admin control
+ * in the future.
+ */
+static void set_max_drc(void)
+{
+	nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
+						>> NFSD_DRC_SIZE_SHIFT;
+	nfsd_serv->sv_drc_pages_used = 0;
+	dprintk("%s svc_drc_max_pages %u\n", __func__,
+		nfsd_serv->sv_drc_max_pages);
+}
 
 int nfsd_create_serv(void)
 {
@@ -229,6 +250,8 @@ int nfsd_create_serv(void)
 				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
+	else
+		set_max_drc();
 
 	do_gettimeofday(&nfssvc_boot);		/* record boot time */
 	return err;
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index ab9616d09204..1f063d495159 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -331,6 +331,9 @@ extern struct timeval	nfssvc_boot;
 #define NFSD_LEASE_TIME                 (nfs4_lease_time())
 #define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
 
+/* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
+#define NFSD_DRC_SIZE_SHIFT	7
+
 /*
  * The following attributes are currently not supported by the NFSv4 server:
  *    ARCHIVE       (deprecated anyway)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 815dd589d4db..d209c630a4a1 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -95,6 +95,8 @@ struct svc_serv {
 	struct module *		sv_module;	/* optional module to count when
 						 * adding threads */
 	svc_thread_fn		sv_function;	/* main function for threads */
+	unsigned int		sv_drc_max_pages; /* Total pages for DRC */
+	unsigned int		sv_drc_pages_used;/* DRC pages used */
 };
 
 /*
-- 
cgit v1.2.3


From da3846a2866ddf239311766ff434a82e7b4ac701 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:28:22 +0300
Subject: nfsd41: nfsd DRC logic

Replay a request in nfsd4_sequence.
Add a minorversion to struct nfsd4_compound_state.

Pass the current slot to nfs4svc_encode_compound res via struct
nfsd4_compoundres to set an NFSv4.1 DRC entry.

Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: use bool inuse for slot state]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: use cstate session in nfs4svc_encode_compoundres]
[nfsd41 replace nfsd4_set_cache_entry]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c        |  7 +++++++
 fs/nfsd/nfs4state.c       |  6 ++++++
 fs/nfsd/nfs4xdr.c         | 11 +++++++++++
 include/linux/nfsd/xdr4.h |  1 +
 4 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ec51936d2ce2..9e2ee75e0f7c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -936,6 +936,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 			BUG_ON(op->status == nfs_ok);
 
 encode_op:
+		/* Only from SEQUENCE or CREATE_SESSION */
+		if (resp->cstate.status == nfserr_replay_cache) {
+			dprintk("%s NFS4.1 replay from cache\n", __func__);
+			status = op->status;
+			goto out;
+		}
 		if (op->status == nfserr_replay_me) {
 			op->replay = &cstate->replay_owner->so_replay;
 			nfsd4_encode_replay(resp, op);
@@ -964,6 +970,7 @@ encode_op:
 		status = nfserr_jukebox;
 	}
 
+	resp->cstate.status = status;
 	fh_put(&resp->cstate.current_fh);
 	fh_put(&resp->cstate.save_fh);
 	BUG_ON(resp->cstate.replay_owner);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a37b91dab1bf..e20d4345040f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -989,6 +989,8 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp)
 	}
 
 	resp->rqstp->rq_resused = entry->ce_resused;
+	resp->opcnt = entry->ce_opcnt;
+	resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
 	status = entry->ce_status;
 
 	return status;
@@ -1214,6 +1216,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	if (status == nfserr_replay_cache) {
 		cstate->slot = slot;
 		cstate->session = session;
+		/* Return the cached reply status and set cstate->status
+		 * for nfsd4_svc_encode_compoundres processing*/
+		status = nfsd4_replay_cache_entry(resp);
+		cstate->status = nfserr_replay_cache;
 		goto replay_cache;
 	}
 	if (status)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 8556575480ce..09415bcf078e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3049,6 +3049,17 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 		iov = &rqstp->rq_res.head[0];
 	iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
 	BUG_ON(iov->iov_len > PAGE_SIZE);
+	if (resp->cstate.slot != NULL) {
+		if (resp->cstate.status == nfserr_replay_cache) {
+			iov->iov_len = resp->cstate.iovlen;
+		} else {
+			nfsd4_store_cache_entry(resp);
+			dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+			resp->cstate.slot->sl_inuse = 0;
+		}
+		if (resp->cstate.session)
+			nfsd4_put_session(resp->cstate.session);
+	}
 	return 1;
 }
 
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index d091684325af..69cb467cb720 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -52,6 +52,7 @@ struct nfsd4_compound_state {
 	struct nfsd4_session	*session;
 	struct nfsd4_slot	*slot;
 	__be32			*statp;
+	size_t			iovlen;
 	u32			status;
 };
 
-- 
cgit v1.2.3


From ec6b5d7b5064fde27aee798b81107ea3a830de85 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:28:28 +0300
Subject: nfsd41: create_session operation

Implement the create_session operation confoming to
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-26

Look up the client id (generated by the server on exchange_id,
given by the client on create_session).
If neither a confirmed or unconfirmed client is found
then the client id is stale
If a confirmed cilent is found (i.e. we already received
create_session for it) then compare the sequence id
to determine if it's a replay or possibly a mis-ordered rpc.
If the seqid is in order, update the confirmed client seqid
and procedd with updating the session parameters.

If an unconfirmed client_id is found then verify the creds
and seqid.  If both match move the client id to confirmed state
and proceed with processing the create_session.

Currently, we do not support persistent sessions, and RDMA.

alloc_init_session generates a new sessionid and creates
a session structure.

NFSD_PAGES_PER_SLOT is used for the max response cached calculation, and for
the counting of DRC pages using the hard limits set in struct srv_serv.

A note on NFSD_PAGES_PER_SLOT:

Other patches in this series allow for NFSD_PAGES_PER_SLOT + 1 pages to be
cached in a DRC slot when the response size is less than NFSD_PAGES_PER_SLOT *
PAGE_SIZE but xdr_buf pages are used. e.g. a READDIR operation will encode a
small amount of data in the xdr_buf head, and then the READDIR in the xdr_buf
pages.  So, the hard limit calculation use of pages by a session is
underestimated by the number of cached operations using the xdr_buf pages.

Yet another patch caches no pages for the solo sequence operation, or any
compound where cache_this is False.  So the hard limit calculation use of
pages by a session is overestimated by the number of these operations in the
cache.

TODO: improve resource pre-allocation and negotiate session
parameters accordingly.  Respect and possibly adjust
backchannel attributes.

Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Dean Hildebrand <dhildeb@us.ibm.com>
[nfsd41: remove headerpadsz from channel attributes]
Our client and server only support a headerpadsz of 0.
[nfsd41: use DRC limits in fore channel init]
[nfsd41: do not change CREATE_SESSION back channel attrs]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[use sessionid_lock spin lock]
[nfsd41: use bool inuse for slot state]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41 remove sl_session from alloc_init_session]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[simplify nfsd4_encode_create_session error handling]
[nfsd41: fix comment style in init_forechannel_attrs]
[nfsd41: allocate struct nfsd4_session and slot table in one piece]
[nfsd41: no need to INIT_LIST_HEAD in alloc_init_session just prior to list_add]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 188 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfsd/nfs4xdr.c          | 148 ++++++++++++++++++++++++++++++++++-
 include/linux/nfsd/state.h |   7 +-
 include/linux/nfsd/xdr4.h  |  21 ++++-
 4 files changed, 358 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f25a7d2a9fac..463ae39df148 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
 static u32 nfs4_init;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
+static u64 current_sessionid = 1;
 
 #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -401,6 +402,131 @@ dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
 	dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
 }
 
+static void
+gen_sessionid(struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd4_sessionid *sid;
+
+	sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
+	sid->clientid = clp->cl_clientid;
+	sid->sequence = current_sessionid++;
+	sid->reserved = 0;
+}
+
+/*
+ * Give the client the number of slots it requests bound by
+ * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
+ *
+ * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
+ * should (up to a point) re-negotiate active sessions and reduce their
+ * slot usage to make rooom for new connections. For now we just fail the
+ * create session.
+ */
+static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
+{
+	int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+
+	spin_lock(&nfsd_serv->sv_lock);
+	if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
+		np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
+	nfsd_serv->sv_drc_pages_used += np;
+	spin_unlock(&nfsd_serv->sv_lock);
+
+	if (np <= 0) {
+		status = nfserr_resource;
+		fchan->maxreqs = 0;
+	} else
+		fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
+
+	return status;
+}
+
+/*
+ * fchan holds the client values on input, and the server values on output
+ */
+static int init_forechannel_attrs(struct svc_rqst *rqstp,
+				    struct nfsd4_session *session,
+				    struct nfsd4_channel_attrs *fchan)
+{
+	int status = 0;
+	__u32   maxcount = svc_max_payload(rqstp);
+
+	/* headerpadsz set to zero in encode routine */
+
+	/* Use the client's max request and max response size if possible */
+	if (fchan->maxreq_sz > maxcount)
+		fchan->maxreq_sz = maxcount;
+	session->se_fmaxreq_sz = fchan->maxreq_sz;
+
+	if (fchan->maxresp_sz > maxcount)
+		fchan->maxresp_sz = maxcount;
+	session->se_fmaxresp_sz = fchan->maxresp_sz;
+
+	/* Set the max response cached size our default which is
+	 * a multiple of PAGE_SIZE and small */
+	session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+	fchan->maxresp_cached = session->se_fmaxresp_cached;
+
+	/* Use the client's maxops if possible */
+	if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
+		fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
+	session->se_fmaxops = fchan->maxops;
+
+	/* try to use the client requested number of slots */
+	if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+		fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
+
+	/* FIXME: Error means no more DRC pages so the server should
+	 * recover pages from existing sessions. For now fail session
+	 * creation.
+	 */
+	status = set_forechannel_maxreqs(fchan);
+
+	session->se_fnumslots = fchan->maxreqs;
+	return status;
+}
+
+static int
+alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
+		   struct nfsd4_create_session *cses)
+{
+	struct nfsd4_session *new, tmp;
+	int idx, status = nfserr_resource, slotsize;
+
+	memset(&tmp, 0, sizeof(tmp));
+
+	/* FIXME: For now, we just accept the client back channel attributes. */
+	status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+	if (status)
+		goto out;
+
+	/* allocate struct nfsd4_session and slot table in one piece */
+	slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+	new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
+	if (!new)
+		goto out;
+
+	memcpy(new, &tmp, sizeof(*new));
+
+	new->se_client = clp;
+	gen_sessionid(new);
+	idx = hash_sessionid(&new->se_sessionid);
+	memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
+	       NFS4_MAX_SESSIONID_LEN);
+
+	new->se_flags = cses->flags;
+	kref_init(&new->se_ref);
+	spin_lock(&sessionid_lock);
+	list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+	list_add(&new->se_perclnt, &clp->cl_sessions);
+	spin_unlock(&sessionid_lock);
+
+	status = nfs_ok;
+out:
+	return status;
+}
+
 /* caller must hold sessionid_lock */
 static struct nfsd4_session *
 find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
@@ -1182,7 +1308,67 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		     struct nfsd4_compound_state *cstate,
 		     struct nfsd4_create_session *cr_ses)
 {
-	return -1;	/* stub */
+	u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+	struct nfs4_client *conf, *unconf;
+	int status = 0;
+
+	nfs4_lock_state();
+	unconf = find_unconfirmed_client(&cr_ses->clientid);
+	conf = find_confirmed_client(&cr_ses->clientid);
+
+	if (conf) {
+		status = nfs_ok;
+		if (conf->cl_seqid == cr_ses->seqid) {
+			dprintk("Got a create_session replay! seqid= %d\n",
+				conf->cl_seqid);
+			goto out_replay;
+		} else if (cr_ses->seqid != conf->cl_seqid + 1) {
+			status = nfserr_seq_misordered;
+			dprintk("Sequence misordered!\n");
+			dprintk("Expected seqid= %d but got seqid= %d\n",
+				conf->cl_seqid, cr_ses->seqid);
+			goto out;
+		}
+		conf->cl_seqid++;
+	} else if (unconf) {
+		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+		    (ip_addr != unconf->cl_addr)) {
+			status = nfserr_clid_inuse;
+			goto out;
+		}
+
+		if (unconf->cl_seqid != cr_ses->seqid) {
+			status = nfserr_seq_misordered;
+			goto out;
+		}
+
+		move_to_confirmed(unconf);
+
+		/*
+		 * We do not support RDMA or persistent sessions
+		 */
+		cr_ses->flags &= ~SESSION4_PERSIST;
+		cr_ses->flags &= ~SESSION4_RDMA;
+
+		conf = unconf;
+	} else {
+		status = nfserr_stale_clientid;
+		goto out;
+	}
+
+	status = alloc_init_session(rqstp, conf, cr_ses);
+	if (status)
+		goto out;
+
+out_replay:
+	memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+	       NFS4_MAX_SESSIONID_LEN);
+	cr_ses->seqid = conf->cl_seqid;
+
+out:
+	nfs4_unlock_state();
+	dprintk("%s returns %d\n", __func__, ntohl(status));
+	return status;
 }
 
 __be32
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 09415bcf078e..671f9b96429b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1099,7 +1099,108 @@ static __be32
 nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
 			    struct nfsd4_create_session *sess)
 {
-	return nfserr_opnotsupp;	/* stub */
+	DECODE_HEAD;
+
+	u32 dummy;
+	char *machine_name;
+	int i;
+	int nr_secflavs;
+
+	READ_BUF(16);
+	COPYMEM(&sess->clientid, 8);
+	READ32(sess->seqid);
+	READ32(sess->flags);
+
+	/* Fore channel attrs */
+	READ_BUF(28);
+	READ32(dummy); /* headerpadsz is always 0 */
+	READ32(sess->fore_channel.maxreq_sz);
+	READ32(sess->fore_channel.maxresp_sz);
+	READ32(sess->fore_channel.maxresp_cached);
+	READ32(sess->fore_channel.maxops);
+	READ32(sess->fore_channel.maxreqs);
+	READ32(sess->fore_channel.nr_rdma_attrs);
+	if (sess->fore_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		READ32(sess->fore_channel.rdma_attrs);
+	} else if (sess->fore_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many fore channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	/* Back channel attrs */
+	READ_BUF(28);
+	READ32(dummy); /* headerpadsz is always 0 */
+	READ32(sess->back_channel.maxreq_sz);
+	READ32(sess->back_channel.maxresp_sz);
+	READ32(sess->back_channel.maxresp_cached);
+	READ32(sess->back_channel.maxops);
+	READ32(sess->back_channel.maxreqs);
+	READ32(sess->back_channel.nr_rdma_attrs);
+	if (sess->back_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		READ32(sess->back_channel.rdma_attrs);
+	} else if (sess->back_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many back channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	READ_BUF(8);
+	READ32(sess->callback_prog);
+
+	/* callback_sec_params4 */
+	READ32(nr_secflavs);
+	for (i = 0; i < nr_secflavs; ++i) {
+		READ_BUF(4);
+		READ32(dummy);
+		switch (dummy) {
+		case RPC_AUTH_NULL:
+			/* Nothing to read */
+			break;
+		case RPC_AUTH_UNIX:
+			READ_BUF(8);
+			/* stamp */
+			READ32(dummy);
+
+			/* machine name */
+			READ32(dummy);
+			READ_BUF(dummy);
+			SAVEMEM(machine_name, dummy);
+
+			/* uid, gid */
+			READ_BUF(8);
+			READ32(sess->uid);
+			READ32(sess->gid);
+
+			/* more gids */
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy * 4);
+			for (i = 0; i < dummy; ++i)
+				READ32(dummy);
+			break;
+		case RPC_AUTH_GSS:
+			dprintk("RPC_AUTH_GSS callback secflavor "
+				"not supported!\n");
+			READ_BUF(8);
+			/* gcbp_service */
+			READ32(dummy);
+			/* gcbp_handle_from_server */
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+			/* gcbp_handle_from_client */
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+			break;
+		default:
+			dprintk("Illegal callback secflavor\n");
+			return nfserr_inval;
+		}
+	}
+	DECODE_TAIL;
 }
 
 static __be32
@@ -2821,8 +2922,49 @@ static __be32
 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
 			    struct nfsd4_create_session *sess)
 {
-	/* stub */
-	return nfserr;
+	ENCODE_HEAD;
+
+	if (nfserr)
+		return nfserr;
+
+	RESERVE_SPACE(24);
+	WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(sess->seqid);
+	WRITE32(sess->flags);
+	ADJUST_ARGS();
+
+	RESERVE_SPACE(28);
+	WRITE32(0); /* headerpadsz */
+	WRITE32(sess->fore_channel.maxreq_sz);
+	WRITE32(sess->fore_channel.maxresp_sz);
+	WRITE32(sess->fore_channel.maxresp_cached);
+	WRITE32(sess->fore_channel.maxops);
+	WRITE32(sess->fore_channel.maxreqs);
+	WRITE32(sess->fore_channel.nr_rdma_attrs);
+	ADJUST_ARGS();
+
+	if (sess->fore_channel.nr_rdma_attrs) {
+		RESERVE_SPACE(4);
+		WRITE32(sess->fore_channel.rdma_attrs);
+		ADJUST_ARGS();
+	}
+
+	RESERVE_SPACE(28);
+	WRITE32(0); /* headerpadsz */
+	WRITE32(sess->back_channel.maxreq_sz);
+	WRITE32(sess->back_channel.maxresp_sz);
+	WRITE32(sess->back_channel.maxresp_cached);
+	WRITE32(sess->back_channel.maxops);
+	WRITE32(sess->back_channel.maxreqs);
+	WRITE32(sess->back_channel.nr_rdma_attrs);
+	ADJUST_ARGS();
+
+	if (sess->back_channel.nr_rdma_attrs) {
+		RESERVE_SPACE(4);
+		WRITE32(sess->back_channel.rdma_attrs);
+		ADJUST_ARGS();
+	}
+	return 0;
 }
 
 static __be32
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index f1edb1d98523..692edf4c2e7c 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -99,8 +99,12 @@ struct nfs4_callback {
 	struct rpc_clnt *       cb_client;
 };
 
+/* Maximum number of slots per session. 128 is useful for long haul TCP */
+#define NFSD_MAX_SLOTS_PER_SESSION	128
 /* Maximum number of pages per slot cache entry */
 #define NFSD_PAGES_PER_SLOT	1
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND	16
 
 struct nfsd4_cache_entry {
 	__be32		ce_status;
@@ -129,7 +133,7 @@ struct nfsd4_session {
 	u32			se_fmaxresp_cached;
 	u32			se_fmaxops;
 	u32			se_fnumslots;
-	struct nfsd4_slot	*se_slots;	/* forward channel slots */
+	struct nfsd4_slot	se_slots[];	/* forward channel slots */
 };
 
 static inline void
@@ -188,6 +192,7 @@ struct nfs4_client {
 	struct list_head	cl_sessions;
 	u32			cl_seqid;       /* seqid for create_session */
 	u32			cl_exchange_flags;
+	struct nfs4_sessionid	cl_sessionid;
 };
 
 /* struct nfs4_client_reset
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 69cb467cb720..9468829adb70 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -359,8 +359,27 @@ struct nfsd4_exchange_id {
 	int		spa_how;
 };
 
+struct nfsd4_channel_attrs {
+	u32		headerpadsz;
+	u32		maxreq_sz;
+	u32		maxresp_sz;
+	u32		maxresp_cached;
+	u32		maxops;
+	u32		maxreqs;
+	u32		nr_rdma_attrs;
+	u32		rdma_attrs;
+};
+
 struct nfsd4_create_session {
-	int	foo;	/* stub */
+	clientid_t		clientid;
+	struct nfs4_sessionid	sessionid;
+	u32			seqid;
+	u32			flags;
+	struct nfsd4_channel_attrs fore_channel;
+	struct nfsd4_channel_attrs back_channel;
+	u32			callback_prog;
+	u32			uid;
+	u32			gid;
 };
 
 struct nfsd4_sequence {
-- 
cgit v1.2.3


From 38eb76a54d803e6792816623651b1a9cb85f8d01 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:28:32 +0300
Subject: nfsd41: Add a create session replay cache

Replace the nfs4_client cl_seqid field with a single struct nfs41_slot used
for the create session replay cache.

The CREATE_SESSION slot sets the sl_session pointer to NULL. Otherwise, the
slot and it's replay cache are used just like the session slots.

Fix unconfirmed create_session replay response by initializing the
create_session slot sequence id to 0.

A future patch will set the CREATE_SESSION cache when a SEQUENCE operation
preceeds the CREATE_SESSION operation. This compound is currently only cached
in the session slot table.

Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: use bool inuse for slot state]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: revert portion of nfsd4_set_cache_entry]
Signed-off-by: Andy Adamson <andros@netpp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 39 +++++++++++++++++++++++++++------------
 include/linux/nfsd/state.h |  2 +-
 2 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 463ae39df148..58f9797eb09e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -649,6 +649,8 @@ static inline void
 free_client(struct nfs4_client *clp)
 {
 	shutdown_callback_client(clp);
+	nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
+			     clp->cl_slot.sl_cache_entry.ce_resused);
 	if (clp->cl_cred.cr_group_info)
 		put_group_info(clp->cl_cred.cr_group_info);
 	kfree(clp->cl_principal);
@@ -1263,11 +1265,12 @@ out_copy:
 	exid->clientid.cl_boot = new->cl_clientid.cl_boot;
 	exid->clientid.cl_id = new->cl_clientid.cl_id;
 
-	new->cl_seqid = exid->seqid = 1;
+	new->cl_slot.sl_seqid = 0;
+	exid->seqid = 1;
 	nfsd4_set_ex_flags(new, exid);
 
 	dprintk("nfsd4_exchange_id seqid %d flags %x\n",
-		new->cl_seqid, new->cl_exchange_flags);
+		new->cl_slot.sl_seqid, new->cl_exchange_flags);
 	status = nfs_ok;
 
 out:
@@ -1309,7 +1312,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		     struct nfsd4_create_session *cr_ses)
 {
 	u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	struct nfs4_client *conf, *unconf;
+	struct nfsd4_slot *slot = NULL;
 	int status = 0;
 
 	nfs4_lock_state();
@@ -1317,19 +1322,24 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	conf = find_confirmed_client(&cr_ses->clientid);
 
 	if (conf) {
-		status = nfs_ok;
-		if (conf->cl_seqid == cr_ses->seqid) {
+		slot = &conf->cl_slot;
+		status = check_slot_seqid(cr_ses->seqid, slot);
+		if (status == nfserr_replay_cache) {
 			dprintk("Got a create_session replay! seqid= %d\n",
-				conf->cl_seqid);
-			goto out_replay;
-		} else if (cr_ses->seqid != conf->cl_seqid + 1) {
+				slot->sl_seqid);
+			cstate->slot = slot;
+			cstate->status = status;
+			/* Return the cached reply status */
+			status = nfsd4_replay_cache_entry(resp);
+			goto out;
+		} else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
 			status = nfserr_seq_misordered;
 			dprintk("Sequence misordered!\n");
 			dprintk("Expected seqid= %d but got seqid= %d\n",
-				conf->cl_seqid, cr_ses->seqid);
+				slot->sl_seqid, cr_ses->seqid);
 			goto out;
 		}
-		conf->cl_seqid++;
+		conf->cl_slot.sl_seqid++;
 	} else if (unconf) {
 		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
 		    (ip_addr != unconf->cl_addr)) {
@@ -1337,11 +1347,15 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 			goto out;
 		}
 
-		if (unconf->cl_seqid != cr_ses->seqid) {
+		slot = &unconf->cl_slot;
+		status = check_slot_seqid(cr_ses->seqid, slot);
+		if (status) {
+			/* an unconfirmed replay returns misordered */
 			status = nfserr_seq_misordered;
 			goto out;
 		}
 
+		slot->sl_seqid++; /* from 0 to 1 */
 		move_to_confirmed(unconf);
 
 		/*
@@ -1360,11 +1374,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	if (status)
 		goto out;
 
-out_replay:
 	memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
-	cr_ses->seqid = conf->cl_seqid;
+	cr_ses->seqid = slot->sl_seqid;
 
+	slot->sl_inuse = true;
+	cstate->slot = slot;
 out:
 	nfs4_unlock_state();
 	dprintk("%s returns %d\n", __func__, ntohl(status));
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 692edf4c2e7c..f063df7ad134 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -190,7 +190,7 @@ struct nfs4_client {
 
 	/* for nfs41 */
 	struct list_head	cl_sessions;
-	u32			cl_seqid;       /* seqid for create_session */
+	struct nfsd4_slot	cl_slot;	/* create_session slot */
 	u32			cl_exchange_flags;
 	struct nfs4_sessionid	cl_sessionid;
 };
-- 
cgit v1.2.3


From bf864a31d50e3e94d6e76537b97d664913906ff8 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:28:35 +0300
Subject: nfsd41: non-page DRC for solo sequence responses

A session inactivity time compound (lease renewal) or a compound where the
sequence operation has sa_cachethis set to FALSE do not require any pages
to be held in the v4.1 DRC. This is because struct nfsd4_slot is already
caching the session information.

Add logic to the nfs41 server to not cache response pages for solo sequence
responses.

Return nfserr_replay_uncached_rep on the operation following the sequence
operation when sa_cachethis is FALSE.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: use cstate session in nfsd4_replay_cache_entry]
[nfsd41: rename nfsd4_no_page_in_cache]
[nfsd41 rename nfsd4_enc_no_page_replay]
[nfsd41 nfsd4_is_solo_sequence]
[nfsd41 change nfsd4_not_cached return]
Signed-off-by: Andy Adamson <andros@netapp.com>
[changed return type to bool]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41 drop parens in nfsd4_is_solo_sequence call]
Signed-off-by: Andy Adamson <andros@netapp.com>
[changed "== 0" to "!"]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c         | 34 +++++++++++++++++++++++++++++++--
 fs/nfsd/nfs4state.c        | 47 ++++++++++++++++++++++++++++++++++++++++------
 fs/nfsd/nfs4xdr.c          |  5 +++--
 include/linux/nfsd/state.h |  1 +
 include/linux/nfsd/xdr4.h  | 15 ++++++++++++++-
 5 files changed, 91 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9e2ee75e0f7c..ae21a4efe36c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -827,6 +827,34 @@ static struct nfsd4_operation nfsd4_ops[];
 
 static const char *nfsd4_op_name(unsigned opnum);
 
+/*
+ * This is a replay of a compound for which no cache entry pages
+ * were used. Encode the sequence operation, and if cachethis is FALSE
+ * encode the uncache rep error on the next operation.
+ */
+static __be32
+nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
+			 struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_op *op;
+
+	dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
+		resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
+
+	/* Encode the replayed sequence operation */
+	BUG_ON(resp->opcnt != 1);
+	op = &args->ops[resp->opcnt - 1];
+	nfsd4_encode_operation(resp, op);
+
+	/*return nfserr_retry_uncached_rep in next operation. */
+	if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
+		op = &args->ops[resp->opcnt++];
+		op->status = nfserr_retry_uncached_rep;
+		nfsd4_encode_operation(resp, op);
+	}
+	return op->status;
+}
+
 /*
  * Enforce NFSv4.1 COMPOUND ordering rules.
  *
@@ -895,7 +923,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 		dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
 			resp->opcnt, args->opcnt, op->opnum,
 			nfsd4_op_name(op->opnum));
-
 		/*
 		 * The XDR decode routines may have pre-set op->status;
 		 * for example, if there is a miscellaneous XDR error
@@ -939,7 +966,10 @@ encode_op:
 		/* Only from SEQUENCE or CREATE_SESSION */
 		if (resp->cstate.status == nfserr_replay_cache) {
 			dprintk("%s NFS4.1 replay from cache\n", __func__);
-			status = op->status;
+			if (nfsd4_not_cached(resp))
+				status = nfsd4_enc_uncached_replay(args, resp);
+			else
+				status = op->status;
 			goto out;
 		}
 		if (op->status == nfserr_replay_me) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 58f9797eb09e..04a395fb5dce 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1049,17 +1049,31 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
 	/* Don't cache a failed OP_SEQUENCE. */
 	if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
 		return;
+
 	nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
+	entry->ce_opcnt = resp->opcnt;
+	entry->ce_status = resp->cstate.status;
+
+	/*
+	 * Don't need a page to cache just the sequence operation - the slot
+	 * does this for us!
+	 */
+
+	if (nfsd4_not_cached(resp)) {
+		entry->ce_resused = 0;
+		entry->ce_rpchdrlen = 0;
+		dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
+			resp->cstate.slot->sl_cache_entry.ce_cachethis);
+		return;
+	}
 	entry->ce_resused = rqstp->rq_resused;
 	if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
 		entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
 	nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
 			 entry->ce_resused);
-	entry->ce_status = resp->cstate.status;
 	entry->ce_datav.iov_base = resp->cstate.statp;
 	entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
 				(char *)page_address(rqstp->rq_respages[0]));
-	entry->ce_opcnt = resp->opcnt;
 	/* Current request rpc header length*/
 	entry->ce_rpchdrlen = (char *)resp->cstate.statp -
 				(char *)page_address(rqstp->rq_respages[0]);
@@ -1096,13 +1110,28 @@ nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
  * cached page.  Replace any futher replay pages from the cache.
  */
 __be32
-nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp)
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+			 struct nfsd4_sequence *seq)
 {
 	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
 	__be32 status;
 
 	dprintk("--> %s entry %p\n", __func__, entry);
 
+	/*
+	 * If this is just the sequence operation, we did not keep
+	 * a page in the cache entry because we can just use the
+	 * slot info stored in struct nfsd4_sequence that was checked
+	 * against the slot in nfsd4_sequence().
+	 *
+	 * This occurs when seq->cachethis is FALSE, or when the client
+	 * session inactivity timer fires and a solo sequence operation
+	 * is sent (lease renewal).
+	 */
+	if (seq && nfsd4_not_cached(resp)) {
+		seq->maxslots = resp->cstate.session->se_fnumslots;
+		return nfs_ok;
+	}
 
 	if (!nfsd41_copy_replay_data(resp, entry)) {
 		/*
@@ -1330,7 +1359,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 			cstate->slot = slot;
 			cstate->status = status;
 			/* Return the cached reply status */
-			status = nfsd4_replay_cache_entry(resp);
+			status = nfsd4_replay_cache_entry(resp, NULL);
 			goto out;
 		} else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
 			status = nfserr_seq_misordered;
@@ -1380,6 +1409,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 
 	slot->sl_inuse = true;
 	cstate->slot = slot;
+	/* Ensure a page is used for the cache */
+	slot->sl_cache_entry.ce_cachethis = 1;
 out:
 	nfs4_unlock_state();
 	dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1425,8 +1456,8 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 		cstate->slot = slot;
 		cstate->session = session;
 		/* Return the cached reply status and set cstate->status
-		 * for nfsd4_svc_encode_compoundres processing*/
-		status = nfsd4_replay_cache_entry(resp);
+		 * for nfsd4_svc_encode_compoundres processing */
+		status = nfsd4_replay_cache_entry(resp, seq);
 		cstate->status = nfserr_replay_cache;
 		goto replay_cache;
 	}
@@ -1436,6 +1467,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	/* Success! bump slot seqid */
 	slot->sl_inuse = true;
 	slot->sl_seqid = seq->seqid;
+	slot->sl_cache_entry.ce_cachethis = seq->cachethis;
+	/* Always set the cache entry cachethis for solo sequence */
+	if (nfsd4_is_solo_sequence(resp))
+		slot->sl_cache_entry.ce_cachethis = 1;
 
 	cstate->slot = slot;
 	cstate->session = session;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 671f9b96429b..64bc2150a6fa 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2975,7 +2975,7 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
 	return nfserr;
 }
 
-static __be32
+__be32
 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
 		      struct nfsd4_sequence *seq)
 {
@@ -3192,7 +3192,8 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 	iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
 	BUG_ON(iov->iov_len > PAGE_SIZE);
 	if (resp->cstate.slot != NULL) {
-		if (resp->cstate.status == nfserr_replay_cache) {
+		if (resp->cstate.status == nfserr_replay_cache &&
+				!nfsd4_not_cached(resp)) {
 			iov->iov_len = resp->cstate.iovlen;
 		} else {
 			nfsd4_store_cache_entry(resp);
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index f063df7ad134..18dcffa57f77 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -110,6 +110,7 @@ struct nfsd4_cache_entry {
 	__be32		ce_status;
 	struct kvec	ce_datav; /* encoded NFSv4.1 data in rq_res.head[0] */
 	struct page	*ce_respages[NFSD_PAGES_PER_SLOT + 1];
+	int		ce_cachethis;
 	short		ce_resused;
 	int		ce_opcnt;
 	int		ce_rpchdrlen;
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 9468829adb70..486188810a60 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -480,6 +480,18 @@ struct nfsd4_compoundres {
 	struct nfsd4_compound_state	cstate;
 };
 
+static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+	return args->opcnt == 1;
+}
+
+static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
+{
+	return !resp->cstate.slot->sl_cache_entry.ce_cachethis ||
+			nfsd4_is_solo_sequence(resp);
+}
+
 #define NFS4_SVC_XDRSIZE		sizeof(struct nfsd4_compoundargs)
 
 static inline void
@@ -510,7 +522,8 @@ extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *,
 		struct nfsd4_setclientid_confirm *setclientid_confirm);
 extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
-extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp);
+extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+		struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *,
 struct nfsd4_exchange_id *);
-- 
cgit v1.2.3


From e10e0cfc2f27364c73b28adbd3c8688d97049e73 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 3 Apr 2009 08:28:38 +0300
Subject: nfsd41: destroy_session operation

Implement the destory_session operation confoming to
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-26

[use sessionid_lock spin lock]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c       | 29 ++++++++++++++++++++++++++++-
 fs/nfsd/nfs4xdr.c         |  7 +++++--
 include/linux/nfsd/xdr4.h |  2 +-
 3 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 04a395fb5dce..9192e5b35f42 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1422,7 +1422,34 @@ nfsd4_destroy_session(struct svc_rqst *r,
 		      struct nfsd4_compound_state *cstate,
 		      struct nfsd4_destroy_session *sessionid)
 {
-	return -1;	/* stub */
+	struct nfsd4_session *ses;
+	u32 status = nfserr_badsession;
+
+	/* Notes:
+	 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
+	 * - Should we return nfserr_back_chan_busy if waiting for
+	 *   callbacks on to-be-destroyed session?
+	 * - Do we need to clear any callback info from previous session?
+	 */
+
+	dump_sessionid(__func__, &sessionid->sessionid);
+	spin_lock(&sessionid_lock);
+	ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+	if (!ses) {
+		spin_unlock(&sessionid_lock);
+		goto out;
+	}
+
+	unhash_session(ses);
+	spin_unlock(&sessionid_lock);
+
+	/* wait for callbacks */
+	shutdown_callback_client(ses->se_client);
+	nfsd4_put_session(ses);
+	status = nfs_ok;
+out:
+	dprintk("%s returns %d\n", __func__, ntohl(status));
+	return status;
 }
 
 __be32
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 64bc2150a6fa..c6a726d04efd 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1207,7 +1207,11 @@ static __be32
 nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
 			     struct nfsd4_destroy_session *destroy_session)
 {
-	return nfserr_opnotsupp;	/* stub */
+	DECODE_HEAD;
+	READ_BUF(NFS4_MAX_SESSIONID_LEN);
+	COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+
+	DECODE_TAIL;
 }
 
 static __be32
@@ -2971,7 +2975,6 @@ static __be32
 nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
 			     struct nfsd4_destroy_session *destroy_session)
 {
-	/* stub */
 	return nfserr;
 }
 
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 486188810a60..a0a2e8317a3b 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -395,7 +395,7 @@ struct nfsd4_sequence {
 };
 
 struct nfsd4_destroy_session {
-	int	foo;	/* stub */
+	struct nfs4_sessionid	sessionid;
 };
 
 struct nfsd4_op {
-- 
cgit v1.2.3


From dd453dfd70538cadc02cb47ff8d8cfd0cb8cf435 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 3 Apr 2009 08:28:41 +0300
Subject: nfsd: pass nfsd4_compound_state* to nfs4_preprocess_{state,seq}id_op

Currently we only use cstate->current_fh,
will also be used by nfsd41 code.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c         | 10 ++++------
 fs/nfsd/nfs4state.c        | 22 ++++++++++++++--------
 include/linux/nfsd/state.h |  4 +++-
 3 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ae21a4efe36c..e6a0f314fdbe 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -513,9 +513,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 	/* check stateid */
-	if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-				&read->rd_stateid,
-				RD_STATE, &read->rd_filp))) {
+	if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
+						 RD_STATE, &read->rd_filp))) {
 		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
 		goto out;
 	}
@@ -646,7 +645,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
 		nfs4_lock_state();
-		status = nfs4_preprocess_stateid_op(&cstate->current_fh,
+		status = nfs4_preprocess_stateid_op(cstate,
 			&setattr->sa_stateid, WR_STATE, NULL);
 		nfs4_unlock_state();
 		if (status) {
@@ -686,8 +685,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return nfserr_inval;
 
 	nfs4_lock_state();
-	status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid,
-					WR_STATE, &filp);
+	status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
 	if (filp)
 		get_file(filp);
 	nfs4_unlock_state();
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9192e5b35f42..f49f305096ec 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2810,10 +2810,12 @@ static int is_delegation_stateid(stateid_t *stateid)
 * Checks for stateid operations
 */
 __be32
-nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
+nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+			   stateid_t *stateid, int flags, struct file **filpp)
 {
 	struct nfs4_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
+	struct svc_fh *current_fh = &cstate->current_fh;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	__be32 status;
 
@@ -2878,10 +2880,14 @@ setlkflg (int type)
  * Checks for sequence id mutating operations. 
  */
 static __be32
-nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+			 stateid_t *stateid, int flags,
+			 struct nfs4_stateowner **sopp,
+			 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
 {
 	struct nfs4_stateid *stp;
 	struct nfs4_stateowner *sop;
+	struct svc_fh *current_fh = &cstate->current_fh;
 	__be32 status;
 
 	dprintk("NFSD: preprocess_seqid_op: seqid=%d " 
@@ -3004,7 +3010,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					oc->oc_seqid, &oc->oc_req_stateid,
 					CONFIRM | OPEN_STATE,
 					&oc->oc_stateowner, &stp, NULL)))
@@ -3074,7 +3080,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 		return nfserr_inval;
 
 	nfs4_lock_state();
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					od->od_seqid,
 					&od->od_stateid, 
 					OPEN_STATE,
@@ -3127,7 +3133,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 	/* check close_lru for replay */
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					close->cl_seqid,
 					&close->cl_stateid, 
 					OPEN_STATE | CLOSE_STATE,
@@ -3474,7 +3480,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 
 		/* validate and update open stateid and open seqid */
-		status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+		status = nfs4_preprocess_seqid_op(cstate,
 				        lock->lk_new_open_seqid,
 		                        &lock->lk_new_open_stateid,
 					OPEN_STATE,
@@ -3501,7 +3507,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 	} else {
 		/* lock (lock owner + lock stateid) already exists */
-		status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+		status = nfs4_preprocess_seqid_op(cstate,
 				       lock->lk_old_lock_seqid, 
 				       &lock->lk_old_lock_stateid, 
 				       LOCK_STATE,
@@ -3697,7 +3703,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 									        
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					locku->lu_seqid, 
 					&locku->lu_stateid, 
 					LOCK_STATE,
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 18dcffa57f77..836c0d6bbed8 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -333,7 +333,9 @@ struct nfs4_stateid {
 	((err) != nfserr_stale_stateid) &&      \
 	((err) != nfserr_bad_stateid))
 
-extern __be32 nfs4_preprocess_stateid_op(struct svc_fh *current_fh,
+struct nfsd4_compound_state;
+
+extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		stateid_t *stateid, int flags, struct file **filp);
 extern void nfs4_lock_state(void);
 extern void nfs4_unlock_state(void);
-- 
cgit v1.2.3


From 6668958fac1d05f55420de702f3678d46c1e93a5 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:28:45 +0300
Subject: nfsd41: stateid handling

When sessions are used, stateful operation sequenceid and stateid handling
are not used. When sessions are used,  on the first open set the seqid to 1,
mark state confirmed and skip seqid processing.

When sessionas are used the stateid generation number is ignored when it is zero
whereas without sessions bad_stateid or stale stateid is returned.

Add flags to propagate session use to all stateful ops and down to
check_stateid_generation.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Andy Adamson <andros@netapp.com>
[nfsd4_has_session should return a boolean, not u32]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: pass nfsd4_compoundres * to nfsd4_process_open1]
[nfsd41: calculate HAS_SESSION in nfs4_preprocess_stateid_op]
[nfsd41: calculate HAS_SESSION in nfs4_preprocess_seqid_op]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c         |  5 ++++-
 fs/nfsd/nfs4state.c        | 47 ++++++++++++++++++++++++++++++++++++++--------
 fs/nfsd/nfs4xdr.c          |  2 +-
 include/linux/nfsd/state.h |  1 +
 include/linux/nfsd/xdr4.h  |  8 +++++++-
 5 files changed, 52 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e6a0f314fdbe..aeb4888bdfad 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -168,6 +168,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	   struct nfsd4_open *open)
 {
 	__be32 status;
+	struct nfsd4_compoundres *resp;
+
 	dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
 		(int)open->op_fname.len, open->op_fname.data,
 		open->op_stateowner);
@@ -179,7 +181,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfs4_lock_state();
 
 	/* check seqid for replay. set nfs4_owner */
-	status = nfsd4_process_open1(open);
+	resp = rqstp->rq_resp;
+	status = nfsd4_process_open1(&resp->cstate, open);
 	if (status == nfserr_replay_me) {
 		struct nfs4_replay *rp = &open->op_stateowner->so_replay;
 		fh_put(&cstate->current_fh);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f49f305096ec..bbaf3c9bfe5a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2183,7 +2183,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
 
 
 __be32
-nfsd4_process_open1(struct nfsd4_open *open)
+nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+		    struct nfsd4_open *open)
 {
 	clientid_t *clientid = &open->op_clientid;
 	struct nfs4_client *clp = NULL;
@@ -2206,6 +2207,9 @@ nfsd4_process_open1(struct nfsd4_open *open)
 			return nfserr_expired;
 		goto renew;
 	}
+	/* When sessions are used, skip open sequenceid processing */
+	if (nfsd4_has_session(cstate))
+		goto renew;
 	if (!sop->so_confirmed) {
 		/* Replace unconfirmed owners without checking for replay. */
 		clp = sop->so_client;
@@ -2483,6 +2487,7 @@ out:
 __be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	struct nfs4_file *fp = NULL;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	struct nfs4_stateid *stp = NULL;
@@ -2541,9 +2546,14 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 			release_open_stateid(stp);
 			goto out;
 		}
+		if (nfsd4_has_session(&resp->cstate))
+			update_stateid(&stp->st_stateid);
 	}
 	memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
 
+	if (nfsd4_has_session(&resp->cstate))
+		open->op_stateowner->so_confirmed = 1;
+
 	/*
 	* Attempt to hand out a delegation. No error return, because the
 	* OPEN succeeds even if we fail.
@@ -2564,7 +2574,8 @@ out:
 	* To finish the open response, we just need to set the rflags.
 	*/
 	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-	if (!open->op_stateowner->so_confirmed)
+	if (!open->op_stateowner->so_confirmed &&
+	    !nfsd4_has_session(&resp->cstate))
 		open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
 
 	return status;
@@ -2781,8 +2792,15 @@ grace_disallows_io(struct inode *inode)
 	return locks_in_grace() && mandatory_lock(inode);
 }
 
-static int check_stateid_generation(stateid_t *in, stateid_t *ref)
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
 {
+	/*
+	 * When sessions are used the stateid generation number is ignored
+	 * when it is zero.
+	 */
+	if ((flags & HAS_SESSION) && in->si_generation == 0)
+		goto out;
+
 	/* If the client sends us a stateid from the future, it's buggy: */
 	if (in->si_generation > ref->si_generation)
 		return nfserr_bad_stateid;
@@ -2798,6 +2816,7 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
 	 */
 	if (in->si_generation < ref->si_generation)
 		return nfserr_old_stateid;
+out:
 	return nfs_ok;
 }
 
@@ -2825,6 +2844,9 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 	if (grace_disallows_io(ino))
 		return nfserr_grace;
 
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
+
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
 		return check_special_stateids(current_fh, stateid, flags);
 
@@ -2837,7 +2859,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		dp = find_delegation_stateid(ino, stateid);
 		if (!dp)
 			goto out;
-		status = check_stateid_generation(stateid, &dp->dl_stateid);
+		status = check_stateid_generation(stateid, &dp->dl_stateid,
+						  flags);
 		if (status)
 			goto out;
 		status = nfs4_check_delegmode(dp, flags);
@@ -2854,7 +2877,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			goto out;
 		if (!stp->st_stateowner->so_confirmed)
 			goto out;
-		status = check_stateid_generation(stateid, &stp->st_stateid);
+		status = check_stateid_generation(stateid, &stp->st_stateid,
+						  flags);
 		if (status)
 			goto out;
 		status = nfs4_check_openmode(stp, flags);
@@ -2905,6 +2929,10 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 
 	if (STALE_STATEID(stateid))
 		return nfserr_stale_stateid;
+
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
+
 	/*
 	* We return BAD_STATEID if filehandle doesn't match stateid, 
 	* the confirmed flag is incorrecly set, or the generation 
@@ -2961,7 +2989,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	*  For the moment, we ignore the possibility of 
 	*  generation number wraparound.
 	*/
-	if (seqid != sop->so_seqid)
+	if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
 		goto check_replay;
 
 	if (sop->so_confirmed && flags & CONFIRM) {
@@ -2974,7 +3002,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 				" confirmed yet!\n");
 		return nfserr_bad_stateid;
 	}
-	status = check_stateid_generation(stateid, &stp->st_stateid);
+	status = check_stateid_generation(stateid, &stp->st_stateid, flags);
 	if (status)
 		return status;
 	renew_client(sop->so_client);
@@ -3169,11 +3197,14 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	stateid_t *stateid = &dr->dr_stateid;
 	struct inode *inode;
 	__be32 status;
+	int flags = 0;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
 		return status;
 	inode = cstate->current_fh.fh_dentry->d_inode;
 
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
 	nfs4_lock_state();
 	status = nfserr_bad_stateid;
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
@@ -3187,7 +3218,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	dp = find_delegation_stateid(inode, stateid);
 	if (!dp)
 		goto out;
-	status = check_stateid_generation(stateid, &dp->dl_stateid);
+	status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
 	if (status)
 		goto out;
 	renew_client(dp->dl_client);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c6a726d04efd..dd81ac1a1c6c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3194,7 +3194,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 		iov = &rqstp->rq_res.head[0];
 	iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
 	BUG_ON(iov->iov_len > PAGE_SIZE);
-	if (resp->cstate.slot != NULL) {
+	if (nfsd4_has_session(&resp->cstate)) {
 		if (resp->cstate.status == nfserr_replay_cache &&
 				!nfsd4_not_cached(resp)) {
 			iov->iov_len = resp->cstate.iovlen;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 836c0d6bbed8..4d61c873feed 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -320,6 +320,7 @@ struct nfs4_stateid {
 };
 
 /* flags for preprocess_seqid_op() */
+#define HAS_SESSION             0x00000001
 #define CONFIRM                 0x00000002
 #define OPEN_STATE              0x00000004
 #define LOCK_STATE              0x00000008
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index a0a2e8317a3b..b8b3dcba28cc 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -56,6 +56,11 @@ struct nfsd4_compound_state {
 	u32			status;
 };
 
+static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
+{
+	return cs->slot != NULL;
+}
+
 struct nfsd4_change_info {
 	u32		atomic;
 	u32		before_ctime_sec;
@@ -536,7 +541,8 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
 extern __be32 nfsd4_destroy_session(struct svc_rqst *,
 		struct nfsd4_compound_state *,
 		struct nfsd4_destroy_session *);
-extern __be32 nfsd4_process_open1(struct nfsd4_open *open);
+extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
+		struct nfsd4_open *open);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
 		struct svc_fh *current_fh, struct nfsd4_open *open);
 extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
-- 
cgit v1.2.3


From d87a8ade95288f28c729e076cd74929f3f199b6c Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:28:53 +0300
Subject: nfsd41: access_valid

For nfs41, the open share flags are used also for
delegation "wants" and "signals".  Check that they are valid.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c        |  1 +
 fs/nfsd/nfs4state.c       | 20 +++++++++++++++-----
 include/linux/nfsd/xdr4.h |  1 +
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 3fdcca53212a..db208dd8fdca 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -910,6 +910,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	resp->tag = args->tag;
 	resp->opcnt = 0;
 	resp->rqstp = rqstp;
+	resp->cstate.minorversion = args->minorversion;
 	resp->cstate.replay_owner = NULL;
 	fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
 	fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d227f85b5ed2..374aaf3808e8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1943,11 +1943,21 @@ find_file(struct inode *ino)
 	return NULL;
 }
 
-static inline int access_valid(u32 x)
+static inline int access_valid(u32 x, u32 minorversion)
 {
-	if (x < NFS4_SHARE_ACCESS_READ)
+	if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
 		return 0;
-	if (x > NFS4_SHARE_ACCESS_BOTH)
+	if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
+		return 0;
+	x &= ~NFS4_SHARE_ACCESS_MASK;
+	if (minorversion && x) {
+		if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
+			return 0;
+		if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
+			return 0;
+		x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
+	}
+	if (x)
 		return 0;
 	return 1;
 }
@@ -2495,7 +2505,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	__be32 status;
 
 	status = nfserr_inval;
-	if (!access_valid(open->op_share_access)
+	if (!access_valid(open->op_share_access, resp->cstate.minorversion)
 			|| !deny_valid(open->op_share_deny))
 		goto out;
 	/*
@@ -3104,7 +3114,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 			(int)cstate->current_fh.fh_dentry->d_name.len,
 			cstate->current_fh.fh_dentry->d_name.name);
 
-	if (!access_valid(od->od_share_access)
+	if (!access_valid(od->od_share_access, cstate->minorversion)
 			|| !deny_valid(od->od_share_deny))
 		return nfserr_inval;
 
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index b8b3dcba28cc..5e16935a1eaf 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -53,6 +53,7 @@ struct nfsd4_compound_state {
 	struct nfsd4_slot	*slot;
 	__be32			*statp;
 	size_t			iovlen;
+	u32			minorversion;
 	u32			status;
 };
 
-- 
cgit v1.2.3


From 8daf220a6a83c47b9648c28bb819c14c60bad7f9 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 3 Apr 2009 08:28:59 +0300
Subject: nfsd41: control nfsv4.1 svc via /proc/fs/nfsd/versions

Support enabling and disabling nfsv4.1 via /proc/fs/nfsd/versions
by writing the strings "+4.1" or "-4.1" correspondingly.

Use user mode nfs-utils (rpc.nfsd option) to enable.
This will allow us to get rid of CONFIG_NFSD_V4_1

[nfsd41: disable support for minorversion by default]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c        |  2 +-
 fs/nfsd/nfsctl.c          | 26 +++++++++++++++++++++++---
 fs/nfsd/nfssvc.c          | 24 ++++++++++++++++++++++++
 include/linux/nfsd/nfsd.h |  2 ++
 4 files changed, 50 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index db208dd8fdca..52bb14debb30 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -921,7 +921,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	 * According to RFC3010, this takes precedence over all other errors.
 	 */
 	status = nfserr_minor_vers_mismatch;
-	if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+	if (args->minorversion > nfsd_supported_minorversion)
 		goto out;
 
 	if (!nfs41_op_ordering_ok(args)) {
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4adebb6312c4..a9b8c75bf0bf 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -792,8 +792,9 @@ out_free:
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
 	char *mesg = buf;
-	char *vers, sign;
+	char *vers, *minorp, sign;
 	int len, num;
+	unsigned minor;
 	ssize_t tlen = 0;
 	char *sep;
 
@@ -814,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 		do {
 			sign = *vers;
 			if (sign == '+' || sign == '-')
-				num = simple_strtol((vers+1), NULL, 0);
+				num = simple_strtol((vers+1), &minorp, 0);
 			else
-				num = simple_strtol(vers, NULL, 0);
+				num = simple_strtol(vers, &minorp, 0);
+			if (*minorp == '.') {
+				if (num < 4)
+					return -EINVAL;
+				minor = simple_strtoul(minorp+1, NULL, 0);
+				if (minor == 0)
+					return -EINVAL;
+				if (nfsd_minorversion(minor, sign == '-' ?
+						     NFSD_CLEAR : NFSD_SET) < 0)
+					return -EINVAL;
+				goto next;
+			}
 			switch(num) {
 			case 2:
 			case 3:
@@ -826,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 			default:
 				return -EINVAL;
 			}
+		next:
 			vers += len + 1;
 			tlen += len;
 		} while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -844,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 				       num);
 			sep = " ";
 		}
+	if (nfsd_vers(4, NFSD_AVAIL))
+		for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
+			len += sprintf(buf+len, " %c4.%u",
+					(nfsd_vers(4, NFSD_TEST) &&
+					 nfsd_minorversion(minor, NFSD_TEST)) ?
+						'+' : '-',
+					minor);
 	len += sprintf(buf+len, "\n");
 	return len;
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b53a098e97a4..e9d57734a348 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -121,6 +121,8 @@ struct svc_program		nfsd_program = {
 
 };
 
+u32 nfsd_supported_minorversion;
+
 int nfsd_vers(int vers, enum vers_op change)
 {
 	if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -147,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
 	}
 	return 0;
 }
+
+int nfsd_minorversion(u32 minorversion, enum vers_op change)
+{
+	if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+		return -1;
+	switch(change) {
+	case NFSD_SET:
+		nfsd_supported_minorversion = minorversion;
+		break;
+	case NFSD_CLEAR:
+		if (minorversion == 0)
+			return -1;
+		nfsd_supported_minorversion = minorversion - 1;
+		break;
+	case NFSD_TEST:
+		return minorversion <= nfsd_supported_minorversion;
+	case NFSD_AVAIL:
+		return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
+	}
+	return 0;
+}
+
 /*
  * Maximum number of nfsd processes
  */
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 1f063d495159..70339b7b83c0 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -53,6 +53,7 @@ typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
 extern struct svc_program	nfsd_program;
 extern struct svc_version	nfsd_version2, nfsd_version3,
 				nfsd_version4;
+extern u32			nfsd_supported_minorversion;
 extern struct mutex		nfsd_mutex;
 extern struct svc_serv		*nfsd_serv;
 
@@ -149,6 +150,7 @@ int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
 
 enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
 int nfsd_vers(int vers, enum vers_op change);
+int nfsd_minorversion(u32 minorversion, enum vers_op change);
 void nfsd_reset_versions(void);
 int nfsd_create_serv(void);
 
-- 
cgit v1.2.3


From f3ec22b5b0b4b220ee600bf1a8b6dc045b4e72bd Mon Sep 17 00:00:00 2001
From: Marc Eshel <eshel@almaden.ibm.com>
Date: Fri, 3 Apr 2009 08:29:02 +0300
Subject: nfsd41: provide support for minor version 1 at rpc level

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/nfsd/nfsd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 70339b7b83c0..5bed5096874d 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -23,7 +23,7 @@
 /*
  * nfsd version
  */
-#define NFSD_SUPPORTED_MINOR_VERSION	0
+#define NFSD_SUPPORTED_MINOR_VERSION	1
 
 /*
  * Flags for nfsd_permission
-- 
cgit v1.2.3


From 7e70570647827345352cf6c17461c9fa166f570a Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:29:11 +0300
Subject: nfsd41: support for 3-word long attribute bitmask

Also, use client minorversion to generate supported attrs

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c        | 15 +++++++-----
 fs/nfsd/nfs4xdr.c         | 59 +++++++++++++++++++++++++++++++++++++----------
 include/linux/nfsd/nfsd.h | 34 +++++++++++++++++++++++++--
 include/linux/nfsd/xdr4.h | 12 +++++-----
 4 files changed, 94 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 937853f62834..e206053433e3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -463,8 +463,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
 		return nfserr_inval;
 
-	getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
-	getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+	getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
 
 	getattr->ga_fhp = &cstate->current_fh;
 	return nfs_ok;
@@ -555,8 +556,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
 		return nfserr_inval;
 
-	readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
-	readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+	readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
 
 	if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
 	    (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -746,8 +748,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		return status;
 
-	if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0)
-	    || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+	if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
+	    || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
+	    || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
 		return nfserr_attrnotsupp;
 	if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
 	    || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 42f9fb661006..70296fedee41 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -236,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 
 	bmval[0] = 0;
 	bmval[1] = 0;
+	bmval[2] = 0;
 
 	READ_BUF(4);
 	READ32(bmlen);
@@ -247,13 +248,16 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 		READ32(bmval[0]);
 	if (bmlen > 1)
 		READ32(bmval[1]);
+	if (bmlen > 2)
+		READ32(bmval[2]);
 
 	DECODE_TAIL;
 }
 
 static u32 nfsd_attrmask[] = {
 	NFSD_WRITEABLE_ATTRS_WORD0,
-	NFSD_WRITEABLE_ATTRS_WORD1
+	NFSD_WRITEABLE_ATTRS_WORD1,
+	NFSD_WRITEABLE_ATTRS_WORD2
 };
 
 static __be32
@@ -274,9 +278,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
 	 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
 	 * read-only attributes return ERR_INVAL.
 	 */
-	if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+	if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
+	    (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
+	    (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
 		return nfserr_attrnotsupp;
-	if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]))
+	if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+	    (bmval[2] & ~writable[2]))
 		return nfserr_inval;
 
 	READ_BUF(4);
@@ -411,6 +418,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
 			goto xdr_error;
 		}
 	}
+	BUG_ON(bmval[2]);	/* no such writeable attr supported yet */
 	if (len != expected_len)
 		goto xdr_error;
 
@@ -1726,6 +1734,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 {
 	u32 bmval0 = bmval[0];
 	u32 bmval1 = bmval[1];
+	u32 bmval2 = bmval[2];
 	struct kstat stat;
 	struct svc_fh tempfh;
 	struct kstatfs statfs;
@@ -1739,12 +1748,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	int err;
 	int aclsupport = 0;
 	struct nfs4_acl *acl = NULL;
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	u32 minorversion = resp->cstate.minorversion;
 
 	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
-	BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
-	BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
+	BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
+	BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
+	BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
 
 	if (exp->ex_fslocs.migrated) {
+		BUG_ON(bmval[2]);
 		status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
 		if (status)
 			goto out;
@@ -1790,22 +1803,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if ((buflen -= 16) < 0)
 		goto out_resource;
 
-	WRITE32(2);
-	WRITE32(bmval0);
-	WRITE32(bmval1);
+	if (unlikely(bmval2)) {
+		WRITE32(3);
+		WRITE32(bmval0);
+		WRITE32(bmval1);
+		WRITE32(bmval2);
+	} else if (likely(bmval1)) {
+		WRITE32(2);
+		WRITE32(bmval0);
+		WRITE32(bmval1);
+	} else {
+		WRITE32(1);
+		WRITE32(bmval0);
+	}
 	attrlenp = p++;                /* to be backfilled later */
 
 	if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-		u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
+		u32 word0 = nfsd_suppattrs0(minorversion);
+		u32 word1 = nfsd_suppattrs1(minorversion);
+		u32 word2 = nfsd_suppattrs2(minorversion);
+
 		if ((buflen -= 12) < 0)
 			goto out_resource;
 		if (!aclsupport)
 			word0 &= ~FATTR4_WORD0_ACL;
 		if (!exp->ex_fslocs.locations)
 			word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
-		WRITE32(2);
-		WRITE32(word0);
-		WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
+		if (!word2) {
+			WRITE32(2);
+			WRITE32(word0);
+			WRITE32(word1);
+		} else {
+			WRITE32(3);
+			WRITE32(word0);
+			WRITE32(word1);
+			WRITE32(word2);
+		}
 	}
 	if (bmval0 & FATTR4_WORD0_TYPE) {
 		if ((buflen -= 4) < 0)
@@ -2115,6 +2148,8 @@ out_acl:
 		}
 		WRITE64(stat.ino);
 	}
+	BUG_ON(bmval2);	/* FIXME: not implemented yet */
+
 	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
 	*countp = p - buffer;
 	status = nfs_ok;
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 5bed5096874d..69ca788f8fc5 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -346,7 +346,7 @@ extern struct timeval	nfssvc_boot;
  *    TIME_BACKUP   (unlikely to be supported any time soon)
  *    TIME_CREATE   (unlikely to be supported any time soon)
  */
-#define NFSD_SUPPORTED_ATTRS_WORD0                                                          \
+#define NFSD4_SUPPORTED_ATTRS_WORD0                                                         \
 (FATTR4_WORD0_SUPPORTED_ATTRS   | FATTR4_WORD0_TYPE         | FATTR4_WORD0_FH_EXPIRE_TYPE   \
  | FATTR4_WORD0_CHANGE          | FATTR4_WORD0_SIZE         | FATTR4_WORD0_LINK_SUPPORT     \
  | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR   | FATTR4_WORD0_FSID             \
@@ -358,7 +358,7 @@ extern struct timeval	nfssvc_boot;
  | FATTR4_WORD0_MAXFILESIZE     | FATTR4_WORD0_MAXLINK      | FATTR4_WORD0_MAXNAME          \
  | FATTR4_WORD0_MAXREAD         | FATTR4_WORD0_MAXWRITE     | FATTR4_WORD0_ACL)
 
-#define NFSD_SUPPORTED_ATTRS_WORD1                                                          \
+#define NFSD4_SUPPORTED_ATTRS_WORD1                                                         \
 (FATTR4_WORD1_MODE              | FATTR4_WORD1_NO_TRUNC     | FATTR4_WORD1_NUMLINKS         \
  | FATTR4_WORD1_OWNER	        | FATTR4_WORD1_OWNER_GROUP  | FATTR4_WORD1_RAWDEV           \
  | FATTR4_WORD1_SPACE_AVAIL     | FATTR4_WORD1_SPACE_FREE   | FATTR4_WORD1_SPACE_TOTAL      \
@@ -366,6 +366,35 @@ extern struct timeval	nfssvc_boot;
  | FATTR4_WORD1_TIME_DELTA   | FATTR4_WORD1_TIME_METADATA    \
  | FATTR4_WORD1_TIME_MODIFY     | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
 
+#define NFSD4_SUPPORTED_ATTRS_WORD2 0
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
+	NFSD4_SUPPORTED_ATTRS_WORD0
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
+	NFSD4_SUPPORTED_ATTRS_WORD1
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
+	NFSD4_SUPPORTED_ATTRS_WORD2
+
+static inline u32 nfsd_suppattrs0(u32 minorversion)
+{
+	return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
+			    : NFSD4_SUPPORTED_ATTRS_WORD0;
+}
+
+static inline u32 nfsd_suppattrs1(u32 minorversion)
+{
+	return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1
+			    : NFSD4_SUPPORTED_ATTRS_WORD1;
+}
+
+static inline u32 nfsd_suppattrs2(u32 minorversion)
+{
+	return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2
+			    : NFSD4_SUPPORTED_ATTRS_WORD2;
+}
+
 /* These will return ERR_INVAL if specified in GETATTR or READDIR. */
 #define NFSD_WRITEONLY_ATTRS_WORD1							    \
 (FATTR4_WORD1_TIME_ACCESS_SET   | FATTR4_WORD1_TIME_MODIFY_SET)
@@ -376,6 +405,7 @@ extern struct timeval	nfssvc_boot;
 #define NFSD_WRITEABLE_ATTRS_WORD1                                                          \
 (FATTR4_WORD1_MODE              | FATTR4_WORD1_OWNER         | FATTR4_WORD1_OWNER_GROUP     \
  | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#define NFSD_WRITEABLE_ATTRS_WORD2 0
 
 #endif /* CONFIG_NFSD_V4 */
 
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 5e16935a1eaf..c07d8fe7bb07 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -102,7 +102,7 @@ struct nfsd4_create {
 			u32 specdata2;
 		} dev;    /* NF4BLK, NF4CHR */
 	} u;
-	u32		cr_bmval[2];        /* request */
+	u32		cr_bmval[3];        /* request */
 	struct iattr	cr_iattr;           /* request */
 	struct nfsd4_change_info  cr_cinfo; /* response */
 	struct nfs4_acl *cr_acl;
@@ -117,7 +117,7 @@ struct nfsd4_delegreturn {
 };
 
 struct nfsd4_getattr {
-	u32		ga_bmval[2];        /* request */
+	u32		ga_bmval[3];        /* request */
 	struct svc_fh	*ga_fhp;            /* response */
 };
 
@@ -218,7 +218,7 @@ struct nfsd4_open {
 	stateid_t       op_delegate_stateid; /* request - response */
 	u32		op_create;     	    /* request */
 	u32		op_createmode;      /* request */
-	u32		op_bmval[2];        /* request */
+	u32		op_bmval[3];        /* request */
 	union {                             /* request */
 		struct iattr	iattr;                      /* UNCHECKED4,GUARDED4 */
 		nfs4_verifier	verf;                                /* EXCLUSIVE4 */
@@ -271,7 +271,7 @@ struct nfsd4_readdir {
 	nfs4_verifier	rd_verf;            /* request */
 	u32		rd_dircount;        /* request */
 	u32		rd_maxcount;        /* request */
-	u32		rd_bmval[2];        /* request */
+	u32		rd_bmval[3];        /* request */
 	struct svc_rqst *rd_rqstp;          /* response */
 	struct svc_fh * rd_fhp;             /* response */
 
@@ -313,7 +313,7 @@ struct nfsd4_secinfo {
 
 struct nfsd4_setattr {
 	stateid_t	sa_stateid;         /* request */
-	u32		sa_bmval[2];        /* request */
+	u32		sa_bmval[3];        /* request */
 	struct iattr	sa_iattr;           /* request */
 	struct nfs4_acl *sa_acl;
 };
@@ -339,7 +339,7 @@ struct nfsd4_setclientid_confirm {
 
 /* also used for NVERIFY */
 struct nfsd4_verify {
-	u32		ve_bmval[2];        /* request */
+	u32		ve_bmval[3];        /* request */
 	u32		ve_attrlen;         /* request */
 	char *		ve_attrval;         /* request */
 };
-- 
cgit v1.2.3


From 8c18f2052e756e7d5dea712fc6e7ed70c00e8a39 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 3 Apr 2009 08:29:14 +0300
Subject: nfsd41: SUPPATTR_EXCLCREAT attribute

Return bitmask for supported EXCLUSIVE4_1 create attributes.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4xdr.c         |  7 ++++++-
 include/linux/nfs4.h      |  2 ++
 include/linux/nfsd/nfsd.h | 14 +++++++++++++-
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 70296fedee41..533d14fec99e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2148,7 +2148,12 @@ out_acl:
 		}
 		WRITE64(stat.ino);
 	}
-	BUG_ON(bmval2);	/* FIXME: not implemented yet */
+	if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+		WRITE32(3);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+	}
 
 	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
 	*countp = p - buffer;
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 29ed600e193b..ec3cd49b04f5 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -404,6 +404,8 @@ enum lock_type4 {
 #define FATTR4_WORD0_UNIQUE_HANDLES     (1UL << 9)
 #define FATTR4_WORD0_LEASE_TIME         (1UL << 10)
 #define FATTR4_WORD0_RDATTR_ERROR       (1UL << 11)
+/* Mandatory in NFSv4.1 */
+#define FATTR4_WORD2_SUPPATTR_EXCLCREAT (1UL << 11)
 
 /* Recommended Attributes */
 #define FATTR4_WORD0_ACL                (1UL << 12)
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 69ca788f8fc5..0ec4d142c503 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -375,7 +375,7 @@ extern struct timeval	nfssvc_boot;
 	NFSD4_SUPPORTED_ATTRS_WORD1
 
 #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
-	NFSD4_SUPPORTED_ATTRS_WORD2
+	(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
 
 static inline u32 nfsd_suppattrs0(u32 minorversion)
 {
@@ -407,6 +407,18 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
  | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
 #define NFSD_WRITEABLE_ATTRS_WORD2 0
 
+#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
+	NFSD_WRITEABLE_ATTRS_WORD0
+/*
+ * we currently store the exclusive create verifier in the v_{a,m}time
+ * attributes so the client can't set these at create time using EXCLUSIVE4_1
+ */
+#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \
+	(NFSD_WRITEABLE_ATTRS_WORD1 & \
+	 ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET))
+#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
+	NFSD_WRITEABLE_ATTRS_WORD2
+
 #endif /* CONFIG_NFSD_V4 */
 
 #endif /* LINUX_NFSD_NFSD_H */
-- 
cgit v1.2.3


From 79fb54abd285b442e1f30f851902f3ddf58e7704 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 3 Apr 2009 08:29:17 +0300
Subject: nfsd41: CREATE_EXCLUSIVE4_1

Implement the CREATE_EXCLUSIVE4_1 open mode conforming to
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-26

This mode allows the client to atomically create a file
if it doesn't exist while setting some of its attributes.

It must be implemented if the server supports persistent
reply cache and/or pnfs.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c        | 15 +++++++++++++++
 fs/nfsd/nfs4xdr.c         | 17 +++++++++++++++++
 include/linux/nfs4.h      |  8 +++++++-
 include/linux/nfsd/xdr4.h | 10 ++++------
 4 files changed, 43 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e206053433e3..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 	open->op_truncate = 0;
 
 	if (open->op_create) {
+		/* FIXME: check session persistence and pnfs flags.
+		 * The nfsv4.1 spec requires the following semantics:
+		 *
+		 * Persistent   | pNFS   | Server REQUIRED | Client Allowed
+		 * Reply Cache  | server |                 |
+		 * -------------+--------+-----------------+--------------------
+		 * no           | no     | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 *              |        |                 | (SHOULD)
+		 *              |        | and EXCLUSIVE4  | or EXCLUSIVE4
+		 *              |        |                 | (SHOULD NOT)
+		 * no           | yes    | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 * yes          | no     | GUARDED4        | GUARDED4
+		 * yes          | yes    | GUARDED4        | GUARDED4
+		 */
+
 		/*
 		 * Note: create modes (UNCHECKED,GUARDED...) are the same
 		 * in NFSv4 as in v3.
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 533d14fec99e..b820c311931c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -260,6 +260,12 @@ static u32 nfsd_attrmask[] = {
 	NFSD_WRITEABLE_ATTRS_WORD2
 };
 
+static u32 nfsd41_ex_attrmask[] = {
+	NFSD_SUPPATTR_EXCLCREAT_WORD0,
+	NFSD_SUPPATTR_EXCLCREAT_WORD1,
+	NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
+
 static __be32
 nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
 		   struct iattr *iattr, struct nfs4_acl **acl)
@@ -684,6 +690,17 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 			READ_BUF(8);
 			COPYMEM(open->op_verf.data, 8);
 			break;
+		case NFS4_CREATE_EXCLUSIVE4_1:
+			if (argp->minorversion < 1)
+				goto xdr_error;
+			READ_BUF(8);
+			COPYMEM(open->op_verf.data, 8);
+			status = nfsd4_decode_fattr(argp, open->op_bmval,
+				nfsd41_ex_attrmask, &open->op_iattr,
+				&open->op_acl);
+			if (status)
+				goto out;
+			break;
 		default:
 			goto xdr_error;
 		}
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index ec3cd49b04f5..e3f0cbcbd0db 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -368,7 +368,13 @@ enum opentype4 {
 enum createmode4 {
 	NFS4_CREATE_UNCHECKED = 0,
 	NFS4_CREATE_GUARDED = 1,
-	NFS4_CREATE_EXCLUSIVE = 2
+	NFS4_CREATE_EXCLUSIVE = 2,
+	/*
+	 * New to NFSv4.1. If session is persistent,
+	 * GUARDED4 MUST be used. Otherwise, use
+	 * EXCLUSIVE4_1 instead of EXCLUSIVE4.
+	 */
+	NFS4_CREATE_EXCLUSIVE4_1 = 3
 };
 
 enum limit_by4 {
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index c07d8fe7bb07..f80d6013fdc3 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -219,10 +219,8 @@ struct nfsd4_open {
 	u32		op_create;     	    /* request */
 	u32		op_createmode;      /* request */
 	u32		op_bmval[3];        /* request */
-	union {                             /* request */
-		struct iattr	iattr;                      /* UNCHECKED4,GUARDED4 */
-		nfs4_verifier	verf;                                /* EXCLUSIVE4 */
-	} u;
+	struct iattr	iattr;              /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
+	nfs4_verifier	verf;               /* EXCLUSIVE4 */
 	clientid_t	op_clientid;        /* request */
 	struct xdr_netobj op_owner;           /* request */
 	u32		op_seqid;           /* request */
@@ -236,8 +234,8 @@ struct nfsd4_open {
 	struct nfs4_stateowner *op_stateowner; /* used during processing */
 	struct nfs4_acl *op_acl;
 };
-#define op_iattr	u.iattr
-#define op_verf		u.verf
+#define op_iattr	iattr
+#define op_verf		verf
 
 struct nfsd4_open_confirm {
 	stateid_t	oc_req_stateid		/* request */;
-- 
cgit v1.2.3


From 276dbf997043cbf38f0087624e0f9c51742c8885 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sat, 4 Apr 2009 01:45:37 +0100
Subject: intel-iommu: Handle PCI domains appropriately.

We were comparing {bus,devfn} and assuming that a match meant it was the
same device. It doesn't -- the same {bus,devfn} can exist in
multiple PCI domains. Include domain number in device identification
(and call it 'segment' in most places, because there's already a lot of
references to 'domain' which means something else, and this code is
infected with ACPI thinking already).

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c        |  1 +
 drivers/pci/intel-iommu.c | 76 ++++++++++++++++++++++++++++++-----------------
 include/linux/dmar.h      |  1 +
 3 files changed, 50 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 3fbe6af7ad71..25a00ce4f24d 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -180,6 +180,7 @@ dmar_parse_one_drhd(struct acpi_dmar_header *header)
 	dmaru->hdr = header;
 	drhd = (struct acpi_dmar_hardware_unit *)header;
 	dmaru->reg_base_addr = drhd->address;
+	dmaru->segment = drhd->segment;
 	dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
 
 	ret = alloc_iommu(dmaru);
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 6262c198e56e..fd7472f28674 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -248,7 +248,8 @@ struct dmar_domain {
 struct device_domain_info {
 	struct list_head link;	/* link to domain siblings */
 	struct list_head global; /* link to global list */
-	u8 bus;			/* PCI bus numer */
+	int segment;		/* PCI domain */
+	u8 bus;			/* PCI bus number */
 	u8 devfn;		/* PCI devfn number */
 	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
 	struct dmar_domain *domain; /* pointer to domain */
@@ -468,7 +469,7 @@ static void domain_update_iommu_cap(struct dmar_domain *domain)
 	domain_update_iommu_snooping(domain);
 }
 
-static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
+static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 {
 	struct dmar_drhd_unit *drhd = NULL;
 	int i;
@@ -476,6 +477,8 @@ static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
 	for_each_drhd_unit(drhd) {
 		if (drhd->ignored)
 			continue;
+		if (segment != drhd->segment)
+			continue;
 
 		for (i = 0; i < drhd->devices_cnt; i++) {
 			if (drhd->devices[i] &&
@@ -1318,7 +1321,7 @@ static void domain_exit(struct dmar_domain *domain)
 }
 
 static int domain_context_mapping_one(struct dmar_domain *domain,
-		u8 bus, u8 devfn)
+				      int segment, u8 bus, u8 devfn)
 {
 	struct context_entry *context;
 	unsigned long flags;
@@ -1333,7 +1336,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
 	BUG_ON(!domain->pgd);
 
-	iommu = device_to_iommu(bus, devfn);
+	iommu = device_to_iommu(segment, bus, devfn);
 	if (!iommu)
 		return -ENODEV;
 
@@ -1423,8 +1426,8 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
 	int ret;
 	struct pci_dev *tmp, *parent;
 
-	ret = domain_context_mapping_one(domain, pdev->bus->number,
-		pdev->devfn);
+	ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
+					 pdev->bus->number, pdev->devfn);
 	if (ret)
 		return ret;
 
@@ -1435,18 +1438,23 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
 	/* Secondary interface's bus number and devfn 0 */
 	parent = pdev->bus->self;
 	while (parent != tmp) {
-		ret = domain_context_mapping_one(domain, parent->bus->number,
-			parent->devfn);
+		ret = domain_context_mapping_one(domain,
+						 pci_domain_nr(parent->bus),
+						 parent->bus->number,
+						 parent->devfn);
 		if (ret)
 			return ret;
 		parent = parent->bus->self;
 	}
 	if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
 		return domain_context_mapping_one(domain,
-			tmp->subordinate->number, 0);
+					pci_domain_nr(tmp->subordinate),
+					tmp->subordinate->number, 0);
 	else /* this is a legacy PCI bridge */
 		return domain_context_mapping_one(domain,
-			tmp->bus->number, tmp->devfn);
+						  pci_domain_nr(tmp->bus),
+						  tmp->bus->number,
+						  tmp->devfn);
 }
 
 static int domain_context_mapped(struct pci_dev *pdev)
@@ -1455,12 +1463,12 @@ static int domain_context_mapped(struct pci_dev *pdev)
 	struct pci_dev *tmp, *parent;
 	struct intel_iommu *iommu;
 
-	iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
+	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
+				pdev->devfn);
 	if (!iommu)
 		return -ENODEV;
 
-	ret = device_context_mapped(iommu,
-		pdev->bus->number, pdev->devfn);
+	ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
 	if (!ret)
 		return ret;
 	/* dependent device mapping */
@@ -1471,17 +1479,17 @@ static int domain_context_mapped(struct pci_dev *pdev)
 	parent = pdev->bus->self;
 	while (parent != tmp) {
 		ret = device_context_mapped(iommu, parent->bus->number,
-			parent->devfn);
+					    parent->devfn);
 		if (!ret)
 			return ret;
 		parent = parent->bus->self;
 	}
 	if (tmp->is_pcie)
-		return device_context_mapped(iommu,
-			tmp->subordinate->number, 0);
+		return device_context_mapped(iommu, tmp->subordinate->number,
+					     0);
 	else
-		return device_context_mapped(iommu,
-			tmp->bus->number, tmp->devfn);
+		return device_context_mapped(iommu, tmp->bus->number,
+					     tmp->devfn);
 }
 
 static int
@@ -1548,7 +1556,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
 			info->dev->dev.archdata.iommu = NULL;
 		spin_unlock_irqrestore(&device_domain_lock, flags);
 
-		iommu = device_to_iommu(info->bus, info->devfn);
+		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
 		iommu_detach_dev(iommu, info->bus, info->devfn);
 		free_devinfo_mem(info);
 
@@ -1583,11 +1591,14 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
 	struct pci_dev *dev_tmp;
 	unsigned long flags;
 	int bus = 0, devfn = 0;
+	int segment;
 
 	domain = find_domain(pdev);
 	if (domain)
 		return domain;
 
+	segment = pci_domain_nr(pdev->bus);
+
 	dev_tmp = pci_find_upstream_pcie_bridge(pdev);
 	if (dev_tmp) {
 		if (dev_tmp->is_pcie) {
@@ -1599,7 +1610,8 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
 		}
 		spin_lock_irqsave(&device_domain_lock, flags);
 		list_for_each_entry(info, &device_domain_list, global) {
-			if (info->bus == bus && info->devfn == devfn) {
+			if (info->segment == segment &&
+			    info->bus == bus && info->devfn == devfn) {
 				found = info->domain;
 				break;
 			}
@@ -1637,6 +1649,7 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
 			domain_exit(domain);
 			goto error;
 		}
+		info->segment = segment;
 		info->bus = bus;
 		info->devfn = devfn;
 		info->dev = NULL;
@@ -1648,7 +1661,8 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
 		found = NULL;
 		spin_lock_irqsave(&device_domain_lock, flags);
 		list_for_each_entry(tmp, &device_domain_list, global) {
-			if (tmp->bus == bus && tmp->devfn == devfn) {
+			if (tmp->segment == segment &&
+			    tmp->bus == bus && tmp->devfn == devfn) {
 				found = tmp->domain;
 				break;
 			}
@@ -1668,6 +1682,7 @@ found_domain:
 	info = alloc_devinfo_mem();
 	if (!info)
 		goto error;
+	info->segment = segment;
 	info->bus = pdev->bus->number;
 	info->devfn = pdev->devfn;
 	info->dev = pdev;
@@ -2808,6 +2823,7 @@ static int vm_domain_add_dev_info(struct dmar_domain *domain,
 	if (!info)
 		return -ENOMEM;
 
+	info->segment = pci_domain_nr(pdev->bus);
 	info->bus = pdev->bus->number;
 	info->devfn = pdev->devfn;
 	info->dev = pdev;
@@ -2837,15 +2853,15 @@ static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
 		parent = pdev->bus->self;
 		while (parent != tmp) {
 			iommu_detach_dev(iommu, parent->bus->number,
-				parent->devfn);
+					 parent->devfn);
 			parent = parent->bus->self;
 		}
 		if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
 			iommu_detach_dev(iommu,
 				tmp->subordinate->number, 0);
 		else /* this is a legacy PCI bridge */
-			iommu_detach_dev(iommu,
-				tmp->bus->number, tmp->devfn);
+			iommu_detach_dev(iommu, tmp->bus->number,
+					 tmp->devfn);
 	}
 }
 
@@ -2858,13 +2874,15 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
 	int found = 0;
 	struct list_head *entry, *tmp;
 
-	iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
+	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
+				pdev->devfn);
 	if (!iommu)
 		return;
 
 	spin_lock_irqsave(&device_domain_lock, flags);
 	list_for_each_safe(entry, tmp, &domain->devices) {
 		info = list_entry(entry, struct device_domain_info, link);
+		/* No need to compare PCI domain; it has to be the same */
 		if (info->bus == pdev->bus->number &&
 		    info->devfn == pdev->devfn) {
 			list_del(&info->link);
@@ -2889,7 +2907,8 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
 		 * owned by this domain, clear this iommu in iommu_bmp
 		 * update iommu count and coherency
 		 */
-		if (device_to_iommu(info->bus, info->devfn) == iommu)
+		if (iommu == device_to_iommu(info->segment, info->bus,
+					    info->devfn))
 			found = 1;
 	}
 
@@ -2922,7 +2941,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
 
 		spin_unlock_irqrestore(&device_domain_lock, flags1);
 
-		iommu = device_to_iommu(info->bus, info->devfn);
+		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
 		iommu_detach_dev(iommu, info->bus, info->devfn);
 		iommu_detach_dependent_devices(iommu, info->dev);
 
@@ -3110,7 +3129,8 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
 		}
 	}
 
-	iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
+	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
+				pdev->devfn);
 	if (!iommu)
 		return -ENODEV;
 
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 4a0ce6f27e1b..e397dc342cda 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -34,6 +34,7 @@ struct dmar_drhd_unit {
 	u64	reg_base_addr;		/* register base address*/
 	struct	pci_dev **devices; 	/* target device array	*/
 	int	devices_cnt;		/* target device count	*/
+	u16	segment;		/* PCI domain		*/
 	u8	ignored:1; 		/* ignore drhd		*/
 	u8	include_all:1;
 	struct intel_iommu *iommu;
-- 
cgit v1.2.3


From 1f24b5a8ecbb2a3c7080f418974d40e3ffedb221 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Thu, 26 Mar 2009 00:42:41 -0700
Subject: [MTD] driver model updates

Update driver model support in the MTD framework, so it fits
better into the current udev-based hotplug framework:

 - Each mtd_info now has a device node.  MTD drivers should set
   the dev.parent field to point to the physical device, before
   setting up partitions or otherwise declaring MTDs.

 - Those device nodes always map to /sys/class/mtdX device nodes,
   which no longer depend on MTD_CHARDEV.

 - Those mtdX sysfs nodes have a "starter set" of attributes;
   it's not yet sufficient to replace /proc/mtd.

 - Enabling MTD_CHARDEV provides /sys/class/mtdXro/ nodes and the
   /sys/class/mtd*/dev attributes (for udev, mdev, etc).

 - Include a MODULE_ALIAS_CHARDEV_MAJOR macro.  It'll work with
   udev creating the /dev/mtd* nodes, not just a static rootfs.

So the sysfs structure is pretty much what you'd expect, except
that readonly chardev nodes are a bit quirky.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtd_blkdevs.c |   1 +
 drivers/mtd/mtdchar.c     |  47 +++----------------
 drivers/mtd/mtdcore.c     | 115 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/mtd/mtdpart.c     |   9 +++-
 include/linux/mtd/mtd.h   |   7 +++
 5 files changed, 137 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 4109e0bb94f5..a49a9c8f2cb1 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -286,6 +286,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 	gd->private_data = new;
 	new->blkcore_priv = gd;
 	gd->queue = tr->blkcore_priv->rq;
+	gd->driverfs_dev = new->mtd->dev.parent;
 
 	if (new->readonly)
 		set_disk_ro(gd, 1);
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index f478f1fc3949..763d3f0a1f42 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -20,33 +20,6 @@
 
 #include <asm/uaccess.h>
 
-static struct class *mtd_class;
-
-static void mtd_notify_add(struct mtd_info* mtd)
-{
-	if (!mtd)
-		return;
-
-	device_create(mtd_class, NULL, MKDEV(MTD_CHAR_MAJOR, mtd->index*2),
-		      NULL, "mtd%d", mtd->index);
-
-	device_create(mtd_class, NULL, MKDEV(MTD_CHAR_MAJOR, mtd->index*2+1),
-		      NULL, "mtd%dro", mtd->index);
-}
-
-static void mtd_notify_remove(struct mtd_info* mtd)
-{
-	if (!mtd)
-		return;
-
-	device_destroy(mtd_class, MKDEV(MTD_CHAR_MAJOR, mtd->index*2));
-	device_destroy(mtd_class, MKDEV(MTD_CHAR_MAJOR, mtd->index*2+1));
-}
-
-static struct mtd_notifier notifier = {
-	.add	= mtd_notify_add,
-	.remove	= mtd_notify_remove,
-};
 
 /*
  * Data structure to hold the pointer to the mtd device as well
@@ -854,34 +827,26 @@ static const struct file_operations mtd_fops = {
 
 static int __init init_mtdchar(void)
 {
-	if (register_chrdev(MTD_CHAR_MAJOR, "mtd", &mtd_fops)) {
+	int status;
+
+	status = register_chrdev(MTD_CHAR_MAJOR, "mtd", &mtd_fops);
+	if (status < 0) {
 		printk(KERN_NOTICE "Can't allocate major number %d for Memory Technology Devices.\n",
 		       MTD_CHAR_MAJOR);
-		return -EAGAIN;
-	}
-
-	mtd_class = class_create(THIS_MODULE, "mtd");
-
-	if (IS_ERR(mtd_class)) {
-		printk(KERN_ERR "Error creating mtd class.\n");
-		unregister_chrdev(MTD_CHAR_MAJOR, "mtd");
-		return PTR_ERR(mtd_class);
 	}
 
-	register_mtd_user(&notifier);
-	return 0;
+	return status;
 }
 
 static void __exit cleanup_mtdchar(void)
 {
-	unregister_mtd_user(&notifier);
-	class_destroy(mtd_class);
 	unregister_chrdev(MTD_CHAR_MAJOR, "mtd");
 }
 
 module_init(init_mtdchar);
 module_exit(cleanup_mtdchar);
 
+MODULE_ALIAS_CHARDEV_MAJOR(MTD_CHAR_MAJOR);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 65a7f3703881..a88f8bc9a534 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -23,6 +23,9 @@
 
 #include "mtdcore.h"
 
+
+static struct class *mtd_class;
+
 /* These are exported solely for the purpose of mtd_blkdevs.c. You
    should not use them for _anything_ else */
 DEFINE_MUTEX(mtd_table_mutex);
@@ -33,6 +36,82 @@ EXPORT_SYMBOL_GPL(mtd_table);
 
 static LIST_HEAD(mtd_notifiers);
 
+
+#if defined(CONFIG_MTD_CHAR) || defined(CONFIG_MTD_CHAR_MODULE)
+#define MTD_DEVT(index) MKDEV(MTD_CHAR_MAJOR, (index)*2)
+#else
+#define MTD_DEVT(index) 0
+#endif
+
+/* REVISIT once MTD uses the driver model better, whoever allocates
+ * the mtd_info will probably want to use the release() hook...
+ */
+static void mtd_release(struct device *dev)
+{
+	struct mtd_info *mtd = dev_to_mtd(dev);
+
+	/* remove /dev/mtdXro node if needed */
+	if (MTD_DEVT(mtd->index))
+		device_destroy(mtd_class, MTD_DEVT(mtd->index) + 1);
+}
+
+static ssize_t mtd_type_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct mtd_info *mtd = dev_to_mtd(dev);
+	char *type;
+
+	switch (mtd->type) {
+	case MTD_ABSENT:
+		type = "absent";
+		break;
+	case MTD_RAM:
+		type = "ram";
+		break;
+	case MTD_ROM:
+		type = "rom";
+		break;
+	case MTD_NORFLASH:
+		type = "nor";
+		break;
+	case MTD_NANDFLASH:
+		type = "nand";
+		break;
+	case MTD_DATAFLASH:
+		type = "dataflash";
+		break;
+	case MTD_UBIVOLUME:
+		type = "ubi";
+		break;
+	default:
+		type = "unknown";
+	}
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", type);
+}
+static DEVICE_ATTR(mtd_type, S_IRUGO, mtd_type_show, NULL);
+
+static struct attribute *mtd_attrs[] = {
+	&dev_attr_mtd_type.attr,
+	/* FIXME provide a /proc/mtd superset */
+	NULL,
+};
+
+struct attribute_group mtd_group = {
+	.attrs		= mtd_attrs,
+};
+
+struct attribute_group *mtd_groups[] = {
+	&mtd_group,
+	NULL,
+};
+
+static struct device_type mtd_devtype = {
+	.name		= "mtd",
+	.groups		= mtd_groups,
+	.release	= mtd_release,
+};
+
 /**
  *	add_mtd_device - register an MTD device
  *	@mtd: pointer to new MTD device info structure
@@ -41,6 +120,7 @@ static LIST_HEAD(mtd_notifiers);
  *	notify each currently active MTD 'user' of its arrival. Returns
  *	zero on success or 1 on failure, which currently will only happen
  *	if the number of present devices exceeds MAX_MTD_DEVICES (i.e. 16)
+ *	or there's a sysfs error.
  */
 
 int add_mtd_device(struct mtd_info *mtd)
@@ -95,6 +175,23 @@ int add_mtd_device(struct mtd_info *mtd)
 					       mtd->name);
 			}
 
+			/* Caller should have set dev.parent to match the
+			 * physical device.
+			 */
+			mtd->dev.type = &mtd_devtype;
+			mtd->dev.class = mtd_class;
+			mtd->dev.devt = MTD_DEVT(i);
+			dev_set_name(&mtd->dev, "mtd%d", i);
+			if (device_register(&mtd->dev) != 0) {
+				mtd_table[i] = NULL;
+				break;
+			}
+
+			if (MTD_DEVT(i))
+				device_create(mtd_class, mtd->dev.parent,
+						MTD_DEVT(i) + 1,
+						NULL, "mtd%dro", i);
+
 			DEBUG(0, "mtd: Giving out device %d to %s\n",i, mtd->name);
 			/* No need to get a refcount on the module containing
 			   the notifier, since we hold the mtd_table_mutex */
@@ -358,6 +455,24 @@ EXPORT_SYMBOL_GPL(register_mtd_user);
 EXPORT_SYMBOL_GPL(unregister_mtd_user);
 EXPORT_SYMBOL_GPL(default_mtd_writev);
 
+static int __init mtd_setup(void)
+{
+	mtd_class = class_create(THIS_MODULE, "mtd");
+
+	if (IS_ERR(mtd_class)) {
+		pr_err("Error creating mtd class.\n");
+		return PTR_ERR(mtd_class);
+	}
+	return 0;
+}
+core_initcall(mtd_setup);
+
+static void __exit mtd_teardown(void)
+{
+	class_destroy(mtd_class);
+}
+__exitcall(mtd_teardown);
+
 #ifdef CONFIG_PROC_FS
 
 /*====================================================================*/
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 06d5b9d8853a..02ce38fb1fc3 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -356,6 +356,11 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 	slave->mtd.owner = master->owner;
 	slave->mtd.backing_dev_info = master->backing_dev_info;
 
+	/* NOTE:  we don't arrange MTDs as a tree; it'd be error-prone
+	 * to have the same data be in two different partitions.
+	 */
+	slave->mtd.dev.parent = master->dev.parent;
+
 	slave->mtd.read = part_read;
 	slave->mtd.write = part_write;
 
@@ -508,7 +513,9 @@ out_register:
  * This function, given a master MTD object and a partition table, creates
  * and registers slave MTD objects which are bound to the master according to
  * the partition definitions.
- * (Q: should we register the master MTD object as well?)
+ *
+ * We don't register the master, or expect the caller to have done so,
+ * for reasons of data integrity.
  */
 
 int add_mtd_partitions(struct mtd_info *master,
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 0c079fd307a5..5675b63a0631 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/uio.h>
 #include <linux/notifier.h>
+#include <linux/device.h>
 
 #include <linux/mtd/compatmac.h>
 #include <mtd/mtd-abi.h>
@@ -237,6 +238,7 @@ struct mtd_info {
 	void *priv;
 
 	struct module *owner;
+	struct device dev;
 	int usecount;
 
 	/* If the driver is something smart, like UBI, it may need to maintain
@@ -247,6 +249,11 @@ struct mtd_info {
 	void (*put_device) (struct mtd_info *mtd);
 };
 
+static inline struct mtd_info *dev_to_mtd(struct device *dev)
+{
+	return dev ? container_of(dev, struct mtd_info, dev) : NULL;
+}
+
 static inline uint32_t mtd_div_by_eb(uint64_t sz, struct mtd_info *mtd)
 {
 	if (mtd->erasesize_shift)
-- 
cgit v1.2.3


From 601cc11d054ae4b5e9b5babec3d8e4667a2cb9b5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 3 Apr 2009 08:03:22 -0700
Subject: Make non-compat preadv/pwritev use native register size

Instead of always splitting the file offset into 32-bit 'high' and 'low'
parts, just split them into the largest natural word-size - which in C
terms is 'unsigned long'.

This allows 64-bit architectures to avoid the unnecessary 32-bit
shifting and masking for native format (while the compat interfaces will
obviously always have to do it).

This also changes the order of 'high' and 'low' to be "low first".  Why?
Because when we have it like this, the 64-bit system calls now don't use
the "pos_high" argument at all, and it makes more sense for the native
system call to simply match the user-mode prototype.

This results in a much more natural calling convention, and allows the
compiler to generate much more straightforward code.  On x86-64, we now
generate

        testq   %rcx, %rcx      # pos_l
        js      .L122   #,
        movq    %rcx, -48(%rbp) # pos_l, pos

from the C source

        loff_t pos = pos_from_hilo(pos_h, pos_l);
	...
        if (pos < 0)
                return -EINVAL;

and the 'pos_h' register isn't even touched.  It used to generate code
like

        mov     %r8d, %r8d      # pos_low, pos_low
        salq    $32, %rcx       #, tmp71
        movq    %r8, %rax       # pos_low, pos.386
        orq     %rcx, %rax      # tmp71, pos.386
        js      .L122   #,
        movq    %rax, -48(%rbp) # pos.386, pos

which isn't _that_ horrible, but it does show how the natural word size
is just a more sensible interface (same arguments will hold in the user
level glibc wrapper function, of course, so the kernel side is just half
of the equation!)

Note: in all cases the user code wrapper can again be the same. You can
just do

	#define HALF_BITS (sizeof(unsigned long)*4)
	__syscall(PWRITEV, fd, iov, count, offset, (offset >> HALF_BITS) >> HALF_BITS);

or something like that.  That way the user mode wrapper will also be
nicely passing in a zero (it won't actually have to do the shifts, the
compiler will understand what is going on) for the last argument.

And that is a good idea, even if nobody will necessarily ever care: if
we ever do move to a 128-bit lloff_t, this particular system call might
be left alone.  Of course, that will be the least of our worries if we
really ever need to care, so this may not be worth really caring about.

[ Fixed for lost 'loff_t' cast noticed by Andrew Morton ]

Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c              |  4 ++--
 fs/read_write.c          | 14 ++++++++++----
 include/linux/compat.h   |  4 ++--
 include/linux/syscalls.h |  4 ++--
 4 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index 1c859dae758f..3f84d5f15889 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1236,7 +1236,7 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
 
 asmlinkage ssize_t
 compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
-		  unsigned long vlen, u32 pos_high, u32 pos_low)
+		  unsigned long vlen, u32 pos_low, u32 pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 	struct file *file;
@@ -1293,7 +1293,7 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
 
 asmlinkage ssize_t
 compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
-		   unsigned long vlen, u32 pos_high, u32 pos_low)
+		   unsigned long vlen, u32 pos_low, u32 pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 	struct file *file;
diff --git a/fs/read_write.c b/fs/read_write.c
index 6d5d8ff238aa..9d1e76bb9ee1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,10 +731,16 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
+static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
+{
+#define HALF_LONG_BITS (BITS_PER_LONG / 2)
+	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
+}
+
 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
-		unsigned long, vlen, u32, pos_high, u32, pos_low)
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 {
-	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
 	struct file *file;
 	ssize_t ret = -EBADF;
 	int fput_needed;
@@ -757,9 +763,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 }
 
 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
-		unsigned long, vlen, u32, pos_high, u32, pos_low)
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 {
-	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
 	struct file *file;
 	ssize_t ret = -EBADF;
 	int fput_needed;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 9723edd6455c..f2ded21f9a3c 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -193,10 +193,10 @@ asmlinkage ssize_t compat_sys_writev(unsigned long fd,
 		const struct compat_iovec __user *vec, unsigned long vlen);
 asmlinkage ssize_t compat_sys_preadv(unsigned long fd,
 		const struct compat_iovec __user *vec,
-		unsigned long vlen, u32 pos_high, u32 pos_low);
+		unsigned long vlen, u32 pos_low, u32 pos_high);
 asmlinkage ssize_t compat_sys_pwritev(unsigned long fd,
 		const struct compat_iovec __user *vec,
-		unsigned long vlen, u32 pos_high, u32 pos_low);
+		unsigned long vlen, u32 pos_low, u32 pos_high);
 
 int compat_do_execve(char * filename, compat_uptr_t __user *argv,
 	        compat_uptr_t __user *envp, struct pt_regs * regs);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b299a82a05e7..18771cac2f85 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -462,9 +462,9 @@ asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
 asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
 			     size_t count, loff_t pos);
 asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
-			   unsigned long vlen, u32 pos_high, u32 pos_low);
+			   unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
 asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
-			    unsigned long vlen, u32 pos_high, u32 pos_low);
+			    unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
 asmlinkage long sys_mkdir(const char __user *pathname, int mode);
 asmlinkage long sys_chdir(const char __user *filename);
-- 
cgit v1.2.3


From 3206450355100eae8e033645318b95bb60f1faff Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 6 Feb 2009 15:27:13 +0100
Subject: mfd: Support active high IRQs on WM835x

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/mfd/wm8350-core.c       | 16 +++++++++++++++-
 include/linux/mfd/wm8350/core.h |  2 ++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c
index f22b18b70796..a285cc0cc704 100644
--- a/drivers/mfd/wm8350-core.c
+++ b/drivers/mfd/wm8350-core.c
@@ -1438,7 +1438,21 @@ int wm8350_device_init(struct wm8350 *wm8350, int irq,
 	mutex_init(&wm8350->irq_mutex);
 	INIT_WORK(&wm8350->irq_work, wm8350_irq_worker);
 	if (irq) {
-		ret = request_irq(irq, wm8350_irq, 0,
+		int flags = 0;
+
+		if (pdata && pdata->irq_high) {
+			flags |= IRQF_TRIGGER_HIGH;
+
+			wm8350_set_bits(wm8350, WM8350_SYSTEM_CONTROL_1,
+					WM8350_IRQ_POL);
+		} else {
+			flags |= IRQF_TRIGGER_LOW;
+
+			wm8350_clear_bits(wm8350, WM8350_SYSTEM_CONTROL_1,
+					  WM8350_IRQ_POL);
+		}
+
+		ret = request_irq(irq, wm8350_irq, flags,
 				  "wm8350", wm8350);
 		if (ret != 0) {
 			dev_err(wm8350->dev, "Failed to request IRQ: %d\n",
diff --git a/include/linux/mfd/wm8350/core.h b/include/linux/mfd/wm8350/core.h
index 980669d50dca..42cca672f340 100644
--- a/include/linux/mfd/wm8350/core.h
+++ b/include/linux/mfd/wm8350/core.h
@@ -640,9 +640,11 @@ struct wm8350 {
  *
  * @init: Function called during driver initialisation.  Should be
  *        used by the platform to configure GPIO functions and similar.
+ * @irq_high: Set if WM8350 IRQ is active high.
  */
 struct wm8350_platform_data {
 	int (*init)(struct wm8350 *wm8350);
+	int irq_high;
 };
 
 
-- 
cgit v1.2.3


From a23a175795cdb202619ac176129b2f0c2a5c9456 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Tue, 17 Feb 2009 10:06:41 +0100
Subject: mfd: convert DS1WM to use MFD core

This patch converts the DS1WM driver into an MFD cell. It also
calculates the bus_shift parameter from the memory resource size.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/w1/masters/ds1wm.c | 31 ++++++++++++++++++++-----------
 include/linux/ds1wm.h      | 12 ------------
 include/linux/mfd/ds1wm.h  |  5 +++++
 3 files changed, 25 insertions(+), 23 deletions(-)
 delete mode 100644 include/linux/ds1wm.h
 create mode 100644 include/linux/mfd/ds1wm.h

(limited to 'include/linux')

diff --git a/drivers/w1/masters/ds1wm.c b/drivers/w1/masters/ds1wm.c
index 29e144f81cbe..f1e6b3dd1e43 100644
--- a/drivers/w1/masters/ds1wm.c
+++ b/drivers/w1/masters/ds1wm.c
@@ -19,7 +19,8 @@
 #include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/delay.h>
-#include <linux/ds1wm.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/ds1wm.h>
 
 #include <asm/io.h>
 
@@ -89,7 +90,7 @@ struct ds1wm_data {
 	void		__iomem *map;
 	int		bus_shift; /* # of shifts to calc register offsets */
 	struct platform_device *pdev;
-	struct ds1wm_platform_data *pdata;
+	struct mfd_cell	*cell;
 	int		irq;
 	int		active_high;
 	struct clk	*clk;
@@ -217,8 +218,8 @@ static void ds1wm_up(struct ds1wm_data *ds1wm_data)
 {
 	int gclk, divisor;
 
-	if (ds1wm_data->pdata->enable)
-		ds1wm_data->pdata->enable(ds1wm_data->pdev);
+	if (ds1wm_data->cell->enable)
+		ds1wm_data->cell->enable(ds1wm_data->pdev);
 
 	gclk = clk_get_rate(ds1wm_data->clk);
 	clk_enable(ds1wm_data->clk);
@@ -244,8 +245,8 @@ static void ds1wm_down(struct ds1wm_data *ds1wm_data)
 	ds1wm_write_register(ds1wm_data, DS1WM_INT_EN,
 			     ds1wm_data->active_high ? DS1WM_INTEN_IAS : 0);
 
-	if (ds1wm_data->pdata->disable)
-		ds1wm_data->pdata->disable(ds1wm_data->pdev);
+	if (ds1wm_data->cell->disable)
+		ds1wm_data->cell->disable(ds1wm_data->pdev);
 
 	clk_disable(ds1wm_data->clk);
 }
@@ -330,13 +331,18 @@ static struct w1_bus_master ds1wm_master = {
 static int ds1wm_probe(struct platform_device *pdev)
 {
 	struct ds1wm_data *ds1wm_data;
-	struct ds1wm_platform_data *plat;
+	struct ds1wm_driver_data *plat;
 	struct resource *res;
+	struct mfd_cell *cell;
 	int ret;
 
 	if (!pdev)
 		return -ENODEV;
 
+	cell = pdev->dev.platform_data;
+	if (!cell)
+		return -ENODEV;
+
 	ds1wm_data = kzalloc(sizeof(*ds1wm_data), GFP_KERNEL);
 	if (!ds1wm_data)
 		return -ENOMEM;
@@ -348,15 +354,18 @@ static int ds1wm_probe(struct platform_device *pdev)
 		ret = -ENXIO;
 		goto err0;
 	}
-	ds1wm_data->map = ioremap(res->start, res->end - res->start + 1);
+	ds1wm_data->map = ioremap(res->start, resource_size(res));
 	if (!ds1wm_data->map) {
 		ret = -ENOMEM;
 		goto err0;
 	}
-	plat = pdev->dev.platform_data;
-	ds1wm_data->bus_shift = plat->bus_shift;
+	plat = cell->driver_data;
+
+	/* calculate bus shift from mem resource */
+	ds1wm_data->bus_shift = resource_size(res) >> 3;
+
 	ds1wm_data->pdev = pdev;
-	ds1wm_data->pdata = plat;
+	ds1wm_data->cell = cell;
 
 	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
 	if (!res) {
diff --git a/include/linux/ds1wm.h b/include/linux/ds1wm.h
deleted file mode 100644
index d3c65e48a2e7..000000000000
--- a/include/linux/ds1wm.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* platform data for the DS1WM driver */
-
-struct ds1wm_platform_data {
-	int bus_shift;	    /* number of shifts needed to calculate the
-			     * offset between DS1WM registers;
-			     * e.g. on h5xxx and h2200 this is 2
-			     * (registers aligned to 4-byte boundaries),
-			     * while on hx4700 this is 1 */
-	int active_high;
-	void (*enable)(struct platform_device *pdev);
-	void (*disable)(struct platform_device *pdev);
-};
diff --git a/include/linux/mfd/ds1wm.h b/include/linux/mfd/ds1wm.h
new file mode 100644
index 000000000000..d4898ba18207
--- /dev/null
+++ b/include/linux/mfd/ds1wm.h
@@ -0,0 +1,5 @@
+/* MFD cell driver data for the DS1WM driver */
+
+struct ds1wm_driver_data {
+	int active_high;
+};
-- 
cgit v1.2.3


From b72019dbd126e60bb5f9f350f76127b1527facba Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Tue, 17 Feb 2009 10:06:52 +0100
Subject: mfd: remove unused PASIC3 bus_shift field

Removes the now-unused bus_shift field from pasic3_platform_data.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 include/linux/mfd/htc-pasic3.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/htc-pasic3.h b/include/linux/mfd/htc-pasic3.h
index b4294f12c4f8..3d3ed67bd969 100644
--- a/include/linux/mfd/htc-pasic3.h
+++ b/include/linux/mfd/htc-pasic3.h
@@ -48,7 +48,6 @@ struct pasic3_leds_machinfo {
 
 struct pasic3_platform_data {
 	struct pasic3_leds_machinfo *led_pdata;
-	unsigned int                 bus_shift;
 	unsigned int                 clock_rate;
 };
 
-- 
cgit v1.2.3


From 7d33ccbeecd8393cc690cf9a71008236cdd7cc2c Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Tue, 17 Feb 2009 10:09:19 +0100
Subject: mfd: remove DS1WM clock handling

This driver requests a clock that usually is supplied by the MFD in which
the DS1WM is contained. Currently, it is impossible for a MFD to register
their clocks with the generic clock API due to different implementations
across architectures.
For now, this patch removes the clock handling from DS1WM altogether,
trusting that the MFD enable/disable functions will switch the clock if
needed. The clock rate is obtained from a new parameter in driver_data.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/w1/masters/ds1wm.c | 27 +++++++--------------------
 include/linux/mfd/ds1wm.h  |  1 +
 2 files changed, 8 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/w1/masters/ds1wm.c b/drivers/w1/masters/ds1wm.c
index f1e6b3dd1e43..37f08c850608 100644
--- a/drivers/w1/masters/ds1wm.c
+++ b/drivers/w1/masters/ds1wm.c
@@ -16,7 +16,6 @@
 #include <linux/irq.h>
 #include <linux/pm.h>
 #include <linux/platform_device.h>
-#include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/delay.h>
 #include <linux/mfd/core.h>
@@ -93,7 +92,6 @@ struct ds1wm_data {
 	struct mfd_cell	*cell;
 	int		irq;
 	int		active_high;
-	struct clk	*clk;
 	int		slave_present;
 	void		*reset_complete;
 	void		*read_complete;
@@ -216,17 +214,17 @@ static int ds1wm_find_divisor(int gclk)
 
 static void ds1wm_up(struct ds1wm_data *ds1wm_data)
 {
-	int gclk, divisor;
+	int divisor;
+	struct ds1wm_driver_data *plat = ds1wm_data->cell->driver_data;
 
 	if (ds1wm_data->cell->enable)
 		ds1wm_data->cell->enable(ds1wm_data->pdev);
 
-	gclk = clk_get_rate(ds1wm_data->clk);
-	clk_enable(ds1wm_data->clk);
-	divisor = ds1wm_find_divisor(gclk);
+	divisor = ds1wm_find_divisor(plat->clock_rate);
 	if (divisor == 0) {
 		dev_err(&ds1wm_data->pdev->dev,
-			"no suitable divisor for %dHz clock\n", gclk);
+			"no suitable divisor for %dHz clock\n",
+			plat->clock_rate);
 		return;
 	}
 	ds1wm_write_register(ds1wm_data, DS1WM_CLKDIV, divisor);
@@ -247,8 +245,6 @@ static void ds1wm_down(struct ds1wm_data *ds1wm_data)
 
 	if (ds1wm_data->cell->disable)
 		ds1wm_data->cell->disable(ds1wm_data->pdev);
-
-	clk_disable(ds1wm_data->clk);
 }
 
 /* --------------------------------------------------------------------- */
@@ -385,26 +381,18 @@ static int ds1wm_probe(struct platform_device *pdev)
 	if (ret)
 		goto err1;
 
-	ds1wm_data->clk = clk_get(&pdev->dev, "ds1wm");
-	if (IS_ERR(ds1wm_data->clk)) {
-		ret = PTR_ERR(ds1wm_data->clk);
-		goto err2;
-	}
-
 	ds1wm_up(ds1wm_data);
 
 	ds1wm_master.data = (void *)ds1wm_data;
 
 	ret = w1_add_master_device(&ds1wm_master);
 	if (ret)
-		goto err3;
+		goto err2;
 
 	return 0;
 
-err3:
-	ds1wm_down(ds1wm_data);
-	clk_put(ds1wm_data->clk);
 err2:
+	ds1wm_down(ds1wm_data);
 	free_irq(ds1wm_data->irq, ds1wm_data);
 err1:
 	iounmap(ds1wm_data->map);
@@ -443,7 +431,6 @@ static int ds1wm_remove(struct platform_device *pdev)
 
 	w1_remove_master_device(&ds1wm_master);
 	ds1wm_down(ds1wm_data);
-	clk_put(ds1wm_data->clk);
 	free_irq(ds1wm_data->irq, ds1wm_data);
 	iounmap(ds1wm_data->map);
 	kfree(ds1wm_data);
diff --git a/include/linux/mfd/ds1wm.h b/include/linux/mfd/ds1wm.h
index d4898ba18207..be469a357cbb 100644
--- a/include/linux/mfd/ds1wm.h
+++ b/include/linux/mfd/ds1wm.h
@@ -2,4 +2,5 @@
 
 struct ds1wm_driver_data {
 	int active_high;
+	int clock_rate;
 };
-- 
cgit v1.2.3


From 81ec5364a58c0545b694dee02fe65b9ae48f37b6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Dec 2007 17:27:03 +0100
Subject: [MTD] [NAND] Add support for 4KiB pages.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Siewior <bigeasy@linutronix.de>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_base.c | 18 ++++++++++++++++++
 include/linux/mtd/nand.h     |  4 ++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0793ca39cc88..facee262b4ba 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -82,6 +82,20 @@ static struct nand_ecclayout nand_oob_64 = {
 		 .length = 38}}
 };
 
+static struct nand_ecclayout nand_oob_128 = {
+	.eccbytes = 48,
+	.eccpos = {
+		   80, 81, 82, 83, 84, 85, 86, 87,
+		   88, 89, 90, 91, 92, 93, 94, 95,
+		   96, 97, 98, 99, 100, 101, 102, 103,
+		   104, 105, 106, 107, 108, 109, 110, 111,
+		   112, 113, 114, 115, 116, 117, 118, 119,
+		   120, 121, 122, 123, 124, 125, 126, 127},
+	.oobfree = {
+		{.offset = 2,
+		 .length = 78}}
+};
+
 static int nand_get_device(struct nand_chip *chip, struct mtd_info *mtd,
 			   int new_state);
 
@@ -2638,6 +2652,9 @@ int nand_scan_tail(struct mtd_info *mtd)
 		case 64:
 			chip->ecc.layout = &nand_oob_64;
 			break;
+		case 128:
+			chip->ecc.layout = &nand_oob_128;
+			break;
 		default:
 			printk(KERN_WARNING "No oob scheme defined for "
 			       "oobsize %d\n", mtd->oobsize);
@@ -2767,6 +2784,7 @@ int nand_scan_tail(struct mtd_info *mtd)
 			break;
 		case 4:
 		case 8:
+		case 16:
 			mtd->subpage_sft = 2;
 			break;
 		}
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index db5b63da2a7e..7efb9be34662 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -43,8 +43,8 @@ extern void nand_wait_ready(struct mtd_info *mtd);
  * is supported now. If you add a chip with bigger oobsize/page
  * adjust this accordingly.
  */
-#define NAND_MAX_OOBSIZE	64
-#define NAND_MAX_PAGESIZE	2048
+#define NAND_MAX_OOBSIZE	128
+#define NAND_MAX_PAGESIZE	4096
 
 /*
  * Constants for hardware specific CLE/ALE/NCE function
-- 
cgit v1.2.3


From 1faa16d22877f4839bd433547d770c676d1d964c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 6 Apr 2009 14:48:01 +0200
Subject: block: change the request allocation/congestion logic to be
 sync/async based

This makes sure that we never wait on async IO for sync requests, instead
of doing the split on writes vs reads.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-core.c            | 70 ++++++++++++++++++++++-----------------------
 block/blk-sysfs.c           | 40 +++++++++++++-------------
 block/elevator.c            |  2 +-
 include/linux/backing-dev.h | 12 ++++----
 include/linux/blkdev.h      | 52 ++++++++++++++++++++++-----------
 mm/backing-dev.c            | 10 +++----
 6 files changed, 102 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 996ed906d8ca..a32b571aaaa2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -484,11 +484,11 @@ static int blk_init_free_list(struct request_queue *q)
 {
 	struct request_list *rl = &q->rq;
 
-	rl->count[READ] = rl->count[WRITE] = 0;
-	rl->starved[READ] = rl->starved[WRITE] = 0;
+	rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
+	rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
 	rl->elvpriv = 0;
-	init_waitqueue_head(&rl->wait[READ]);
-	init_waitqueue_head(&rl->wait[WRITE]);
+	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
+	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 
 	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
 				mempool_free_slab, request_cachep, q->node);
@@ -699,18 +699,18 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
 	ioc->last_waited = jiffies;
 }
 
-static void __freed_request(struct request_queue *q, int rw)
+static void __freed_request(struct request_queue *q, int sync)
 {
 	struct request_list *rl = &q->rq;
 
-	if (rl->count[rw] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, rw);
+	if (rl->count[sync] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, sync);
 
-	if (rl->count[rw] + 1 <= q->nr_requests) {
-		if (waitqueue_active(&rl->wait[rw]))
-			wake_up(&rl->wait[rw]);
+	if (rl->count[sync] + 1 <= q->nr_requests) {
+		if (waitqueue_active(&rl->wait[sync]))
+			wake_up(&rl->wait[sync]);
 
-		blk_clear_queue_full(q, rw);
+		blk_clear_queue_full(q, sync);
 	}
 }
 
@@ -718,18 +718,18 @@ static void __freed_request(struct request_queue *q, int rw)
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
-static void freed_request(struct request_queue *q, int rw, int priv)
+static void freed_request(struct request_queue *q, int sync, int priv)
 {
 	struct request_list *rl = &q->rq;
 
-	rl->count[rw]--;
+	rl->count[sync]--;
 	if (priv)
 		rl->elvpriv--;
 
-	__freed_request(q, rw);
+	__freed_request(q, sync);
 
-	if (unlikely(rl->starved[rw ^ 1]))
-		__freed_request(q, rw ^ 1);
+	if (unlikely(rl->starved[sync ^ 1]))
+		__freed_request(q, sync ^ 1);
 }
 
 /*
@@ -743,15 +743,15 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = NULL;
-	const int rw = rw_flags & 0x01;
+	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	int may_queue, priv;
 
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
 
-	if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
-		if (rl->count[rw]+1 >= q->nr_requests) {
+	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
+		if (rl->count[is_sync]+1 >= q->nr_requests) {
 			ioc = current_io_context(GFP_ATOMIC, q->node);
 			/*
 			 * The queue will fill after this allocation, so set
@@ -759,9 +759,9 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 			 * This process will be allowed to complete a batch of
 			 * requests, others will be blocked.
 			 */
-			if (!blk_queue_full(q, rw)) {
+			if (!blk_queue_full(q, is_sync)) {
 				ioc_set_batching(q, ioc);
-				blk_set_queue_full(q, rw);
+				blk_set_queue_full(q, is_sync);
 			} else {
 				if (may_queue != ELV_MQUEUE_MUST
 						&& !ioc_batching(q, ioc)) {
@@ -774,7 +774,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 				}
 			}
 		}
-		blk_set_queue_congested(q, rw);
+		blk_set_queue_congested(q, is_sync);
 	}
 
 	/*
@@ -782,11 +782,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	 * limit of requests, otherwise we could have thousands of requests
 	 * allocated with any setting of ->nr_requests
 	 */
-	if (rl->count[rw] >= (3 * q->nr_requests / 2))
+	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
 		goto out;
 
-	rl->count[rw]++;
-	rl->starved[rw] = 0;
+	rl->count[is_sync]++;
+	rl->starved[is_sync] = 0;
 
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	if (priv)
@@ -804,7 +804,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 		 * wait queue, but this is pretty rare.
 		 */
 		spin_lock_irq(q->queue_lock);
-		freed_request(q, rw, priv);
+		freed_request(q, is_sync, priv);
 
 		/*
 		 * in the very unlikely event that allocation failed and no
@@ -814,8 +814,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 		 * rq mempool into READ and WRITE
 		 */
 rq_starved:
-		if (unlikely(rl->count[rw] == 0))
-			rl->starved[rw] = 1;
+		if (unlikely(rl->count[is_sync] == 0))
+			rl->starved[is_sync] = 1;
 
 		goto out;
 	}
@@ -829,7 +829,7 @@ rq_starved:
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 
-	trace_block_getrq(q, bio, rw);
+	trace_block_getrq(q, bio, rw_flags & 1);
 out:
 	return rq;
 }
@@ -843,7 +843,7 @@ out:
 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 					struct bio *bio)
 {
-	const int rw = rw_flags & 0x01;
+	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	struct request *rq;
 
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
@@ -852,10 +852,10 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 
-		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
+		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
 				TASK_UNINTERRUPTIBLE);
 
-		trace_block_sleeprq(q, bio, rw);
+		trace_block_sleeprq(q, bio, rw_flags & 1);
 
 		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
@@ -871,7 +871,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		ioc_set_batching(q, ioc);
 
 		spin_lock_irq(q->queue_lock);
-		finish_wait(&rl->wait[rw], &wait);
+		finish_wait(&rl->wait[is_sync], &wait);
 
 		rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	};
@@ -1070,14 +1070,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	 * it didn't come out of our reserved rq pools
 	 */
 	if (req->cmd_flags & REQ_ALLOCED) {
-		int rw = rq_data_dir(req);
+		int is_sync = rq_is_sync(req) != 0;
 		int priv = req->cmd_flags & REQ_ELVPRIV;
 
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(!hlist_unhashed(&req->hash));
 
 		blk_free_request(q, req);
-		freed_request(q, rw, priv);
+		freed_request(q, is_sync, priv);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e29ddfc73cf4..3ff9bba3379a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -48,28 +48,28 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 	q->nr_requests = nr;
 	blk_queue_congestion_threshold(q);
 
-	if (rl->count[READ] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, READ);
-	else if (rl->count[READ] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, READ);
-
-	if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, WRITE);
-	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, WRITE);
-
-	if (rl->count[READ] >= q->nr_requests) {
-		blk_set_queue_full(q, READ);
-	} else if (rl->count[READ]+1 <= q->nr_requests) {
-		blk_clear_queue_full(q, READ);
-		wake_up(&rl->wait[READ]);
+	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
+		blk_set_queue_congested(q, BLK_RW_SYNC);
+	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, BLK_RW_SYNC);
+
+	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
+		blk_set_queue_congested(q, BLK_RW_ASYNC);
+	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, BLK_RW_ASYNC);
+
+	if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+		blk_set_queue_full(q, BLK_RW_SYNC);
+	} else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
+		blk_clear_queue_full(q, BLK_RW_SYNC);
+		wake_up(&rl->wait[BLK_RW_SYNC]);
 	}
 
-	if (rl->count[WRITE] >= q->nr_requests) {
-		blk_set_queue_full(q, WRITE);
-	} else if (rl->count[WRITE]+1 <= q->nr_requests) {
-		blk_clear_queue_full(q, WRITE);
-		wake_up(&rl->wait[WRITE]);
+	if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+		blk_set_queue_full(q, BLK_RW_ASYNC);
+	} else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
+		blk_clear_queue_full(q, BLK_RW_ASYNC);
+		wake_up(&rl->wait[BLK_RW_ASYNC]);
 	}
 	spin_unlock_irq(q->queue_lock);
 	return ret;
diff --git a/block/elevator.c b/block/elevator.c
index 98259eda0ef6..ca6788a0195a 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -677,7 +677,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 	}
 
 	if (unplug_it && blk_queue_plugged(q)) {
-		int nrq = q->rq.count[READ] + q->rq.count[WRITE]
+		int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
 			- q->in_flight;
 
 		if (nrq >= q->unplug_thresh)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index bee52abb8a4d..0ec2c594868e 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -24,8 +24,8 @@ struct dentry;
  */
 enum bdi_state {
 	BDI_pdflush,		/* A pdflush thread is working this device */
-	BDI_write_congested,	/* The write queue is getting full */
-	BDI_read_congested,	/* The read queue is getting full */
+	BDI_async_congested,	/* The async (write) queue is getting full */
+	BDI_sync_congested,	/* The sync queue is getting full */
 	BDI_unused,		/* Available bits start here */
 };
 
@@ -215,18 +215,18 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
 
 static inline int bdi_read_congested(struct backing_dev_info *bdi)
 {
-	return bdi_congested(bdi, 1 << BDI_read_congested);
+	return bdi_congested(bdi, 1 << BDI_sync_congested);
 }
 
 static inline int bdi_write_congested(struct backing_dev_info *bdi)
 {
-	return bdi_congested(bdi, 1 << BDI_write_congested);
+	return bdi_congested(bdi, 1 << BDI_async_congested);
 }
 
 static inline int bdi_rw_congested(struct backing_dev_info *bdi)
 {
-	return bdi_congested(bdi, (1 << BDI_read_congested)|
-				  (1 << BDI_write_congested));
+	return bdi_congested(bdi, (1 << BDI_sync_congested) |
+				  (1 << BDI_async_congested));
 }
 
 void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 465d6babc847..67dae3bd881c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -38,6 +38,10 @@ struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
 
 struct request_list {
+	/*
+	 * count[], starved[], and wait[] are indexed by
+	 * BLK_RW_SYNC/BLK_RW_ASYNC
+	 */
 	int count[2];
 	int starved[2];
 	int elvpriv;
@@ -66,6 +70,11 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_ATA_PC,
 };
 
+enum {
+	BLK_RW_ASYNC	= 0,
+	BLK_RW_SYNC	= 1,
+};
+
 /*
  * For request of type REQ_TYPE_LINUX_BLOCK, rq->cmd[0] is the opcode being
  * sent down (similar to how REQ_TYPE_BLOCK_PC means that ->cmd[] holds a
@@ -103,7 +112,7 @@ enum rq_flag_bits {
 	__REQ_QUIET,		/* don't worry about errors */
 	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
-	__REQ_RW_SYNC,		/* request is sync (O_DIRECT) */
+	__REQ_RW_SYNC,		/* request is sync (sync write or read) */
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_RW_META,		/* metadata io request */
 	__REQ_COPY_USER,	/* contains copies of user pages */
@@ -438,8 +447,8 @@ struct request_queue
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
-#define	QUEUE_FLAG_READFULL	3	/* read queue has been filled */
-#define QUEUE_FLAG_WRITEFULL	4	/* write queue has been filled */
+#define	QUEUE_FLAG_SYNCFULL	3	/* read queue has been filled */
+#define QUEUE_FLAG_ASYNCFULL	4	/* write queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
@@ -611,32 +620,41 @@ enum {
 #define rq_data_dir(rq)		((rq)->cmd_flags & 1)
 
 /*
- * We regard a request as sync, if it's a READ or a SYNC write.
+ * We regard a request as sync, if either a read or a sync write
  */
-#define rq_is_sync(rq)		(rq_data_dir((rq)) == READ || (rq)->cmd_flags & REQ_RW_SYNC)
+static inline bool rw_is_sync(unsigned int rw_flags)
+{
+	return !(rw_flags & REQ_RW) || (rw_flags & REQ_RW_SYNC);
+}
+
+static inline bool rq_is_sync(struct request *rq)
+{
+	return rw_is_sync(rq->cmd_flags);
+}
+
 #define rq_is_meta(rq)		((rq)->cmd_flags & REQ_RW_META)
 
-static inline int blk_queue_full(struct request_queue *q, int rw)
+static inline int blk_queue_full(struct request_queue *q, int sync)
 {
-	if (rw == READ)
-		return test_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
-	return test_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
+	if (sync)
+		return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags);
+	return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags);
 }
 
-static inline void blk_set_queue_full(struct request_queue *q, int rw)
+static inline void blk_set_queue_full(struct request_queue *q, int sync)
 {
-	if (rw == READ)
-		queue_flag_set(QUEUE_FLAG_READFULL, q);
+	if (sync)
+		queue_flag_set(QUEUE_FLAG_SYNCFULL, q);
 	else
-		queue_flag_set(QUEUE_FLAG_WRITEFULL, q);
+		queue_flag_set(QUEUE_FLAG_ASYNCFULL, q);
 }
 
-static inline void blk_clear_queue_full(struct request_queue *q, int rw)
+static inline void blk_clear_queue_full(struct request_queue *q, int sync)
 {
-	if (rw == READ)
-		queue_flag_clear(QUEUE_FLAG_READFULL, q);
+	if (sync)
+		queue_flag_clear(QUEUE_FLAG_SYNCFULL, q);
 	else
-		queue_flag_clear(QUEUE_FLAG_WRITEFULL, q);
+		queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q);
 }
 
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index be68c956a660..493b468a5035 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -284,12 +284,12 @@ static wait_queue_head_t congestion_wqh[2] = {
 	};
 
 
-void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
+void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
 	enum bdi_state bit;
-	wait_queue_head_t *wqh = &congestion_wqh[rw];
+	wait_queue_head_t *wqh = &congestion_wqh[sync];
 
-	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	bit = sync ? BDI_sync_congested : BDI_async_congested;
 	clear_bit(bit, &bdi->state);
 	smp_mb__after_clear_bit();
 	if (waitqueue_active(wqh))
@@ -297,11 +297,11 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
 }
 EXPORT_SYMBOL(clear_bdi_congested);
 
-void set_bdi_congested(struct backing_dev_info *bdi, int rw)
+void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
 	enum bdi_state bit;
 
-	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	bit = sync ? BDI_sync_congested : BDI_async_congested;
 	set_bit(bit, &bdi->state);
 }
 EXPORT_SYMBOL(set_bdi_congested);
-- 
cgit v1.2.3


From a1f242524c3c1f5d40f1c9c343427e34d1aadd6e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 6 Apr 2009 14:48:02 +0200
Subject: Add WRITE_SYNC_PLUG and SWRITE_SYNC_PLUG

(S)WRITE_SYNC always unplugs the device right after IO submission.
Sometimes we want to build up a queue before doing so, so add
variants that explicitly DON'T unplug the queue. The caller must
then do that after submitting all the IO.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index a09e17c8f5fd..ea0510978f76 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -96,7 +96,10 @@ struct inodes_stat_t {
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define READ_META	(READ | (1 << BIO_RW_META))
 #define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
+#define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO))
 #define SWRITE_SYNC	(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
+#define SWRITE_SYNC_PLUG	\
+	(SWRITE | (1 << BIO_RW_SYNCIO))
 #define WRITE_BARRIER	(WRITE | (1 << BIO_RW_BARRIER))
 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
-- 
cgit v1.2.3


From aeb6fafb8fa53266d70ca7474fcda2bdaf96524a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 6 Apr 2009 14:48:07 +0200
Subject: block: Add flag for telling the IO schedulers NOT to anticipate more
 IO

By default, CFQ will anticipate more IO from a given io context if the
previously completed IO was sync. This used to be fine, since the only
sync IO was reads and O_DIRECT writes. But with more "normal" sync writes
being used now, we don't want to anticipate for those.

Add a bio/request flag that informs the IO scheduler that this is a sync
request that we should not idle for. Introduce WRITE_ODIRECT specifically
for O_DIRECT writes, and make sure that the other sync writes set this
flag.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-core.c       |  2 ++
 block/cfq-iosched.c    |  4 +++-
 fs/direct-io.c         |  2 +-
 include/linux/bio.h    | 19 +++++++++++--------
 include/linux/blkdev.h |  3 +++
 include/linux/fs.h     |  9 +++++----
 6 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index c4198f083e5b..25572802dac2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1128,6 +1128,8 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_UNPLUG;
 	if (bio_rw_meta(bio))
 		req->cmd_flags |= REQ_RW_META;
+	if (bio_noidle(bio))
+		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 664ebfd092ec..9e809345f71a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1992,8 +1992,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		}
 		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
 			cfq_slice_expired(cfqd, 1);
-		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
+		else if (sync && !rq_noidle(rq) &&
+			 RB_EMPTY_ROOT(&cfqq->sort_list)) {
 			cfq_arm_slice_timer(cfqd);
+		}
 	}
 
 	if (!cfqd->rq_in_driver)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..da258e7249cc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		rw = WRITE_SYNC;
+		rw = WRITE_ODIRECT;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b05b1d4d17d2..b900d2c67d29 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -145,20 +145,21 @@ struct bio {
  * bit 2 -- barrier
  *	Insert a serialization point in the IO queue, forcing previously
  *	submitted IO to be completed before this one is issued.
- * bit 3 -- synchronous I/O hint: the block layer will unplug immediately
- *	Note that this does NOT indicate that the IO itself is sync, just
- *	that the block layer will not postpone issue of this IO by plugging.
- * bit 4 -- metadata request
+ * bit 3 -- synchronous I/O hint.
+ * bit 4 -- Unplug the device immediately after submitting this bio.
+ * bit 5 -- metadata request
  *	Used for tracing to differentiate metadata and data IO. May also
  *	get some preferential treatment in the IO scheduler
- * bit 5 -- discard sectors
+ * bit 6 -- discard sectors
  *	Informs the lower level device that this range of sectors is no longer
  *	used by the file system and may thus be freed by the device. Used
  *	for flash based storage.
- * bit 6 -- fail fast device errors
- * bit 7 -- fail fast transport errors
- * bit 8 -- fail fast driver errors
+ * bit 7 -- fail fast device errors
+ * bit 8 -- fail fast transport errors
+ * bit 9 -- fail fast driver errors
  *	Don't want driver retries for any fast fail whatever the reason.
+ * bit 10 -- Tell the IO scheduler not to wait for more requests after this
+	one has been submitted, even if it is a SYNC request.
  */
 #define BIO_RW		0	/* Must match RW in req flags (blkdev.h) */
 #define BIO_RW_AHEAD	1	/* Must match FAILFAST in req flags */
@@ -170,6 +171,7 @@ struct bio {
 #define BIO_RW_FAILFAST_DEV		7
 #define BIO_RW_FAILFAST_TRANSPORT	8
 #define BIO_RW_FAILFAST_DRIVER		9
+#define BIO_RW_NOIDLE	10
 
 #define bio_rw_flagged(bio, flag)	((bio)->bi_rw & (1 << (flag)))
 
@@ -188,6 +190,7 @@ struct bio {
 #define bio_rw_ahead(bio)	bio_rw_flagged(bio, BIO_RW_AHEAD)
 #define bio_rw_meta(bio)	bio_rw_flagged(bio, BIO_RW_META)
 #define bio_discard(bio)	bio_rw_flagged(bio, BIO_RW_DISCARD)
+#define bio_noidle(bio)		bio_rw_flagged(bio, BIO_RW_NOIDLE)
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 67dae3bd881c..e03660964e02 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -118,6 +118,7 @@ enum rq_flag_bits {
 	__REQ_COPY_USER,	/* contains copies of user pages */
 	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
 	__REQ_UNPLUG,		/* unplug queue on submission */
+	__REQ_NOIDLE,		/* Don't anticipate more IO after this one */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -145,6 +146,7 @@ enum rq_flag_bits {
 #define REQ_COPY_USER	(1 << __REQ_COPY_USER)
 #define REQ_INTEGRITY	(1 << __REQ_INTEGRITY)
 #define REQ_UNPLUG	(1 << __REQ_UNPLUG)
+#define REQ_NOIDLE	(1 << __REQ_NOIDLE)
 
 #define BLK_MAX_CDB	16
 
@@ -633,6 +635,7 @@ static inline bool rq_is_sync(struct request *rq)
 }
 
 #define rq_is_meta(rq)		((rq)->cmd_flags & REQ_RW_META)
+#define rq_noidle(rq)		((rq)->cmd_flags & REQ_NOIDLE)
 
 static inline int blk_queue_full(struct request_queue *q, int sync)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ea0510978f76..cae5720f431c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -95,11 +95,12 @@ struct inodes_stat_t {
 #define SWRITE 3	/* for ll_rw_block() - wait for buffer lock */
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define READ_META	(READ | (1 << BIO_RW_META))
-#define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
-#define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO))
-#define SWRITE_SYNC	(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
+#define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
+#define WRITE_SYNC	(WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
+#define WRITE_ODIRECT	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define SWRITE_SYNC_PLUG	\
-	(SWRITE | (1 << BIO_RW_SYNCIO))
+			(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
+#define SWRITE_SYNC	(SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
 #define WRITE_BARRIER	(WRITE | (1 << BIO_RW_BARRIER))
 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
-- 
cgit v1.2.3


From 1bd465e6b0e2b559db47420fea686507a01cfab0 Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <lg@denx.de>
Date: Sat, 10 Jan 2009 18:54:39 +0000
Subject: leds: allow led-drivers to use a variable range of brightness values

This patch allows drivers to override the default maximum brightness value
of 255.  We take care to preserve backwards-compatibility as much as
possible, so that user-space ABI doesn't change for existing drivers.
LED trigger code has also been updated to use the per-LED maximum.

Signed-off-by: Guennadi Liakhovetski <lg@denx.de>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 drivers/leds/led-class.c          | 21 ++++++++++++++++++++-
 drivers/leds/leds.h               |  4 ++--
 drivers/leds/ledtrig-default-on.c |  2 +-
 drivers/leds/ledtrig-heartbeat.c  |  4 ++--
 drivers/leds/ledtrig-ide-disk.c   |  3 ++-
 drivers/leds/ledtrig-timer.c      |  2 +-
 include/linux/leds.h              |  1 +
 7 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 52f82e3ea13a..f2cc13d76810 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -64,7 +64,16 @@ static ssize_t led_brightness_store(struct device *dev,
 	return ret;
 }
 
+static ssize_t led_max_brightness_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%u\n", led_cdev->max_brightness);
+}
+
 static DEVICE_ATTR(brightness, 0644, led_brightness_show, led_brightness_store);
+static DEVICE_ATTR(max_brightness, 0444, led_max_brightness_show, NULL);
 #ifdef CONFIG_LEDS_TRIGGERS
 static DEVICE_ATTR(trigger, 0644, led_trigger_show, led_trigger_store);
 #endif
@@ -138,6 +147,13 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 	list_add_tail(&led_cdev->node, &leds_list);
 	up_write(&leds_list_lock);
 
+	if (!led_cdev->max_brightness)
+		led_cdev->max_brightness = LED_FULL;
+
+	rc = device_create_file(led_cdev->dev, &dev_attr_max_brightness);
+	if (rc)
+		goto err_out_attr_max;
+
 	led_update_brightness(led_cdev);
 
 #ifdef CONFIG_LEDS_TRIGGERS
@@ -155,9 +171,11 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 
 #ifdef CONFIG_LEDS_TRIGGERS
 err_out_led_list:
+	device_remove_file(led_cdev->dev, &dev_attr_max_brightness);
+#endif
+err_out_attr_max:
 	device_remove_file(led_cdev->dev, &dev_attr_brightness);
 	list_del(&led_cdev->node);
-#endif
 err_out:
 	device_unregister(led_cdev->dev);
 	return rc;
@@ -172,6 +190,7 @@ EXPORT_SYMBOL_GPL(led_classdev_register);
  */
 void led_classdev_unregister(struct led_classdev *led_cdev)
 {
+	device_remove_file(led_cdev->dev, &dev_attr_max_brightness);
 	device_remove_file(led_cdev->dev, &dev_attr_brightness);
 #ifdef CONFIG_LEDS_TRIGGERS
 	device_remove_file(led_cdev->dev, &dev_attr_trigger);
diff --git a/drivers/leds/leds.h b/drivers/leds/leds.h
index 5edbf52c4fa7..2dd8ecbfdc31 100644
--- a/drivers/leds/leds.h
+++ b/drivers/leds/leds.h
@@ -20,8 +20,8 @@
 static inline void led_set_brightness(struct led_classdev *led_cdev,
 					enum led_brightness value)
 {
-	if (value > LED_FULL)
-		value = LED_FULL;
+	if (value > led_cdev->max_brightness)
+		value = led_cdev->max_brightness;
 	led_cdev->brightness = value;
 	if (!(led_cdev->flags & LED_SUSPENDED))
 		led_cdev->brightness_set(led_cdev, value);
diff --git a/drivers/leds/ledtrig-default-on.c b/drivers/leds/ledtrig-default-on.c
index 92995e40cfa4..a4ef54b9d508 100644
--- a/drivers/leds/ledtrig-default-on.c
+++ b/drivers/leds/ledtrig-default-on.c
@@ -19,7 +19,7 @@
 
 static void defon_trig_activate(struct led_classdev *led_cdev)
 {
-	led_set_brightness(led_cdev, LED_FULL);
+	led_set_brightness(led_cdev, led_cdev->max_brightness);
 }
 
 static struct led_trigger defon_led_trigger = {
diff --git a/drivers/leds/ledtrig-heartbeat.c b/drivers/leds/ledtrig-heartbeat.c
index 4bf8cec8b8c1..c1c1ea6f817b 100644
--- a/drivers/leds/ledtrig-heartbeat.c
+++ b/drivers/leds/ledtrig-heartbeat.c
@@ -47,7 +47,7 @@ static void led_heartbeat_function(unsigned long data)
 			msecs_to_jiffies(heartbeat_data->period);
 		delay = msecs_to_jiffies(70);
 		heartbeat_data->phase++;
-		brightness = LED_FULL;
+		brightness = led_cdev->max_brightness;
 		break;
 	case 1:
 		delay = heartbeat_data->period / 4 - msecs_to_jiffies(70);
@@ -56,7 +56,7 @@ static void led_heartbeat_function(unsigned long data)
 	case 2:
 		delay = msecs_to_jiffies(70);
 		heartbeat_data->phase++;
-		brightness = LED_FULL;
+		brightness = led_cdev->max_brightness;
 		break;
 	default:
 		delay = heartbeat_data->period - heartbeat_data->period / 4 -
diff --git a/drivers/leds/ledtrig-ide-disk.c b/drivers/leds/ledtrig-ide-disk.c
index 883a577b1b97..ec099fcbcb00 100644
--- a/drivers/leds/ledtrig-ide-disk.c
+++ b/drivers/leds/ledtrig-ide-disk.c
@@ -37,7 +37,8 @@ static void ledtrig_ide_timerfunc(unsigned long data)
 {
 	if (ide_lastactivity != ide_activity) {
 		ide_lastactivity = ide_activity;
-		led_trigger_event(ledtrig_ide, LED_FULL);
+		/* INT_MAX will set each LED to its maximum brightness */
+		led_trigger_event(ledtrig_ide, INT_MAX);
 		mod_timer(&ledtrig_ide_timer, jiffies + msecs_to_jiffies(10));
 	} else {
 		led_trigger_event(ledtrig_ide, LED_OFF);
diff --git a/drivers/leds/ledtrig-timer.c b/drivers/leds/ledtrig-timer.c
index 3d6531396dda..3b83406de752 100644
--- a/drivers/leds/ledtrig-timer.c
+++ b/drivers/leds/ledtrig-timer.c
@@ -166,7 +166,7 @@ static void timer_trig_activate(struct led_classdev *led_cdev)
 
 	timer_data->brightness_on = led_get_brightness(led_cdev);
 	if (timer_data->brightness_on == LED_OFF)
-		timer_data->brightness_on = LED_FULL;
+		timer_data->brightness_on = led_cdev->max_brightness;
 	led_cdev->trigger_data = timer_data;
 
 	init_timer(&timer_data->timer);
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 24489da701e3..17d277e0c4a6 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -30,6 +30,7 @@ enum led_brightness {
 struct led_classdev {
 	const char		*name;
 	int			 brightness;
+	int			 max_brightness;
 	int			 flags;
 
 	/* Lower 16 bits reflect status */
-- 
cgit v1.2.3


From 41c42ff5dbe29b7b826e6736f960959c76e7acf0 Mon Sep 17 00:00:00 2001
From: Luotao Fu <l.fu@pengutronix.de>
Date: Wed, 11 Feb 2009 13:24:40 -0800
Subject: leds: simple driver for pwm driven LEDs

Add a simple driver for pwm driver LEDs.  pwm_id and period can be defined
in board file.  It is developed for pxa, however it is probably generic
enough to be used on other platforms with pwm.

Signed-off-by: Luotao Fu <l.fu@pengutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 drivers/leds/Kconfig     |   6 ++
 drivers/leds/Makefile    |   1 +
 drivers/leds/leds-pwm.c  | 153 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/leds_pwm.h |  21 +++++++
 4 files changed, 181 insertions(+)
 create mode 100644 drivers/leds/leds-pwm.c
 create mode 100644 include/linux/leds_pwm.h

(limited to 'include/linux')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 16a94088beee..89ea7ef39fe4 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -197,6 +197,12 @@ config LEDS_DAC124S085
 	  This option enables support for DAC124S085 SPI DAC from NatSemi,
 	  which can be used to control up to four LEDs.
 
+config LEDS_PWM
+	tristate "PWM driven LED Support"
+	depends on LEDS_CLASS && HAVE_PWM
+	help
+	  This option enables support for pwm driven LEDs
+
 comment "LED Triggers"
 
 config LEDS_TRIGGERS
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 4157e86b9747..584a3f6c2530 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_LEDS_FSG)			+= leds-fsg.o
 obj-$(CONFIG_LEDS_PCA955X)		+= leds-pca955x.o
 obj-$(CONFIG_LEDS_DA903X)		+= leds-da903x.o
 obj-$(CONFIG_LEDS_WM8350)		+= leds-wm8350.o
+obj-$(CONFIG_LEDS_PWM)			+= leds-pwm.o
 
 # LED SPI Drivers
 obj-$(CONFIG_LEDS_DAC124S085)		+= leds-dac124s085.o
diff --git a/drivers/leds/leds-pwm.c b/drivers/leds/leds-pwm.c
new file mode 100644
index 000000000000..cdfdc8714e10
--- /dev/null
+++ b/drivers/leds/leds-pwm.c
@@ -0,0 +1,153 @@
+/*
+ * linux/drivers/leds-pwm.c
+ *
+ * simple PWM based LED control
+ *
+ * Copyright 2009 Luotao Fu @ Pengutronix (l.fu@pengutronix.de)
+ *
+ * based on leds-gpio.c by Raphael Assenat <raph@8d.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/fb.h>
+#include <linux/leds.h>
+#include <linux/err.h>
+#include <linux/pwm.h>
+#include <linux/leds_pwm.h>
+
+struct led_pwm_data {
+	struct led_classdev	cdev;
+	struct pwm_device	*pwm;
+	unsigned int 		active_low;
+	unsigned int		period;
+	unsigned int		max_brightness;
+};
+
+static void led_pwm_set(struct led_classdev *led_cdev,
+	enum led_brightness brightness)
+{
+	struct led_pwm_data *led_dat =
+		container_of(led_cdev, struct led_pwm_data, cdev);
+	unsigned int max = led_dat->max_brightness;
+	unsigned int period =  led_dat->period;
+
+	if (brightness == 0) {
+		pwm_config(led_dat->pwm, 0, period);
+		pwm_disable(led_dat->pwm);
+	} else {
+		pwm_config(led_dat->pwm, brightness * period / max, period);
+		pwm_enable(led_dat->pwm);
+	}
+}
+
+static int led_pwm_probe(struct platform_device *pdev)
+{
+	struct led_pwm_platform_data *pdata = pdev->dev.platform_data;
+	struct led_pwm *cur_led;
+	struct led_pwm_data *leds_data, *led_dat;
+	int i, ret = 0;
+
+	if (!pdata)
+		return -EBUSY;
+
+	leds_data = kzalloc(sizeof(struct led_pwm_data) * pdata->num_leds,
+				GFP_KERNEL);
+	if (!leds_data)
+		return -ENOMEM;
+
+	for (i = 0; i < pdata->num_leds; i++) {
+		cur_led = &pdata->leds[i];
+		led_dat = &leds_data[i];
+
+		led_dat->pwm = pwm_request(cur_led->pwm_id,
+				cur_led->name);
+		if (IS_ERR(led_dat->pwm)) {
+			dev_err(&pdev->dev, "unable to request PWM %d\n",
+					cur_led->pwm_id);
+			goto err;
+		}
+
+		led_dat->cdev.name = cur_led->name;
+		led_dat->cdev.default_trigger = cur_led->default_trigger;
+		led_dat->active_low = cur_led->active_low;
+		led_dat->max_brightness = cur_led->max_brightness;
+		led_dat->period = cur_led->pwm_period_ns;
+		led_dat->cdev.brightness_set = led_pwm_set;
+		led_dat->cdev.brightness = LED_OFF;
+		led_dat->cdev.flags |= LED_CORE_SUSPENDRESUME;
+
+		ret = led_classdev_register(&pdev->dev, &led_dat->cdev);
+		if (ret < 0) {
+			pwm_free(led_dat->pwm);
+			goto err;
+		}
+	}
+
+	platform_set_drvdata(pdev, leds_data);
+
+	return 0;
+
+err:
+	if (i > 0) {
+		for (i = i - 1; i >= 0; i--) {
+			led_classdev_unregister(&leds_data[i].cdev);
+			pwm_free(leds_data[i].pwm);
+		}
+	}
+
+	kfree(leds_data);
+
+	return ret;
+}
+
+static int __devexit led_pwm_remove(struct platform_device *pdev)
+{
+	int i;
+	struct led_pwm_platform_data *pdata = pdev->dev.platform_data;
+	struct led_pwm_data *leds_data;
+
+	leds_data = platform_get_drvdata(pdev);
+
+	for (i = 0; i < pdata->num_leds; i++) {
+		led_classdev_unregister(&leds_data[i].cdev);
+		pwm_free(leds_data[i].pwm);
+	}
+
+	kfree(leds_data);
+
+	return 0;
+}
+
+static struct platform_driver led_pwm_driver = {
+	.probe		= led_pwm_probe,
+	.remove		= __devexit_p(led_pwm_remove),
+	.driver		= {
+		.name	= "leds_pwm",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init led_pwm_init(void)
+{
+	return platform_driver_register(&led_pwm_driver);
+}
+
+static void __exit led_pwm_exit(void)
+{
+	platform_driver_unregister(&led_pwm_driver);
+}
+
+module_init(led_pwm_init);
+module_exit(led_pwm_exit);
+
+MODULE_AUTHOR("Luotao Fu <l.fu@pengutronix.de>");
+MODULE_DESCRIPTION("PWM LED driver for PXA");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:leds-pwm");
diff --git a/include/linux/leds_pwm.h b/include/linux/leds_pwm.h
new file mode 100644
index 000000000000..33a071167489
--- /dev/null
+++ b/include/linux/leds_pwm.h
@@ -0,0 +1,21 @@
+/*
+ * PWM LED driver data - see drivers/leds/leds-pwm.c
+ */
+#ifndef __LINUX_LEDS_PWM_H
+#define __LINUX_LEDS_PWM_H
+
+struct led_pwm {
+	const char	*name;
+	const char	*default_trigger;
+	unsigned	pwm_id;
+	u8 		active_low;
+	unsigned 	max_brightness;
+	unsigned	pwm_period_ns;
+};
+
+struct led_pwm_platform_data {
+	int			num_leds;
+	struct led_pwm	*leds;
+};
+
+#endif
-- 
cgit v1.2.3


From defb512d2576992c63ba1c18c24eea31cfeaa26e Mon Sep 17 00:00:00 2001
From: Richard Purdie <rpurdie@linux.intel.com>
Date: Tue, 17 Feb 2009 15:04:07 +0000
Subject: leds: Add suspend/resume state flags to leds-gpio

Add an option to preserve LED state when suspending/resuming to the LED
gpio driver. Based on a suggestion from Robert Jarzmik.

Tested-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 drivers/leds/leds-gpio.c | 3 ++-
 include/linux/leds.h     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index 0daa2d21cbde..8fa352ac20f8 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -93,7 +93,8 @@ static int __devinit create_gpio_led(const struct gpio_led *template,
 	}
 	led_dat->cdev.brightness_set = gpio_led_set;
 	led_dat->cdev.brightness = LED_OFF;
-	led_dat->cdev.flags |= LED_CORE_SUSPENDRESUME;
+	if (!template->retain_state_suspended)
+		led_dat->cdev.flags |= LED_CORE_SUSPENDRESUME;
 
 	ret = gpio_direction_output(led_dat->gpio, led_dat->active_low);
 	if (ret < 0)
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 17d277e0c4a6..376fe07732ea 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -141,7 +141,8 @@ struct gpio_led {
 	const char *name;
 	const char *default_trigger;
 	unsigned 	gpio;
-	u8 		active_low;
+	u8 		active_low : 1;
+	u8		retain_state_suspended : 1;
 };
 
 struct gpio_led_platform_data {
-- 
cgit v1.2.3


From 0b56129be72c38179697b7441aacbe133d515ff9 Mon Sep 17 00:00:00 2001
From: Kim Kyuwon <chammoru@gmail.com>
Date: Wed, 4 Mar 2009 11:59:29 -0800
Subject: leds: add BD2802GU LED driver

ROHM BD2802GU is a RGB LED controller attached to i2c bus and specifically
engineered for decoration purposes.  This RGB controller incorporates
lighting patterns and illuminates.

This driver is designed to minimize power consumption, so when there is no
emitting LED, it enters to reset state.  And because the BD2802GU has lots
of features that can't be covered by the current LED framework, it
provides Advanced Configuration Function(ADF) mode, so that user
applications can set registers of BD2802GU directly.

Here are basic usage examples :
; to turn on LED (not blink)
$ echo 1 > /sys/class/leds/led1_R/brightness
; to blink LED
$ echo timer > /sys/class/leds/led1_R/trigger
$ echo 1 > /sys/class/leds/led1_R/delay_on
$ echo 1 > /sys/class/leds/led1_R/delay_off
; to turn off LED
$ echo 0 > /sys/class/leds/led1_R/brightness

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Kim Kyuwon <chammoru@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 drivers/leds/Kconfig        |   7 +
 drivers/leds/Makefile       |   1 +
 drivers/leds/leds-bd2802.c  | 765 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/leds-bd2802.h |  26 ++
 4 files changed, 799 insertions(+)
 create mode 100644 drivers/leds/leds-bd2802.c
 create mode 100644 include/linux/leds-bd2802.h

(limited to 'include/linux')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index db84d8f616dc..b77baecd50ca 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -210,6 +210,13 @@ config LEDS_PWM
 	help
 	  This option enables support for pwm driven LEDs
 
+config LEDS_BD2802
+	tristate "LED driver for BD2802 RGB LED"
+	depends on LEDS_CLASS && I2C
+	help
+	  This option enables support for BD2802GU RGB LED driver chips
+	  accessed via the I2C bus.
+
 comment "LED Triggers"
 
 config LEDS_TRIGGERS
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 291aea22bf0c..2d41c4dcf92f 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_LEDS_TRIGGERS)		+= led-triggers.o
 
 # LED Platform Drivers
 obj-$(CONFIG_LEDS_ATMEL_PWM)		+= leds-atmel-pwm.o
+obj-$(CONFIG_LEDS_BD2802)		+= leds-bd2802.o
 obj-$(CONFIG_LEDS_LOCOMO)		+= leds-locomo.o
 obj-$(CONFIG_LEDS_MIKROTIK_RB532)	+= leds-rb532.o
 obj-$(CONFIG_LEDS_S3C24XX)		+= leds-s3c24xx.o
diff --git a/drivers/leds/leds-bd2802.c b/drivers/leds/leds-bd2802.c
new file mode 100644
index 000000000000..4149ecb3a9b2
--- /dev/null
+++ b/drivers/leds/leds-bd2802.c
@@ -0,0 +1,765 @@
+/*
+ * leds-bd2802.c - RGB LED Driver
+ *
+ * Copyright (C) 2009 Samsung Electronics
+ * Kim Kyuwon <q1.kim@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Datasheet: http://www.rohm.com/products/databook/driver/pdf/bd2802gu-e.pdf
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <linux/delay.h>
+#include <linux/leds.h>
+#include <linux/leds-bd2802.h>
+
+
+#define LED_CTL(rgb2en, rgb1en) ((rgb2en) << 4 | ((rgb1en) << 0))
+
+#define BD2802_LED_OFFSET		0xa
+#define BD2802_COLOR_OFFSET		0x3
+
+#define BD2802_REG_CLKSETUP 		0x00
+#define BD2802_REG_CONTROL 		0x01
+#define BD2802_REG_HOURSETUP		0x02
+#define BD2802_REG_CURRENT1SETUP	0x03
+#define BD2802_REG_CURRENT2SETUP	0x04
+#define BD2802_REG_WAVEPATTERN		0x05
+
+#define BD2802_CURRENT_032		0x10 /* 3.2mA */
+#define BD2802_CURRENT_000		0x00 /* 0.0mA */
+
+#define BD2802_PATTERN_FULL		0x07
+#define BD2802_PATTERN_HALF		0x03
+
+enum led_ids {
+	LED1,
+	LED2,
+	LED_NUM,
+};
+
+enum led_colors {
+	RED,
+	GREEN,
+	BLUE,
+};
+
+enum led_bits {
+	BD2802_OFF,
+	BD2802_BLINK,
+	BD2802_ON,
+};
+
+/*
+ * State '0' : 'off'
+ * State '1' : 'blink'
+ * State '2' : 'on'.
+ */
+struct led_state {
+	unsigned r:2;
+	unsigned g:2;
+	unsigned b:2;
+};
+
+struct bd2802_led {
+	struct bd2802_led_platform_data	*pdata;
+	struct i2c_client		*client;
+	struct rw_semaphore		rwsem;
+	struct work_struct		work;
+
+	struct led_state		led[2];
+
+	/*
+	 * Making led_classdev as array is not recommended, because array
+	 * members prevent using 'container_of' macro. So repetitive works
+	 * are needed.
+	 */
+	struct led_classdev		cdev_led1r;
+	struct led_classdev		cdev_led1g;
+	struct led_classdev		cdev_led1b;
+	struct led_classdev		cdev_led2r;
+	struct led_classdev		cdev_led2g;
+	struct led_classdev		cdev_led2b;
+
+	/*
+	 * Advanced Configuration Function(ADF) mode:
+	 * In ADF mode, user can set registers of BD2802GU directly,
+	 * therefore BD2802GU doesn't enter reset state.
+	 */
+	int 				adf_on;
+
+	enum led_ids			led_id;
+	enum led_colors			color;
+	enum led_bits			state;
+};
+
+
+/*--------------------------------------------------------------*/
+/*	BD2802GU helper functions					*/
+/*--------------------------------------------------------------*/
+
+static inline int bd2802_is_rgb_off(struct bd2802_led *led, enum led_ids id,
+							enum led_colors color)
+{
+	switch (color) {
+	case RED:
+		return !led->led[id].r;
+	case GREEN:
+		return !led->led[id].g;
+	case BLUE:
+		return !led->led[id].b;
+	default:
+		dev_err(&led->client->dev, "%s: Invalid color\n", __func__);
+		return -EINVAL;
+	}
+}
+
+static inline int bd2802_is_led_off(struct bd2802_led *led, enum led_ids id)
+{
+	if (led->led[id].r || led->led[id].g || led->led[id].b)
+		return 0;
+
+	return 1;
+}
+
+static inline int bd2802_is_all_off(struct bd2802_led *led)
+{
+	int i;
+
+	for (i = 0; i < LED_NUM; i++)
+		if (!bd2802_is_led_off(led, i))
+			return 0;
+
+	return 1;
+}
+
+static inline u8 bd2802_get_base_offset(enum led_ids id, enum led_colors color)
+{
+	return id * BD2802_LED_OFFSET + color * BD2802_COLOR_OFFSET;
+}
+
+static inline u8 bd2802_get_reg_addr(enum led_ids id, enum led_colors color,
+								u8 reg_offset)
+{
+	return reg_offset + bd2802_get_base_offset(id, color);
+}
+
+
+/*--------------------------------------------------------------*/
+/*	BD2802GU core functions					*/
+/*--------------------------------------------------------------*/
+
+static int bd2802_write_byte(struct i2c_client *client, u8 reg, u8 val)
+{
+	int ret = i2c_smbus_write_byte_data(client, reg, val);
+	if (ret >= 0)
+		return 0;
+
+	dev_err(&client->dev, "%s: reg 0x%x, val 0x%x, err %d\n",
+						__func__, reg, val, ret);
+
+	return ret;
+}
+
+static void bd2802_update_state(struct bd2802_led *led, enum led_ids id,
+				enum led_colors color, enum led_bits led_bit)
+{
+	int i;
+	u8 value;
+
+	for (i = 0; i < LED_NUM; i++) {
+		if (i == id) {
+			switch (color) {
+			case RED:
+				led->led[i].r = led_bit;
+				break;
+			case GREEN:
+				led->led[i].g = led_bit;
+				break;
+			case BLUE:
+				led->led[i].b = led_bit;
+				break;
+			default:
+				dev_err(&led->client->dev,
+					"%s: Invalid color\n", __func__);
+				return;
+			}
+		}
+	}
+
+	if (led_bit == BD2802_BLINK || led_bit == BD2802_ON)
+		return;
+
+	if (!bd2802_is_led_off(led, id))
+		return;
+
+	if (bd2802_is_all_off(led) && !led->adf_on) {
+		gpio_set_value(led->pdata->reset_gpio, 0);
+		return;
+	}
+
+	/*
+	 * In this case, other led is turned on, and current led is turned
+	 * off. So set RGB LED Control register to stop the current RGB LED
+	 */
+	value = (id == LED1) ? LED_CTL(1, 0) : LED_CTL(0, 1);
+	bd2802_write_byte(led->client, BD2802_REG_CONTROL, value);
+}
+
+static void bd2802_configure(struct bd2802_led *led)
+{
+	struct bd2802_led_platform_data *pdata = led->pdata;
+	u8 reg;
+
+	reg = bd2802_get_reg_addr(LED1, RED, BD2802_REG_HOURSETUP);
+	bd2802_write_byte(led->client, reg, pdata->rgb_time);
+
+	reg = bd2802_get_reg_addr(LED2, RED, BD2802_REG_HOURSETUP);
+	bd2802_write_byte(led->client, reg, pdata->rgb_time);
+}
+
+static void bd2802_reset_cancel(struct bd2802_led *led)
+{
+	gpio_set_value(led->pdata->reset_gpio, 1);
+	udelay(100);
+	bd2802_configure(led);
+}
+
+static void bd2802_enable(struct bd2802_led *led, enum led_ids id)
+{
+	enum led_ids other_led = (id == LED1) ? LED2 : LED1;
+	u8 value, other_led_on;
+
+	other_led_on = !bd2802_is_led_off(led, other_led);
+	if (id == LED1)
+		value = LED_CTL(other_led_on, 1);
+	else
+		value = LED_CTL(1 , other_led_on);
+
+	bd2802_write_byte(led->client, BD2802_REG_CONTROL, value);
+}
+
+static void bd2802_set_on(struct bd2802_led *led, enum led_ids id,
+							enum led_colors color)
+{
+	u8 reg;
+
+	if (bd2802_is_all_off(led) && !led->adf_on)
+		bd2802_reset_cancel(led);
+
+	reg = bd2802_get_reg_addr(id, color, BD2802_REG_CURRENT1SETUP);
+	bd2802_write_byte(led->client, reg, BD2802_CURRENT_032);
+	reg = bd2802_get_reg_addr(id, color, BD2802_REG_CURRENT2SETUP);
+	bd2802_write_byte(led->client, reg, BD2802_CURRENT_000);
+	reg = bd2802_get_reg_addr(id, color, BD2802_REG_WAVEPATTERN);
+	bd2802_write_byte(led->client, reg, BD2802_PATTERN_FULL);
+
+	bd2802_enable(led, id);
+	bd2802_update_state(led, id, color, BD2802_ON);
+}
+
+static void bd2802_set_blink(struct bd2802_led *led, enum led_ids id,
+							enum led_colors color)
+{
+	u8 reg;
+
+	if (bd2802_is_all_off(led) && !led->adf_on)
+		bd2802_reset_cancel(led);
+
+	reg = bd2802_get_reg_addr(id, color, BD2802_REG_CURRENT1SETUP);
+	bd2802_write_byte(led->client, reg, BD2802_CURRENT_000);
+	reg = bd2802_get_reg_addr(id, color, BD2802_REG_CURRENT2SETUP);
+	bd2802_write_byte(led->client, reg, BD2802_CURRENT_032);
+	reg = bd2802_get_reg_addr(id, color, BD2802_REG_WAVEPATTERN);
+	bd2802_write_byte(led->client, reg, BD2802_PATTERN_HALF);
+
+	bd2802_enable(led, id);
+	bd2802_update_state(led, id, color, BD2802_BLINK);
+}
+
+static void bd2802_turn_on(struct bd2802_led *led, enum led_ids id,
+				enum led_colors color, enum led_bits led_bit)
+{
+	if (led_bit == BD2802_OFF) {
+		dev_err(&led->client->dev,
+					"Only 'blink' and 'on' are allowed\n");
+		return;
+	}
+
+	if (led_bit == BD2802_BLINK)
+		bd2802_set_blink(led, id, color);
+	else
+		bd2802_set_on(led, id, color);
+}
+
+static void bd2802_turn_off(struct bd2802_led *led, enum led_ids id,
+							enum led_colors color)
+{
+	u8 reg;
+
+	if (bd2802_is_rgb_off(led, id, color))
+		return;
+
+	reg = bd2802_get_reg_addr(id, color, BD2802_REG_CURRENT1SETUP);
+	bd2802_write_byte(led->client, reg, BD2802_CURRENT_000);
+	reg = bd2802_get_reg_addr(id, color, BD2802_REG_CURRENT2SETUP);
+	bd2802_write_byte(led->client, reg, BD2802_CURRENT_000);
+
+	bd2802_update_state(led, id, color, BD2802_OFF);
+}
+
+static void bd2802_restore_state(struct bd2802_led *led)
+{
+	int i;
+
+	for (i = 0; i < LED_NUM; i++) {
+		if (led->led[i].r)
+			bd2802_turn_on(led, i, RED, led->led[i].r);
+		if (led->led[i].g)
+			bd2802_turn_on(led, i, GREEN, led->led[i].g);
+		if (led->led[i].b)
+			bd2802_turn_on(led, i, BLUE, led->led[i].b);
+	}
+}
+
+#define BD2802_SET_REGISTER(reg_addr, reg_name)				\
+static ssize_t bd2802_store_reg##reg_addr(struct device *dev,		\
+	struct device_attribute *attr, const char *buf, size_t count)	\
+{									\
+	struct bd2802_led *led = i2c_get_clientdata(to_i2c_client(dev));\
+	unsigned long val;						\
+	int ret;							\
+	if (!count)							\
+		return -EINVAL;						\
+	ret = strict_strtoul(buf, 16, &val);				\
+	if (ret)							\
+		return ret;						\
+	down_write(&led->rwsem);					\
+	bd2802_write_byte(led->client, reg_addr, (u8) val);		\
+	up_write(&led->rwsem);						\
+	return count;							\
+}									\
+static struct device_attribute bd2802_reg##reg_addr##_attr = {		\
+	.attr = {.name = reg_name, .mode = 0644, .owner = THIS_MODULE},	\
+	.store = bd2802_store_reg##reg_addr,				\
+};
+
+BD2802_SET_REGISTER(0x00, "0x00");
+BD2802_SET_REGISTER(0x01, "0x01");
+BD2802_SET_REGISTER(0x02, "0x02");
+BD2802_SET_REGISTER(0x03, "0x03");
+BD2802_SET_REGISTER(0x04, "0x04");
+BD2802_SET_REGISTER(0x05, "0x05");
+BD2802_SET_REGISTER(0x06, "0x06");
+BD2802_SET_REGISTER(0x07, "0x07");
+BD2802_SET_REGISTER(0x08, "0x08");
+BD2802_SET_REGISTER(0x09, "0x09");
+BD2802_SET_REGISTER(0x0a, "0x0a");
+BD2802_SET_REGISTER(0x0b, "0x0b");
+BD2802_SET_REGISTER(0x0c, "0x0c");
+BD2802_SET_REGISTER(0x0d, "0x0d");
+BD2802_SET_REGISTER(0x0e, "0x0e");
+BD2802_SET_REGISTER(0x0f, "0x0f");
+BD2802_SET_REGISTER(0x10, "0x10");
+BD2802_SET_REGISTER(0x11, "0x11");
+BD2802_SET_REGISTER(0x12, "0x12");
+BD2802_SET_REGISTER(0x13, "0x13");
+BD2802_SET_REGISTER(0x14, "0x14");
+BD2802_SET_REGISTER(0x15, "0x15");
+
+static struct device_attribute *bd2802_addr_attributes[] = {
+	&bd2802_reg0x00_attr,
+	&bd2802_reg0x01_attr,
+	&bd2802_reg0x02_attr,
+	&bd2802_reg0x03_attr,
+	&bd2802_reg0x04_attr,
+	&bd2802_reg0x05_attr,
+	&bd2802_reg0x06_attr,
+	&bd2802_reg0x07_attr,
+	&bd2802_reg0x08_attr,
+	&bd2802_reg0x09_attr,
+	&bd2802_reg0x0a_attr,
+	&bd2802_reg0x0b_attr,
+	&bd2802_reg0x0c_attr,
+	&bd2802_reg0x0d_attr,
+	&bd2802_reg0x0e_attr,
+	&bd2802_reg0x0f_attr,
+	&bd2802_reg0x10_attr,
+	&bd2802_reg0x11_attr,
+	&bd2802_reg0x12_attr,
+	&bd2802_reg0x13_attr,
+	&bd2802_reg0x14_attr,
+	&bd2802_reg0x15_attr,
+};
+
+static void bd2802_enable_adv_conf(struct bd2802_led *led)
+{
+	int i, ret;
+
+	for (i = 0; i < ARRAY_SIZE(bd2802_addr_attributes); i++) {
+		ret = device_create_file(&led->client->dev,
+						bd2802_addr_attributes[i]);
+		if (ret) {
+			dev_err(&led->client->dev, "failed to sysfs file %s\n",
+					bd2802_addr_attributes[i]->attr.name);
+			goto failed_remove_files;
+		}
+	}
+
+	if (bd2802_is_all_off(led))
+		bd2802_reset_cancel(led);
+
+	led->adf_on = 1;
+
+	return;
+
+failed_remove_files:
+	for (i--; i >= 0; i--)
+		device_remove_file(&led->client->dev,
+						bd2802_addr_attributes[i]);
+}
+
+static void bd2802_disable_adv_conf(struct bd2802_led *led)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(bd2802_addr_attributes); i++)
+		device_remove_file(&led->client->dev,
+						bd2802_addr_attributes[i]);
+
+	if (bd2802_is_all_off(led))
+		gpio_set_value(led->pdata->reset_gpio, 0);
+
+	led->adf_on = 0;
+}
+
+static ssize_t bd2802_show_adv_conf(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	struct bd2802_led *led = i2c_get_clientdata(to_i2c_client(dev));
+	ssize_t ret;
+
+	down_read(&led->rwsem);
+	if (led->adf_on)
+		ret = sprintf(buf, "on\n");
+	else
+		ret = sprintf(buf, "off\n");
+	up_read(&led->rwsem);
+
+	return ret;
+}
+
+static ssize_t bd2802_store_adv_conf(struct device *dev,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct bd2802_led *led = i2c_get_clientdata(to_i2c_client(dev));
+
+	if (!count)
+		return -EINVAL;
+
+	down_write(&led->rwsem);
+	if (!led->adf_on && !strncmp(buf, "on", 2))
+		bd2802_enable_adv_conf(led);
+	else if (led->adf_on && !strncmp(buf, "off", 3))
+		bd2802_disable_adv_conf(led);
+	up_write(&led->rwsem);
+
+	return count;
+}
+
+static struct device_attribute bd2802_adv_conf_attr = {
+	.attr = {
+		.name = "advanced_configuration",
+		.mode = 0644,
+		.owner = THIS_MODULE
+	},
+	.show = bd2802_show_adv_conf,
+	.store = bd2802_store_adv_conf,
+};
+
+static void bd2802_led_work(struct work_struct *work)
+{
+	struct bd2802_led *led = container_of(work, struct bd2802_led, work);
+
+	if (led->state)
+		bd2802_turn_on(led, led->led_id, led->color, led->state);
+	else
+		bd2802_turn_off(led, led->led_id, led->color);
+}
+
+#define BD2802_CONTROL_RGBS(name, id, clr)				\
+static void bd2802_set_##name##_brightness(struct led_classdev *led_cdev,\
+					enum led_brightness value)	\
+{									\
+	struct bd2802_led *led =					\
+		container_of(led_cdev, struct bd2802_led, cdev_##name);	\
+	led->led_id = id;						\
+	led->color = clr;						\
+	if (value == LED_OFF)						\
+		led->state = BD2802_OFF;				\
+	else								\
+		led->state = BD2802_ON;					\
+	schedule_work(&led->work);					\
+}									\
+static int bd2802_set_##name##_blink(struct led_classdev *led_cdev,	\
+		unsigned long *delay_on, unsigned long *delay_off)	\
+{									\
+	struct bd2802_led *led =					\
+		container_of(led_cdev, struct bd2802_led, cdev_##name);	\
+	if (*delay_on == 0 || *delay_off == 0)				\
+		return -EINVAL;						\
+	led->led_id = id;						\
+	led->color = clr;						\
+	led->state = BD2802_BLINK;					\
+	schedule_work(&led->work);					\
+	return 0;							\
+}
+
+BD2802_CONTROL_RGBS(led1r, LED1, RED);
+BD2802_CONTROL_RGBS(led1g, LED1, GREEN);
+BD2802_CONTROL_RGBS(led1b, LED1, BLUE);
+BD2802_CONTROL_RGBS(led2r, LED2, RED);
+BD2802_CONTROL_RGBS(led2g, LED2, GREEN);
+BD2802_CONTROL_RGBS(led2b, LED2, BLUE);
+
+static int bd2802_register_led_classdev(struct bd2802_led *led)
+{
+	int ret;
+
+	INIT_WORK(&led->work, bd2802_led_work);
+
+	led->cdev_led1r.name = "led1_R";
+	led->cdev_led1r.brightness = LED_OFF;
+	led->cdev_led1r.brightness_set = bd2802_set_led1r_brightness;
+	led->cdev_led1r.blink_set = bd2802_set_led1r_blink;
+	led->cdev_led1r.flags |= LED_CORE_SUSPENDRESUME;
+
+	ret = led_classdev_register(&led->client->dev, &led->cdev_led1r);
+	if (ret < 0) {
+		dev_err(&led->client->dev, "couldn't register LED %s\n",
+							led->cdev_led1r.name);
+		goto failed_unregister_led1_R;
+	}
+
+	led->cdev_led1g.name = "led1_G";
+	led->cdev_led1g.brightness = LED_OFF;
+	led->cdev_led1g.brightness_set = bd2802_set_led1g_brightness;
+	led->cdev_led1g.blink_set = bd2802_set_led1g_blink;
+	led->cdev_led1g.flags |= LED_CORE_SUSPENDRESUME;
+
+	ret = led_classdev_register(&led->client->dev, &led->cdev_led1g);
+	if (ret < 0) {
+		dev_err(&led->client->dev, "couldn't register LED %s\n",
+							led->cdev_led1g.name);
+		goto failed_unregister_led1_G;
+	}
+
+	led->cdev_led1b.name = "led1_B";
+	led->cdev_led1b.brightness = LED_OFF;
+	led->cdev_led1b.brightness_set = bd2802_set_led1b_brightness;
+	led->cdev_led1b.blink_set = bd2802_set_led1b_blink;
+	led->cdev_led1b.flags |= LED_CORE_SUSPENDRESUME;
+
+	ret = led_classdev_register(&led->client->dev, &led->cdev_led1b);
+	if (ret < 0) {
+		dev_err(&led->client->dev, "couldn't register LED %s\n",
+							led->cdev_led1b.name);
+		goto failed_unregister_led1_B;
+	}
+
+	led->cdev_led2r.name = "led2_R";
+	led->cdev_led2r.brightness = LED_OFF;
+	led->cdev_led2r.brightness_set = bd2802_set_led2r_brightness;
+	led->cdev_led2r.blink_set = bd2802_set_led2r_blink;
+	led->cdev_led2r.flags |= LED_CORE_SUSPENDRESUME;
+
+	ret = led_classdev_register(&led->client->dev, &led->cdev_led2r);
+	if (ret < 0) {
+		dev_err(&led->client->dev, "couldn't register LED %s\n",
+							led->cdev_led2r.name);
+		goto failed_unregister_led2_R;
+	}
+
+	led->cdev_led2g.name = "led2_G";
+	led->cdev_led2g.brightness = LED_OFF;
+	led->cdev_led2g.brightness_set = bd2802_set_led2g_brightness;
+	led->cdev_led2g.blink_set = bd2802_set_led2g_blink;
+	led->cdev_led2g.flags |= LED_CORE_SUSPENDRESUME;
+
+	ret = led_classdev_register(&led->client->dev, &led->cdev_led2g);
+	if (ret < 0) {
+		dev_err(&led->client->dev, "couldn't register LED %s\n",
+							led->cdev_led2g.name);
+		goto failed_unregister_led2_G;
+	}
+
+	led->cdev_led2b.name = "led2_B";
+	led->cdev_led2b.brightness = LED_OFF;
+	led->cdev_led2b.brightness_set = bd2802_set_led2b_brightness;
+	led->cdev_led2b.blink_set = bd2802_set_led2b_blink;
+	led->cdev_led2b.flags |= LED_CORE_SUSPENDRESUME;
+
+	ret = led_classdev_register(&led->client->dev, &led->cdev_led2b);
+	if (ret < 0) {
+		dev_err(&led->client->dev, "couldn't register LED %s\n",
+							led->cdev_led2b.name);
+		goto failed_unregister_led2_B;
+	}
+
+	return 0;
+
+failed_unregister_led2_B:
+	led_classdev_unregister(&led->cdev_led2g);
+failed_unregister_led2_G:
+	led_classdev_unregister(&led->cdev_led2r);
+failed_unregister_led2_R:
+	led_classdev_unregister(&led->cdev_led1b);
+failed_unregister_led1_B:
+	led_classdev_unregister(&led->cdev_led1g);
+failed_unregister_led1_G:
+	led_classdev_unregister(&led->cdev_led1r);
+failed_unregister_led1_R:
+
+	return ret;
+}
+
+static void bd2802_unregister_led_classdev(struct bd2802_led *led)
+{
+	cancel_work_sync(&led->work);
+	led_classdev_unregister(&led->cdev_led1r);
+}
+
+static int __devinit bd2802_probe(struct i2c_client *client,
+			const struct i2c_device_id *id)
+{
+	struct bd2802_led *led;
+	struct bd2802_led_platform_data *pdata;
+	int ret;
+
+	led = kzalloc(sizeof(struct bd2802_led), GFP_KERNEL);
+	if (!led) {
+		dev_err(&client->dev, "failed to allocate driver data\n");
+		return -ENOMEM;
+	}
+
+	led->client = client;
+	pdata = led->pdata = client->dev.platform_data;
+	i2c_set_clientdata(client, led);
+
+	/* Configure RESET GPIO (L: RESET, H: RESET cancel) */
+	gpio_request(pdata->reset_gpio, "RGB_RESETB");
+	gpio_direction_output(pdata->reset_gpio, 1);
+
+	/* Tacss = min 0.1ms */
+	udelay(100);
+
+	/* Detect BD2802GU */
+	ret = bd2802_write_byte(client, BD2802_REG_CLKSETUP, 0x00);
+	if (ret < 0) {
+		dev_err(&client->dev, "failed to detect device\n");
+		goto failed_free;
+	} else
+		dev_info(&client->dev, "return 0x%02x\n", ret);
+
+	/* To save the power, reset BD2802 after detecting */
+	gpio_set_value(led->pdata->reset_gpio, 0);
+
+	init_rwsem(&led->rwsem);
+
+	ret = device_create_file(&client->dev, &bd2802_adv_conf_attr);
+	if (ret) {
+		dev_err(&client->dev, "failed to create sysfs file %s\n",
+					bd2802_adv_conf_attr.attr.name);
+		goto failed_free;
+	}
+
+	ret = bd2802_register_led_classdev(led);
+	if (ret < 0)
+		goto failed_unregister_dev_file;
+
+	return 0;
+
+failed_unregister_dev_file:
+	device_remove_file(&client->dev, &bd2802_adv_conf_attr);
+failed_free:
+	i2c_set_clientdata(client, NULL);
+	kfree(led);
+
+	return ret;
+}
+
+static int __exit bd2802_remove(struct i2c_client *client)
+{
+	struct bd2802_led *led = i2c_get_clientdata(client);
+
+	bd2802_unregister_led_classdev(led);
+	gpio_set_value(led->pdata->reset_gpio, 0);
+	if (led->adf_on)
+		bd2802_disable_adv_conf(led);
+	device_remove_file(&client->dev, &bd2802_adv_conf_attr);
+	i2c_set_clientdata(client, NULL);
+	kfree(led);
+
+	return 0;
+}
+
+static int bd2802_suspend(struct i2c_client *client, pm_message_t mesg)
+{
+	struct bd2802_led *led = i2c_get_clientdata(client);
+
+	gpio_set_value(led->pdata->reset_gpio, 0);
+
+	return 0;
+}
+
+static int bd2802_resume(struct i2c_client *client)
+{
+	struct bd2802_led *led = i2c_get_clientdata(client);
+
+	if (!bd2802_is_all_off(led) || led->adf_on) {
+		gpio_set_value(led->pdata->reset_gpio, 1);
+		udelay(100);
+		bd2802_restore_state(led);
+	}
+
+	return 0;
+}
+
+static const struct i2c_device_id bd2802_id[] = {
+	{ "BD2802", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, bd2802_id);
+
+static struct i2c_driver bd2802_i2c_driver = {
+	.driver	= {
+		.name	= "BD2802",
+	},
+	.probe		= bd2802_probe,
+	.remove		= __exit_p(bd2802_remove),
+	.suspend	= bd2802_suspend,
+	.resume		= bd2802_resume,
+	.id_table	= bd2802_id,
+};
+
+static int __init bd2802_init(void)
+{
+	return i2c_add_driver(&bd2802_i2c_driver);
+}
+module_init(bd2802_init);
+
+static void __exit bd2802_exit(void)
+{
+	i2c_del_driver(&bd2802_i2c_driver);
+}
+module_exit(bd2802_exit);
+
+MODULE_AUTHOR("Kim Kyuwon <q1.kim@samsung.com>");
+MODULE_DESCRIPTION("BD2802 LED driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/leds-bd2802.h b/include/linux/leds-bd2802.h
new file mode 100644
index 000000000000..42f854a1a199
--- /dev/null
+++ b/include/linux/leds-bd2802.h
@@ -0,0 +1,26 @@
+/*
+ * leds-bd2802.h - RGB LED Driver
+ *
+ * Copyright (C) 2009 Samsung Electronics
+ * Kim Kyuwon <q1.kim@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Datasheet: http://www.rohm.com/products/databook/driver/pdf/bd2802gu-e.pdf
+ *
+ */
+#ifndef _LEDS_BD2802_H_
+#define _LEDS_BD2802_H_
+
+struct bd2802_led_platform_data{
+	int	reset_gpio;
+	u8	rgb_time;
+};
+
+#define RGB_TIME(slopedown, slopeup, waveform) \
+	((slopedown) << 6 | (slopeup) << 4 | (waveform))
+
+#endif /* _LEDS_BD2802_H_ */
+
-- 
cgit v1.2.3


From e3ee703366da3a4ce80f24b47a5df56cd816c5d6 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 6 Apr 2009 18:12:24 +0200
Subject: i2c: Deprecate client_register and client_unregister methods

The new i2c binding model makes the client_register and
client_unregister methods of struct i2c_adapter useless, so we can
remove them with the rest of the legacy model.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/feature-removal-schedule.txt | 3 ++-
 include/linux/i2c.h                        | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 39246fc11257..7e2af10e8264 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -354,7 +354,8 @@ Who:  Krzysztof Piotr Oledzki <ole@ans.pl>
 
 ---------------------------
 
-What:	i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client()
+What:	i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client(),
+	i2c_adapter->client_register(), i2c_adapter->client_unregister
 When:	2.6.30
 Check:	i2c_attach_client i2c_detach_client
 Why:	Deprecated by the new (standard) device driver binding model. Use
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index c86c3b07604c..00ee11eb9092 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -353,8 +353,8 @@ struct i2c_adapter {
 	void *algo_data;
 
 	/* --- administration stuff. */
-	int (*client_register)(struct i2c_client *);
-	int (*client_unregister)(struct i2c_client *);
+	int (*client_register)(struct i2c_client *) __deprecated;
+	int (*client_unregister)(struct i2c_client *) __deprecated;
 
 	/* data fields that are valid for all devices	*/
 	u8 level; 			/* nesting level for lockdep */
-- 
cgit v1.2.3


From 7c8ad4aff0699197469327d0e50d1e48f2ccb39b Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 6 Apr 2009 18:12:25 +0200
Subject: i2c: Delete many unused driver IDs

Delete many unused I2C driver IDs. We should be able to get rid of
i2c_driver.id pretty soon now.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 include/linux/i2c-id.h | 37 -------------------------------------
 1 file changed, 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index f27604af8378..ee9fbc172405 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -33,47 +33,10 @@
 
 #define I2C_DRIVERID_MSP3400	 1
 #define I2C_DRIVERID_TUNER	 2
-#define I2C_DRIVERID_TEA6420	 5	/* audio matrix switch		*/
-#define I2C_DRIVERID_TEA6415C	 6	/* video matrix switch		*/
-#define I2C_DRIVERID_TDA9840	 7	/* stereo sound processor	*/
-#define I2C_DRIVERID_SAA7111A	 8	/* video input processor	*/
-#define I2C_DRIVERID_SAA7185B	13	/* video encoder		*/
-#define I2C_DRIVERID_SAA7110	22	/* video decoder		*/
-#define I2C_DRIVERID_SAA5249	24	/* SAA5249 and compatibles	*/
 #define I2C_DRIVERID_TDA7432	27	/* Stereo sound processor	*/
 #define I2C_DRIVERID_TVAUDIO    29      /* Generic TV sound driver      */
-#define I2C_DRIVERID_TDA9875    32      /* TV sound decoder chip        */
-#define I2C_DRIVERID_BT819	40     /* video decoder			*/
-#define I2C_DRIVERID_BT856	41     /* video encoder			*/
-#define I2C_DRIVERID_VPX3220	42     /* video decoder+vbi/vtxt	*/
-#define I2C_DRIVERID_ADV7175	48     /* ADV 7175/7176 video encoder	*/
-#define I2C_DRIVERID_SAA7114	49	/* video decoder		*/
-#define I2C_DRIVERID_ADV7170	54	/* video encoder		*/
-#define I2C_DRIVERID_SAA7191	57	/* video decoder		*/
-#define I2C_DRIVERID_INDYCAM	58	/* SGI IndyCam			*/
-#define I2C_DRIVERID_OVCAMCHIP	61	/* OmniVision CMOS image sens.	*/
-#define I2C_DRIVERID_SAA6752HS	67	/* MPEG2 encoder		*/
-#define I2C_DRIVERID_TVEEPROM	68	/* TV EEPROM			*/
-#define I2C_DRIVERID_WM8775	69	/* wm8775 audio processor	*/
-#define I2C_DRIVERID_CS53L32A	70	/* cs53l32a audio processor	*/
-#define I2C_DRIVERID_CX25840	71	/* cx2584x video encoder	*/
-#define I2C_DRIVERID_SAA7127	72	/* saa7127 video encoder	*/
 #define I2C_DRIVERID_SAA711X	73	/* saa711x video encoders	*/
 #define I2C_DRIVERID_INFRARED	75	/* I2C InfraRed on Video boards */
-#define I2C_DRIVERID_TVP5150	76	/* TVP5150 video decoder        */
-#define I2C_DRIVERID_WM8739	77	/* wm8739 audio processor	*/
-#define I2C_DRIVERID_UPD64083	78	/* upd64083 video processor	*/
-#define I2C_DRIVERID_UPD64031A	79	/* upd64031a video processor	*/
-#define I2C_DRIVERID_SAA717X	80	/* saa717x video encoder	*/
-#define I2C_DRIVERID_BT866	85	/* Conexant bt866 video encoder */
-#define I2C_DRIVERID_KS0127	86	/* Samsung ks0127 video decoder */
-#define I2C_DRIVERID_TLV320AIC23B 87	/* TI TLV320AIC23B audio codec  */
-#define I2C_DRIVERID_VP27SMPX	93	/* Panasonic VP27s tuner internal MPX */
-#define I2C_DRIVERID_M52790 	95      /* Mitsubishi M52790SP/FP AV switch */
-#define I2C_DRIVERID_CS5345	96	/* cs5345 audio processor	*/
-#define I2C_DRIVERID_AU8522	97	/* Auvitek au8522       */
-
-#define I2C_DRIVERID_OV7670 1048	/* Omnivision 7670 camera */
 
 /*
  * ---- Adapter types ----------------------------------------------------
-- 
cgit v1.2.3


From abe213d7f6fb87f48f4324320733f666db1bc11b Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 6 Apr 2009 18:12:25 +0200
Subject: i2c: Delete unused i2c-algo-sgi helper module

The i2c-algo-sgi code was merged into the vino driver, so we can
delete it now.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/algos/Kconfig        |   4 -
 drivers/i2c/algos/Makefile       |   1 -
 drivers/i2c/algos/i2c-algo-sgi.c | 179 ---------------------------------------
 include/linux/i2c-algo-sgi.h     |  26 ------
 4 files changed, 210 deletions(-)
 delete mode 100644 drivers/i2c/algos/i2c-algo-sgi.c
 delete mode 100644 include/linux/i2c-algo-sgi.h

(limited to 'include/linux')

diff --git a/drivers/i2c/algos/Kconfig b/drivers/i2c/algos/Kconfig
index b788579b8227..7b2ce4a08524 100644
--- a/drivers/i2c/algos/Kconfig
+++ b/drivers/i2c/algos/Kconfig
@@ -14,8 +14,4 @@ config I2C_ALGOPCF
 config I2C_ALGOPCA
 	tristate "I2C PCA 9564 interfaces"
 
-config I2C_ALGO_SGI
-	tristate
-	depends on SGI_IP22 || SGI_IP32 || X86_VISWS
-
 endmenu
diff --git a/drivers/i2c/algos/Makefile b/drivers/i2c/algos/Makefile
index cac1051bd4f1..18b3e962ec09 100644
--- a/drivers/i2c/algos/Makefile
+++ b/drivers/i2c/algos/Makefile
@@ -5,7 +5,6 @@
 obj-$(CONFIG_I2C_ALGOBIT)	+= i2c-algo-bit.o
 obj-$(CONFIG_I2C_ALGOPCF)	+= i2c-algo-pcf.o
 obj-$(CONFIG_I2C_ALGOPCA)	+= i2c-algo-pca.o
-obj-$(CONFIG_I2C_ALGO_SGI)	+= i2c-algo-sgi.o
 
 ifeq ($(CONFIG_I2C_DEBUG_ALGO),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/i2c/algos/i2c-algo-sgi.c b/drivers/i2c/algos/i2c-algo-sgi.c
deleted file mode 100644
index 6eaf145e1ada..000000000000
--- a/drivers/i2c/algos/i2c-algo-sgi.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * i2c-algo-sgi.c: i2c driver algorithm used by the VINO (SGI Indy) and
- * MACE (SGI O2) chips.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * Copyright (C) 2003 Ladislav Michl <ladis@linux-mips.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/delay.h>
-
-#include <linux/i2c.h>
-#include <linux/i2c-algo-sgi.h>
-
-
-#define SGI_I2C_FORCE_IDLE	(0 << 0)
-#define SGI_I2C_NOT_IDLE	(1 << 0)
-#define SGI_I2C_WRITE		(0 << 1)
-#define SGI_I2C_READ		(1 << 1)
-#define SGI_I2C_RELEASE_BUS	(0 << 2)
-#define SGI_I2C_HOLD_BUS	(1 << 2)
-#define SGI_I2C_XFER_DONE	(0 << 4)
-#define SGI_I2C_XFER_BUSY	(1 << 4)
-#define SGI_I2C_ACK		(0 << 5)
-#define SGI_I2C_NACK		(1 << 5)
-#define SGI_I2C_BUS_OK		(0 << 7)
-#define SGI_I2C_BUS_ERR		(1 << 7)
-
-#define get_control()		adap->getctrl(adap->data)
-#define set_control(val)	adap->setctrl(adap->data, val)
-#define read_data()		adap->rdata(adap->data)
-#define write_data(val)		adap->wdata(adap->data, val)
-
-
-static int wait_xfer_done(struct i2c_algo_sgi_data *adap)
-{
-	int i;
-
-	for (i = 0; i < adap->xfer_timeout; i++) {
-		if ((get_control() & SGI_I2C_XFER_BUSY) == 0)
-			return 0;
-		udelay(1);
-	}
-
-	return -ETIMEDOUT;
-}
-
-static int wait_ack(struct i2c_algo_sgi_data *adap)
-{
-	int i;
-
-	if (wait_xfer_done(adap))
-		return -ETIMEDOUT;
-	for (i = 0; i < adap->ack_timeout; i++) {
-		if ((get_control() & SGI_I2C_NACK) == 0)
-			return 0;
-		udelay(1);
-	}
-
-	return -ETIMEDOUT;
-}
-
-static int force_idle(struct i2c_algo_sgi_data *adap)
-{
-	int i;
-
-	set_control(SGI_I2C_FORCE_IDLE);
-	for (i = 0; i < adap->xfer_timeout; i++) {
-		if ((get_control() & SGI_I2C_NOT_IDLE) == 0)
-			goto out;
-		udelay(1);
-	}
-	return -ETIMEDOUT;
-out:
-	if (get_control() & SGI_I2C_BUS_ERR)
-		return -EIO;
-	return 0;
-}
-
-static int do_address(struct i2c_algo_sgi_data *adap, unsigned int addr,
-		      int rd)
-{
-	if (rd)
-		set_control(SGI_I2C_NOT_IDLE);
-	/* Check if bus is idle, eventually force it to do so */
-	if (get_control() & SGI_I2C_NOT_IDLE)
-		if (force_idle(adap))
-	                return -EIO;
-	/* Write out the i2c chip address and specify operation */
-	set_control(SGI_I2C_HOLD_BUS | SGI_I2C_WRITE | SGI_I2C_NOT_IDLE);
-	if (rd)
-		addr |= 1;
-	write_data(addr);
-	if (wait_ack(adap))
-		return -EIO;
-	return 0;
-}
-
-static int i2c_read(struct i2c_algo_sgi_data *adap, unsigned char *buf,
-		    unsigned int len)
-{
-	int i;
-
-	set_control(SGI_I2C_HOLD_BUS | SGI_I2C_READ | SGI_I2C_NOT_IDLE);
-	for (i = 0; i < len; i++) {
-		if (wait_xfer_done(adap))
-			return -EIO;
-		buf[i] = read_data();
-	}
-	set_control(SGI_I2C_RELEASE_BUS | SGI_I2C_FORCE_IDLE);
-
-	return 0;
-
-}
-
-static int i2c_write(struct i2c_algo_sgi_data *adap, unsigned char *buf,
-		     unsigned int len)
-{
-	int i;
-
-	/* We are already in write state */
-	for (i = 0; i < len; i++) {
-		write_data(buf[i]);
-		if (wait_ack(adap))
-			return -EIO;
-	}
-	return 0;
-}
-
-static int sgi_xfer(struct i2c_adapter *i2c_adap, struct i2c_msg *msgs,
-		    int num)
-{
-	struct i2c_algo_sgi_data *adap = i2c_adap->algo_data;
-	struct i2c_msg *p;
-	int i, err = 0;
-
-	for (i = 0; !err && i < num; i++) {
-		p = &msgs[i];
-		err = do_address(adap, p->addr, p->flags & I2C_M_RD);
-		if (err || !p->len)
-			continue;
-		if (p->flags & I2C_M_RD)
-			err = i2c_read(adap, p->buf, p->len);
-		else
-			err = i2c_write(adap, p->buf, p->len);
-	}
-
-	return (err < 0) ? err : i;
-}
-
-static u32 sgi_func(struct i2c_adapter *adap)
-{
-	return I2C_FUNC_SMBUS_EMUL;
-}
-
-static const struct i2c_algorithm sgi_algo = {
-	.master_xfer	= sgi_xfer,
-	.functionality	= sgi_func,
-};
-
-/*
- * registering functions to load algorithms at runtime
- */
-int i2c_sgi_add_bus(struct i2c_adapter *adap)
-{
-	adap->algo = &sgi_algo;
-
-	return i2c_add_adapter(adap);
-}
-EXPORT_SYMBOL(i2c_sgi_add_bus);
-
-MODULE_AUTHOR("Ladislav Michl <ladis@linux-mips.org>");
-MODULE_DESCRIPTION("I2C-Bus SGI algorithm");
-MODULE_LICENSE("GPL");
diff --git a/include/linux/i2c-algo-sgi.h b/include/linux/i2c-algo-sgi.h
deleted file mode 100644
index 3b7715024e69..000000000000
--- a/include/linux/i2c-algo-sgi.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * Copyright (C) 2003 Ladislav Michl <ladis@linux-mips.org>
- */
-
-#ifndef I2C_ALGO_SGI_H
-#define I2C_ALGO_SGI_H 1
-
-#include <linux/i2c.h>
-
-struct i2c_algo_sgi_data {
-	void *data;	/* private data for lowlevel routines */
-	unsigned (*getctrl)(void *data);
-	void (*setctrl)(void *data, unsigned val);
-	unsigned (*rdata)(void *data);
-	void (*wdata)(void *data, unsigned val);
-
-	int xfer_timeout;
-	int ack_timeout;
-};
-
-int i2c_sgi_add_bus(struct i2c_adapter *);
-
-#endif /* I2C_ALGO_SGI_H */
-- 
cgit v1.2.3


From f0ad670d7061efad138df19aefe569263c4ea37b Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Mon, 6 Apr 2009 12:00:36 +0300
Subject: nfsd41: define NFSD_DRC_SIZE_SHIFT in set_max_drc

Fixes the following compiler error:
fs/nfsd/nfssvc.c: In function 'set_max_drc':
fs/nfsd/nfssvc.c:240: error: 'NFSD_DRC_SIZE_SHIFT' undeclared

CONFIG_NFSD_V4 is not set

Reported-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfssvc.c          | 2 ++
 include/linux/nfsd/nfsd.h | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e9d57734a348..469c931cca95 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -236,6 +236,8 @@ void nfsd_reset_versions(void)
  */
 static void set_max_drc(void)
 {
+	/* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
+	#define NFSD_DRC_SIZE_SHIFT	7
 	nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
 						>> NFSD_DRC_SIZE_SHIFT;
 	nfsd_serv->sv_drc_pages_used = 0;
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 0ec4d142c503..2b49d676d0c9 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -333,9 +333,6 @@ extern struct timeval	nfssvc_boot;
 #define NFSD_LEASE_TIME                 (nfs4_lease_time())
 #define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
 
-/* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
-#define NFSD_DRC_SIZE_SHIFT	7
-
 /*
  * The following attributes are currently not supported by the NFSv4 server:
  *    ARCHIVE       (deprecated anyway)
-- 
cgit v1.2.3


From 04826f43d4f0a4d56423eb8abb9f2ec9987df5b5 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Mon, 6 Apr 2009 12:00:49 +0300
Subject: nfsd41: define nfsd4_set_statp as noop for !CONFIG_NFSD_V4

Fixes following modpost error:
ERROR: "nfsd4_set_statp" [fs/nfsd/nfsd.ko] undefined!

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/nfsd/cache.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h
index a59a2df6d079..5bccaab81056 100644
--- a/include/linux/nfsd/cache.h
+++ b/include/linux/nfsd/cache.h
@@ -75,6 +75,13 @@ int	nfsd_reply_cache_init(void);
 void	nfsd_reply_cache_shutdown(void);
 int	nfsd_cache_lookup(struct svc_rqst *, int);
 void	nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+
+#ifdef CONFIG_NFSD_V4
 void	nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
+#else  /* CONFIG_NFSD_V4 */
+static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+}
+#endif /* CONFIG_NFSD_V4 */
 
 #endif /* NFSCACHE_H */
-- 
cgit v1.2.3


From 296ccb086dfb89b5b8d73ef08c795ffdff12a597 Mon Sep 17 00:00:00 2001
From: Yuji Shimada <shimada-yxb@necst.nec.co.jp>
Date: Fri, 3 Apr 2009 16:41:46 +0900
Subject: PCI: Setup disabled bridges even if buses are added

This patch sets up disabled bridges even if buses have already been
added.

pci_assign_unassigned_resources is called after buses are added.
pci_assign_unassigned_resources calls pci_bus_assign_resources.
pci_bus_assign_resources calls pci_setup_bridge to configure BARs of
bridges.

Currently pci_setup_bridge returns immediately if the bus have already
been added. So pci_assign_unassigned_resources can't configure BARs of
bridges that were added in a disabled state; this patch fixes the issue.

On logical hot-add, we need to prevent the kernel from re-initializing
bridges that have already been initialized. To achieve this,
pci_setup_bridge returns immediately if the bridge have already been
enabled.

We don't need to check whether the specified bus is a root bus or not.
pci_setup_bridge is not called on a root bus, because a root bus does
not have a bridge.

The patch adds a new helper function, pci_is_enabled. I made the
function name similar to pci_is_managed. The codes which use
enable_cnt directly are changed to use pci_is_enabled.

Acked-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Yuji Shimada <shimada-yxb@necst.nec.co.jp>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/bus.c       | 2 +-
 drivers/pci/pci-sysfs.c | 2 +-
 drivers/pci/pci.c       | 4 ++--
 drivers/pci/setup-bus.c | 2 +-
 include/linux/pci.h     | 5 +++++
 5 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 68f91a252595..97a8194063b5 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -184,7 +184,7 @@ void pci_enable_bridges(struct pci_bus *bus)
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		if (dev->subordinate) {
-			if (atomic_read(&dev->enable_cnt) == 0) {
+			if (!pci_is_enabled(dev)) {
 				retval = pci_enable_device(dev);
 				pci_set_master(dev);
 			}
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index e9a8706a6401..cd8e682c04aa 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -148,7 +148,7 @@ static ssize_t is_enabled_store(struct device *dev,
 		return -EPERM;
 
 	if (!val) {
-		if (atomic_read(&pdev->enable_cnt) != 0)
+		if (pci_is_enabled(pdev))
 			pci_disable_device(pdev);
 		else
 			result = -EIO;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 445fb6f7ea3f..16fd0d4c3166 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -844,7 +844,7 @@ static int do_pci_enable_device(struct pci_dev *dev, int bars)
  */
 int pci_reenable_device(struct pci_dev *dev)
 {
-	if (atomic_read(&dev->enable_cnt))
+	if (pci_is_enabled(dev))
 		return do_pci_enable_device(dev, (1 << PCI_NUM_RESOURCES) - 1);
 	return 0;
 }
@@ -1042,7 +1042,7 @@ static void do_pci_disable_device(struct pci_dev *dev)
  */
 void pci_disable_enabled_device(struct pci_dev *dev)
 {
-	if (atomic_read(&dev->enable_cnt))
+	if (pci_is_enabled(dev))
 		do_pci_disable_device(dev);
 }
 
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 334285a8e237..8d9da9d30a61 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -144,7 +144,7 @@ static void pci_setup_bridge(struct pci_bus *bus)
 	struct pci_bus_region region;
 	u32 l, bu, lu, io_upper16;
 
-	if (!pci_is_root_bus(bus) && bus->is_added)
+	if (pci_is_enabled(bridge))
 		return;
 
 	dev_info(&bridge->dev, "PCI bridge, secondary bus %04x:%02x\n",
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a7fe4bbd7ff1..72698d89e767 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -674,6 +674,11 @@ int __must_check pci_reenable_device(struct pci_dev *);
 int __must_check pcim_enable_device(struct pci_dev *pdev);
 void pcim_pin_device(struct pci_dev *pdev);
 
+static inline int pci_is_enabled(struct pci_dev *pdev)
+{
+	return (atomic_read(&pdev->enable_cnt) > 0);
+}
+
 static inline int pci_is_managed(struct pci_dev *pdev)
 {
 	return pdev->is_managed;
-- 
cgit v1.2.3


From 46a0fac9438764533245928b78d35fbaa5d7adf4 Mon Sep 17 00:00:00 2001
From: Shawn Bohrer <shawn.bohrer@ni.com>
Date: Mon, 6 Apr 2009 17:32:07 +0100
Subject: 8250_pci: add support for National Instruments 843x RS232 devices

This implements basic support for all 843x RS232 devices, but does not
add DMA support.  This means that sustained data transfers at high baud
rates may not be possible on multiple ports simultaneously.

Signed-off-by: Shawn Bohrer <shawn.bohrer@ni.com>
Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250_pci.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h   |  14 ++++
 2 files changed, 207 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 533f82025adf..2f570c761faf 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -306,6 +306,36 @@ static void __devexit pci_plx9050_exit(struct pci_dev *dev)
 	}
 }
 
+/* MITE registers */
+#define MITE_IOWBSR1	0xc4
+#define MITE_IOWCR1	0xf4
+#define MITE_LCIMR1	0x08
+#define MITE_LCIMR2	0x10
+
+#define MITE_LCIMR2_CLR_CPU_IE	(1 << 30)
+
+static void __devexit pci_ni8430_exit(struct pci_dev *dev)
+{
+	void __iomem *p;
+	unsigned long base, len;
+	unsigned int bar = 0;
+
+	if ((pci_resource_flags(dev, bar) & IORESOURCE_MEM) == 0) {
+		moan_device("no memory in bar", dev);
+		return;
+	}
+
+	base = pci_resource_start(dev, bar);
+	len =  pci_resource_len(dev, bar);
+	p = ioremap_nocache(base, len);
+	if (p == NULL)
+		return;
+
+	/* Disable the CPU Interrupt */
+	writel(MITE_LCIMR2_CLR_CPU_IE, p + MITE_LCIMR2);
+	iounmap(p);
+}
+
 /* SBS Technologies Inc. PMC-OCTPRO and P-OCTAL cards */
 static int
 sbs_setup(struct serial_private *priv, const struct pciserial_board *board,
@@ -597,6 +627,82 @@ static int pci_xircom_init(struct pci_dev *dev)
 	return 0;
 }
 
+#define MITE_IOWBSR1_WSIZE	0xa
+#define MITE_IOWBSR1_WIN_OFFSET	0x800
+#define MITE_IOWBSR1_WENAB	(1 << 7)
+#define MITE_LCIMR1_IO_IE_0	(1 << 24)
+#define MITE_LCIMR2_SET_CPU_IE	(1 << 31)
+#define MITE_IOWCR1_RAMSEL_MASK	0xfffffffe
+
+static int pci_ni8430_init(struct pci_dev *dev)
+{
+	void __iomem *p;
+	unsigned long base, len;
+	u32 device_window;
+	unsigned int bar = 0;
+
+	if ((pci_resource_flags(dev, bar) & IORESOURCE_MEM) == 0) {
+		moan_device("no memory in bar", dev);
+		return 0;
+	}
+
+	base = pci_resource_start(dev, bar);
+	len =  pci_resource_len(dev, bar);
+	p = ioremap_nocache(base, len);
+	if (p == NULL)
+		return -ENOMEM;
+
+	/* Set device window address and size in BAR0 */
+	device_window = ((base + MITE_IOWBSR1_WIN_OFFSET) & 0xffffff00)
+	                | MITE_IOWBSR1_WENAB | MITE_IOWBSR1_WSIZE;
+	writel(device_window, p + MITE_IOWBSR1);
+
+	/* Set window access to go to RAMSEL IO address space */
+	writel((readl(p + MITE_IOWCR1) & MITE_IOWCR1_RAMSEL_MASK),
+	       p + MITE_IOWCR1);
+
+	/* Enable IO Bus Interrupt 0 */
+	writel(MITE_LCIMR1_IO_IE_0, p + MITE_LCIMR1);
+
+	/* Enable CPU Interrupt */
+	writel(MITE_LCIMR2_SET_CPU_IE, p + MITE_LCIMR2);
+
+	iounmap(p);
+	return 0;
+}
+
+/* UART Port Control Register */
+#define NI8430_PORTCON	0x0f
+#define NI8430_PORTCON_TXVR_ENABLE	(1 << 3)
+
+static int
+pci_ni8430_setup(struct serial_private *priv, struct pciserial_board *board,
+		 struct uart_port *port, int idx)
+{
+	void __iomem *p;
+	unsigned long base, len;
+	unsigned int bar, offset = board->first_offset;
+
+	if (idx >= board->num_ports)
+		return 1;
+
+	bar = FL_GET_BASE(board->flags);
+	offset += idx * board->uart_offset;
+
+	base = pci_resource_start(priv->dev, bar);
+	len =  pci_resource_len(priv->dev, bar);
+	p = ioremap_nocache(base, len);
+
+	/* enable the transciever */
+	writeb(readb(p + offset + NI8430_PORTCON) | NI8430_PORTCON_TXVR_ENABLE,
+	       p + offset + NI8430_PORTCON);
+
+	iounmap(p);
+
+	return setup_port(priv, port, bar, offset, board->reg_shift);
+}
+
+
 static int pci_netmos_init(struct pci_dev *dev)
 {
 	/* subdevice 0x00PS means <P> parallel, <S> serial */
@@ -912,6 +1018,18 @@ static struct pci_serial_quirk pci_serial_quirks[] __refdata = {
 		.setup		= pci_default_setup,
 		.exit		= __devexit_p(pci_ite887x_exit),
 	},
+	/*
+	 * National Instruments
+	 */
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_ANY_ID,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8430_init,
+		.setup		= pci_ni8430_setup,
+		.exit		= __devexit_p(pci_ni8430_exit),
+	},
 	/*
 	 * Panacom
 	 */
@@ -1280,6 +1398,10 @@ enum pci_board_num_t {
 	pbn_exar_XR17C154,
 	pbn_exar_XR17C158,
 	pbn_pasemi_1682M,
+	pbn_ni8430_2,
+	pbn_ni8430_4,
+	pbn_ni8430_8,
+	pbn_ni8430_16,
 };
 
 /*
@@ -1850,6 +1972,37 @@ static struct pciserial_board pci_boards[] __devinitdata = {
 		.num_ports	= 1,
 		.base_baud	= 8333333,
 	},
+	/*
+	 * National Instruments 843x
+	 */
+	[pbn_ni8430_16] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 16,
+		.base_baud	= 3686400,
+		.uart_offset	= 0x10,
+		.first_offset	= 0x800,
+	},
+	[pbn_ni8430_8] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 8,
+		.base_baud	= 3686400,
+		.uart_offset	= 0x10,
+		.first_offset	= 0x800,
+	},
+	[pbn_ni8430_4] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 4,
+		.base_baud	= 3686400,
+		.uart_offset	= 0x10,
+		.first_offset	= 0x800,
+	},
+	[pbn_ni8430_2] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 2,
+		.base_baud	= 3686400,
+		.uart_offset	= 0x10,
+		.first_offset	= 0x800,
+	},
 };
 
 static const struct pci_device_id softmodem_blacklist[] = {
@@ -3051,6 +3204,46 @@ static struct pci_device_id serial_pci_tbl[] = {
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_pasemi_1682M },
 
+	/*
+	 * National Instruments
+	 */
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8430_2322,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_2 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI8430_2322,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_2 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8430_2324,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_4 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI8430_2324,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_4 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8430_2328,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_8 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI8430_2328,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_8 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8430_23216,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_16 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI8430_23216,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_16 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8432_2322,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_2 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI8432_2322,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_2 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8432_2324,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_4 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI8432_2324,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_ni8430_4 },
+
 	/*
 	* ADDI-DATA GmbH communication cards <info@addi-data.com>
 	*/
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 170f8b1f22db..449e7d9d549f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -944,6 +944,20 @@
 #define PCI_DEVICE_ID_SUN_TOMATILLO	0xa801
 #define PCI_DEVICE_ID_SUN_CASSINI	0xabba
 
+#define PCI_VENDOR_ID_NI		0x1093
+#define PCI_DEVICE_ID_NI_PXI8430_2322	0x7080
+#define PCI_DEVICE_ID_NI_PCI8430_2322	0x70db
+#define PCI_DEVICE_ID_NI_PXI8430_2324	0x70dd
+#define PCI_DEVICE_ID_NI_PCI8430_2324	0x70df
+#define PCI_DEVICE_ID_NI_PXI8430_2328	0x70e2
+#define PCI_DEVICE_ID_NI_PCI8430_2328	0x70e4
+#define PCI_DEVICE_ID_NI_PXI8430_23216	0x70e6
+#define PCI_DEVICE_ID_NI_PCI8430_23216	0x70e7
+#define PCI_DEVICE_ID_NI_PXI8432_2322	0x70e8
+#define PCI_DEVICE_ID_NI_PCI8432_2322	0x70ea
+#define PCI_DEVICE_ID_NI_PXI8432_2324	0x70ec
+#define PCI_DEVICE_ID_NI_PCI8432_2324	0x70ee
+
 #define PCI_VENDOR_ID_CMD		0x1095
 #define PCI_DEVICE_ID_CMD_643		0x0643
 #define PCI_DEVICE_ID_CMD_646		0x0646
-- 
cgit v1.2.3


From 04bf7e745b841619d2f14f2f8b6f2c97f1c6757e Mon Sep 17 00:00:00 2001
From: Will Page <will.page@ni.com>
Date: Mon, 6 Apr 2009 17:32:15 +0100
Subject: 8250_pci: add support for National Instruments legacy 8420 RS232
 boards

Signed-off-by: Will Page <will.page@ni.com>
Signed-off-by: Shawn Bohrer <shawn.bohrer@ni.com>
Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250_pci.c | 218 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h   |  12 +++
 2 files changed, 230 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 2f570c761faf..5b0e80da1c44 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -306,6 +306,33 @@ static void __devexit pci_plx9050_exit(struct pci_dev *dev)
 	}
 }
 
+#define NI8420_INT_ENABLE_REG	0x38
+#define NI8420_INT_ENABLE_BIT	0x2000
+
+static void __devexit pci_ni8420_exit(struct pci_dev *dev)
+{
+	void __iomem *p;
+	unsigned long base, len;
+	unsigned int bar = 0;
+
+	if ((pci_resource_flags(dev, bar) & IORESOURCE_MEM) == 0) {
+		moan_device("no memory in bar", dev);
+		return;
+	}
+
+	base = pci_resource_start(dev, bar);
+	len =  pci_resource_len(dev, bar);
+	p = ioremap_nocache(base, len);
+	if (p == NULL)
+		return;
+
+	/* Disable the CPU Interrupt */
+	writel(readl(p + NI8420_INT_ENABLE_REG) & ~(NI8420_INT_ENABLE_BIT),
+	       p + NI8420_INT_ENABLE_REG);
+	iounmap(p);
+}
+
+
 /* MITE registers */
 #define MITE_IOWBSR1	0xc4
 #define MITE_IOWCR1	0xf4
@@ -627,6 +654,31 @@ static int pci_xircom_init(struct pci_dev *dev)
 	return 0;
 }
 
+static int pci_ni8420_init(struct pci_dev *dev)
+{
+	void __iomem *p;
+	unsigned long base, len;
+	unsigned int bar = 0;
+
+	if ((pci_resource_flags(dev, bar) & IORESOURCE_MEM) == 0) {
+		moan_device("no memory in bar", dev);
+		return 0;
+	}
+
+	base = pci_resource_start(dev, bar);
+	len =  pci_resource_len(dev, bar);
+	p = ioremap_nocache(base, len);
+	if (p == NULL)
+		return -ENOMEM;
+
+	/* Enable CPU Interrupt */
+	writel(readl(p + NI8420_INT_ENABLE_REG) | NI8420_INT_ENABLE_BIT,
+	       p + NI8420_INT_ENABLE_REG);
+
+	iounmap(p);
+	return 0;
+}
+
 #define MITE_IOWBSR1_WSIZE	0xa
 #define MITE_IOWBSR1_WIN_OFFSET	0x800
 #define MITE_IOWBSR1_WENAB	(1 << 7)
@@ -1021,6 +1073,114 @@ static struct pci_serial_quirk pci_serial_quirks[] __refdata = {
 	/*
 	 * National Instruments
 	 */
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PCI23216,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PCI2328,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PCI2324,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PCI2322,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PCI2324I,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PCI2322I,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PXI8420_23216,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PXI8420_2328,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PXI8420_2324,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PXI8420_2322,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PXI8422_2324,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_NI,
+		.device		= PCI_DEVICE_ID_NI_PXI8422_2322,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.init		= pci_ni8420_init,
+		.setup		= pci_default_setup,
+		.exit		= __devexit_p(pci_ni8420_exit),
+	},
 	{
 		.vendor		= PCI_VENDOR_ID_NI,
 		.device		= PCI_ANY_ID,
@@ -1334,6 +1494,7 @@ enum pci_board_num_t {
 	pbn_b1_2_115200,
 	pbn_b1_4_115200,
 	pbn_b1_8_115200,
+	pbn_b1_16_115200,
 
 	pbn_b1_1_921600,
 	pbn_b1_2_921600,
@@ -1343,6 +1504,9 @@ enum pci_board_num_t {
 	pbn_b1_2_1250000,
 
 	pbn_b1_bt_1_115200,
+	pbn_b1_bt_2_115200,
+	pbn_b1_bt_4_115200,
+
 	pbn_b1_bt_2_921600,
 
 	pbn_b1_1_1382400,
@@ -1609,6 +1773,12 @@ static struct pciserial_board pci_boards[] __devinitdata = {
 		.base_baud	= 115200,
 		.uart_offset	= 8,
 	},
+	[pbn_b1_16_115200] = {
+		.flags		= FL_BASE1,
+		.num_ports	= 16,
+		.base_baud	= 115200,
+		.uart_offset	= 8,
+	},
 
 	[pbn_b1_1_921600] = {
 		.flags		= FL_BASE1,
@@ -1647,6 +1817,18 @@ static struct pciserial_board pci_boards[] __devinitdata = {
 		.base_baud	= 115200,
 		.uart_offset	= 8,
 	},
+	[pbn_b1_bt_2_115200] = {
+		.flags		= FL_BASE1|FL_BASE_BARS,
+		.num_ports	= 2,
+		.base_baud	= 115200,
+		.uart_offset	= 8,
+	},
+	[pbn_b1_bt_4_115200] = {
+		.flags		= FL_BASE1|FL_BASE_BARS,
+		.num_ports	= 4,
+		.base_baud	= 115200,
+		.uart_offset	= 8,
+	},
 
 	[pbn_b1_bt_2_921600] = {
 		.flags		= FL_BASE1|FL_BASE_BARS,
@@ -3207,6 +3389,42 @@ static struct pci_device_id serial_pci_tbl[] = {
 	/*
 	 * National Instruments
 	 */
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI23216,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_16_115200 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI2328,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_8_115200 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI2324,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_bt_4_115200 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI2322,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_bt_2_115200 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI2324I,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_bt_4_115200 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PCI2322I,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_bt_2_115200 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8420_23216,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_16_115200 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8420_2328,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_8_115200 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8420_2324,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_bt_4_115200 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8420_2322,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_bt_2_115200 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8422_2324,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_bt_4_115200 },
+	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8422_2322,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b1_bt_2_115200 },
 	{	PCI_VENDOR_ID_NI, PCI_DEVICE_ID_NI_PXI8430_2322,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_ni8430_2 },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 449e7d9d549f..ee98cd570885 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -945,6 +945,18 @@
 #define PCI_DEVICE_ID_SUN_CASSINI	0xabba
 
 #define PCI_VENDOR_ID_NI		0x1093
+#define PCI_DEVICE_ID_NI_PCI2322	0xd130
+#define PCI_DEVICE_ID_NI_PCI2324	0xd140
+#define PCI_DEVICE_ID_NI_PCI2328	0xd150
+#define PCI_DEVICE_ID_NI_PXI8422_2322	0xd190
+#define PCI_DEVICE_ID_NI_PXI8422_2324	0xd1a0
+#define PCI_DEVICE_ID_NI_PXI8420_2322	0xd1d0
+#define PCI_DEVICE_ID_NI_PXI8420_2324	0xd1e0
+#define PCI_DEVICE_ID_NI_PXI8420_2328	0xd1f0
+#define PCI_DEVICE_ID_NI_PXI8420_23216	0xd1f1
+#define PCI_DEVICE_ID_NI_PCI2322I	0xd250
+#define PCI_DEVICE_ID_NI_PCI2324I	0xd270
+#define PCI_DEVICE_ID_NI_PCI23216	0xd2b0
 #define PCI_DEVICE_ID_NI_PXI8430_2322	0x7080
 #define PCI_DEVICE_ID_NI_PCI8430_2322	0x70db
 #define PCI_DEVICE_ID_NI_PXI8430_2324	0x70dd
-- 
cgit v1.2.3


From 06e82df015afad2d96d030f76f5e4d13e6dcdfa4 Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Mon, 6 Apr 2009 17:34:34 +0100
Subject: mux: fix build problem

Fixes:

  In file included from drivers/serial/mux.c:37:
  include/linux/serial_core.h: In function 'uart_handle_sysrq_char':
  include/linux/serial_core.h:467: error: 'struct uart_port' has no member named 'sysrq'
  include/linux/serial_core.h:468: error: 'struct uart_port' has no member named 'sysrq'

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/serial_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index df9245c7bd3b..83e4b3ff9cda 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -277,7 +277,7 @@ struct uart_port {
 	struct uart_icount	icount;			/* statistics */
 
 	struct console		*cons;			/* struct console, if any */
-#ifdef CONFIG_SERIAL_CORE_CONSOLE
+#if defined(CONFIG_SERIAL_CORE_CONSOLE) || defined(SUPPORT_SYSRQ)
 	unsigned long		sysrq;			/* sysrq timeout */
 #endif
 
-- 
cgit v1.2.3


From 0c659b82d11eaf5e1bee1f2bcb9994b9d09d175c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Thu, 2 Apr 2009 10:37:25 -0400
Subject: ata: Add TRIM infrastructure

This is common code shared between the IDE and libata implementations

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 include/linux/ata.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 6617c9f8f2ca..cb79b7a208e1 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -29,6 +29,8 @@
 #ifndef __LINUX_ATA_H__
 #define __LINUX_ATA_H__
 
+#include <linux/kernel.h>
+#include <linux/string.h>
 #include <linux/types.h>
 #include <asm/byteorder.h>
 
@@ -91,6 +93,7 @@ enum {
 	ATA_ID_CFA_POWER	= 160,
 	ATA_ID_CFA_KEY_MGMT	= 162,
 	ATA_ID_CFA_MODES	= 163,
+	ATA_ID_DATA_SET_MGMT	= 169,
 	ATA_ID_ROT_SPEED	= 217,
 	ATA_ID_PIO4		= (1 << 1),
 
@@ -248,6 +251,7 @@ enum {
 	ATA_CMD_SMART		= 0xB0,
 	ATA_CMD_MEDIA_LOCK	= 0xDE,
 	ATA_CMD_MEDIA_UNLOCK	= 0xDF,
+	ATA_CMD_DSM		= 0x06,
 	/* marked obsolete in the ATA/ATAPI-7 spec */
 	ATA_CMD_RESTORE		= 0x10,
 
@@ -321,6 +325,9 @@ enum {
 	ATA_SMART_READ_VALUES	= 0xD0,
 	ATA_SMART_READ_THRESHOLDS = 0xD1,
 
+	/* feature values for Data Set Management */
+	ATA_DSM_TRIM		= 0x01,
+
 	/* password used in LBA Mid / LBA High for executing SMART commands */
 	ATA_SMART_LBAM_PASS	= 0x4F,
 	ATA_SMART_LBAH_PASS	= 0xC2,
@@ -723,6 +730,14 @@ static inline int ata_id_has_unload(const u16 *id)
 	return 0;
 }
 
+static inline int ata_id_has_trim(const u16 *id)
+{
+	if (ata_id_major_version(id) >= 7 &&
+	    (id[ATA_ID_DATA_SET_MGMT] & 1))
+		return 1;
+	return 0;
+}
+
 static inline int ata_id_current_chs_valid(const u16 *id)
 {
 	/* For ATA-1 devices, if the INITIALIZE DEVICE PARAMETERS command
@@ -863,6 +878,32 @@ static inline void ata_id_to_hd_driveid(u16 *id)
 #endif
 }
 
+/*
+ * Write up to 'max' LBA Range Entries to the buffer that will cover the
+ * extent from sector to sector + count.  This is used for TRIM and for
+ * ADD LBA(S) TO NV CACHE PINNED SET.
+ */
+static inline unsigned ata_set_lba_range_entries(void *_buffer, unsigned max,
+						u64 sector, unsigned long count)
+{
+	__le64 *buffer = _buffer;
+	unsigned i = 0;
+
+	while (i < max) {
+		u64 entry = sector |
+			((u64)(count > 0xffff ? 0xffff : count) << 48);
+		buffer[i++] = __cpu_to_le64(entry);
+		if (count <= 0xffff)
+			break;
+		count -= 0xffff;
+		sector += 0xffff;
+	}
+
+	max = ALIGN(i * 8, 512);
+	memset(buffer + i, 0, max - i * 8);
+	return max;
+}
+
 static inline int is_multi_taskfile(struct ata_taskfile *tf)
 {
 	return (tf->command == ATA_CMD_READ_MULTI) ||
-- 
cgit v1.2.3


From e0d3bafd02586cfde286c320f56906fd9fa8d256 Mon Sep 17 00:00:00 2001
From: Sri Deevi <Srinivasa.Deevi@conexant.com>
Date: Tue, 3 Mar 2009 14:37:50 -0300
Subject: V4L/DVB (10954): Add cx231xx USB driver

Signed-off-by: Srinivasa Deevi <srinivasa.deevi@conexant.com>
[mchehab@redhat.com: Remove the Kconfig changes, to avoid git breakages]
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/Makefile                   |    1 +
 drivers/media/video/cx231xx/Kconfig            |   35 +
 drivers/media/video/cx231xx/Makefile           |   15 +
 drivers/media/video/cx231xx/cx231xx-audio.c    |  581 ++++++
 drivers/media/video/cx231xx/cx231xx-avcore.c   | 2289 ++++++++++++++++++++++
 drivers/media/video/cx231xx/cx231xx-cards.c    |  935 +++++++++
 drivers/media/video/cx231xx/cx231xx-conf-reg.h |  491 +++++
 drivers/media/video/cx231xx/cx231xx-core.c     | 1167 ++++++++++++
 drivers/media/video/cx231xx/cx231xx-dvb.c      |  565 ++++++
 drivers/media/video/cx231xx/cx231xx-i2c.c      |  577 ++++++
 drivers/media/video/cx231xx/cx231xx-input.c    |  250 +++
 drivers/media/video/cx231xx/cx231xx-reg.h      | 1574 +++++++++++++++
 drivers/media/video/cx231xx/cx231xx-vbi.c      |  693 +++++++
 drivers/media/video/cx231xx/cx231xx-vbi.h      |   61 +
 drivers/media/video/cx231xx/cx231xx-video.c    | 2440 ++++++++++++++++++++++++
 drivers/media/video/cx231xx/cx231xx.h          |  762 ++++++++
 include/linux/i2c-id.h                         |    1 +
 17 files changed, 12437 insertions(+)
 create mode 100644 drivers/media/video/cx231xx/Kconfig
 create mode 100644 drivers/media/video/cx231xx/Makefile
 create mode 100644 drivers/media/video/cx231xx/cx231xx-audio.c
 create mode 100644 drivers/media/video/cx231xx/cx231xx-avcore.c
 create mode 100644 drivers/media/video/cx231xx/cx231xx-cards.c
 create mode 100644 drivers/media/video/cx231xx/cx231xx-conf-reg.h
 create mode 100644 drivers/media/video/cx231xx/cx231xx-core.c
 create mode 100644 drivers/media/video/cx231xx/cx231xx-dvb.c
 create mode 100644 drivers/media/video/cx231xx/cx231xx-i2c.c
 create mode 100644 drivers/media/video/cx231xx/cx231xx-input.c
 create mode 100644 drivers/media/video/cx231xx/cx231xx-reg.h
 create mode 100644 drivers/media/video/cx231xx/cx231xx-vbi.c
 create mode 100644 drivers/media/video/cx231xx/cx231xx-vbi.h
 create mode 100644 drivers/media/video/cx231xx/cx231xx-video.c
 create mode 100644 drivers/media/video/cx231xx/cx231xx.h

(limited to 'include/linux')

diff --git a/drivers/media/video/Makefile b/drivers/media/video/Makefile
index ba02977adf10..7c0bd6e78312 100644
--- a/drivers/media/video/Makefile
+++ b/drivers/media/video/Makefile
@@ -67,6 +67,7 @@ obj-$(CONFIG_VIDEO_MEYE) += meye.o
 obj-$(CONFIG_VIDEO_SAA7134) += saa7134/
 obj-$(CONFIG_VIDEO_CX88) += cx88/
 obj-$(CONFIG_VIDEO_EM28XX) += em28xx/
+obj-$(CONFIG_VIDEO_CX231XX) += cx231xx/
 obj-$(CONFIG_VIDEO_USBVISION) += usbvision/
 obj-$(CONFIG_VIDEO_TVP5150) += tvp5150.o
 obj-$(CONFIG_VIDEO_TVP514X) += tvp514x.o
diff --git a/drivers/media/video/cx231xx/Kconfig b/drivers/media/video/cx231xx/Kconfig
new file mode 100644
index 000000000000..0f0e2b9d9853
--- /dev/null
+++ b/drivers/media/video/cx231xx/Kconfig
@@ -0,0 +1,35 @@
+config VIDEO_CX231XX
+	tristate "Conexant cx231xx USB video capture support"
+	depends on VIDEO_DEV && I2C && INPUT
+	select VIDEO_TUNER
+	select VIDEO_TVEEPROM
+	select VIDEO_IR
+	select VIDEOBUF_VMALLOC
+	select VIDEO_CX25840
+	select VIDEO_CX231XX_ALSA
+
+	---help---
+	  This is a video4linux driver for Conexant 231xx USB based TV cards.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called cx231xx
+
+config VIDEO_CX231XX_ALSA
+    tristate "Conexant Cx231xx ALSA audio module"
+	depends on VIDEO_CX231XX && SND
+	select SND_PCM
+
+	---help---
+	  This is an ALSA driver for Cx231xx USB based TV cards.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called cx231xx-alsa
+
+config VIDEO_CX231XX_DVB
+	tristate "DVB/ATSC Support for Cx231xx based TV cards"
+	depends on VIDEO_CX231XX && DVB_CORE
+	select VIDEOBUF_DVB
+	select MEDIA_TUNER_XC5000 if !DVB_FE_CUSTOMIZE
+	---help---
+	  This adds support for DVB cards based on the
+	  Conexant cx231xx chips.
diff --git a/drivers/media/video/cx231xx/Makefile b/drivers/media/video/cx231xx/Makefile
new file mode 100644
index 000000000000..2590a09f3442
--- /dev/null
+++ b/drivers/media/video/cx231xx/Makefile
@@ -0,0 +1,15 @@
+cx231xx-objs     := cx231xx-video.o cx231xx-i2c.o cx231xx-cards.o cx231xx-core.o \
+                    cx231xx-avcore.o cx231xx-pcb-config.o cx231xx-vbi.o
+
+cx231xx-alsa-objs := cx231xx-audio.o
+
+
+obj-$(CONFIG_VIDEO_CX231XX) += cx231xx.o
+obj-$(CONFIG_VIDEO_CX231XX_ALSA) += cx231xx-alsa.o
+obj-$(CONFIG_VIDEO_CX231XX_DVB) += cx231xx-dvb.o
+
+EXTRA_CFLAGS += -Idrivers/media/video
+EXTRA_CFLAGS += -Idrivers/media/common/tuners
+EXTRA_CFLAGS += -Idrivers/media/dvb/dvb-core
+EXTRA_CFLAGS += -Idrivers/media/dvb/frontends
+
diff --git a/drivers/media/video/cx231xx/cx231xx-audio.c b/drivers/media/video/cx231xx/cx231xx-audio.c
new file mode 100644
index 000000000000..e4335e2a4103
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-audio.c
@@ -0,0 +1,581 @@
+/*
+ *  Conexant Cx231xx audio extension
+ *
+ *  Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+ *       Based on em28xx driver
+ *
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/usb.h>
+#include <linux/init.h>
+#include <linux/sound.h>
+#include <linux/spinlock.h>
+#include <linux/soundcard.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <sound/core.h>
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include <sound/info.h>
+#include <sound/initval.h>
+#include <sound/control.h>
+#include <media/v4l2-common.h>
+#include "cx231xx.h"
+#include "cx231xx-pcb-config.h"
+
+static int debug;
+module_param(debug, int, 0644);
+MODULE_PARM_DESC(debug, "activates debug info");
+
+#define dprintk(fmt, arg...) do {					\
+	    if (debug)							\
+		printk(KERN_INFO "cx231xx-audio %s: " fmt,		\
+				  __func__, ##arg); 		\
+	} while (0)
+
+static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
+
+static int cx231xx_isoc_audio_deinit(struct cx231xx *dev)
+{
+	int i;
+
+	dprintk("Stopping isoc\n");
+
+
+	for (i = 0; i < CX231XX_AUDIO_BUFS; i++) {
+        if(dev->adev.urb[i]) {
+            if (!irqs_disabled())
+			    usb_kill_urb(dev->adev.urb[i]);
+		    else
+		        usb_unlink_urb(dev->adev.urb[i]);
+
+		    usb_free_urb(dev->adev.urb[i]);
+		    dev->adev.urb[i] = NULL;
+
+            kfree(dev->adev.transfer_buffer[i]);
+		    dev->adev.transfer_buffer[i] = NULL;
+
+        }
+	}
+
+	return 0;
+}
+
+static void cx231xx_audio_isocirq(struct urb *urb)
+{
+	struct cx231xx            *dev = urb->context;
+	int                      i;
+	unsigned int             oldptr;
+	int                      period_elapsed = 0;
+	int                      status;
+	unsigned char            *cp;
+	unsigned int             stride;
+	struct snd_pcm_substream *substream;
+	struct snd_pcm_runtime   *runtime;
+
+    switch (urb->status) {
+	    case 0:             /* success */
+	    case -ETIMEDOUT:    /* NAK */
+		    break;
+	    case -ECONNRESET:   /* kill */
+	    case -ENOENT:
+	    case -ESHUTDOWN:
+		    return;
+	    default:            /* error */
+		    dprintk("urb completition error %d.\n", urb->status);
+		    break;
+	}
+
+	if (dev->adev.capture_pcm_substream) {
+		substream = dev->adev.capture_pcm_substream;
+		runtime = substream->runtime;
+		stride = runtime->frame_bits >> 3;
+
+		for (i = 0; i < urb->number_of_packets; i++) {
+			int length =
+			    urb->iso_frame_desc[i].actual_length / stride;
+			cp = (unsigned char *)urb->transfer_buffer +
+			    urb->iso_frame_desc[i].offset;
+
+			if (!length)
+				continue;
+
+			oldptr = dev->adev.hwptr_done_capture;
+			if (oldptr + length >= runtime->buffer_size) {
+				unsigned int cnt =
+				    runtime->buffer_size - oldptr;
+				memcpy(runtime->dma_area + oldptr * stride, cp,
+				       cnt * stride);
+				memcpy(runtime->dma_area, cp + cnt * stride,
+				       length * stride - cnt * stride);
+			} else {
+				memcpy(runtime->dma_area + oldptr * stride, cp,
+				       length * stride);
+			}
+
+			snd_pcm_stream_lock(substream);
+
+			dev->adev.hwptr_done_capture += length;
+			if (dev->adev.hwptr_done_capture >=
+			    runtime->buffer_size)
+				dev->adev.hwptr_done_capture -=
+				    runtime->buffer_size;
+
+			dev->adev.capture_transfer_done += length;
+			if (dev->adev.capture_transfer_done >=
+			    runtime->period_size) {
+				dev->adev.capture_transfer_done -=
+				    runtime->period_size;
+				period_elapsed = 1;
+			}
+
+			snd_pcm_stream_unlock(substream);
+		}
+		if (period_elapsed)
+			snd_pcm_period_elapsed(substream);
+	}
+	urb->status = 0;
+
+	status = usb_submit_urb(urb, GFP_ATOMIC);
+	if (status < 0) {
+		cx231xx_errdev("resubmit of audio urb failed (error=%i)\n",
+			      status);
+	}
+	return;
+}
+
+static int cx231xx_init_audio_isoc(struct cx231xx *dev)
+{
+	int       i, errCode;
+	int       sb_size;
+
+    cx231xx_info("%s: Starting AUDIO transfers\n",__func__);
+
+    sb_size = CX231XX_NUM_AUDIO_PACKETS * dev->adev.max_pkt_size;
+
+	for (i = 0; i < CX231XX_AUDIO_BUFS; i++) {
+		struct urb *urb;
+		int j, k;
+
+		dev->adev.transfer_buffer[i] = kmalloc(sb_size, GFP_ATOMIC);
+		if (!dev->adev.transfer_buffer[i])
+			return -ENOMEM;
+
+		memset(dev->adev.transfer_buffer[i], 0x80, sb_size);
+		urb = usb_alloc_urb(CX231XX_NUM_AUDIO_PACKETS, GFP_ATOMIC);
+		if (!urb) {
+			cx231xx_errdev("usb_alloc_urb failed!\n");
+			for (j = 0; j < i; j++) {
+				usb_free_urb(dev->adev.urb[j]);
+				kfree(dev->adev.transfer_buffer[j]);
+			}
+			return -ENOMEM;
+		}
+
+		urb->dev = dev->udev;
+		urb->context = dev;
+		urb->pipe = usb_rcvisocpipe(dev->udev, dev->adev.end_point_addr);
+		urb->transfer_flags = URB_ISO_ASAP;
+		urb->transfer_buffer = dev->adev.transfer_buffer[i];
+		urb->interval = 1;
+		urb->complete = cx231xx_audio_isocirq;
+		urb->number_of_packets = CX231XX_NUM_AUDIO_PACKETS;
+		urb->transfer_buffer_length = sb_size;
+
+		for (j = k = 0; j < CX231XX_NUM_AUDIO_PACKETS;
+			     j++, k += dev->adev.max_pkt_size) {
+			urb->iso_frame_desc[j].offset = k;
+			urb->iso_frame_desc[j].length =
+			    dev->adev.max_pkt_size;
+		}
+		dev->adev.urb[i] = urb;
+	}
+
+	for (i = 0; i < CX231XX_AUDIO_BUFS; i++) {
+		errCode = usb_submit_urb(dev->adev.urb[i], GFP_ATOMIC);
+		if (errCode < 0) {
+			cx231xx_isoc_audio_deinit(dev);
+			return errCode;
+		}
+	}
+
+	return errCode;
+}
+
+static int cx231xx_cmd(struct cx231xx *dev, int cmd, int arg)
+{
+	dprintk("%s transfer\n", (dev->adev.capture_stream == STREAM_ON)?
+				 "stop" : "start");
+
+	switch (cmd) {
+    case CX231XX_CAPTURE_STREAM_EN:
+		if (dev->adev.capture_stream == STREAM_OFF && arg == 1) {
+			dev->adev.capture_stream = STREAM_ON;
+			cx231xx_init_audio_isoc(dev);
+		} else if (dev->adev.capture_stream == STREAM_ON && arg == 0) {
+			dev->adev.capture_stream = STREAM_OFF;
+			cx231xx_isoc_audio_deinit(dev);
+		} else {
+			cx231xx_errdev( "An underrun very likely occurred. "
+					"Ignoring it.\n");
+		}
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int snd_pcm_alloc_vmalloc_buffer(struct snd_pcm_substream *subs,
+					size_t size)
+{
+	struct snd_pcm_runtime *runtime = subs->runtime;
+
+	dprintk("Allocating vbuffer\n");
+	if (runtime->dma_area) {
+		if (runtime->dma_bytes > size)
+			return 0;
+
+		vfree(runtime->dma_area);
+	}
+	runtime->dma_area = vmalloc(size);
+	if (!runtime->dma_area)
+		return -ENOMEM;
+
+	runtime->dma_bytes = size;
+
+	return 0;
+}
+
+static struct snd_pcm_hardware snd_cx231xx_hw_capture = {
+	.info = SNDRV_PCM_INFO_BLOCK_TRANSFER |
+		SNDRV_PCM_INFO_MMAP           |
+		SNDRV_PCM_INFO_INTERLEAVED    |
+		SNDRV_PCM_INFO_MMAP_VALID,
+
+	.formats = SNDRV_PCM_FMTBIT_S16_LE,
+
+	.rates = SNDRV_PCM_RATE_CONTINUOUS | SNDRV_PCM_RATE_KNOT,
+
+	.rate_min = 48000,
+	.rate_max = 48000,
+	.channels_min = 2,
+	.channels_max = 2,
+	.buffer_bytes_max = 62720 * 8,	/* just about the value in usbaudio.c */
+	.period_bytes_min = 64,		    /* 12544/2, */
+	.period_bytes_max = 12544,
+	.periods_min = 2,
+	.periods_max = 98,		        /* 12544, */
+};
+
+static int snd_cx231xx_capture_open(struct snd_pcm_substream *substream)
+{
+	struct cx231xx *dev = snd_pcm_substream_chip(substream);
+	struct snd_pcm_runtime *runtime = substream->runtime;
+	int ret = 0;
+
+	dprintk("opening device and trying to acquire exclusive lock\n");
+
+	if (!dev) {
+		cx231xx_errdev("BUG: cx231xx can't find device struct."
+				" Can't proceed with open\n");
+		return -ENODEV;
+	}
+
+	/* Sets volume, mute, etc */
+	dev->mute = 0;
+
+    /* set alternate setting for audio interface */
+    ret = cx231xx_set_alt_setting(dev, INDEX_AUDIO, 1); /* 1 - 48000 samples per sec */
+    if (ret < 0) {
+        cx231xx_errdev("failed to set alternate setting !\n");
+
+        return ret;
+    }
+
+    /* inform hardware to start streaming */
+    ret = cx231xx_capture_start(dev, 1, Audio);
+
+	runtime->hw = snd_cx231xx_hw_capture;
+
+    mutex_lock(&dev->lock);
+	dev->adev.users++;
+    mutex_unlock(&dev->lock);
+
+	snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIODS);
+	dev->adev.capture_pcm_substream = substream;
+	runtime->private_data = dev;
+
+	return 0;
+}
+
+static int snd_cx231xx_pcm_close(struct snd_pcm_substream *substream)
+{
+    int ret;
+	struct cx231xx *dev = snd_pcm_substream_chip(substream);
+
+
+	dprintk("closing device\n");
+
+    /* set alternate setting for audio interface */
+    ret = cx231xx_set_alt_setting(dev, INDEX_AUDIO, 0); /* 1 - 48000 samples per sec */
+    if (ret < 0) {
+        cx231xx_errdev("failed to set alternate setting !\n");
+
+        return ret;
+    }
+
+    /* inform hardware to start streaming */
+    ret = cx231xx_capture_start(dev, 0, Audio);
+
+	dev->mute = 1;
+	mutex_lock(&dev->lock);
+    dev->adev.users--;
+	mutex_unlock(&dev->lock);
+
+	if (dev->adev.users == 0 && dev->adev.shutdown == 1) {
+		dprintk("audio users: %d\n", dev->adev.users);
+		dprintk("disabling audio stream!\n");
+		dev->adev.shutdown = 0;
+		dprintk("released lock\n");
+		cx231xx_cmd(dev, CX231XX_CAPTURE_STREAM_EN, 0);
+	}
+	return 0;
+}
+
+static int snd_cx231xx_hw_capture_params(struct snd_pcm_substream *substream,
+					struct snd_pcm_hw_params *hw_params)
+{
+	unsigned int channels, rate, format;
+	int ret;
+
+	dprintk("Setting capture parameters\n");
+
+	ret = snd_pcm_alloc_vmalloc_buffer(substream,
+				params_buffer_bytes(hw_params));
+	format = params_format(hw_params);
+	rate = params_rate(hw_params);
+	channels = params_channels(hw_params);
+
+	/* TODO: set up cx231xx audio chip to deliver the correct audio format,
+	   current default is 48000hz multiplexed => 96000hz mono
+	   which shouldn't matter since analogue TV only supports mono */
+	return 0;
+}
+
+static int snd_cx231xx_hw_capture_free(struct snd_pcm_substream *substream)
+{
+	struct cx231xx *dev = snd_pcm_substream_chip(substream);
+
+	dprintk("Stop capture, if needed\n");
+
+	if (dev->adev.capture_stream == STREAM_ON)
+		cx231xx_cmd(dev, CX231XX_CAPTURE_STREAM_EN, CX231XX_STOP_AUDIO);
+
+	return 0;
+}
+
+static int snd_cx231xx_prepare(struct snd_pcm_substream *substream)
+{
+	return 0;
+}
+
+static int snd_cx231xx_capture_trigger(struct snd_pcm_substream *substream,
+				      int cmd)
+{
+	struct cx231xx *dev = snd_pcm_substream_chip(substream);
+    int retval;
+
+
+	dprintk("Should %s capture\n", (cmd == SNDRV_PCM_TRIGGER_START)?
+				       "start": "stop");
+
+    spin_lock(&dev->adev.slock);
+	switch (cmd) {
+	case SNDRV_PCM_TRIGGER_START:
+		cx231xx_cmd(dev, CX231XX_CAPTURE_STREAM_EN, CX231XX_START_AUDIO);
+		retval = 0;
+		break;
+	case SNDRV_PCM_TRIGGER_STOP:
+        cx231xx_cmd(dev, CX231XX_CAPTURE_STREAM_EN, CX231XX_STOP_AUDIO);
+		retval = 0;
+        break;
+	default:
+		retval = -EINVAL;
+	}
+
+    spin_unlock(&dev->adev.slock);
+	return retval;
+}
+
+static snd_pcm_uframes_t snd_cx231xx_capture_pointer(struct snd_pcm_substream
+						    *substream)
+{
+	struct cx231xx *dev;
+    unsigned long flags;
+	snd_pcm_uframes_t hwptr_done;
+
+	dev = snd_pcm_substream_chip(substream);
+
+    spin_lock_irqsave(&dev->adev.slock, flags);
+	hwptr_done = dev->adev.hwptr_done_capture;
+    spin_unlock_irqrestore(&dev->adev.slock, flags);
+
+	return hwptr_done;
+}
+
+static struct page *snd_pcm_get_vmalloc_page(struct snd_pcm_substream *subs,
+					     unsigned long offset)
+{
+	void *pageptr = subs->runtime->dma_area + offset;
+
+	return vmalloc_to_page(pageptr);
+}
+
+static struct snd_pcm_ops snd_cx231xx_pcm_capture = {
+	.open      = snd_cx231xx_capture_open,
+	.close     = snd_cx231xx_pcm_close,
+	.ioctl     = snd_pcm_lib_ioctl,
+	.hw_params = snd_cx231xx_hw_capture_params,
+	.hw_free   = snd_cx231xx_hw_capture_free,
+	.prepare   = snd_cx231xx_prepare,
+	.trigger   = snd_cx231xx_capture_trigger,
+	.pointer   = snd_cx231xx_capture_pointer,
+	.page      = snd_pcm_get_vmalloc_page,
+};
+
+static int cx231xx_audio_init(struct cx231xx *dev)
+{
+	struct cx231xx_audio *adev = &dev->adev;
+	struct snd_pcm      *pcm;
+	struct snd_card     *card;
+	static int          devnr;
+	int                 err;
+    struct usb_interface *uif;
+    int i, isoc_pipe = 0;
+
+	if (dev->has_alsa_audio != 1) {
+		/* This device does not support the extension (in this case
+		   the device is expecting the snd-usb-audio module or
+		   doesn't have analog audio support at all) */
+		return 0;
+	}
+
+	cx231xx_info("cx231xx-audio.c: probing for cx231xx "
+			 "non standard usbaudio\n");
+
+	card = snd_card_new(index[devnr], "Cx231xx Audio", THIS_MODULE, 0);
+	if (card == NULL) {
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&adev->slock);
+	err = snd_pcm_new(card, "Cx231xx Audio", 0, 0, 1, &pcm);
+	if (err < 0) {
+		snd_card_free(card);
+		return err;
+	}
+
+	snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE, &snd_cx231xx_pcm_capture);
+	pcm->info_flags = 0;
+	pcm->private_data = dev;
+	strcpy(pcm->name, "Conexant cx231xx Capture");
+	strcpy(card->driver, "Conexant cx231xx Audio");
+	strcpy(card->shortname, "Cx231xx Audio");
+	strcpy(card->longname, "Conexant cx231xx Audio");
+
+	err = snd_card_register(card);
+	if (err < 0) {
+		snd_card_free(card);
+		return err;
+	}
+	adev->sndcard = card;
+	adev->udev = dev->udev;
+
+    /* compute alternate max packet sizes for Audio */
+    uif = dev->udev->actconfig->interface[dev->current_pcb_config.hs_config_info[0].interface_info.audio_index+1];
+
+    adev->end_point_addr = le16_to_cpu(uif->altsetting[0].endpoint[isoc_pipe].desc.bEndpointAddress);
+
+    adev->num_alt = uif->num_altsetting;
+    cx231xx_info(": EndPoint Addr 0x%x, Alternate settings: %i\n", adev->end_point_addr,
+                                    adev->num_alt);
+    adev->alt_max_pkt_size = kmalloc(32 * adev->num_alt, GFP_KERNEL);
+
+    if (adev->alt_max_pkt_size == NULL) {
+        cx231xx_errdev("out of memory!\n");
+        return -ENOMEM;
+    }
+
+    for (i = 0; i < adev->num_alt ; i++) {
+        u16 tmp = le16_to_cpu(uif->altsetting[i].endpoint[isoc_pipe].desc.
+					        wMaxPacketSize);
+        adev->alt_max_pkt_size[i] =
+            (tmp & 0x07ff) * (((tmp & 0x1800) >> 11) + 1);
+        cx231xx_info("Alternate setting %i, max size= %i\n", i,
+				        adev->alt_max_pkt_size[i]);
+    }
+
+	return 0;
+}
+
+static int cx231xx_audio_fini(struct cx231xx *dev)
+{
+	if (dev == NULL)
+		return 0;
+
+	if (dev->has_alsa_audio != 1) {
+		/* This device does not support the extension (in this case
+		   the device is expecting the snd-usb-audio module or
+		   doesn't have analog audio support at all) */
+		return 0;
+	}
+
+	if (dev->adev.sndcard) {
+		snd_card_free(dev->adev.sndcard);
+        kfree(dev->adev.alt_max_pkt_size);
+		dev->adev.sndcard = NULL;
+	}
+
+	return 0;
+}
+
+static struct cx231xx_ops audio_ops = {
+    .id   = CX231XX_AUDIO,
+	.name = "Cx231xx Audio Extension",
+	.init = cx231xx_audio_init,
+	.fini = cx231xx_audio_fini,
+};
+
+static int __init cx231xx_alsa_register(void)
+{
+	return cx231xx_register_extension(&audio_ops);
+}
+
+static void __exit cx231xx_alsa_unregister(void)
+{
+	cx231xx_unregister_extension(&audio_ops);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Srinivasa Deevi <srinivasa.deevi@conexant.com>");
+MODULE_DESCRIPTION("Cx231xx Audio driver");
+
+module_init(cx231xx_alsa_register);
+module_exit(cx231xx_alsa_unregister);
diff --git a/drivers/media/video/cx231xx/cx231xx-avcore.c b/drivers/media/video/cx231xx/cx231xx-avcore.c
new file mode 100644
index 000000000000..b5597337966f
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-avcore.c
@@ -0,0 +1,2289 @@
+/*
+   cx231xx_avcore.c - driver for Conexant Cx23100/101/102 USB video capture devices
+
+   Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+
+   This program contains the specific code to control the avdecoder chip and
+   other related usb control functions for cx231xx based chipset.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitmap.h>
+#include <linux/usb.h>
+#include <linux/i2c.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+
+#include <media/v4l2-common.h>
+#include <media/v4l2-ioctl.h>
+#include <media/v4l2-chip-ident.h>
+
+#include "cx231xx.h"
+
+
+/*************************************************************************************
+ *            C O L I B R I - B L O C K    C O N T R O L   functions                 *
+ *************************************************************************************/
+int cx231xx_colibri_init_super_block(struct cx231xx *dev, u32 ref_count)
+{
+    int status = 0;
+    u8 temp = 0;
+    u32 colibri_power_status = 0;
+    int i = 0;
+
+    /* super block initialize */
+    temp = (u8)(ref_count & 0xff);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, SUP_BLK_TUNE2, 2, temp, 1);
+
+    status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS, SUP_BLK_TUNE2, 2, &colibri_power_status, 1);
+
+    temp = (u8)((ref_count & 0x300) >> 8);
+    temp |= 0x40;
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, SUP_BLK_TUNE1, 2, temp, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, SUP_BLK_PLL2, 2, 0x0f, 1);
+
+    /* enable pll     */
+    while(colibri_power_status != 0x18)
+    {
+        status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, SUP_BLK_PWRDN, 2, 0x18, 1);
+        status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS, SUP_BLK_PWRDN, 2, &colibri_power_status, 1);
+        colibri_power_status &= 0xff;
+        if(status < 0) {
+            cx231xx_info(": Init Super Block failed in sending/receiving cmds\n");
+            break;
+        }
+        i++;
+        if( i == 10) {
+            cx231xx_info(": Init Super Block force break in loop !!!!\n");
+            status = -1;
+            break;
+        }
+    }
+
+    if(status < 0 )
+        return status;
+
+    /* start tuning filter */
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, SUP_BLK_TUNE3, 2, 0x40, 1);
+    msleep(5);
+
+    /* exit tuning */
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, SUP_BLK_TUNE3, 2, 0x00, 1);
+
+    return status;
+}
+
+int cx231xx_colibri_init_channels(struct cx231xx *dev)
+{
+    int status = 0;
+
+    /* power up all 3 channels, clear pd_buffer */
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_PWRDN_CLAMP_CH1, 2, 0x00, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_PWRDN_CLAMP_CH2, 2, 0x00, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_PWRDN_CLAMP_CH3, 2, 0x00, 1);
+
+    /* Enable quantizer calibration */
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_COM_QUANT, 2, 0x02, 1);
+
+    /* channel initialize, force modulator (fb) reset */
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_FB_FRCRST_CH1, 2, 0x17, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_FB_FRCRST_CH2, 2, 0x17, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_FB_FRCRST_CH3, 2, 0x17, 1);
+
+    /* start quantilizer calibration  */
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_CAL_ATEST_CH1, 2, 0x10, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_CAL_ATEST_CH2, 2, 0x10, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_CAL_ATEST_CH3, 2, 0x10, 1);
+    msleep(5);
+
+    /* exit modulator (fb) reset */
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_FB_FRCRST_CH1, 2, 0x07, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_FB_FRCRST_CH2, 2, 0x07, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_FB_FRCRST_CH3, 2, 0x07, 1);
+
+    /* enable the pre_clamp in each channel for single-ended input */
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_NTF_PRECLMP_EN_CH1, 2, 0xf0, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_NTF_PRECLMP_EN_CH2, 2, 0xf0, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_NTF_PRECLMP_EN_CH3, 2, 0xf0, 1);
+
+    /* use diode instead of resistor, so set term_en to 0, res_en to 0  */
+    status = cx231xx_reg_mask_write(dev, Colibri_DEVICE_ADDRESS, 8, ADC_QGAIN_RES_TRM_CH1, 3, 7, 0x00);
+    status = cx231xx_reg_mask_write(dev, Colibri_DEVICE_ADDRESS, 8, ADC_QGAIN_RES_TRM_CH2, 3, 7, 0x00);
+    status = cx231xx_reg_mask_write(dev, Colibri_DEVICE_ADDRESS, 8, ADC_QGAIN_RES_TRM_CH3, 3, 7, 0x00);
+
+    /* dynamic element matching off */
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_DCSERVO_DEM_CH1, 2, 0x03, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_DCSERVO_DEM_CH2, 2, 0x03, 1);
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_DCSERVO_DEM_CH3, 2, 0x03, 1);
+
+    return status;
+}
+
+int cx231xx_colibri_setup_AFE_for_baseband(struct cx231xx *dev)
+{
+    u32 c_value = 0;
+    int status = 0;
+
+    status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_PWRDN_CLAMP_CH2, 2, &c_value, 1);
+    c_value &= (~(0x50));
+    status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_PWRDN_CLAMP_CH2, 2, c_value, 1);
+
+    return status;
+}
+
+/*
+	we have 3 channel
+	channel 1 ----- pin 1  to pin4(in reg is 1-4)
+	channel 2 ----- pin 5  to pin8(in reg is 5-8)
+	channel 3 ----- pin 9 to pin 12(in reg is 9-11)
+*/
+int cx231xx_colibri_set_input_mux(struct cx231xx *dev, u32 input_mux)
+{
+    u8 ch1_setting = (u8)input_mux;
+    u8 ch2_setting = (u8)(input_mux >> 8);
+    u8 ch3_setting = (u8)(input_mux >> 16);
+    int status = 0;
+    u32 value = 0;
+
+    if(ch1_setting != 0)
+    {
+        status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_INPUT_CH1, 2, &value, 1);
+        value &= (!INPUT_SEL_MASK);
+        value |= (ch1_setting-1)<<4;
+        value &= 0xff;
+        status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_INPUT_CH1, 2, value, 1);
+    }
+
+    if(ch2_setting != 0)
+    {
+        status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_INPUT_CH2, 2, &value, 1);
+        value &= (!INPUT_SEL_MASK);
+        value |= (ch2_setting-1)<<4;
+        value &= 0xff;
+        status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_INPUT_CH2, 2, value, 1);
+    }
+
+    /* For ch3_setting, the value to put in the register is 7 less than the input number */
+    if(ch3_setting != 0)
+    {
+        status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_INPUT_CH3, 2, &value, 1);
+        value &= (!INPUT_SEL_MASK);
+        value |= (ch3_setting-1)<<4;
+        value &= 0xff;
+        status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_INPUT_CH3, 2, value, 1);
+    }
+
+    return status;
+}
+
+int cx231xx_colibri_set_mode(struct cx231xx *dev, enum AFE_MODE mode)
+{
+    int status = 0;
+
+    switch(mode) {
+        case AFE_MODE_LOW_IF:
+            /* SetupAFEforLowIF();  */
+            break;
+        case AFE_MODE_BASEBAND:
+            status = cx231xx_colibri_setup_AFE_for_baseband(dev);
+            break;
+        case AFE_MODE_EU_HI_IF:
+            /* SetupAFEforEuHiIF(); */
+            break;
+        case AFE_MODE_US_HI_IF:
+            /* SetupAFEforUsHiIF(); */
+            break;
+        case AFE_MODE_JAPAN_HI_IF:
+            /* SetupAFEforJapanHiIF(); */
+            break;
+    }
+
+    if((mode != dev->colibri_mode) && (dev->video_input == CX231XX_VMUX_TELEVISION)) {
+        status = cx231xx_colibri_adjust_ref_count(dev, CX231XX_VMUX_TELEVISION);
+    }
+
+    dev->colibri_mode  = mode;
+
+    return status;
+}
+
+/* For power saving in the EVK */
+int cx231xx_colibri_update_power_control(struct cx231xx *dev, AV_MODE avmode)
+{
+   u32 colibri_power_status = 0;
+   int status = 0;
+
+   switch (dev->model) {
+       case CX231XX_BOARD_CNXT_RDE_250:
+       case CX231XX_BOARD_CNXT_RDU_250:
+
+	       if(avmode==POLARIS_AVMODE_ANALOGT_TV)
+	       {
+		       while(colibri_power_status != 0x18) {
+                   status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, 0x18, 1);
+                   status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, &colibri_power_status, 1);
+                   if(status < 0 )
+                       break;
+		       }
+
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH1, 2, 0x00, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH2, 2, 0x00, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH3, 2, 0x00, 1);
+	       }
+           else if(avmode==POLARIS_AVMODE_DIGITAL)  {
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH1, 2, 0x70, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH2, 2, 0x70, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH3, 2, 0x70, 1);
+
+               status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, &colibri_power_status, 1);
+		       colibri_power_status |=0x07;
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, colibri_power_status, 1);
+	       }
+	       else if(avmode==POLARIS_AVMODE_ENXTERNAL_AV)  {
+
+		       while(colibri_power_status != 0x18) {
+			       status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, 0x18, 1);
+                   status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, &colibri_power_status, 1);
+                   if(status < 0 )
+                       break;
+		       }
+
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH1, 2, 0x00, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH2, 2, 0x00, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH3, 2, 0x00, 1);
+	       }
+	       else {
+		       cx231xx_info("Invalid AV mode input\n");
+               status = -1;
+	       }
+	       break;
+       default:
+	       if(avmode==POLARIS_AVMODE_ANALOGT_TV)
+	       {
+		       while(colibri_power_status != 0x18)  {
+			       status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, 0x18, 1);
+                   status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, &colibri_power_status, 1);
+                   if(status < 0 )
+                       break;
+		       }
+
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH1, 2, 0x40, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH2, 2, 0x40, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH3, 2, 0x00, 1);
+           }
+           else if(avmode==POLARIS_AVMODE_DIGITAL)  {
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH1, 2, 0x70, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH2, 2, 0x70, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH3, 2, 0x70, 1);
+
+               status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, &colibri_power_status, 1);
+               colibri_power_status |=0x07;
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, colibri_power_status, 1);
+           }
+           else if(avmode==POLARIS_AVMODE_ENXTERNAL_AV) {
+               while(colibri_power_status != 0x18)  {
+			       status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, 0x18, 1);
+                   status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     SUP_BLK_PWRDN, 2, &colibri_power_status, 1);
+                   if(status < 0 )
+                       break;
+		       }
+
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH1, 2, 0x00, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH2, 2, 0x00, 1);
+               status = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                                     ADC_PWRDN_CLAMP_CH3, 2, 0x40, 1);
+           }
+           else  {
+                cx231xx_info("Invalid AV mode input\n");
+                status = -1;
+           }
+    } /* switch  */
+
+    return status;
+}
+
+int cx231xx_colibri_adjust_ref_count(struct cx231xx *dev, u32 video_input)
+{
+    u32 input_mode = 0;
+    u32 ntf_mode = 0;
+    int status = 0;
+
+    dev->video_input = video_input;
+
+    if(video_input == CX231XX_VMUX_TELEVISION) {
+        status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_INPUT_CH3, 2, &input_mode, 1);
+        status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                            ADC_NTF_PRECLMP_EN_CH3, 2, &ntf_mode, 1);
+    }
+    else {
+        status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS, ADC_INPUT_CH1, 2, &input_mode, 1);
+        status = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS,
+                            ADC_NTF_PRECLMP_EN_CH1, 2, &ntf_mode, 1);
+    }
+
+    input_mode = (ntf_mode & 0x3) | ((input_mode & 0x6) << 1);
+
+    switch(input_mode)
+    {
+    case SINGLE_ENDED:
+        dev->colibri_ref_count = 0x23C;
+        break;
+    case LOW_IF:
+        dev->colibri_ref_count = 0x24C;
+        break;
+    case EU_IF:
+        dev->colibri_ref_count = 0x258;
+        break;
+    case US_IF:
+        dev->colibri_ref_count = 0x260;
+        break;
+    default:
+        break;
+    }
+
+    status = cx231xx_colibri_init_super_block(dev, dev->colibri_ref_count);
+
+    return status;
+}
+
+
+
+/*************************************************************************************
+ *       V I D E O / A U D I O    D E C O D E R    C O N T R O L   functions         *
+ *************************************************************************************/
+int cx231xx_set_video_input_mux(struct cx231xx *dev, u8 input)
+{
+    int status = 0;
+
+    switch(INPUT(input)->type) {
+        case CX231XX_VMUX_COMPOSITE1:
+        case CX231XX_VMUX_SVIDEO:
+            if((dev->current_pcb_config.type == USB_BUS_POWER) &&
+                        (dev->power_mode != POLARIS_AVMODE_ENXTERNAL_AV)) {
+                status = cx231xx_set_power_mode(dev, POLARIS_AVMODE_ENXTERNAL_AV); /* External AV */
+                if (status < 0) {
+                    cx231xx_errdev("%s: cx231xx_set_power_mode : Failed to set Power - errCode [%d]!\n",
+			            __func__, status);
+		            return status;
+	            }
+            }
+            status = cx231xx_set_decoder_video_input(dev, INPUT(input)->type, INPUT(input)->vmux);
+            break;
+        case CX231XX_VMUX_TELEVISION:
+        case CX231XX_VMUX_CABLE:
+            if((dev->current_pcb_config.type == USB_BUS_POWER) &&
+                        (dev->power_mode != POLARIS_AVMODE_ANALOGT_TV)) {
+                status = cx231xx_set_power_mode(dev, POLARIS_AVMODE_ANALOGT_TV); /* Tuner */
+                if (status < 0) {
+                    cx231xx_errdev("%s: cx231xx_set_power_mode : Failed to set Power - errCode [%d]!\n",
+			            __func__, status);
+		            return status;
+	            }
+            }
+            status = cx231xx_set_decoder_video_input(dev, CX231XX_VMUX_COMPOSITE1, INPUT(input)->vmux);
+            break;
+        default:
+            cx231xx_errdev("%s: cx231xx_set_power_mode : Unknown Input %d !\n",
+			            __func__, INPUT(input)->type);
+            break;
+    }
+
+    /* save the selection */
+    dev->video_input = input;
+
+    return status;
+}
+
+int cx231xx_set_decoder_video_input(struct cx231xx *dev, u8 pin_type, u8 input)
+{
+    int status = 0;
+    u32 value = 0;
+
+    if(pin_type != dev->video_input) {
+        status = cx231xx_colibri_adjust_ref_count(dev, pin_type);
+        if(status < 0 ) {
+            cx231xx_errdev("%s: cx231xx_colibri_adjust_ref_count :Failed to set Colibri input mux - errCode [%d]!\n",
+			                __func__, status);
+            return status;
+        }
+    }
+
+    /* call colibri block to set video inputs */
+    status = cx231xx_colibri_set_input_mux(dev, input);
+    if(status < 0 ) {
+        cx231xx_errdev("%s: cx231xx_colibri_set_input_mux :Failed to set Colibri input mux - errCode [%d]!\n",
+			            __func__, status);
+        return status;
+    }
+
+    switch(pin_type) {
+        case CX231XX_VMUX_COMPOSITE1:
+            {
+                status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AFE_CTRL, 2, &value, 4);
+                value |= (0<<13)|(1<<4);
+                value  &= ~(1<<5);
+
+                value &= (~(0x1FF8000));        /* set [24:23] [22:15] to 0  */
+                value |= 0x1000000;             /* set FUNC_MODE[24:23] = 2 IF_MOD[22:15] = 0  */
+                status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AFE_CTRL, 2, value, 4);
+
+                status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, OUT_CTRL1, 2, &value, 4);
+                value |= (1<<7);
+                status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, OUT_CTRL1, 2, value, 4);
+
+                /* Set vip 1.1 output mode */
+                status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       OUT_CTRL1, FLD_OUT_MODE, OUT_MODE_VIP11);
+
+                /* Tell DIF object to go to baseband mode  */
+                status = cx231xx_dif_set_standard(dev, DIF_USE_BASEBAND);
+                if (status < 0) {
+		            cx231xx_errdev("%s: cx231xx_dif set to By pass mode - errCode [%d]!\n",
+			            __func__, status);
+		            return status;
+	            }
+
+                /* Read the DFE_CTRL1 register */
+                status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DFE_CTRL1, 2, &value, 4);
+
+                /* enable the VBI_GATE_EN */
+                value |= FLD_VBI_GATE_EN;
+
+                /* Enable the auto-VGA enable */
+                value |= FLD_VGA_AUTO_EN;
+
+                /* Write it back */
+                status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DFE_CTRL1, 2, value, 4);
+
+                /* Disable auto config of registers */
+                status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       MODE_CTRL, FLD_ACFG_DIS, cx231xx_set_field(FLD_ACFG_DIS, 1));
+
+                /* Set CVBS input mode */
+                status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       MODE_CTRL, FLD_INPUT_MODE,
+                                       cx231xx_set_field(FLD_INPUT_MODE, INPUT_MODE_CVBS_0));
+            }
+            break;
+        case CX231XX_VMUX_SVIDEO:
+            {
+                /* Disable the use of  DIF */
+
+                status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AFE_CTRL, 2, &value, 4);
+
+                value &= (~(0x1FF8000));    /* set [24:23] [22:15] to 0 */
+                value |= 0x1000010;         /* set FUNC_MODE[24:23] = 2
+                                               IF_MOD[22:15] = 0 DCR_BYP_CH2[4:4] = 1; */
+                status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AFE_CTRL, 2, value, 4);
+
+                /* Tell DIF object to go to baseband mode */
+                status = cx231xx_dif_set_standard(dev, DIF_USE_BASEBAND);
+                if (status < 0) {
+		            cx231xx_errdev("%s: cx231xx_dif set to By pass mode - errCode [%d]!\n",
+			            __func__, status);
+		            return status;
+	            }
+
+                /* Read the DFE_CTRL1 register */
+                status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DFE_CTRL1, 2, &value, 4);
+
+                /* enable the VBI_GATE_EN */
+                value |= FLD_VBI_GATE_EN;
+
+                /* Enable the auto-VGA enable */
+                value |= FLD_VGA_AUTO_EN;
+
+                /* Write it back */
+                status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DFE_CTRL1, 2, value, 4);
+
+                /* Disable auto config of registers  */
+                status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       MODE_CTRL, FLD_ACFG_DIS, cx231xx_set_field(FLD_ACFG_DIS, 1));
+
+                /* Set YC input mode */
+                status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       MODE_CTRL, FLD_INPUT_MODE,
+                                       cx231xx_set_field(FLD_INPUT_MODE, INPUT_MODE_YC_1));
+
+                /* Chroma to ADC2 */
+                status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AFE_CTRL, 2, &value, 4);
+                value |= FLD_CHROMA_IN_SEL;         /* set the chroma in select */
+
+                /* Clear VGA_SEL_CH2 and VGA_SEL_CH3 (bits 7 and 8)  This sets them to use video
+                   rather than audio.  Only one of the two will be in use. */
+                value &= ~(FLD_VGA_SEL_CH2 | FLD_VGA_SEL_CH3);
+
+                status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AFE_CTRL, 2, value, 4);
+
+                status = cx231xx_colibri_set_mode(dev, AFE_MODE_BASEBAND);
+            }
+            break;
+        case CX231XX_VMUX_TELEVISION:
+        case CX231XX_VMUX_CABLE:
+        default:
+            {
+                switch(dev->model)  {
+		        case CX231XX_BOARD_CNXT_RDE_250:
+		        case CX231XX_BOARD_CNXT_RDU_250:
+			        {
+				        /* Disable the use of  DIF   */
+
+                        status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AFE_CTRL, 2, &value, 4);
+				        value |= (0<<13)|(1<<4);
+				        value  &= ~(1<<5);
+
+				        value &= (~(0x1FF8000));    /* set [24:23] [22:15] to 0 */
+				        value |= 0x1000000;         /* set FUNC_MODE[24:23] = 2 IF_MOD[22:15] = 0 */
+                        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AFE_CTRL, 2, value, 4);
+
+                        status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, OUT_CTRL1, 2, &value, 4);
+				        value |= (1<<7);
+                        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, OUT_CTRL1, 2, value, 4);
+
+				        /* Set vip 1.1 output mode */
+                        status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       OUT_CTRL1, FLD_OUT_MODE, OUT_MODE_VIP11);
+
+                        /* Tell DIF object to go to baseband mode */
+                        status = cx231xx_dif_set_standard(dev, DIF_USE_BASEBAND);
+                        if (status < 0) {
+		                    cx231xx_errdev("%s: cx231xx_dif set to By pass mode - errCode [%d]!\n",
+			                    __func__, status);
+		                    return status;
+	                    }
+
+				        /* Read the DFE_CTRL1 register */
+                        status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DFE_CTRL1, 2, &value, 4);
+
+				        /* enable the VBI_GATE_EN */
+				        value |= FLD_VBI_GATE_EN;
+
+				        /* Enable the auto-VGA enable */
+				        value |= FLD_VGA_AUTO_EN;
+
+				        /* Write it back */
+                        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DFE_CTRL1, 2, value, 4);
+
+				        /* Disable auto config of registers */
+                        status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       MODE_CTRL, FLD_ACFG_DIS, cx231xx_set_field(FLD_ACFG_DIS, 1));
+
+                        /* Set CVBS input mode */
+                        status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       MODE_CTRL, FLD_INPUT_MODE,
+                                       cx231xx_set_field(FLD_INPUT_MODE, INPUT_MODE_CVBS_0));
+			        }
+			        break;
+		        default:
+			        {
+				        /* Enable the DIF for the tuner */
+
+				        /* Reinitialize the DIF */
+                        status = cx231xx_dif_set_standard(dev, dev->norm);
+                        if (status < 0) {
+		                    cx231xx_errdev("%s: cx231xx_dif set to By pass mode - errCode [%d]!\n",
+			                    __func__, status);
+		                    return status;
+	                    }
+
+				        /* Make sure bypass is cleared */
+                        status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_MISC_CTRL, 2, &value, 4);
+
+				        /* Clear the bypass bit */
+				        value &= ~FLD_DIF_DIF_BYPASS;
+
+				        /* Enable the use of the DIF block */
+                        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_MISC_CTRL, 2, value, 4);
+
+				        /* Read the DFE_CTRL1 register */
+                        status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DFE_CTRL1, 2, &value, 4);
+
+				        /* Disable the VBI_GATE_EN */
+				        value &= ~FLD_VBI_GATE_EN;
+
+				        /* Enable the auto-VGA enable, AGC, and set the skip count to 2 */
+				        value |= FLD_VGA_AUTO_EN | FLD_AGC_AUTO_EN | 0x00200000;
+
+				        /* Write it back */
+                        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DFE_CTRL1, 2, value, 4);
+
+				        /* Wait 15 ms */
+				        msleep(1);
+
+				        /* Disable the auto-VGA enable AGC */
+				        value &= ~(FLD_VGA_AUTO_EN);
+
+				        /* Write it back */
+                        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DFE_CTRL1, 2, value, 4);
+
+				        /* Enable Polaris B0 AGC output */
+                        status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PIN_CTRL, 2, &value, 4);
+				        value |=(FLD_OEF_AGC_RF)|(FLD_OEF_AGC_IFVGA)|(FLD_OEF_AGC_IF);
+                        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PIN_CTRL, 2, value, 4);
+
+				        /* Set vip 1.1 output mode */
+                        status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       OUT_CTRL1, FLD_OUT_MODE, OUT_MODE_VIP11);
+
+				        /* Disable auto config of registers */
+                        status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       MODE_CTRL, FLD_ACFG_DIS, cx231xx_set_field(FLD_ACFG_DIS, 1));
+
+				        /* Set CVBS input mode */
+                        status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       MODE_CTRL, FLD_INPUT_MODE,
+                                       cx231xx_set_field(FLD_INPUT_MODE, INPUT_MODE_CVBS_0));
+
+				        /* Set some bits in AFE_CTRL so that channel 2 or 3 is ready to receive audio */
+				        /* Clear clamp for channels 2 and 3      (bit 16-17) */
+				        /* Clear droop comp                      (bit 19-20) */
+				        /* Set VGA_SEL (for audio control)       (bit 7-8) */
+                        status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AFE_CTRL, 2, &value, 4);
+
+				        value |= FLD_VGA_SEL_CH3 | FLD_VGA_SEL_CH2;
+
+                        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AFE_CTRL, 2, value, 4);
+			        }
+			        break;
+
+		        }
+            }
+            break;
+    }
+
+    /* Set raw VBI mode */
+    status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       OUT_CTRL1, FLD_VBIHACTRAW_EN,
+                                       cx231xx_set_field(FLD_VBIHACTRAW_EN, 1));
+
+    status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, OUT_CTRL1, 2, &value, 4);
+    if(value & 0x02) {
+        value |=(1<<19);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, OUT_CTRL1, 2, value, 4);
+    }
+
+    return status;
+}
+
+/*
+ * Handle any video-mode specific overrides that are different on a per video standards
+ * basis after touching the MODE_CTRL register which resets many values for autodetect
+ */
+int cx231xx_do_mode_ctrl_overrides(struct cx231xx *dev)
+{
+    int status = 0;
+
+    cx231xx_info("do_mode_ctrl_overrides : 0x%x\n", (unsigned int)dev->norm);
+
+    /* Change the DFE_CTRL3 bp_percent to fix flagging */
+    status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DFE_CTRL3, 2, 0xCD3F0280, 4);
+
+    if( dev->norm & (V4L2_STD_NTSC_M | V4L2_STD_NTSC_M_JP | V4L2_STD_PAL_M) ) {
+        cx231xx_info("do_mode_ctrl_overrides NTSC\n");
+
+            /* Move the close caption lines out of active video, adjust the active video start point */
+            status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       VERT_TIM_CTRL, FLD_VBLANK_CNT,0x18);
+            status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       VERT_TIM_CTRL, FLD_VACTIVE_CNT,0x1E6000);
+            status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       VERT_TIM_CTRL, FLD_V656BLANK_CNT,0x1E000000);
+
+            status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       HORIZ_TIM_CTRL, FLD_HBLANK_CNT,
+                                       cx231xx_set_field(FLD_HBLANK_CNT, 0x79));
+    } else if ( dev->norm & ( V4L2_STD_PAL_B | V4L2_STD_PAL_G | V4L2_STD_PAL_D |
+        V4L2_STD_PAL_I | V4L2_STD_PAL_N | V4L2_STD_PAL_Nc) ) {
+            cx231xx_info("do_mode_ctrl_overrides PAL\n");
+            status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       VERT_TIM_CTRL, FLD_VBLANK_CNT,0x24);
+            /* Adjust the active video horizontal start point */
+            status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       HORIZ_TIM_CTRL, FLD_HBLANK_CNT,
+                                       cx231xx_set_field(FLD_HBLANK_CNT, 0x85));
+    } else if (dev->norm & ( V4L2_STD_SECAM_B | V4L2_STD_SECAM_D | V4L2_STD_SECAM_G |
+                            V4L2_STD_SECAM_K | V4L2_STD_SECAM_K1 | V4L2_STD_SECAM_L |
+                            V4L2_STD_SECAM_LC) ) {
+            cx231xx_info("do_mode_ctrl_overrides SECAM\n");
+            status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       VERT_TIM_CTRL, FLD_VBLANK_CNT,0x24);
+            /* Adjust the active video horizontal start point */
+            status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                       HORIZ_TIM_CTRL, FLD_HBLANK_CNT,
+                                       cx231xx_set_field(FLD_HBLANK_CNT, 0x85));
+    }
+
+    return status;
+}
+
+int cx231xx_set_audio_input(struct cx231xx *dev, u8 input)
+{
+    int status = 0;
+    enum AUDIO_INPUT ainput = AUDIO_INPUT_LINE;
+
+    switch(INPUT(input)->amux) {
+        case CX231XX_AMUX_VIDEO:
+            ainput = AUDIO_INPUT_TUNER_TV;
+            break;
+        case CX231XX_AMUX_LINE_IN:
+            status = cx231xx_flatiron_set_audio_input(dev, input);
+            ainput = AUDIO_INPUT_LINE;
+            break;
+        default:
+            break;
+    }
+
+    status = cx231xx_set_audio_decoder_input(dev, ainput);
+
+    return status;
+}
+
+int cx231xx_set_audio_decoder_input(struct cx231xx *dev, enum AUDIO_INPUT audio_input)
+{
+    u32 dwval;
+    int status;
+    u32 gen_ctrl;
+    u32 value = 0;
+
+    /* Put it in soft reset   */
+    status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, GENERAL_CTL, 2, &gen_ctrl, 1);
+    gen_ctrl |= 1;
+    status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, GENERAL_CTL, 2, gen_ctrl, 1);
+
+    switch(audio_input)
+    {
+    case AUDIO_INPUT_LINE:
+
+        /* setup AUD_IO control from Merlin paralle output */
+        value = cx231xx_set_field(FLD_AUD_CHAN1_SRC, AUD_CHAN_SRC_PARALLEL);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AUD_IO_CTRL, 2, value, 4);
+
+        /* setup input to Merlin, SRC2 connect to AC97
+           bypass upsample-by-2, slave mode, sony mode, left justify
+           adr 091c, dat 01000000 */
+        status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AC97_CTL, 2, &dwval, 4);
+
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AC97_CTL, 2, (dwval | FLD_AC97_UP2X_BYPASS), 4);
+
+        /* select the parallel1 and SRC3 */
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, BAND_OUT_SEL, 2,
+                     cx231xx_set_field(FLD_SRC3_IN_SEL, 0x0)|
+                     cx231xx_set_field(FLD_SRC3_CLK_SEL, 0x0)|
+                     cx231xx_set_field(FLD_PARALLEL1_SRC_SEL, 0x0), 4);
+
+        /* unmute all, AC97 in, independence mode
+          adr 08d0, data 0x00063073 */
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PATH1_CTL1, 2, 0x00063073, 4);
+
+        /* set AVC maximum threshold, adr 08d4, dat ffff0024 */
+        status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PATH1_VOL_CTL, 2, &dwval, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PATH1_VOL_CTL, 2,
+                            (dwval | FLD_PATH1_AVC_THRESHOLD), 4);
+
+        /* set SC maximum threshold, adr 08ec, dat ffffb3a3 */
+        status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PATH1_SC_CTL, 2, &dwval, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PATH1_SC_CTL, 2,
+                            (dwval | FLD_PATH1_SC_THRESHOLD), 4);
+        break;
+
+    case AUDIO_INPUT_TUNER_TV:
+    default:
+
+         /* Setup SRC sources and clocks */
+         status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, BAND_OUT_SEL, 2,
+                cx231xx_set_field(FLD_SRC6_IN_SEL, 0x00)|
+                cx231xx_set_field(FLD_SRC6_CLK_SEL, 0x01)|
+                cx231xx_set_field(FLD_SRC5_IN_SEL, 0x00)|
+                cx231xx_set_field(FLD_SRC5_CLK_SEL, 0x02)|
+                cx231xx_set_field(FLD_SRC4_IN_SEL, 0x02)|
+                cx231xx_set_field(FLD_SRC4_CLK_SEL, 0x03)|
+                cx231xx_set_field(FLD_SRC3_IN_SEL, 0x00)|
+                cx231xx_set_field(FLD_SRC3_CLK_SEL, 0x00)|
+                cx231xx_set_field(FLD_BASEBAND_BYPASS_CTL, 0x00)|
+                cx231xx_set_field(FLD_AC97_SRC_SEL, 0x03)|
+                cx231xx_set_field(FLD_I2S_SRC_SEL, 0x00)|
+                cx231xx_set_field(FLD_PARALLEL2_SRC_SEL, 0x02)|
+                cx231xx_set_field(FLD_PARALLEL1_SRC_SEL, 0x01) , 4);
+
+         /* Setup the AUD_IO control */
+         status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, AUD_IO_CTRL, 2,
+                cx231xx_set_field(FLD_I2S_PORT_DIR, 0x00)|
+                cx231xx_set_field(FLD_I2S_OUT_SRC, 0x00)|
+                cx231xx_set_field(FLD_AUD_CHAN3_SRC,0x00)|
+                cx231xx_set_field(FLD_AUD_CHAN2_SRC, 0x00)|
+                cx231xx_set_field(FLD_AUD_CHAN1_SRC,0x03 ), 4);
+
+         status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PATH1_CTL1, 2, 0x1F063870, 4);
+
+         /* setAudioStandard(_audio_standard); */
+
+         status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PATH1_CTL1, 2, 0x00063870, 4);
+	     switch(dev->model)
+	     {
+	         case CX231XX_BOARD_CNXT_RDE_250:
+	         case CX231XX_BOARD_CNXT_RDU_250:
+                 status = cx231xx_read_modify_write_i2c_dword(dev, HAMMERHEAD_I2C_ADDRESS,
+                                               CHIP_CTRL, FLD_SIF_EN,
+                                               cx231xx_set_field(FLD_SIF_EN, 1));
+		        break;
+	         default:
+                break;
+	     }
+         break;
+
+    case AUDIO_INPUT_TUNER_FM:
+        /*  use SIF for FM radio
+        setupFM();
+        setAudioStandard(_audio_standard);
+        */
+        break;
+
+    case AUDIO_INPUT_MUTE:
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PATH1_CTL1, 2, 0x1F011012, 4);
+        break;
+    }
+
+    /* Take it out of soft reset */
+    status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, GENERAL_CTL, 2, &gen_ctrl, 1);
+    gen_ctrl &= ~1;
+    status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, GENERAL_CTL, 2, gen_ctrl, 1);
+
+    return status;
+}
+
+
+
+/* Set resolution of the video */
+int cx231xx_resolution_set(struct cx231xx *dev)
+{
+	int width, height;
+    u32 hscale, vscale;
+    int status = 0;
+
+	width = dev->width;
+	height = dev->height;
+
+    get_scale(dev,width, height,&hscale, &vscale);
+
+    /* set horzontal scale */
+    status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, HSCALE_CTRL, 2, hscale, 4);
+
+    /* set vertical scale */
+    status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, VSCALE_CTRL, 2, vscale, 4);
+
+    return status;
+}
+
+/*************************************************************************************
+ *                        C H I P Specific  C O N T R O L   functions                *
+ *************************************************************************************/
+int cx231xx_init_ctrl_pin_status(struct cx231xx *dev)
+{
+    u32 value;
+    int status = 0;
+
+    status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PIN_CTRL, 2, &value, 4);
+    value |=(~dev->board.ctl_pin_status_mask);
+    status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, PIN_CTRL, 2, value, 4);
+
+    return status;
+}
+
+int cx231xx_set_agc_analog_digital_mux_select(struct cx231xx *dev, u8 analog_or_digital)
+{
+    int status = 0;
+
+    /* first set the direction to output */
+    status = cx231xx_set_gpio_direction(dev, dev->board.agc_analog_digital_select_gpio, 1);
+
+    /* 0 - demod ; 1 - Analog mode */
+    status = cx231xx_set_gpio_value(dev, dev->board.agc_analog_digital_select_gpio,
+                                    analog_or_digital);
+
+    return status;
+}
+
+int cx231xx_enable_i2c_for_tuner(struct cx231xx *dev, u8 I2CIndex)
+{
+    u8 value[4] ={0,0,0,0};
+    int status = 0;
+
+    cx231xx_info("Changing the i2c port for tuner to %d\n",I2CIndex);
+
+    status = cx231xx_read_ctrl_reg(dev,VRT_GET_REGISTER, PWR_CTL_EN, value, 4);
+    if(status < 0)
+        return status;
+
+    if(I2CIndex==I2C_1) {
+        if(value[0] & I2C_DEMOD_EN) {
+            value[0] &= ~I2C_DEMOD_EN;
+            status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+        }
+    } else  {
+        if(!(value[0] & I2C_DEMOD_EN)) {
+            value[0] |= I2C_DEMOD_EN;
+            status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+        }
+    }
+
+    return status;
+
+}
+
+
+/*************************************************************************************
+ *                     D I F - B L O C K    C O N T R O L   functions                *
+ *************************************************************************************/
+int cx231xx_dif_configure_C2HH_for_low_IF(struct cx231xx *dev, u32 mode,
+                                          u32 function_mode, u32 standard)
+{
+    int status = 0;
+
+    if(mode == V4L2_TUNER_RADIO) {
+        /* C2HH */
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                            AFE_CTRL_C2HH_SRC_CTRL, 30, 31, 0x1);           /* lo if big signal */
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                            AFE_CTRL_C2HH_SRC_CTRL, 23, 24, function_mode); /* FUNC_MODE = DIF */
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                            AFE_CTRL_C2HH_SRC_CTRL, 15, 22, 0xFF);          /* IF_MODE */
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                            AFE_CTRL_C2HH_SRC_CTRL, 9, 9, 0x1);             /* no inv */
+    }
+    else {
+        switch(standard) {
+            case V4L2_STD_NTSC_M:                                           /* 75 IRE Setup */
+            case V4L2_STD_NTSC_M_JP:                                        /* Japan,  0 IRE Setup */
+            case V4L2_STD_PAL_M:
+            case V4L2_STD_PAL_N:
+            case V4L2_STD_PAL_Nc:
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 30, 31, 0x1);           /* lo if big signal */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 23, 24, function_mode); /* FUNC_MODE = DIF */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 15, 22, 0xb);           /* IF_MODE */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 9, 9, 0x1);             /* no inv */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AUD_IO_CTRL, 0, 31, 0x00000003);                /* 0x124, AUD_CHAN1_SRC = 0x3 */
+            break;
+
+            case V4L2_STD_PAL_B:
+            case V4L2_STD_PAL_G:
+                 /* C2HH setup */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 30, 31, 0x1);           /* lo if big signal */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 23, 24, function_mode); /* FUNC_MODE = DIF */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 15, 22, 0xE);           /* IF_MODE */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 9, 9, 0x1);             /* no inv */
+                 break;
+
+            case V4L2_STD_PAL_D:
+            case V4L2_STD_PAL_I:
+            case V4L2_STD_SECAM_L:
+            case V4L2_STD_SECAM_LC:
+            case V4L2_STD_SECAM_B:
+            case V4L2_STD_SECAM_D:
+            case V4L2_STD_SECAM_G:
+            case V4L2_STD_SECAM_K:
+            case V4L2_STD_SECAM_K1:
+                 /* C2HH setup */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 30, 31, 0x1);           /* lo if big signal */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 23, 24, function_mode); /* FUNC_MODE = DIF */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 15, 22, 0xF);           /* IF_MODE */
+                 status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS, 32,
+                                AFE_CTRL_C2HH_SRC_CTRL, 9, 9, 0x1);             /* no inv */
+			     break;
+
+            case DIF_USE_BASEBAND:
+            default:
+                /* do nothing to config C2HH for baseband */
+                break;
+        }
+    }
+
+    return status;
+}
+
+int cx231xx_dif_set_standard(struct cx231xx *dev, u32 standard)
+{
+    int status = 0;
+    u32 dif_misc_ctrl_value = 0;
+    u32 func_mode = 0;
+
+    cx231xx_info("%s: setStandard to %x\n",__func__,standard);
+
+    status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                     DIF_MISC_CTRL, 2, &dif_misc_ctrl_value, 4);
+    if(standard != DIF_USE_BASEBAND )
+        dev->norm = standard;
+
+    switch (dev->model) {
+       case CX231XX_BOARD_CNXT_RDE_250:
+       case CX231XX_BOARD_CNXT_RDU_250:
+            func_mode=0x03;
+            break;
+        default:
+            func_mode=0x01;
+    }
+
+    status = cx231xx_dif_configure_C2HH_for_low_IF(dev, dev->active_mode, func_mode, standard);
+
+
+    if(standard == DIF_USE_BASEBAND ) {  /* base band */
+
+        /* There is a different SRC_PHASE_INC value for baseband vs. DIF */
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_SRC_PHASE_INC, 2, 0xDF7DF83, 4);
+        status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_MISC_CTRL, 2, &dif_misc_ctrl_value, 4);
+        dif_misc_ctrl_value |= FLD_DIF_DIF_BYPASS;
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_MISC_CTRL, 2, dif_misc_ctrl_value, 4);
+
+    } else if ( standard & (V4L2_STD_PAL_B | V4L2_STD_PAL_G) ) {
+
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL, 0, 31,  0x6503bc0c);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL1, 0, 31,  0xbd038c85);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL2, 0, 31,  0x1db4640a);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL3, 0, 31,  0x00008800);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_IF_REF, 0, 31, 0x444C1380);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_IF, 0, 31, 0xDA302600);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_INT, 0, 31, 0xDA261700);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_RF, 0, 31, 0xDA262600);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_IF_INT_CURRENT, 0, 31,  0x26001700);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_RF_CURRENT, 0, 31,  0x00002660);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_VIDEO_AGC_CTRL, 0, 31,  0x72500800);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_VID_AUD_OVERRIDE, 0, 31,  0x27000100);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AV_SEP_CTRL, 0, 31,  0x3F3530EC);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_COMP_FLT_CTRL, 0, 31,  0x00A653A8);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_SRC_PHASE_INC, 0, 31,  0x1befbf06);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_SRC_GAIN_CONTROL, 0, 31,  0x000035e8);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_RPT_VARIANCE, 0, 31,  0x00000000);
+        /* Save the Spec Inversion value */
+        dif_misc_ctrl_value &= FLD_DIF_SPEC_INV;
+        dif_misc_ctrl_value |=0x3a013F11;
+
+    } else if( standard & V4L2_STD_PAL_D ) {
+
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL, 0, 31,  0x6503bc0c);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL1, 0, 31,  0xbd038c85);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL2, 0, 31,  0x1db4640a);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL3, 0, 31,  0x00008800);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_IF_REF, 0, 31, 0x444C1380);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_IF, 0, 31, 0xDA302600);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_INT, 0, 31, 0xDA261700);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_RF, 0, 31, 0xDA262600);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_IF_INT_CURRENT, 0, 31,  0x26001700);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_RF_CURRENT, 0, 31,  0x00002660);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_VIDEO_AGC_CTRL, 0, 31,  0x72500800);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_VID_AUD_OVERRIDE, 0, 31,  0x27000100);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AV_SEP_CTRL, 0, 31,   0x3F3934EA);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_COMP_FLT_CTRL, 0, 31,  0x00000000);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_SRC_PHASE_INC, 0, 31,  0x1befbf06);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_SRC_GAIN_CONTROL, 0, 31,  0x000035e8);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_RPT_VARIANCE, 0, 31,  0x00000000);
+        /* Save the Spec Inversion value */
+        dif_misc_ctrl_value &= FLD_DIF_SPEC_INV;
+        dif_misc_ctrl_value |=0x3a023F11;
+
+    } else if( standard & V4L2_STD_PAL_I ) {
+
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL, 0, 31,  0x6503bc0c);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL1, 0, 31,  0xbd038c85);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL2, 0, 31,  0x1db4640a);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL3, 0, 31,  0x00008800);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_IF_REF, 0, 31, 0x444C1380);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_IF, 0, 31, 0xDA302600);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_INT, 0, 31, 0xDA261700);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_RF, 0, 31, 0xDA262600);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_IF_INT_CURRENT, 0, 31,  0x26001700);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_RF_CURRENT, 0, 31,  0x00002660);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_VIDEO_AGC_CTRL, 0, 31,  0x72500800);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_VID_AUD_OVERRIDE, 0, 31,  0x27000100);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AV_SEP_CTRL, 0, 31,   0x5F39A934);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_COMP_FLT_CTRL, 0, 31,  0x00000000);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_SRC_PHASE_INC, 0, 31,  0x1befbf06);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_SRC_GAIN_CONTROL, 0, 31,  0x000035e8);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_RPT_VARIANCE, 0, 31,  0x00000000);
+        /* Save the Spec Inversion value */
+        dif_misc_ctrl_value &= FLD_DIF_SPEC_INV;
+        dif_misc_ctrl_value |=0x3a033F11;
+
+    } else if( standard & V4L2_STD_PAL_M ) {
+
+        /* improved Low Frequency Phase Noise */
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_PLL_CTRL, 2, 0xFF01FF0C, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_PLL_CTRL1, 2, 0xbd038c85, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_PLL_CTRL2,  2, 0x1db4640a, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_PLL_CTRL3,  2, 0x00008800, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_AGC_IF_REF, 2, 0x444C1380, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_AGC_IF_INT_CURRENT,  2, 0x26001700, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_AGC_RF_CURRENT,  2, 0x00002660, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_VIDEO_AGC_CTRL,  2, 0x72500800, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_VID_AUD_OVERRIDE, 2, 0x27000100, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_AV_SEP_CTRL,  2, 0x012c405d, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_COMP_FLT_CTRL, 2, 0x009f50c1, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_SRC_PHASE_INC, 2, 0x1befbf06, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_SRC_GAIN_CONTROL, 2, 0x000035e8, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                 DIF_SOFT_RST_CTRL_REVB, 2, 0x00000000, 4);
+
+        /* Save the Spec Inversion value */
+        dif_misc_ctrl_value &= FLD_DIF_SPEC_INV;
+        dif_misc_ctrl_value |= 0x3A0A3F10;
+
+    } else if( standard & (V4L2_STD_PAL_N | V4L2_STD_PAL_Nc) ) {
+
+        /* improved Low Frequency Phase Noise */
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_PLL_CTRL, 2, 0xFF01FF0C, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_PLL_CTRL1, 2, 0xbd038c85, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_PLL_CTRL2, 2, 0x1db4640a, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_PLL_CTRL3, 2, 0x00008800, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_AGC_IF_REF, 2, 0x444C1380, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_AGC_IF_INT_CURRENT, 2, 0x26001700, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_AGC_RF_CURRENT, 2, 0x00002660, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_VIDEO_AGC_CTRL, 2, 0x72500800, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_VID_AUD_OVERRIDE, 2, 0x27000100, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_AV_SEP_CTRL, 2, 0x012c405d, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_COMP_FLT_CTRL, 2, 0x009f50c1, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_SRC_PHASE_INC, 2, 0x1befbf06, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_SRC_GAIN_CONTROL, 2, 0x000035e8, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_SOFT_RST_CTRL_REVB, 2, 0x00000000, 4);
+
+        /* Save the Spec Inversion value */
+        dif_misc_ctrl_value &= FLD_DIF_SPEC_INV;
+        dif_misc_ctrl_value = 0x3A093F10;
+
+    } else if( standard & ( V4L2_STD_SECAM_B | V4L2_STD_SECAM_D | V4L2_STD_SECAM_G |
+                            V4L2_STD_SECAM_K | V4L2_STD_SECAM_K1) ) {
+
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL, 0, 31,  0x6503bc0c);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL1, 0, 31,  0xbd038c85);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL2, 0, 31,  0x1db4640a);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL3, 0, 31,  0x00008800);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_IF_REF, 0, 31, 0x888C0380);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_IF, 0, 31, 0xe0262600);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_INT, 0, 31, 0xc2171700);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_RF, 0, 31, 0xc2262600);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_IF_INT_CURRENT, 0, 31,  0x26001700);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_RF_CURRENT, 0, 31,  0x00002660);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_VID_AUD_OVERRIDE, 0, 31,  0x27000100);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AV_SEP_CTRL, 0, 31,   0x3F3530ec);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_COMP_FLT_CTRL, 0, 31,  0x00000000);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_SRC_PHASE_INC, 0, 31,  0x1befbf06);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_SRC_GAIN_CONTROL, 0, 31,  0x000035e8);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_RPT_VARIANCE, 0, 31,  0x00000000);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_VIDEO_AGC_CTRL, 0, 31,  0xf4000000);
+
+        /* Save the Spec Inversion value */
+        dif_misc_ctrl_value &= FLD_DIF_SPEC_INV;
+        dif_misc_ctrl_value |=0x3a023F11;
+
+    } else if( standard & (V4L2_STD_SECAM_L | V4L2_STD_SECAM_LC) ) {
+
+        /* Is it SECAM_L1? */
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL, 0, 31,  0x6503bc0c);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL1, 0, 31,  0xbd038c85);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL2, 0, 31,  0x1db4640a);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_PLL_CTRL3, 0, 31,  0x00008800);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_IF_REF, 0, 31, 0x888C0380);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_IF, 0, 31, 0xe0262600);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_INT, 0, 31, 0xc2171700);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_CTRL_RF, 0, 31, 0xc2262600);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_IF_INT_CURRENT, 0, 31,  0x26001700);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AGC_RF_CURRENT, 0, 31,  0x00002660);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_VID_AUD_OVERRIDE, 0, 31,  0x27000100);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_AV_SEP_CTRL, 0, 31,   0x3F3530ec);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_COMP_FLT_CTRL, 0, 31,  0x00000000);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_SRC_PHASE_INC, 0, 31,  0x1befbf06);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_SRC_GAIN_CONTROL, 0, 31,  0x000035e8);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_RPT_VARIANCE, 0, 31,  0x00000000);
+        status = cx231xx_reg_mask_write(dev, HAMMERHEAD_I2C_ADDRESS,  32, DIF_VIDEO_AGC_CTRL, 0, 31,  0xf2560000);
+
+        /* Save the Spec Inversion value */
+        dif_misc_ctrl_value &= FLD_DIF_SPEC_INV;
+        dif_misc_ctrl_value |=0x3a023F11;
+
+    } else  { /* V4L2_STD_NTSC_M (75 IRE Setup) Or  V4L2_STD_NTSC_M_JP (Japan,  0 IRE Setup) */
+
+        /* For NTSC the centre frequency of video coming out of sidewinder is
+           around 7.1MHz or 3.6MHz depending on the spectral inversion.
+           so for a non spectrally inverted channel the pll freq word is 0x03420c49
+        */
+
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_PLL_CTRL, 2, 0x6503BC0C, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_PLL_CTRL1, 2, 0xBD038C85, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_PLL_CTRL2, 2, 0x1DB4640A, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_PLL_CTRL3, 2, 0x00008800, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_AGC_IF_REF, 2, 0x444C0380, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_AGC_IF_INT_CURRENT, 2, 0x26001700, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_AGC_RF_CURRENT, 2, 0x00002660, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_VIDEO_AGC_CTRL, 2, 0x04000800, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_VID_AUD_OVERRIDE, 2, 0x27000100, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_AV_SEP_CTRL, 2, 0x01296e1f, 4);
+
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_COMP_FLT_CTRL, 2, 0x009f50c1, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_SRC_PHASE_INC, 2, 0x1befbf06, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_SRC_GAIN_CONTROL, 2, 0x000035e8, 4);
+
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_AGC_CTRL_IF, 2, 0xC2262600, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_AGC_CTRL_INT, 2, 0xC2262600, 4);
+        status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_AGC_CTRL_RF, 2, 0xC2262600, 4);
+
+        /* Save the Spec Inversion value */
+        dif_misc_ctrl_value &= FLD_DIF_SPEC_INV;
+        dif_misc_ctrl_value |= 0x3a003F10;
+
+    }
+
+    /* The AGC values should be the same for all standards,
+       AUD_SRC_SEL[19] should always be disabled    */
+    dif_misc_ctrl_value &= ~FLD_DIF_AUD_SRC_SEL;
+
+    /* It is still possible to get Set Standard calls even when we are in FM mode
+       This is done to override the value for FM. */
+    if (dev->active_mode == V4L2_TUNER_RADIO)
+        dif_misc_ctrl_value = 0x7a080000;
+
+    /* Write the calculated value for misc ontrol register	*/
+    status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, DIF_MISC_CTRL, 2, dif_misc_ctrl_value, 4);
+
+    return status;
+}
+
+int cx231xx_tuner_pre_channel_change(struct cx231xx *dev)
+{
+	int status = 0;
+	u32 dwval;
+
+	/* Set the RF and IF k_agc values to 3 */
+    status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                     DIF_AGC_IF_REF, 2, &dwval, 4);
+	dwval &= ~(FLD_DIF_K_AGC_RF | FLD_DIF_K_AGC_IF);
+	dwval |= 0x33000000;
+
+    status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                     DIF_AGC_IF_REF, 2, dwval, 4);
+
+    return status;
+}
+
+int cx231xx_tuner_post_channel_change(struct cx231xx *dev)
+{
+    int status = 0;
+	u32 dwval;
+
+    /* Set the RF and IF k_agc values to 4 for PAL/NTSC and 8 for SECAM */
+    status = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                     DIF_AGC_IF_REF, 2, &dwval, 4);
+    dwval &= ~(FLD_DIF_K_AGC_RF | FLD_DIF_K_AGC_IF);
+
+    if(dev->norm & ( V4L2_STD_SECAM_L | V4L2_STD_SECAM_B | V4L2_STD_SECAM_D) ) {
+        dwval |= 0x88000000;
+    } else {
+        dwval |= 0x44000000;
+    }
+
+    status = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS,
+                                     DIF_AGC_IF_REF, 2, dwval, 4);
+
+    return status;
+}
+
+
+
+/*************************************************************************************
+ *            F L A T I R O N - B L O C K    C O N T R O L   functions               *
+ *************************************************************************************/
+int cx231xx_flatiron_initialize(struct cx231xx *dev)
+{
+    int status = 0;
+    u32 value;
+
+    status = cx231xx_read_i2c_data(dev, Flatrion_DEVICE_ADDRESS, CH_PWR_CTRL1, 1, &value, 1);
+    /* enables clock to delta-sigma and decimation filter */
+    value |= 0x80;
+    status = cx231xx_write_i2c_data(dev, Flatrion_DEVICE_ADDRESS,
+                                     CH_PWR_CTRL1, 1, value, 1);
+    /* power up all channel */
+    status = cx231xx_write_i2c_data(dev, Flatrion_DEVICE_ADDRESS,
+                                     CH_PWR_CTRL2, 1, 0x00, 1);
+
+    return status;
+}
+
+int cx231xx_flatiron_update_power_control(struct cx231xx *dev, AV_MODE avmode)
+{
+    int status = 0;
+    u32 value=0;
+
+    if(avmode!=POLARIS_AVMODE_ENXTERNAL_AV) {
+        status = cx231xx_read_i2c_data(dev, Flatrion_DEVICE_ADDRESS, CH_PWR_CTRL2, 1, &value, 1);
+        value |= 0xfe;
+        status = cx231xx_write_i2c_data(dev, Flatrion_DEVICE_ADDRESS,
+                                     CH_PWR_CTRL2, 1, value, 1);
+    }
+    else {
+        status = cx231xx_write_i2c_data(dev, Flatrion_DEVICE_ADDRESS,
+                                     CH_PWR_CTRL2, 1, 0x00, 1);
+    }
+
+    return status;
+}
+
+/* set flatiron for audio input types */
+int cx231xx_flatiron_set_audio_input(struct cx231xx *dev, u8 audio_input)
+{
+    int status = 0;
+
+    switch(audio_input) {
+        case CX231XX_AMUX_LINE_IN:
+
+            status = cx231xx_write_i2c_data(dev, Flatrion_DEVICE_ADDRESS,
+                                     CH_PWR_CTRL2, 1, 0x00, 1);
+            status = cx231xx_write_i2c_data(dev, Flatrion_DEVICE_ADDRESS,
+                                     CH_PWR_CTRL1, 1, 0x80, 1);
+            break;
+        case CX231XX_AMUX_VIDEO:
+        default:
+            break;
+    }
+
+    dev->ctl_ainput = audio_input;
+
+    return status;
+}
+
+/*************************************************************************************
+ *                      P O W E R      C O N T R O L   functions                     *
+ *************************************************************************************/
+int cx231xx_set_power_mode(struct cx231xx *dev, AV_MODE mode)
+{
+    u8 value[4] ={0,0,0,0};
+    u32 tmp = 0;
+    int status = 0;
+
+    if(dev->power_mode != mode)
+        dev->power_mode = mode;
+    else {
+        cx231xx_info(" setPowerMode::mode = %d, No Change req.\n",mode);
+        return 0;
+    }
+
+    cx231xx_info(" setPowerMode::mode = %d\n",mode);
+
+    status = cx231xx_read_ctrl_reg(dev,VRT_GET_REGISTER, PWR_CTL_EN, value, 4);
+    if(status < 0)
+        return status;
+
+    tmp = *((u32 *)value);
+
+    switch(mode)  {
+        case POLARIS_AVMODE_ENXTERNAL_AV:
+
+            tmp &= (~PWR_MODE_MASK);
+
+            tmp |= PWR_AV_EN;
+            value[0]=(u8)tmp;
+            value[1]=(u8)(tmp>>8);
+            value[2]=(u8)(tmp>>16);
+            value[3]=(u8)(tmp>>24);
+            status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+            msleep(PWR_SLEEP_INTERVAL);
+
+            tmp |= PWR_ISO_EN;
+            value[0]=(u8)tmp;
+            value[1]=(u8)(tmp>>8);
+            value[2]=(u8)(tmp>>16);
+            value[3]=(u8)(tmp>>24);
+            status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+            msleep(PWR_SLEEP_INTERVAL);
+
+            tmp |=POLARIS_AVMODE_ENXTERNAL_AV;
+            value[0]=(u8)tmp;
+            value[1]=(u8)(tmp>>8);
+            value[2]=(u8)(tmp>>16);
+            value[3]=(u8)(tmp>>24);
+            status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+
+            dev->xc_fw_load_done = 0; /* reset state of xceive tuner */
+            break;
+
+        case POLARIS_AVMODE_ANALOGT_TV:
+
+            tmp &= (~PWR_DEMOD_EN);
+            tmp |= (I2C_DEMOD_EN);
+            value[0]=(u8)tmp;
+            value[1]=(u8)(tmp>>8);
+            value[2]=(u8)(tmp>>16);
+            value[3]=(u8)(tmp>>24);
+            status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+            msleep(PWR_SLEEP_INTERVAL);
+
+            if(!(tmp & PWR_TUNER_EN)) {
+                tmp |= (PWR_TUNER_EN);
+                value[0]=(u8)tmp;
+                value[1]=(u8)(tmp>>8);
+                value[2]=(u8)(tmp>>16);
+                value[3]=(u8)(tmp>>24);
+                status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+                msleep(PWR_SLEEP_INTERVAL);
+            }
+
+            if(!(tmp & PWR_AV_EN)) {
+                tmp |= PWR_AV_EN;
+                value[0]=(u8)tmp;
+                value[1]=(u8)(tmp>>8);
+                value[2]=(u8)(tmp>>16);
+                value[3]=(u8)(tmp>>24);
+                status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+                msleep(PWR_SLEEP_INTERVAL);
+            }
+            if(!(tmp & PWR_ISO_EN )) {
+                tmp |= PWR_ISO_EN;
+                value[0]=(u8)tmp;
+                value[1]=(u8)(tmp>>8);
+                value[2]=(u8)(tmp>>16);
+                value[3]=(u8)(tmp>>24);
+                status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+                msleep(PWR_SLEEP_INTERVAL);
+            }
+
+            if(!(tmp & POLARIS_AVMODE_ANALOGT_TV )) {
+                tmp |= POLARIS_AVMODE_ANALOGT_TV;
+                value[0]=(u8)tmp;
+                value[1]=(u8)(tmp>>8);
+                value[2]=(u8)(tmp>>16);
+                value[3]=(u8)(tmp>>24);
+                status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+                msleep(PWR_SLEEP_INTERVAL);
+            }
+
+            if( (dev->model == CX231XX_BOARD_CNXT_RDE_250) ||
+                (dev->model == CX231XX_BOARD_CNXT_RDU_250)) {
+
+                    /* tuner path to channel 1 from port 3 */
+                    cx231xx_enable_i2c_for_tuner(dev, I2C_3);
+
+                    if(dev->cx231xx_reset_analog_tuner)
+                        dev->cx231xx_reset_analog_tuner(dev);
+            }
+            break;
+
+        case POLARIS_AVMODE_DIGITAL:
+
+            if(!(tmp & PWR_TUNER_EN)) {
+                tmp |= (PWR_TUNER_EN);
+                value[0]=(u8)tmp;
+                value[1]=(u8)(tmp>>8);
+                value[2]=(u8)(tmp>>16);
+                value[3]=(u8)(tmp>>24);
+                status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+                msleep(PWR_SLEEP_INTERVAL);
+            }
+            if(!(tmp & PWR_AV_EN)) {
+                tmp |= PWR_AV_EN;
+                value[0]=(u8)tmp;
+                value[1]=(u8)(tmp>>8);
+                value[2]=(u8)(tmp>>16);
+                value[3]=(u8)(tmp>>24);
+                status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+                msleep(PWR_SLEEP_INTERVAL);
+            }
+            if(!(tmp & PWR_ISO_EN)) {
+                tmp |= PWR_ISO_EN;
+                value[0]=(u8)tmp;
+                value[1]=(u8)(tmp>>8);
+                value[2]=(u8)(tmp>>16);
+                value[3]=(u8)(tmp>>24);
+                status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+                msleep(PWR_SLEEP_INTERVAL);
+            }
+
+            tmp |= POLARIS_AVMODE_DIGITAL|I2C_DEMOD_EN;
+            value[0]=(u8)tmp;
+            value[1]=(u8)(tmp>>8);
+            value[2]=(u8)(tmp>>16);
+            value[3]=(u8)(tmp>>24);
+            status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+            msleep(PWR_SLEEP_INTERVAL);
+
+            if(!(tmp & PWR_DEMOD_EN)) {
+                tmp |= PWR_DEMOD_EN;
+                value[0]=(u8)tmp;
+                value[1]=(u8)(tmp>>8);
+                value[2]=(u8)(tmp>>16);
+                value[3]=(u8)(tmp>>24);
+                status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+                msleep(PWR_SLEEP_INTERVAL);
+            }
+
+            if( (dev->model == CX231XX_BOARD_CNXT_RDE_250) ||
+                (dev->model == CX231XX_BOARD_CNXT_RDU_250)) {
+
+                    /* tuner path to channel 1 from port 3 */
+                    cx231xx_enable_i2c_for_tuner(dev, I2C_3);
+
+                    if(dev->cx231xx_reset_analog_tuner)
+                        dev->cx231xx_reset_analog_tuner(dev);
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    msleep(PWR_SLEEP_INTERVAL);
+
+    /* For power saving, only enable Pwr_resetout_n when digital TV is selected. */
+    if(mode == POLARIS_AVMODE_DIGITAL) {
+        tmp |= PWR_RESETOUT_EN;
+        value[0]=(u8)tmp;
+        value[1]=(u8)(tmp>>8);
+        value[2]=(u8)(tmp>>16);
+        value[3]=(u8)(tmp>>24);
+        status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+        msleep(PWR_SLEEP_INTERVAL);
+    }
+
+    /* update power control for colibri */
+    status = cx231xx_colibri_update_power_control(dev, mode);
+
+    /* update power control for flatiron */
+    status = cx231xx_flatiron_update_power_control(dev, mode);
+
+    status = cx231xx_read_ctrl_reg(dev,VRT_GET_REGISTER, PWR_CTL_EN, value, 4);
+    cx231xx_info(" The data of PWR_CTL_EN register 0x74=0x%0x,0x%0x,0x%0x,0x%0x\n",value[0],value[1],value[2],value[3]);
+
+    return status;
+}
+
+int cx231xx_power_suspend(struct cx231xx *dev)
+{
+    u8 value[4] ={0,0,0,0};
+    u32 tmp = 0;
+    int status = 0;
+
+    status = cx231xx_read_ctrl_reg(dev,VRT_GET_REGISTER, PWR_CTL_EN, value, 4);
+    if(status > 0)
+        return status;
+
+    tmp = *((u32 *)value);
+    tmp &= (~PWR_MODE_MASK);
+
+    value[0]=(u8)tmp;
+    value[1]=(u8)(tmp>>8);
+    value[2]=(u8)(tmp>>16);
+    value[3]=(u8)(tmp>>24);
+    status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, PWR_CTL_EN,value,4);
+
+    return status;
+}
+
+
+/*************************************************************************************
+ *                      S T R E A M    C O N T R O L   functions                     *
+ *************************************************************************************/
+int cx231xx_start_stream(struct cx231xx *dev, u32 ep_mask)
+{
+	u8  value[4] = {0x0, 0x0, 0x0, 0x0};
+    u32 tmp =0;
+    int status = 0;
+
+    cx231xx_info("cx231xx_start_stream():: ep_mask = %x\n", ep_mask);
+    status = cx231xx_read_ctrl_reg(dev,VRT_GET_REGISTER, EP_MODE_SET,value,4);
+    if(status < 0)
+        return status;
+
+    tmp = *((u32 *)value);
+    tmp |= ep_mask;
+    value[0]=(u8) tmp;
+    value[1]=(u8)(tmp>>8);
+    value[2]=(u8)(tmp>>16);
+    value[3]=(u8)(tmp>>24);
+
+    status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, EP_MODE_SET,value,4);
+
+    return status;
+}
+
+int cx231xx_stop_stream(struct cx231xx *dev, u32 ep_mask)
+{
+    u8  value[4] = {0x0, 0x0, 0x0, 0x0};
+    u32 tmp =0;
+    int status = 0;
+
+    cx231xx_info("cx231xx_stop_stream():: ep_mask = %x\n", ep_mask);
+    status = cx231xx_read_ctrl_reg(dev,VRT_GET_REGISTER, EP_MODE_SET,value,4);
+    if(status < 0)
+        return status;
+
+    tmp = *((u32 *)value);
+    tmp&= (~ep_mask);
+    value[0]=(u8) tmp;
+    value[1]=(u8)(tmp>>8);
+    value[2]=(u8)(tmp>>16);
+    value[3]=(u8)(tmp>>24);
+
+    status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, EP_MODE_SET,value,4);
+
+    return status;
+}
+
+int cx231xx_initialize_stream_xfer(struct cx231xx *dev, u32 media_type)
+{
+    int status = 0;
+
+    if(dev->udev->speed == USB_SPEED_HIGH)
+    {
+        switch(media_type)
+        {
+            case 81: /* audio */
+                cx231xx_info("%s: Audio enter HANC\n",__func__);
+                status = cx231xx_mode_register(dev, TS_MODE_REG, 0x9300);
+                break;
+
+            case 2: /* vbi */
+                cx231xx_info("%s: set vanc registers\n",__func__);
+                status = cx231xx_mode_register(dev, TS_MODE_REG, 0x300);
+                break;
+
+            case 3: /* sliced cc */
+                cx231xx_info("%s: set hanc registers\n",__func__);
+                status = cx231xx_mode_register(dev, TS_MODE_REG, 0x1300);
+                break;
+
+            case 0: /* video */
+                cx231xx_info("%s: set video registers\n",__func__);
+                status = cx231xx_mode_register(dev, TS_MODE_REG, 0x100);
+                break;
+
+            case 4: /* ts1 */
+                cx231xx_info("%s: set ts1 registers\n",__func__);
+                status = cx231xx_mode_register(dev, TS_MODE_REG, 0x101);
+                status = cx231xx_mode_register(dev, TS1_CFG_REG, 0x400);
+                break;
+            case 6: /* ts1 parallel mode */
+                cx231xx_info("%s: set ts1 parrallel mode registers\n",__func__);
+                status = cx231xx_mode_register(dev, TS_MODE_REG, 0x100);
+                status = cx231xx_mode_register(dev, TS1_CFG_REG, 0x400);
+                break;
+        }
+    }
+    else
+    {
+        status = cx231xx_mode_register(dev, TS_MODE_REG, 0x101);
+    }
+
+    return status;
+}
+
+
+
+
+int cx231xx_capture_start(struct cx231xx *dev, int start, u8 media_type)
+{
+	int rc;
+    u32 ep_mask = -1;
+    PPCB_CONFIG pcb_config;
+
+    /* get EP for media type */
+    pcb_config  = &dev->current_pcb_config;
+
+    if(pcb_config->config_num==1)
+    {
+        switch (media_type)
+        {
+            case 0:                     /* Video */
+                ep_mask =ENABLE_EP4;    /* ep4  [00:1000] */
+                break;
+            case 1:                     /* Audio */
+                ep_mask =ENABLE_EP3;    /* ep3  [00:0100] */
+                break;
+            case 2:                     /* Vbi */
+                ep_mask = ENABLE_EP5;   /* ep5 [01:0000] */
+                break;
+            case 3:                     /* Sliced_cc */
+                ep_mask = ENABLE_EP6;   /* ep6 [10:0000] */
+                break;
+            case 4:                     /* ts1 */
+            case 6:                     /* ts1 parallel mode */
+                ep_mask = ENABLE_EP1;   /* ep1 [00:0001] */
+                break;
+            case 5:                     /* ts2 */
+                ep_mask = ENABLE_EP2;   /* ep2 [00:0010] */
+                break;
+            }
+
+    }
+    else if(pcb_config->config_num>1)
+    {
+        switch (media_type)
+        {
+            case 0:                     /* Video */
+                ep_mask = ENABLE_EP4;   /* ep4  [00:1000] */
+                break;
+            case 1:                     /* Audio */
+                ep_mask = ENABLE_EP3;   /* ep3  [00:0100] */
+                break;
+            case 2:                     /* Vbi */
+                ep_mask = ENABLE_EP5;   /* ep5 [01:0000] */
+                break;
+            case 3:                     /* Sliced_cc */
+                ep_mask = ENABLE_EP6;   /* ep6 [10:0000] */
+                break;
+            case 4:                     /* ts1 */
+            case 6:                     /* ts1 parallel mode */
+                ep_mask = ENABLE_EP1;   /* ep1 [00:0001] */
+                break;
+            case 5:                     /* ts2 */
+                ep_mask = ENABLE_EP2;   /* ep2 [00:0010] */
+                break;
+            }
+
+    }
+
+    if(start) {
+        rc = cx231xx_initialize_stream_xfer(dev, media_type);
+
+        if(rc < 0) {
+            return rc;
+        }
+
+        /* enable video capture */
+        if(ep_mask > 0 )
+            rc = cx231xx_start_stream(dev, ep_mask);
+    }
+    else {
+        /* disable video capture */
+        if(ep_mask > 0 )
+            rc = cx231xx_stop_stream(dev, ep_mask);
+    }
+
+    if (dev->mode == CX231XX_ANALOG_MODE){
+        /* do any in Analog mode */
+    }
+    else {
+        /* do any in digital mode */
+    }
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(cx231xx_capture_start);
+
+
+/************************************************************************************
+*                       G P I O   B I T control functions                           *
+*************************************************************************************/
+int cx231xx_set_gpio_bit(struct cx231xx *dev, u32 gpio_bit, u8* gpio_val)
+{
+    int status = 0;
+
+    status = cx231xx_send_gpio_cmd(dev, gpio_bit, gpio_val, 4, 0, 0);
+
+    return status;
+}
+
+int cx231xx_get_gpio_bit(struct cx231xx *dev, u32 gpio_bit, u8* gpio_val)
+{
+    int status = 0;
+
+    status = cx231xx_send_gpio_cmd(dev, gpio_bit, gpio_val, 4, 0, 1);
+
+    return status;
+}
+
+/*
+* cx231xx_set_gpio_direction
+*      Sets the direction of the GPIO pin to input or output
+*
+* Parameters :
+*      pin_number : The GPIO Pin number to program the direction for
+*                   from 0 to 31
+*      pin_value : The Direction of the GPIO Pin under reference.
+*                      0 = Input direction
+*                      1 = Output direction
+*/
+int cx231xx_set_gpio_direction(struct cx231xx *dev,
+        int    pin_number,
+        int    pin_value)
+{
+	int status = 0;
+    u32 value = 0;
+
+    /* Check for valid pin_number - if 32 , bail out */
+    if (pin_number >= 32) {
+        return -EINVAL;
+    }
+
+    if (pin_value == 0) {                           /* input */
+        value = dev->gpio_dir &(~(1<<pin_number)) ; /* clear */
+    } else {
+        value = dev->gpio_dir | (1<<pin_number) ;
+    }
+
+	status = cx231xx_set_gpio_bit(dev, value, (u8*) & dev->gpio_val);
+
+    /* cache the value for future */
+	dev->gpio_dir = value;
+
+    return status;
+}
+
+
+/*
+* SetGpioPinLogicValue
+*      Sets the value of the GPIO pin to Logic high or low. The Pin under
+*      reference should ALREADY BE SET IN OUTPUT MODE !!!!!!!!!
+*
+* Parameters :
+*      pin_number : The GPIO Pin number to program the direction for
+*      pin_value : The value of the GPIO Pin under reference.
+*                      0 = set it to 0
+*                      1 = set it to 1
+*/
+int cx231xx_set_gpio_value(struct cx231xx *dev,
+        int    pin_number,
+        int    pin_value)
+{
+    int status = 0;
+    u32 value = 0;
+
+    /* Check for valid pin_number - if 0xFF , bail out */
+    if (pin_number >= 32)
+        return -EINVAL;
+
+    /* first do a sanity check - if the Pin is not output, make it output */
+    if ((dev->gpio_dir & (1<<pin_number)) == 0x00)
+    {
+        /* It was in input mode */
+        value = dev->gpio_dir | (1<<pin_number) ;
+        dev->gpio_dir = value;
+        status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+		value = 0;
+    }
+
+    if (pin_value == 0) {
+        value = dev->gpio_val & (~(1<<pin_number));
+    } else {
+        value = dev->gpio_val | (1<<pin_number);
+    }
+
+    /* store the value */
+    dev->gpio_val=value;
+
+    /* toggle bit0 of GP_IO */
+    status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+    return status;
+}
+
+
+/************************************************************************************
+*                          G P I O I2C related functions                            *
+*************************************************************************************/
+int cx231xx_gpio_i2c_start(struct cx231xx *dev)
+{
+	int status = 0;
+
+	/* set SCL to output 1 ; set SDA to output 1 */
+	dev->gpio_dir |= 1<< dev->board.tuner_scl_gpio;
+	dev->gpio_dir |= 1<<dev->board.tuner_sda_gpio;
+	dev->gpio_val |= 1<<dev->board.tuner_scl_gpio;
+	dev->gpio_val |= 1<<dev->board.tuner_sda_gpio;
+
+    status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+    if(status < 0){
+		return -EINVAL;
+	}
+
+	/* set SCL to output 1; set SDA to output 0 */
+	dev->gpio_val |= 1<<dev->board.tuner_scl_gpio;
+	dev->gpio_val &= ~(1<<dev->board.tuner_sda_gpio);
+
+    status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+    if(status < 0){
+		return -EINVAL;
+	}
+
+	/* set SCL to output 0; set SDA to output 0	 */
+	dev->gpio_val &= ~(1<<dev->board.tuner_scl_gpio);
+	dev->gpio_val &= ~(1<<dev->board.tuner_sda_gpio);
+
+    status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+    if(status < 0){
+		return -EINVAL;
+	}
+
+	return status;
+}
+
+
+int cx231xx_gpio_i2c_end(struct cx231xx *dev)
+{
+    int status = 0;
+
+	/* set SCL to output 0; set SDA to output 0	 */
+	dev->gpio_dir |= 1<<dev->board.tuner_scl_gpio;
+	dev->gpio_dir |= 1<<dev->board.tuner_sda_gpio;
+
+	dev->gpio_val &= ~(1<<dev->board.tuner_scl_gpio);
+	dev->gpio_val &= ~(1<<dev->board.tuner_sda_gpio);
+
+    status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+    if(status < 0){
+		return -EINVAL;
+	}
+
+	/* set SCL to output 1; set SDA to output 0	 */
+	dev->gpio_val |= 1<<dev->board.tuner_scl_gpio;
+	dev->gpio_val &= ~(1<<dev->board.tuner_sda_gpio);
+
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+    if(status < 0){
+		return -EINVAL;
+	}
+
+	/* set SCL to input ,release SCL cable control
+	   set SDA to input ,release SDA cable control */
+	dev->gpio_dir &= ~(1<<dev->board.tuner_scl_gpio);
+	dev->gpio_dir &= ~(1<<dev->board.tuner_sda_gpio);
+
+    status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+    if(status < 0){
+		return -EINVAL;
+	}
+	return status;
+}
+
+
+int cx231xx_gpio_i2c_write_byte(struct cx231xx *dev, u8 data)
+{
+	int  status = 0;
+    u8 i;
+
+	/* set SCL to output ; set SDA to output */
+	dev->gpio_dir |= 1<<dev->board.tuner_scl_gpio;
+	dev->gpio_dir |= 1<<dev->board.tuner_sda_gpio;
+
+	for(i = 0;i<8;i++) {
+		if(((data<<i) & 0x80) == 0) {
+			/* set SCL to output 0; set SDA to output 0	*/
+			dev->gpio_val  &= ~(1<<dev->board.tuner_scl_gpio);
+			dev->gpio_val &= ~(1<<dev->board.tuner_sda_gpio);
+			status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+			/* set SCL to output 1; set SDA to output 0	*/
+			dev->gpio_val  |= 1<<dev->board.tuner_scl_gpio;
+			status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+			/* set SCL to output 0; set SDA to output 0	*/
+			dev->gpio_val  &= ~(1<<dev->board.tuner_scl_gpio);
+			status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+		} else {
+			/* set SCL to output 0; set SDA to output 1	*/
+			dev->gpio_val  &= ~(1<<dev->board.tuner_scl_gpio);
+			dev->gpio_val |= 1<<dev->board.tuner_sda_gpio;
+			status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+			/* set SCL to output 1; set SDA to output 1	*/
+			dev->gpio_val  |= 1<<dev->board.tuner_scl_gpio;
+			status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+			/* set SCL to output 0; set SDA to output 1	*/
+			dev->gpio_val  &= ~(1<<dev->board.tuner_scl_gpio);
+			status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+        }
+	}
+	return status;
+}
+
+int cx231xx_gpio_i2c_read_byte(struct cx231xx *dev, u8 *buf)
+{
+	u8 value = 0;
+	int  status = 0;
+	u32 gpio_logic_value =0;
+    u8 i;
+
+	/* read byte */
+    for(i=0;i<8;i++) {      /* send write I2c addr */
+
+		/* set SCL to output 0; set SDA to input */
+		dev->gpio_val  &= ~(1<<dev->board.tuner_scl_gpio);
+		status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+		/* set SCL to output 1; set SDA to input */
+		dev->gpio_val  |= 1<<dev->board.tuner_scl_gpio;
+		status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+		/* get SDA data bit */
+		gpio_logic_value = dev->gpio_val;
+		status = cx231xx_get_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+		if((dev->gpio_val & (1<<dev->board.tuner_sda_gpio)) != 0) {
+			value |= (1<<(8-i-1));
+		}
+
+		dev->gpio_val = gpio_logic_value;
+	}
+
+	/* set SCL to output 0,finish the read latest SCL signal.
+	   !!!set SDA to input,never to modify SDA direction at the same times */
+	dev->gpio_val  &= ~(1<<dev->board.tuner_scl_gpio);
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+    /* store the value */
+    *buf = value & 0xff;
+
+	return status;
+}
+
+int cx231xx_gpio_i2c_read_ack(struct cx231xx *dev)
+{
+	int  status = 0;
+	u32 gpio_logic_value = 0;
+    int nCnt=10;
+    int nInit=nCnt;
+
+	/* clock stretch; set SCL to input; set SDA to input; get SCL value till SCL = 1 */
+	dev->gpio_dir &= ~(1<<dev->board.tuner_sda_gpio);
+	dev->gpio_dir &= ~(1<<dev->board.tuner_scl_gpio);
+
+	gpio_logic_value = dev->gpio_val;
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+    do{
+		msleep(2);
+		status = cx231xx_get_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+        nCnt--;
+	}while(((dev->gpio_val & (1<<dev->board.tuner_scl_gpio)) == 0) && (nCnt>0));
+
+    if(nCnt==0) {
+        cx231xx_info("No ACK after %d msec for clock stretch. GPIO I2C operation failed!",nInit*10);
+    }
+
+	/* readAck
+	   throuth clock stretch ,slave has given a SCL signal,so the SDA data can be directly read.  */
+	status = cx231xx_get_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+	if((dev->gpio_val & 1<< dev->board.tuner_sda_gpio) == 0){
+		dev->gpio_val = gpio_logic_value;
+		dev->gpio_val &= ~(1<< dev->board.tuner_sda_gpio);
+		status = 0;
+	} else {
+		dev->gpio_val = gpio_logic_value;
+		dev->gpio_val |= (1<< dev->board.tuner_sda_gpio);
+	}
+
+	/* read SDA end, set the SCL to output 0, after this operation, SDA direction can be changed. */
+	dev->gpio_val = gpio_logic_value;
+	dev->gpio_dir |= (1<<dev->board.tuner_scl_gpio);
+	dev->gpio_val  &= ~(1<<dev->board.tuner_scl_gpio);
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+	return status;
+}
+
+
+int cx231xx_gpio_i2c_write_ack(struct cx231xx *dev)
+{
+	int  status = 0;
+
+	/* set SDA to ouput */
+	dev->gpio_dir |= 1<<dev->board.tuner_sda_gpio;
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+	/* set SCL = 0 (output); set SDA = 0 (output) */
+	dev->gpio_val &= ~(1<<dev->board.tuner_sda_gpio);
+	dev->gpio_val  &= ~(1<<dev->board.tuner_scl_gpio);
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+	/* set SCL = 1 (output); set SDA = 0 (output) */
+	dev->gpio_val  |= 1<<dev->board.tuner_scl_gpio;
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+	/* set SCL = 0 (output); set SDA = 0 (output) */
+	dev->gpio_val  &= ~(1<<dev->board.tuner_scl_gpio);
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+	/* set SDA to input,and then the slave will read data from SDA. */
+	dev->gpio_dir  &= ~(1<<dev->board.tuner_sda_gpio);
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+	return status;
+}
+
+int cx231xx_gpio_i2c_write_nak(struct cx231xx *dev)
+{
+	int  status = 0;
+
+	/* set scl to output ; set sda to input */
+	dev->gpio_dir |= 1<<dev->board.tuner_scl_gpio;
+	dev->gpio_dir &= ~(1<<dev->board.tuner_sda_gpio);
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+	/* set scl to output 0; set sda to input */
+	dev->gpio_val  &= ~(1<<dev->board.tuner_scl_gpio);
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+	/* set scl to output 1; set sda to input */
+	dev->gpio_val  |= 1<<dev->board.tuner_scl_gpio;
+	status = cx231xx_set_gpio_bit(dev, dev->gpio_dir, (u8*) &dev->gpio_val);
+
+	return status;
+}
+
+
+
+/************************************************************************************
+*                          G P I O I2C related functions                            *
+*************************************************************************************/
+/* cx231xx_gpio_i2c_read
+ * Function to read data from gpio based I2C interface
+ */
+int cx231xx_gpio_i2c_read(struct cx231xx *dev, u8 dev_addr, u8 *buf ,u8 len)
+{
+	int  status = 0;
+    int i = 0;
+
+    /* get the lock */
+	mutex_lock(&dev->gpio_i2c_lock);
+
+	/* start */
+	status = cx231xx_gpio_i2c_start(dev);
+
+	/* write dev_addr */
+	status = cx231xx_gpio_i2c_write_byte(dev, (dev_addr << 1) +1);
+
+	/* readAck */
+	status = cx231xx_gpio_i2c_read_ack(dev);
+
+    /* read data */
+    for(i = 0; i < len; i++ ) {
+        /* read data */
+        buf[i] = 0;
+	    status = cx231xx_gpio_i2c_read_byte(dev, & buf[i]);
+
+        if( (i+1) != len) {
+	        /* only do write ack if we more length */
+	        status = cx231xx_gpio_i2c_write_ack(dev);
+        }
+    }
+
+	/* write NAK - inform reads are complete */
+	status = cx231xx_gpio_i2c_write_nak(dev);
+
+	/* write end */
+	status = cx231xx_gpio_i2c_end(dev);
+
+	/* release the lock */
+	mutex_unlock(&dev->gpio_i2c_lock);
+
+	return status;
+}
+
+
+/* cx231xx_gpio_i2c_write
+ * Function to write data to gpio based I2C interface
+ */
+int cx231xx_gpio_i2c_write(struct cx231xx *dev, u8 dev_addr, u8 *buf ,u8 len)
+{
+	int  status = 0;
+	int i=0;
+
+	/* get the lock */
+	mutex_lock(&dev->gpio_i2c_lock);
+
+	/* start */
+	status = cx231xx_gpio_i2c_start(dev);
+
+	/* write dev_addr */
+	status = cx231xx_gpio_i2c_write_byte(dev, dev_addr << 1);
+
+	/* read Ack */
+    status = cx231xx_gpio_i2c_read_ack(dev);
+
+    for(i = 0; i < len; i++ ) {
+		/* Write data */
+        status = cx231xx_gpio_i2c_write_byte(dev, buf[i]);
+
+	    /* read Ack */
+        status = cx231xx_gpio_i2c_read_ack(dev);
+    }
+
+    /* write End */
+	status = cx231xx_gpio_i2c_end(dev);
+
+	/* release the lock */
+	mutex_unlock(&dev->gpio_i2c_lock);
+
+	return 0;
+}
+
diff --git a/drivers/media/video/cx231xx/cx231xx-cards.c b/drivers/media/video/cx231xx/cx231xx-cards.c
new file mode 100644
index 000000000000..c567e5a9eec8
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-cards.c
@@ -0,0 +1,935 @@
+/*
+   cx231xx-cards.c - driver for Conexant Cx23100/101/102 USB video capture devices
+
+   Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+        Based on em28xx driver
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/i2c.h>
+#include <linux/usb.h>
+#include <media/tuner.h>
+#include <media/tveeprom.h>
+#include <media/v4l2-common.h>
+#include <media/v4l2-chip-ident.h>
+
+#include <media/cx25840.h>
+#include "xc5000.h"
+
+#include "cx231xx.h"
+
+static int tuner = -1;
+module_param(tuner, int, 0444);
+MODULE_PARM_DESC(tuner, "tuner type");
+
+static unsigned int disable_ir;
+module_param(disable_ir, int, 0444);
+MODULE_PARM_DESC(disable_ir, "disable infrared remote support");
+
+/* Bitmask marking allocated devices from 0 to CX231XX_MAXBOARDS */
+static unsigned long cx231xx_devused;
+
+/*
+ *  Reset sequences for analog/digital modes
+ */
+
+static struct cx231xx_reg_seq RDE250_XCV_TUNER[] = {
+    { 0x03, 0x01, 10 },
+    { 0x03, 0x00, 30 },
+    { 0x03, 0x01, 10 },
+    {   -1,   -1, -1 },
+};
+
+
+
+/*
+ *  Board definitions
+ */
+struct cx231xx_board cx231xx_boards[] = {
+
+	[CX231XX_BOARD_UNKNOWN] = {
+		.name          = "Unknown CX231xx video grabber",
+		.tuner_type    = TUNER_ABSENT,
+        .input        = { {
+			.type     = CX231XX_VMUX_TELEVISION,
+			.vmux     = CX231XX_VIN_3_1,
+			.amux     = CX231XX_AMUX_VIDEO,
+			.gpio     = 0,
+		}, {
+			.type     = CX231XX_VMUX_COMPOSITE1,
+			.vmux     = CX231XX_VIN_2_1,
+			.amux     = CX231XX_AMUX_LINE_IN,
+			.gpio     = 0,
+		}, {
+			.type     = CX231XX_VMUX_SVIDEO,
+			.vmux     = CX231XX_VIN_1_1 | (CX231XX_VIN_1_2 << 8 ) |
+					    CX25840_SVIDEO_ON,
+			.amux     = CX231XX_AMUX_LINE_IN,
+			.gpio     = 0,
+		} },
+	},
+
+	[CX231XX_BOARD_CNXT_RDE_250] = {
+		.name         = "Conexant Hybrid TV - RDE250",
+		.valid        = CX231XX_BOARD_VALIDATED,
+		.tuner_type   = TUNER_XC5000,
+        .tuner_addr   = 0x61,
+		.tuner_gpio   = RDE250_XCV_TUNER,
+        .tuner_sif_gpio = 0x05,
+		.tuner_scl_gpio = 0x1a,
+        .tuner_sda_gpio = 0x1b,
+        .decoder      = CX231XX_AVDECODER,
+        .demod_xfer_mode                = 0,
+        .ctl_pin_status_mask            = 0xFFFFFFC4,
+        .agc_analog_digital_select_gpio = 0x0c,
+        .gpio_pin_status_mask           = 0x4001000,
+        .tuner_i2c_master               = 1,
+        .demod_i2c_master               = 2,
+		.has_dvb      = 1,
+        .demod_addr   = 0x02,
+        .norm         = V4L2_STD_PAL,
+
+		.input        = { {
+			.type     = CX231XX_VMUX_TELEVISION,
+			.vmux     = CX231XX_VIN_3_1,
+			.amux     = CX231XX_AMUX_VIDEO,
+			.gpio     = 0,
+		}, {
+			.type     = CX231XX_VMUX_COMPOSITE1,
+			.vmux     = CX231XX_VIN_2_1,
+			.amux     = CX231XX_AMUX_LINE_IN,
+			.gpio     = 0,
+		}, {
+			.type     = CX231XX_VMUX_SVIDEO,
+			.vmux     = CX231XX_VIN_1_1 | (CX231XX_VIN_1_2 << 8 ) |
+					    CX25840_SVIDEO_ON,
+			.amux     = CX231XX_AMUX_LINE_IN,
+			.gpio     = 0,
+		} },
+	},
+
+    [CX231XX_BOARD_CNXT_RDU_250] = {
+		.name         = "Conexant Hybrid TV - RDU250",
+		.valid        = CX231XX_BOARD_VALIDATED,
+		.tuner_type   = TUNER_XC5000,
+        .tuner_addr   = 0x61,
+		.tuner_gpio   = RDE250_XCV_TUNER,
+        .tuner_sif_gpio = 0x05,
+		.tuner_scl_gpio = 0x1a,
+        .tuner_sda_gpio = 0x1b,
+        .decoder      = CX231XX_AVDECODER,
+        .demod_xfer_mode                = 0,
+        .ctl_pin_status_mask            = 0xFFFFFFC4,
+        .agc_analog_digital_select_gpio = 0x0c,
+        .gpio_pin_status_mask           = 0x4001000,
+        .tuner_i2c_master               = 1,
+        .demod_i2c_master               = 2,
+		.has_dvb      = 1,
+        .demod_addr   = 0x32,
+        .norm         = V4L2_STD_NTSC,
+
+		.input        = { {
+			.type     = CX231XX_VMUX_TELEVISION,
+			.vmux     = CX231XX_VIN_3_1,
+			.amux     = CX231XX_AMUX_VIDEO,
+			.gpio     = 0,
+		}, {
+			.type     = CX231XX_VMUX_COMPOSITE1,
+			.vmux     = CX231XX_VIN_2_1,
+			.amux     = CX231XX_AMUX_LINE_IN,
+			.gpio     = 0,
+		}, {
+			.type     = CX231XX_VMUX_SVIDEO,
+			.vmux     = CX231XX_VIN_1_1 | (CX231XX_VIN_1_2 << 8 ) |
+					    CX25840_SVIDEO_ON,
+			.amux     = CX231XX_AMUX_LINE_IN,
+			.gpio     = 0,
+		} },
+	},
+};
+const unsigned int cx231xx_bcount = ARRAY_SIZE(cx231xx_boards);
+
+/* table of devices that work with this driver */
+struct usb_device_id cx231xx_id_table [] = {
+	{ USB_DEVICE(0x0572, 0x58A0),
+			.driver_info = CX231XX_BOARD_UNKNOWN },
+	{ USB_DEVICE(0x0572, 0x58A2),
+			.driver_info = CX231XX_BOARD_CNXT_RDE_250 },
+    { USB_DEVICE(0x0572, 0x5A3C),
+			.driver_info = CX231XX_BOARD_CNXT_RDU_250 },
+	{ },
+};
+MODULE_DEVICE_TABLE(usb, cx231xx_id_table);
+
+/* cx231xx_tuner_callback
+ * will be used to reset XC5000 tuner using GPIO pin
+ */
+
+int cx231xx_tuner_callback(void *ptr, int component, int command, int arg)
+{
+	int rc = 0;
+	struct cx231xx *dev = ptr;
+
+    if (dev->tuner_type == TUNER_XC5000) {
+        if (command == XC5000_TUNER_RESET) {
+            cx231xx_info("Tuner Call back : RESET : command %d : tuner type %d \n",
+                command, dev->tuner_type);
+
+            cx231xx_set_gpio_value(dev, dev->board.tuner_gpio->bit,1);
+            msleep(10);
+            cx231xx_set_gpio_value(dev, dev->board.tuner_gpio->bit,0);
+            msleep(330);
+            cx231xx_set_gpio_value(dev, dev->board.tuner_gpio->bit,1);
+            msleep(10);
+        }
+    }
+	return rc;
+}
+EXPORT_SYMBOL_GPL(cx231xx_tuner_callback);
+
+static void inline cx231xx_set_model(struct cx231xx *dev)
+{
+	memcpy(&dev->board, &cx231xx_boards[dev->model], sizeof(dev->board));
+}
+
+/* Since cx231xx_pre_card_setup() requires a proper dev->model,
+ * this won't work for boards with generic PCI IDs
+ */
+void cx231xx_pre_card_setup(struct cx231xx *dev)
+{
+
+	cx231xx_set_model(dev);
+
+	cx231xx_info("Identified as %s (card=%d)\n",
+		    dev->board.name, dev->model);
+
+	/* Do card specific if any */
+	switch (dev->model) {
+        case CX231XX_BOARD_CNXT_RDE_250:
+            /* do card specific GPIO settings if required */
+            cx231xx_info("Precard: Board is Conexnat RDE 250\n");
+            /* set the direction for GPIO pins */
+            cx231xx_set_gpio_direction(dev, dev->board.tuner_gpio->bit,1);
+            cx231xx_set_gpio_value(dev, dev->board.tuner_gpio->bit,1);
+            cx231xx_set_gpio_direction(dev, dev->board.tuner_sif_gpio,1);
+            break;
+        case CX231XX_BOARD_CNXT_RDU_250:
+            /* do card specific GPIO settings if required */
+            cx231xx_info("Precard: Board is Conexnat RDU 250\n");
+            /* set the direction for GPIO pins */
+            cx231xx_set_gpio_direction(dev, dev->board.tuner_gpio->bit,1);
+            cx231xx_set_gpio_value(dev, dev->board.tuner_gpio->bit,1);
+            cx231xx_set_gpio_direction(dev, dev->board.tuner_sif_gpio,1);
+            break;
+	}
+
+	/* request some modules if any required */
+
+    /* reset the Tuner */
+	cx231xx_gpio_set(dev, dev->board.tuner_gpio);
+
+    /* set the mode to Analog mode initially */
+	cx231xx_set_mode(dev, CX231XX_ANALOG_MODE);
+
+	/* Unlock device */
+	/* cx231xx_set_mode(dev, CX231XX_SUSPEND); */
+
+}
+
+#if 0
+
+static void cx231xx_config_tuner(struct cx231xx *dev)
+{
+	struct tuner_setup           tun_setup;
+	struct v4l2_frequency        f;
+
+	if (dev->tuner_type == TUNER_ABSENT)
+		return;
+
+	tun_setup.mode_mask = T_ANALOG_TV | T_RADIO;
+	tun_setup.type = dev->tuner_type;
+	tun_setup.addr = dev->tuner_addr;
+	tun_setup.tuner_callback = cx231xx_tuner_callback;
+
+	cx231xx_i2c_call_clients(&dev->i2c_bus[1], TUNER_SET_TYPE_ADDR, &tun_setup);
+#if 0
+    if (tun_setup.type == TUNER_XC5000) {
+		static struct xc2028_ctrl ctrl = {
+			.fname = XC5000_DEFAULT_FIRMWARE,
+			.max_len = 64,
+            .demod = 0;
+		};
+		struct v4l2_priv_tun_config cfg = {
+			.tuner = dev->tuner_type,
+			.priv = &ctrl,
+		};
+		cx231xx_i2c_call_clients(&dev->i2c_bus[1], TUNER_SET_CONFIG, &cfg);
+	}
+#endif
+
+	/* configure tuner */
+	f.tuner = 0;
+	f.type = V4L2_TUNER_ANALOG_TV;
+	f.frequency = 9076;     /* just a magic number */
+	dev->ctl_freq = f.frequency;
+	cx231xx_i2c_call_clients(&dev->i2c_bus[1], VIDIOC_S_FREQUENCY, &f);
+}
+
+#endif
+
+/* ----------------------------------------------------------------------- */
+void cx231xx_set_ir(struct cx231xx *dev, struct IR_i2c *ir)
+{
+	if (disable_ir) {
+		ir->get_key = NULL;
+		return ;
+	}
+
+	/* detect & configure */
+	switch (dev->model) {
+
+        case CX231XX_BOARD_CNXT_RDE_250:
+            break;
+        case CX231XX_BOARD_CNXT_RDU_250:
+            break;
+        default:
+		    break;
+	}
+}
+
+void cx231xx_card_setup(struct cx231xx *dev)
+{
+	cx231xx_set_model(dev);
+
+	dev->tuner_type = cx231xx_boards[dev->model].tuner_type;
+	if (cx231xx_boards[dev->model].tuner_addr)
+		dev->tuner_addr = cx231xx_boards[dev->model].tuner_addr;
+
+    cx231xx_info(": tuner type %d, tuner address %d \n",
+        dev->tuner_type, dev->tuner_addr);
+
+	/* Do card specific if any */
+	switch (dev->model) {
+        case CX231XX_BOARD_CNXT_RDE_250:
+            /* do card specific GPIO settings if required */
+            cx231xx_info("Board is Conexnat RDE 250\n");
+            break;
+        case CX231XX_BOARD_CNXT_RDU_250:
+            /* do card specific GPIO settings if required */
+            cx231xx_info("Board is Conexnat RDU 250\n");
+            break;
+	}
+
+	if (dev->board.valid == CX231XX_BOARD_NOT_VALIDATED) {
+		cx231xx_errdev("\n\n");
+		cx231xx_errdev("The support for this board weren't "
+			      "valid yet.\n");
+		cx231xx_errdev("Please send a report of having this working\n");
+		cx231xx_errdev("not to V4L mailing list (and/or to other "
+				"addresses)\n\n");
+	}
+
+
+	/* request some modules */
+    if (dev->board.decoder == CX231XX_AVDECODER) {
+        cx231xx_info(": Requesting cx25840 module\n");
+		request_module("cx25840");
+    }
+#if 0
+    if (dev->board.tuner_type != TUNER_ABSENT) {
+        cx231xx_info(": Requesting Tuner module\n");
+		request_module("tuner");
+    }
+
+	cx231xx_config_tuner(dev);
+
+    /* TBD  IR will be added later */
+	cx231xx_ir_init(dev);
+#endif
+}
+
+
+
+/*
+ * cx231xx_config()
+ * inits registers with sane defaults
+ */
+int cx231xx_config(struct cx231xx *dev)
+{
+    /* TBD need to add cx231xx specific code */
+	dev->mute = 1;		/* maybe not the right place... */
+	dev->volume = 0x1f;
+
+	return 0;
+}
+
+/*
+ * cx231xx_config_i2c()
+ * configure i2c attached devices
+ */
+void cx231xx_config_i2c(struct cx231xx *dev)
+{
+	struct v4l2_routing route;
+
+	route.input = INPUT(dev->video_input)->vmux;
+	route.output = 0;
+
+	cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_STREAMON, NULL);
+}
+
+/*
+ * cx231xx_realease_resources()
+ * unregisters the v4l2,i2c and usb devices
+ * called when the device gets disconected or at module unload
+*/
+void cx231xx_release_resources(struct cx231xx *dev)
+{
+
+#if 0 /* TBD IR related  */
+	if (dev->ir)
+		cx231xx_ir_fini(dev);
+#endif
+
+	cx231xx_release_analog_resources(dev);
+
+    cx231xx_remove_from_devlist(dev);
+
+    cx231xx_dev_uninit(dev);
+
+	usb_put_dev(dev->udev);
+
+	/* Mark device as unused */
+	cx231xx_devused &= ~(1<<dev->devno);
+}
+
+
+/*
+ * cx231xx_init_dev()
+ * allocates and inits the device structs, registers i2c bus and v4l device
+ */
+static int cx231xx_init_dev(struct cx231xx **devhandle, struct usb_device *udev,
+			   int minor)
+{
+	struct cx231xx *dev = *devhandle;
+	int retval = -ENOMEM;
+	int errCode;
+	unsigned int maxh, maxw;
+
+	dev->udev = udev;
+	mutex_init(&dev->lock);
+	mutex_init(&dev->ctrl_urb_lock);
+    mutex_init(&dev->gpio_i2c_lock);
+
+    spin_lock_init(&dev->video_mode.slock);
+    spin_lock_init(&dev->vbi_mode.slock);
+    spin_lock_init(&dev->sliced_cc_mode.slock);
+
+	init_waitqueue_head(&dev->open);
+	init_waitqueue_head(&dev->wait_frame);
+	init_waitqueue_head(&dev->wait_stream);
+
+    dev->cx231xx_read_ctrl_reg = cx231xx_read_ctrl_reg;
+    dev->cx231xx_write_ctrl_reg = cx231xx_write_ctrl_reg;
+    dev->cx231xx_send_usb_command = cx231xx_send_usb_command;
+    dev->cx231xx_gpio_i2c_read = cx231xx_gpio_i2c_read;
+    dev->cx231xx_gpio_i2c_write = cx231xx_gpio_i2c_write;
+
+    /* Query cx231xx to find what pcb config it is related to */
+    initialize_cx231xx(dev);
+
+    /* Cx231xx pre card setup */
+	cx231xx_pre_card_setup(dev);
+
+	errCode = cx231xx_config(dev);
+	if (errCode) {
+		cx231xx_errdev("error configuring device\n");
+		return -ENOMEM;
+	}
+
+    /* set default norm */
+	dev->norm = dev->board.norm;
+
+	/* register i2c bus */
+	errCode = cx231xx_dev_init(dev);
+	if (errCode < 0) {
+		cx231xx_errdev("%s: cx231xx_i2c_register - errCode [%d]!\n",
+			__func__, errCode);
+		return errCode;
+	}
+
+	/* Do board specific init */
+	cx231xx_card_setup(dev);
+
+	/* configure the device */
+	cx231xx_config_i2c(dev);
+
+	maxw = norm_maxw(dev);
+	maxh = norm_maxh(dev);
+
+	/* set default image size */
+	dev->width = maxw;
+	dev->height = maxh;
+	dev->interlaced = 0;
+	dev->hscale = 0;
+	dev->vscale = 0;
+	dev->video_input = 0;
+
+	errCode = cx231xx_config(dev);
+	if (errCode < 0) {
+		cx231xx_errdev("%s: cx231xx_config - errCode [%d]!\n",
+			__func__, errCode);
+		return errCode;
+	}
+
+	/* init video dma queues */
+	INIT_LIST_HEAD(&dev->video_mode.vidq.active);
+	INIT_LIST_HEAD(&dev->video_mode.vidq.queued);
+
+    /* init vbi dma queues */
+	INIT_LIST_HEAD(&dev->vbi_mode.vidq.active);
+	INIT_LIST_HEAD(&dev->vbi_mode.vidq.queued);
+
+	/* Reset other chips required if they are tied up with GPIO pins */
+
+    cx231xx_add_into_devlist(dev);
+
+	retval = cx231xx_register_analog_devices(dev);
+	if (retval < 0) {
+		cx231xx_release_resources(dev);
+		goto fail_reg_devices;
+	}
+
+	cx231xx_init_extension(dev);
+
+	return 0;
+
+fail_reg_devices:
+	mutex_unlock(&dev->lock);
+	return retval;
+}
+
+#if defined(CONFIG_MODULES) && defined(MODULE)
+static void request_module_async(struct work_struct *work)
+{
+	struct cx231xx *dev = container_of(work,
+			     struct cx231xx, request_module_wk);
+
+
+	if (dev->has_alsa_audio)
+		request_module("cx231xx-alsa");
+
+	if (dev->board.has_dvb)
+		request_module("cx231xx-dvb");
+
+}
+
+static void request_modules(struct cx231xx *dev)
+{
+	INIT_WORK(&dev->request_module_wk, request_module_async);
+	schedule_work(&dev->request_module_wk);
+}
+#else
+#define request_modules(dev)
+#endif /* CONFIG_MODULES */
+
+
+
+/*
+ * cx231xx_usb_probe()
+ * checks for supported devices
+ */
+static int cx231xx_usb_probe(struct usb_interface *interface,
+			    const struct usb_device_id *id)
+{
+	struct usb_device *udev;
+	struct usb_interface *uif;
+	struct cx231xx *dev = NULL;
+	int retval = -ENODEV;
+    int nr, ifnum;
+	int i, isoc_pipe = 0;
+	char *speed;
+	char descr[255] = "";
+    struct usb_interface *lif = NULL;
+    int skip_interface = 0;
+    struct usb_interface_assoc_descriptor *assoc_desc;
+
+	udev = usb_get_dev(interface_to_usbdev(interface));
+	ifnum = interface->altsetting[0].desc.bInterfaceNumber;
+
+    cx231xx_info(": Interface Number %d\n", ifnum);
+
+    /* Interface number 0 - IR interface */
+    if(ifnum == 0 ){
+        /* Check to see next free device and mark as used */
+	    nr = find_first_zero_bit(&cx231xx_devused, CX231XX_MAXBOARDS);
+	    cx231xx_devused |= 1<<nr;
+
+        if (nr >= CX231XX_MAXBOARDS) {
+		    cx231xx_info(": Supports only %i cx231xx boards.\n",
+				    CX231XX_MAXBOARDS);
+		    cx231xx_devused &= ~(1<<nr);
+		    return -ENOMEM;
+	    }
+
+        /* allocate memory for our device state and initialize it */
+	    dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	    if (dev == NULL) {
+		    cx231xx_err(DRIVER_NAME ": out of memory!\n");
+		    cx231xx_devused &= ~(1<<nr);
+		    return -ENOMEM;
+	    }
+
+	    snprintf(dev->name, 29, "cx231xx #%d", nr);
+	    dev->devno = nr;
+	    dev->model = id->driver_info;
+	    dev->video_mode.alt   = -1;
+        dev->interface_count++;
+
+        /* reset gpio dir and value */
+        dev->gpio_dir = 0;
+        dev->gpio_val = 0;
+        dev->xc_fw_load_done = 0;
+		dev->has_alsa_audio = 1;
+        dev->power_mode = -1;
+
+        dev->vbi_or_sliced_cc_mode = 0; /* 0 - vbi ; 1 -sliced cc mode */
+
+        /* get maximum no.of IAD interfaces */
+        assoc_desc = udev->actconfig->intf_assoc[0];
+        dev->max_iad_interface_count  = assoc_desc->bInterfaceCount;
+        cx231xx_info(": Found IAD interface count %d\n", dev->max_iad_interface_count);
+
+        /* init CIR module TBD */
+
+        /* store the current interface */
+        lif = interface;
+
+    }
+    else if(ifnum == 1 ){
+
+        /* Get dev structure first */
+        dev = usb_get_intfdata(udev->actconfig->interface[0]);
+        if(dev == NULL){
+		    cx231xx_err(DRIVER_NAME ": out of first interface!\n");
+		    return -ENODEV;
+	    }
+
+        /* store the interface 0 back */
+        lif = udev->actconfig->interface[0];
+
+        /* increment interface count */
+        dev->interface_count++;
+
+        /* get device number */
+        nr = dev->devno;
+
+        assoc_desc = udev->actconfig->intf_assoc[0];
+        if(assoc_desc->bFirstInterface == ifnum){
+            cx231xx_info(": Found IAD interface match: AV Descriptor Start!! \n");
+        } else {
+            cx231xx_err(DRIVER_NAME " Not found matching interface\n");
+            return -ENODEV;
+        }
+
+    }
+    else if(ifnum >= 2) {
+        /* Get dev structure first */
+        dev = usb_get_intfdata(udev->actconfig->interface[0]);
+        if(dev == NULL){
+		    cx231xx_err(DRIVER_NAME ": out of first interface!\n");
+		    return -ENODEV;
+	    }
+
+        /* store the interface 0 back */
+        lif = udev->actconfig->interface[0];
+
+        /* increment interface count */
+        dev->interface_count++;
+
+        /* get device number */
+        nr = dev->devno;
+
+        /* set skip interface */
+        if((dev->interface_count -1) != dev->max_iad_interface_count )
+            skip_interface = 1; /* set skipping */
+        else{
+            cx231xx_info(": Found IAD interface number match with AV Device number!! \n");
+        }
+    }
+
+	switch (udev->speed) {
+	case USB_SPEED_LOW:
+		speed = "1.5";
+		break;
+	case USB_SPEED_UNKNOWN:
+	case USB_SPEED_FULL:
+		speed = "12";
+		break;
+	case USB_SPEED_HIGH:
+		speed = "480";
+		break;
+	default:
+		speed = "unknown";
+	}
+
+	if (udev->manufacturer)
+		strlcpy(descr, udev->manufacturer, sizeof(descr));
+
+	if (udev->product) {
+		if (*descr)
+			strlcat(descr, " ", sizeof(descr));
+		strlcat(descr, udev->product, sizeof(descr));
+	}
+	if (*descr)
+		strlcat(descr, " ", sizeof(descr));
+
+	cx231xx_info("New device %s@ %s Mbps "
+		"(%04x:%04x, interface %d, class %d)\n",
+		descr,
+		speed,
+		le16_to_cpu(udev->descriptor.idVendor),
+		le16_to_cpu(udev->descriptor.idProduct),
+		ifnum,
+		interface->altsetting->desc.bInterfaceNumber);
+
+    /* AV device initialization */
+    if((dev->interface_count -1) == dev->max_iad_interface_count ) {
+        cx231xx_info(" Calling init_dev\n");
+	    /* allocate device struct */
+	    retval = cx231xx_init_dev(&dev, udev, nr);
+	    if (retval) {
+		    cx231xx_devused &= ~(1<<dev->devno);
+		    kfree(dev);
+
+		    return retval;
+	    }
+
+        /* compute alternate max packet sizes for video */
+        uif = udev->actconfig->interface[dev->current_pcb_config.hs_config_info[0].interface_info.video_index+1];
+
+        dev->video_mode.end_point_addr = le16_to_cpu(uif->altsetting[0].endpoint[isoc_pipe].desc.bEndpointAddress);
+
+        dev->video_mode.num_alt = uif->num_altsetting;
+        cx231xx_info(": EndPoint Addr 0x%x, Alternate settings: %i\n", dev->video_mode.end_point_addr,
+                                    dev->video_mode.num_alt);
+        dev->video_mode.alt_max_pkt_size = kmalloc(32 * dev->video_mode.num_alt, GFP_KERNEL);
+
+        if (dev->video_mode.alt_max_pkt_size == NULL) {
+	        cx231xx_errdev("out of memory!\n");
+	        cx231xx_devused &= ~(1<<nr);
+	        kfree(dev);
+	        return -ENOMEM;
+        }
+
+        for (i = 0; i < dev->video_mode.num_alt ; i++) {
+	        u16 tmp = le16_to_cpu(uif->altsetting[i].endpoint[isoc_pipe].desc.
+						        wMaxPacketSize);
+	        dev->video_mode.alt_max_pkt_size[i] =
+	            (tmp & 0x07ff) * (((tmp & 0x1800) >> 11) + 1);
+            cx231xx_info("Alternate setting %i, max size= %i\n", i,
+					        dev->video_mode.alt_max_pkt_size[i]);
+        }
+
+
+        /* compute alternate max packet sizes for vbi */
+        uif = udev->actconfig->interface[dev->current_pcb_config.hs_config_info[0].interface_info.vanc_index+1];
+
+        dev->vbi_mode.end_point_addr =
+            le16_to_cpu(uif->altsetting[0].endpoint[isoc_pipe].desc.bEndpointAddress);
+
+        dev->vbi_mode.num_alt = uif->num_altsetting;
+        cx231xx_info(": EndPoint Addr 0x%x, Alternate settings: %i\n", dev->vbi_mode.end_point_addr,
+                                    dev->vbi_mode.num_alt);
+        dev->vbi_mode.alt_max_pkt_size = kmalloc(32 * dev->vbi_mode.num_alt, GFP_KERNEL);
+
+        if (dev->vbi_mode.alt_max_pkt_size == NULL) {
+	        cx231xx_errdev("out of memory!\n");
+	        cx231xx_devused &= ~(1<<nr);
+	        kfree(dev);
+	        return -ENOMEM;
+        }
+
+        for (i = 0; i < dev->vbi_mode.num_alt ; i++) {
+	        u16 tmp = le16_to_cpu(uif->altsetting[i].endpoint[isoc_pipe].desc.
+						        wMaxPacketSize);
+	        dev->vbi_mode.alt_max_pkt_size[i] =
+	            (tmp & 0x07ff) * (((tmp & 0x1800) >> 11) + 1);
+            cx231xx_info("Alternate setting %i, max size= %i\n", i,
+					        dev->vbi_mode.alt_max_pkt_size[i]);
+        }
+
+        /* compute alternate max packet sizes for sliced CC */
+        uif = udev->actconfig->interface[dev->current_pcb_config.hs_config_info[0].interface_info.hanc_index+1];
+
+        dev->sliced_cc_mode.end_point_addr =
+            le16_to_cpu(uif->altsetting[0].endpoint[isoc_pipe].desc.bEndpointAddress);
+
+        dev->sliced_cc_mode.num_alt = uif->num_altsetting;
+        cx231xx_info(": EndPoint Addr 0x%x, Alternate settings: %i\n", dev->sliced_cc_mode.end_point_addr,
+                                    dev->sliced_cc_mode.num_alt);
+        dev->sliced_cc_mode.alt_max_pkt_size = kmalloc(32 * dev->sliced_cc_mode.num_alt, GFP_KERNEL);
+
+        if (dev->sliced_cc_mode.alt_max_pkt_size == NULL) {
+	        cx231xx_errdev("out of memory!\n");
+	        cx231xx_devused &= ~(1<<nr);
+	        kfree(dev);
+	        return -ENOMEM;
+        }
+
+        for (i = 0; i < dev->sliced_cc_mode.num_alt ; i++) {
+	        u16 tmp = le16_to_cpu(uif->altsetting[i].endpoint[isoc_pipe].desc.
+						        wMaxPacketSize);
+	        dev->sliced_cc_mode.alt_max_pkt_size[i] =
+	            (tmp & 0x07ff) * (((tmp & 0x1800) >> 11) + 1);
+            cx231xx_info("Alternate setting %i, max size= %i\n", i,
+					        dev->sliced_cc_mode.alt_max_pkt_size[i]);
+        }
+
+        if(dev->current_pcb_config.ts1_source != 0xff ) {
+
+            /* compute alternate max packet sizes for TS1 */
+            uif = udev->actconfig->interface[dev->current_pcb_config.hs_config_info[0].interface_info.ts1_index+1];
+
+            dev->ts1_mode.end_point_addr =
+                le16_to_cpu(uif->altsetting[0].endpoint[isoc_pipe].desc.bEndpointAddress);
+
+            dev->ts1_mode.num_alt = uif->num_altsetting;
+            cx231xx_info(": EndPoint Addr 0x%x, Alternate settings: %i\n", dev->ts1_mode.end_point_addr,
+                                        dev->ts1_mode.num_alt);
+            dev->ts1_mode.alt_max_pkt_size = kmalloc(32 * dev->ts1_mode.num_alt, GFP_KERNEL);
+
+            if (dev->ts1_mode.alt_max_pkt_size == NULL) {
+	            cx231xx_errdev("out of memory!\n");
+	            cx231xx_devused &= ~(1<<nr);
+	            kfree(dev);
+	            return -ENOMEM;
+            }
+
+            for (i = 0; i < dev->ts1_mode.num_alt ; i++) {
+	            u16 tmp = le16_to_cpu(uif->altsetting[i].endpoint[isoc_pipe].desc.
+						            wMaxPacketSize);
+	            dev->ts1_mode.alt_max_pkt_size[i] =
+	                (tmp & 0x07ff) * (((tmp & 0x1800) >> 11) + 1);
+                cx231xx_info("Alternate setting %i, max size= %i\n", i,
+					            dev->ts1_mode.alt_max_pkt_size[i]);
+            }
+        }
+
+    }
+
+	/* save our data pointer in this interface device */
+	usb_set_intfdata(lif, dev);
+
+    /* load other modules required */
+    if((dev->interface_count -1) == dev->max_iad_interface_count )
+    {
+        cx231xx_info("Calling request modules\n");
+        request_modules(dev);
+    }
+
+    if(skip_interface ) {
+        cx231xx_info("Skipping the interface\n");
+        return -ENODEV;
+    }
+
+	return 0;
+}
+
+/*
+ * cx231xx_usb_disconnect()
+ * called when the device gets diconencted
+ * video device will be unregistered on v4l2_close in case it is still open
+ */
+static void cx231xx_usb_disconnect(struct usb_interface *interface)
+{
+	struct cx231xx *dev;
+
+    dev = usb_get_intfdata(interface);
+	usb_set_intfdata(interface, NULL);
+
+	if (!dev)
+		return;
+
+	/* wait until all current v4l2 io is finished then deallocate
+	   resources */
+	mutex_lock(&dev->lock);
+
+	wake_up_interruptible_all(&dev->open);
+
+	if (dev->users) {
+		cx231xx_warn
+		    ("device /dev/video%d is open! Deregistration and memory "
+		     "deallocation are deferred on close.\n",
+				dev->vdev->num);
+
+		dev->state |= DEV_MISCONFIGURED;
+		cx231xx_uninit_isoc(dev);
+		dev->state |= DEV_DISCONNECTED;
+		wake_up_interruptible(&dev->wait_frame);
+		wake_up_interruptible(&dev->wait_stream);
+	} else {
+		dev->state |= DEV_DISCONNECTED;
+		cx231xx_release_resources(dev);
+	}
+
+    cx231xx_close_extension(dev);
+
+	mutex_unlock(&dev->lock);
+
+	if (!dev->users) {
+		kfree(dev->video_mode.alt_max_pkt_size);
+        kfree(dev->vbi_mode.alt_max_pkt_size);
+        kfree(dev->sliced_cc_mode.alt_max_pkt_size);
+        kfree(dev->ts1_mode.alt_max_pkt_size);
+		kfree(dev);
+	}
+}
+
+static struct usb_driver cx231xx_usb_driver = {
+	.name = "cx231xx",
+	.probe = cx231xx_usb_probe,
+	.disconnect = cx231xx_usb_disconnect,
+	.id_table = cx231xx_id_table,
+};
+
+static int __init cx231xx_module_init(void)
+{
+	int result;
+
+	printk(KERN_INFO DRIVER_NAME " v4l2 driver version %d.%d.%d loaded\n",
+	       (CX231XX_VERSION_CODE >> 16) & 0xff,
+	       (CX231XX_VERSION_CODE >> 8) & 0xff, CX231XX_VERSION_CODE & 0xff);
+
+	/* register this driver with the USB subsystem */
+	result = usb_register(&cx231xx_usb_driver);
+	if (result)
+		cx231xx_err(DRIVER_NAME
+			   " usb_register failed. Error number %d.\n", result);
+
+	return result;
+}
+
+static void __exit cx231xx_module_exit(void)
+{
+	/* deregister this driver with the USB subsystem */
+	usb_deregister(&cx231xx_usb_driver);
+}
+
+module_init(cx231xx_module_init);
+module_exit(cx231xx_module_exit);
diff --git a/drivers/media/video/cx231xx/cx231xx-conf-reg.h b/drivers/media/video/cx231xx/cx231xx-conf-reg.h
new file mode 100644
index 000000000000..5ccf6bdfe579
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-conf-reg.h
@@ -0,0 +1,491 @@
+/*
+   cx231xx_conf-reg.h - driver for Conexant Cx23100/101/102 USB
+                        video capture devices
+
+   Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#ifndef _POLARIS_REG_H_
+#define _POLARIS_REG_H_
+
+#define BOARD_CFG_STAT          0x0
+#define TS_MODE_REG             0x4
+#define TS1_CFG_REG             0x8
+#define TS1_LENGTH_REG          0xc
+#define TS2_CFG_REG             0x10
+#define TS2_LENGTH_REG          0x14
+#define EP_MODE_SET             0x18
+#define CIR_PWR_PTN1            0x1c
+#define CIR_PWR_PTN2            0x20
+#define CIR_PWR_PTN3            0x24
+#define CIR_PWR_MASK0           0x28
+#define CIR_PWR_MASK1           0x2c
+#define CIR_PWR_MASK2           0x30
+#define CIR_GAIN                0x34
+#define CIR_CAR_REG             0x38
+#define CIR_OT_CFG1             0x40
+#define CIR_OT_CFG2             0x44
+#define PWR_CTL_EN              0x74
+
+/* Polaris Endpoints capture mask for register EP_MODE_SET */
+#define ENABLE_EP1              0x01    /* Bit[0]=1 */
+#define ENABLE_EP2              0x02    /* Bit[1]=1 */
+#define ENABLE_EP3              0x04    /* Bit[2]=1 */
+#define ENABLE_EP4              0x08    /* Bit[3]=1 */
+#define ENABLE_EP5              0x10    /* Bit[4]=1 */
+#define ENABLE_EP6              0x20    /* Bit[5]=1 */
+
+/* Bit definition for register PWR_CTL_EN */
+#define PWR_MODE_MASK           0x17f
+#define PWR_AV_EN               0x08    /* bit3 */
+#define PWR_ISO_EN              0x40    /* bit6 */
+#define PWR_AV_MODE             0x30    /* bit4,5  */
+#define PWR_TUNER_EN            0x04    /* bit2 */
+#define PWR_DEMOD_EN            0x02    /* bit1 */
+#define I2C_DEMOD_EN            0x01    /* bit0 */
+#define PWR_RESETOUT_EN         0x100   /* bit8 */
+
+typedef enum{
+     POLARIS_AVMODE_DEFAULT         = 0,
+     POLARIS_AVMODE_DIGITAL         = 0x10,
+     POLARIS_AVMODE_ANALOGT_TV      = 0x20,
+     POLARIS_AVMODE_ENXTERNAL_AV    = 0x30,
+
+}AV_MODE;
+
+/* Colibri Registers */
+
+#define SINGLE_ENDED            0x0
+#define LOW_IF                  0x4
+#define EU_IF                   0x9
+#define US_IF                   0xa
+
+
+
+#define SUP_BLK_TUNE1           0x00
+#define SUP_BLK_TUNE2           0x01
+#define SUP_BLK_TUNE3           0x02
+#define SUP_BLK_XTAL            0x03
+#define SUP_BLK_PLL1            0x04
+#define SUP_BLK_PLL2            0x05
+#define SUP_BLK_PLL3            0x06
+#define SUP_BLK_REF             0x07
+#define SUP_BLK_PWRDN           0x08
+#define SUP_BLK_TESTPAD         0x09
+#define ADC_COM_INT5_STAB_REF   0x0a
+#define ADC_COM_QUANT           0x0b
+#define ADC_COM_BIAS1           0x0c
+#define ADC_COM_BIAS2           0x0d
+#define ADC_COM_BIAS3           0x0e
+#define TESTBUS_CTRL            0x12
+
+#define ADC_STATUS_CH1          0x20
+#define ADC_STATUS_CH2          0x40
+#define ADC_STATUS_CH3          0x60
+
+#define ADC_STATUS2_CH1         0x21
+#define ADC_STATUS2_CH2         0x41
+#define ADC_STATUS2_CH3         0x61
+
+#define ADC_CAL_ATEST_CH1       0x22
+#define ADC_CAL_ATEST_CH2       0x42
+#define ADC_CAL_ATEST_CH3       0x62
+
+#define ADC_PWRDN_CLAMP_CH1     0x23
+#define ADC_PWRDN_CLAMP_CH2     0x43
+#define ADC_PWRDN_CLAMP_CH3     0x63
+
+#define ADC_CTRL_DAC23_CH1      0x24
+#define ADC_CTRL_DAC23_CH2      0x44
+#define ADC_CTRL_DAC23_CH3      0x64
+
+#define ADC_CTRL_DAC1_CH1       0x25
+#define ADC_CTRL_DAC1_CH2       0x45
+#define ADC_CTRL_DAC1_CH3       0x65
+
+#define ADC_DCSERVO_DEM_CH1     0x26
+#define ADC_DCSERVO_DEM_CH2     0x46
+#define ADC_DCSERVO_DEM_CH3     0x66
+
+#define ADC_FB_FRCRST_CH1       0x27
+#define ADC_FB_FRCRST_CH2       0x47
+#define ADC_FB_FRCRST_CH3       0x67
+
+#define ADC_INPUT_CH1           0x28
+#define ADC_INPUT_CH2           0x48
+#define ADC_INPUT_CH3           0x68
+#define INPUT_SEL_MASK          0x30    /* [5:4] in_sel */
+
+#define ADC_NTF_PRECLMP_EN_CH1  0x29
+#define ADC_NTF_PRECLMP_EN_CH2  0x49
+#define ADC_NTF_PRECLMP_EN_CH3  0x69
+
+#define ADC_QGAIN_RES_TRM_CH1   0x2a
+#define ADC_QGAIN_RES_TRM_CH2   0x4a
+#define ADC_QGAIN_RES_TRM_CH3   0x6a
+
+#define ADC_SOC_PRECLMP_TERM_CH1    0x2b
+#define ADC_SOC_PRECLMP_TERM_CH2    0x4b
+#define ADC_SOC_PRECLMP_TERM_CH3    0x6b
+
+#define TESTBUS_CTRL_CH1        0x32
+#define TESTBUS_CTRL_CH2        0x52
+#define TESTBUS_CTRL_CH3        0x72
+
+/******************************************************************************
+                            * DIF registers *
+ ******************************************************************************/
+#define      DIRECT_IF_REVB_BASE  0x00300
+
+/*****************************************************************************/
+#define      DIF_PLL_FREQ_WORD        (DIRECT_IF_REVB_BASE + 0x00000000)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_PLL_LOCK                           0x80000000
+/*  Reserved                                [30:29] */
+#define      FLD_DIF_PLL_FREE_RUN                       0x10000000
+#define      FLD_DIF_PLL_FREQ                           0x0FFFFFFF
+
+/*****************************************************************************/
+#define      DIF_PLL_CTRL             (DIRECT_IF_REVB_BASE + 0x00000004)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_KD_PD                              0xFF000000
+/*  Reserved                             [23:20] */
+#define      FLD_DIF_KDS_PD                             0x000F0000
+#define      FLD_DIF_KI_PD                              0x0000FF00
+/*  Reserved                             [7:4] */
+#define      FLD_DIF_KIS_PD                             0x0000000F
+
+/*****************************************************************************/
+#define      DIF_PLL_CTRL1            (DIRECT_IF_REVB_BASE + 0x00000008)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_KD_FD                              0xFF000000
+/*  Reserved                             [23:20] */
+#define      FLD_DIF_KDS_FD                             0x000F0000
+#define      FLD_DIF_KI_FD                              0x0000FF00
+#define      FLD_DIF_SIG_PROP_SZ                        0x000000F0
+#define      FLD_DIF_KIS_FD                             0x0000000F
+
+/*****************************************************************************/
+#define      DIF_PLL_CTRL2            (DIRECT_IF_REVB_BASE + 0x0000000C)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_PLL_AGC_REF                        0xFFF00000
+#define      FLD_DIF_PLL_AGC_KI                         0x000F0000
+/*  Reserved                             [15] */
+#define      FLD_DIF_FREQ_LIMIT                         0x00007000
+#define      FLD_DIF_K_FD                               0x00000F00
+#define      FLD_DIF_DOWNSMPL_FD                        0x000000FF
+
+/*****************************************************************************/
+#define      DIF_PLL_CTRL3            (DIRECT_IF_REVB_BASE + 0x00000010)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:16] */
+#define      FLD_DIF_PLL_AGC_EN                         0x00008000
+/*  Reserved                             [14:12] */
+#define      FLD_DIF_PLL_MAN_GAIN                       0x00000FFF
+
+/*****************************************************************************/
+#define      DIF_AGC_IF_REF           (DIRECT_IF_REVB_BASE + 0x00000014)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_K_AGC_RF                           0xF0000000
+#define      FLD_DIF_K_AGC_IF                           0x0F000000
+#define      FLD_DIF_K_AGC_INT                          0x00F00000
+/*  Reserved                             [19:12] */
+#define      FLD_DIF_IF_REF                             0x00000FFF
+
+/*****************************************************************************/
+#define      DIF_AGC_CTRL_IF          (DIRECT_IF_REVB_BASE + 0x00000018)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_IF_MAX                             0xFF000000
+#define      FLD_DIF_IF_MIN                             0x00FF0000
+#define      FLD_DIF_IF_AGC                             0x0000FFFF
+
+/*****************************************************************************/
+#define      DIF_AGC_CTRL_INT         (DIRECT_IF_REVB_BASE + 0x0000001C)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_INT_MAX                            0xFF000000
+#define      FLD_DIF_INT_MIN                            0x00FF0000
+#define      FLD_DIF_INT_AGC                            0x0000FFFF
+
+/*****************************************************************************/
+#define      DIF_AGC_CTRL_RF          (DIRECT_IF_REVB_BASE + 0x00000020)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_RF_MAX                             0xFF000000
+#define      FLD_DIF_RF_MIN                             0x00FF0000
+#define      FLD_DIF_RF_AGC                             0x0000FFFF
+
+/*****************************************************************************/
+#define      DIF_AGC_IF_INT_CURRENT   (DIRECT_IF_REVB_BASE + 0x00000024)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_IF_AGC_IN                          0xFFFF0000
+#define      FLD_DIF_INT_AGC_IN                         0x0000FFFF
+
+/*****************************************************************************/
+#define      DIF_AGC_RF_CURRENT       (DIRECT_IF_REVB_BASE + 0x00000028)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                            [31:16] */
+#define      FLD_DIF_RF_AGC_IN                          0x0000FFFF
+
+/*****************************************************************************/
+#define      DIF_VIDEO_AGC_CTRL       (DIRECT_IF_REVB_BASE + 0x0000002C)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_AFD                                0xC0000000
+#define      FLD_DIF_K_VID_AGC                          0x30000000
+#define      FLD_DIF_LINE_LENGTH                        0x0FFF0000
+#define      FLD_DIF_AGC_GAIN                           0x0000FFFF
+
+/*****************************************************************************/
+#define      DIF_VID_AUD_OVERRIDE     (DIRECT_IF_REVB_BASE + 0x00000030)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_AUDIO_AGC_OVERRIDE                 0x80000000
+/*  Reserved                             [30:30] */
+#define      FLD_DIF_AUDIO_MAN_GAIN                     0x3F000000
+/*  Reserved                             [23:17] */
+#define      FLD_DIF_VID_AGC_OVERRIDE                   0x00010000
+#define      FLD_DIF_VID_MAN_GAIN                       0x0000FFFF
+
+/*****************************************************************************/
+#define      DIF_AV_SEP_CTRL          (DIRECT_IF_REVB_BASE + 0x00000034)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_LPF_FREQ                           0xC0000000
+#define      FLD_DIF_AV_PHASE_INC                       0x3F000000
+#define      FLD_DIF_AUDIO_FREQ                         0x00FFFFFF
+
+/*****************************************************************************/
+#define      DIF_COMP_FLT_CTRL        (DIRECT_IF_REVB_BASE + 0x00000038)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                            [31:24] */
+#define      FLD_DIF_IIR23_R2                           0x00FF0000
+#define      FLD_DIF_IIR23_R1                           0x0000FF00
+#define      FLD_DIF_IIR1_R1                            0x000000FF
+
+/*****************************************************************************/
+#define      DIF_MISC_CTRL            (DIRECT_IF_REVB_BASE + 0x0000003C)  /* Reg Size 32 */
+/*****************************************************************************/
+#define      FLD_DIF_DIF_BYPASS                         0x80000000
+#define      FLD_DIF_FM_NYQ_GAIN                        0x40000000
+#define      FLD_DIF_RF_AGC_ENA                         0x20000000
+#define      FLD_DIF_INT_AGC_ENA                        0x10000000
+#define      FLD_DIF_IF_AGC_ENA                         0x08000000
+#define      FLD_DIF_FORCE_RF_IF_LOCK                   0x04000000
+#define      FLD_DIF_VIDEO_AGC_ENA                      0x02000000
+#define      FLD_DIF_RF_AGC_INV                         0x01000000
+#define      FLD_DIF_INT_AGC_INV                        0x00800000
+#define      FLD_DIF_IF_AGC_INV                         0x00400000
+#define      FLD_DIF_SPEC_INV                           0x00200000
+#define      FLD_DIF_AUD_FULL_BW                        0x00100000
+#define      FLD_DIF_AUD_SRC_SEL                        0x00080000
+/*  Reserved                             [18] */
+#define      FLD_DIF_IF_FREQ                            0x00030000
+/*  Reserved                             [15:14] */
+#define      FLD_DIF_TIP_OFFSET                         0x00003F00
+/*  Reserved                             [7:5] */
+#define      FLD_DIF_DITHER_ENA                         0x00000010
+/*  Reserved                             [3:1] */
+#define      FLD_DIF_RF_IF_LOCK                         0x00000001
+
+/*****************************************************************************/
+#define      DIF_SRC_PHASE_INC        (DIRECT_IF_REVB_BASE + 0x00000040)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:29] */
+#define      FLD_DIF_PHASE_INC                          0x1FFFFFFF
+
+/*****************************************************************************/
+#define      DIF_SRC_GAIN_CONTROL     (DIRECT_IF_REVB_BASE + 0x00000044)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:16] */
+#define      FLD_DIF_SRC_KI                             0x0000FF00
+#define      FLD_DIF_SRC_KD                             0x000000FF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF01          (DIRECT_IF_REVB_BASE + 0x00000048)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:19] */
+#define      FLD_DIF_BPF_COEFF_0                        0x00070000
+/*  Reserved                             [15:4] */
+#define      FLD_DIF_BPF_COEFF_1                        0x0000000F
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF23          (DIRECT_IF_REVB_BASE + 0x0000004c)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:22] */
+#define      FLD_DIF_BPF_COEFF_2                        0x003F0000
+/*  Reserved                             [15:7] */
+#define      FLD_DIF_BPF_COEFF_3                        0x0000007F
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF45          (DIRECT_IF_REVB_BASE + 0x00000050)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:24] */
+#define      FLD_DIF_BPF_COEFF_4                        0x00FF0000
+/*  Reserved                             [15:8] */
+#define      FLD_DIF_BPF_COEFF_5                        0x000000FF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF67          (DIRECT_IF_REVB_BASE + 0x00000054)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:25] */
+#define      FLD_DIF_BPF_COEFF_6                        0x01FF0000
+/*  Reserved                             [15:9] */
+#define      FLD_DIF_BPF_COEFF_7                        0x000001FF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF89          (DIRECT_IF_REVB_BASE + 0x00000058)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:26] */
+#define      FLD_DIF_BPF_COEFF_8                        0x03FF0000
+/*  Reserved                             [15:10] */
+#define      FLD_DIF_BPF_COEFF_9                        0x000003FF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF1011        (DIRECT_IF_REVB_BASE + 0x0000005C)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:27] */
+#define      FLD_DIF_BPF_COEFF_10                       0x07FF0000
+/*  Reserved                             [15:11] */
+#define      FLD_DIF_BPF_COEFF_11                       0x000007FF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF1213        (DIRECT_IF_REVB_BASE + 0x00000060)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:27] */
+#define      FLD_DIF_BPF_COEFF_12                       0x07FF0000
+/*  Reserved                             [15:12] */
+#define      FLD_DIF_BPF_COEFF_13                       0x00000FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF1415        (DIRECT_IF_REVB_BASE + 0x00000064)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:28] */
+#define      FLD_DIF_BPF_COEFF_14                       0x0FFF0000
+/*  Reserved                             [15:12] */
+#define      FLD_DIF_BPF_COEFF_15                       0x00000FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF1617        (DIRECT_IF_REVB_BASE + 0x00000068)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:29] */
+#define      FLD_DIF_BPF_COEFF_16                       0x1FFF0000
+/*  Reserved                             [15:13] */
+#define      FLD_DIF_BPF_COEFF_17                       0x00001FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF1819        (DIRECT_IF_REVB_BASE + 0x0000006C)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:29] */
+#define      FLD_DIF_BPF_COEFF_18                       0x1FFF0000
+/*  Reserved                             [15:13] */
+#define      FLD_DIF_BPF_COEFF_19                       0x00001FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF2021        (DIRECT_IF_REVB_BASE + 0x00000070)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:29] */
+#define      FLD_DIF_BPF_COEFF_20                       0x1FFF0000
+/*  Reserved                             [15:14] */
+#define      FLD_DIF_BPF_COEFF_21                       0x00003FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF2223        (DIRECT_IF_REVB_BASE + 0x00000074)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:30] */
+#define      FLD_DIF_BPF_COEFF_22                       0x3FFF0000
+/*  Reserved                             [15:14] */
+#define      FLD_DIF_BPF_COEFF_23                       0x00003FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF2425        (DIRECT_IF_REVB_BASE + 0x00000078)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:30] */
+#define      FLD_DIF_BPF_COEFF_24                       0x3FFF0000
+/*  Reserved                             [15:14] */
+#define      FLD_DIF_BPF_COEFF_25                       0x00003FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF2627        (DIRECT_IF_REVB_BASE + 0x0000007C)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:30] */
+#define      FLD_DIF_BPF_COEFF_26                       0x3FFF0000
+/*  Reserved                             [15:14] */
+#define      FLD_DIF_BPF_COEFF_27                       0x00003FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF2829        (DIRECT_IF_REVB_BASE + 0x00000080)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:30] */
+#define      FLD_DIF_BPF_COEFF_28                       0x3FFF0000
+/*  Reserved                             [15:14] */
+#define      FLD_DIF_BPF_COEFF_29                       0x00003FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF3031        (DIRECT_IF_REVB_BASE + 0x00000084)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:30] */
+#define      FLD_DIF_BPF_COEFF_30                       0x3FFF0000
+/*  Reserved                             [15:14] */
+#define      FLD_DIF_BPF_COEFF_31                       0x00003FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF3233        (DIRECT_IF_REVB_BASE + 0x00000088)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:30] */
+#define      FLD_DIF_BPF_COEFF_32                       0x3FFF0000
+/*  Reserved                             [15:14] */
+#define      FLD_DIF_BPF_COEFF_33                       0x00003FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF3435        (DIRECT_IF_REVB_BASE + 0x0000008C)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:30] */
+#define      FLD_DIF_BPF_COEFF_34                       0x3FFF0000
+/*  Reserved                             [15:14] */
+#define      FLD_DIF_BPF_COEFF_35                       0x00003FFF
+
+/*****************************************************************************/
+#define      DIF_BPF_COEFF36          (DIRECT_IF_REVB_BASE + 0x00000090)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:30] */
+#define      FLD_DIF_BPF_COEFF_36                       0x3FFF0000
+/*  Reserved                             [15:0] */
+
+/*****************************************************************************/
+#define      DIF_RPT_VARIANCE         (DIRECT_IF_REVB_BASE + 0x00000094)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:20] */
+#define      FLD_DIF_RPT_VARIANCE                       0x000FFFFF
+
+/*****************************************************************************/
+#define      DIF_SOFT_RST_CTRL_REVB       (DIRECT_IF_REVB_BASE + 0x00000098)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:8] */
+#define      FLD_DIF_DIF_SOFT_RST                       0x00000080
+#define      FLD_DIF_DIF_REG_RST_MSK                    0x00000040
+#define      FLD_DIF_AGC_RST_MSK                        0x00000020
+#define      FLD_DIF_CMP_RST_MSK                        0x00000010
+#define      FLD_DIF_AVS_RST_MSK                        0x00000008
+#define      FLD_DIF_NYQ_RST_MSK                        0x00000004
+#define      FLD_DIF_DIF_SRC_RST_MSK                    0x00000002
+#define      FLD_DIF_PLL_RST_MSK                        0x00000001
+
+/*****************************************************************************/
+#define      DIF_PLL_FREQ_ERR         (DIRECT_IF_REVB_BASE + 0x0000009C)  /* Reg Size 32 */
+/*****************************************************************************/
+/*  Reserved                             [31:25] */
+#define      FLD_DIF_CTL_IP                             0x01FFFFFF
+
+
+#endif
diff --git a/drivers/media/video/cx231xx/cx231xx-core.c b/drivers/media/video/cx231xx/cx231xx-core.c
new file mode 100644
index 000000000000..efe0c666043a
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-core.c
@@ -0,0 +1,1167 @@
+/*
+   cx231xx-core.c - driver for Conexant Cx23100/101/102 USB video capture devices
+
+   Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+        Based on em28xx driver
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/usb.h>
+#include <linux/vmalloc.h>
+#include <media/v4l2-common.h>
+
+#include "cx231xx.h"
+#include "cx231xx-reg.h"
+
+/* #define ENABLE_DEBUG_ISOC_FRAMES */
+
+static unsigned int core_debug;
+module_param(core_debug,int,0644);
+MODULE_PARM_DESC(core_debug,"enable debug messages [core]");
+
+#define cx231xx_coredbg(fmt, arg...) do {\
+	if (core_debug) \
+		printk(KERN_INFO "%s %s :"fmt, \
+			 dev->name, __func__ , ##arg); } while (0)
+
+static unsigned int reg_debug;
+module_param(reg_debug,int,0644);
+MODULE_PARM_DESC(reg_debug,"enable debug messages [URB reg]");
+
+#define cx231xx_regdbg(fmt, arg...) do {\
+	if (reg_debug) \
+		printk(KERN_INFO "%s %s :"fmt, \
+			 dev->name, __func__ , ##arg); } while (0)
+
+static int alt = CX231XX_PINOUT;
+module_param(alt, int, 0644);
+MODULE_PARM_DESC(alt, "alternate setting to use for video endpoint");
+
+/* FIXME */
+#define cx231xx_isocdbg(fmt, arg...) do {\
+	if (core_debug) \
+		printk(KERN_INFO "%s %s :"fmt, \
+			 dev->name, __func__ , ##arg); } while (0)
+
+
+
+/************************************************************************************
+*                              Device control list functions                        *
+*************************************************************************************/
+
+static LIST_HEAD(cx231xx_devlist);
+static DEFINE_MUTEX(cx231xx_devlist_mutex);
+
+struct cx231xx *cx231xx_get_device(int minor,
+				 enum v4l2_buf_type *fh_type,
+				 int *has_radio)
+{
+	struct cx231xx *h, *dev = NULL;
+
+	*fh_type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+	*has_radio = 0;
+
+	mutex_lock(&cx231xx_devlist_mutex);
+	list_for_each_entry(h, &cx231xx_devlist, devlist) {
+		if (h->vdev->minor == minor)
+			dev = h;
+		if (h->vbi_dev->minor == minor) {
+			dev = h;
+			*fh_type = V4L2_BUF_TYPE_VBI_CAPTURE;
+		}
+		if (h->radio_dev &&
+		    h->radio_dev->minor == minor) {
+			dev = h;
+			*has_radio = 1;
+		}
+	}
+	mutex_unlock(&cx231xx_devlist_mutex);
+
+	return dev;
+}
+
+/*
+ * cx231xx_realease_resources()
+ * unregisters the v4l2,i2c and usb devices
+ * called when the device gets disconected or at module unload
+*/
+void cx231xx_remove_from_devlist(struct cx231xx *dev)
+{
+	mutex_lock(&cx231xx_devlist_mutex);
+	list_del(&dev->devlist);
+	mutex_unlock(&cx231xx_devlist_mutex);
+};
+
+void cx231xx_add_into_devlist(struct cx231xx *dev)
+{
+	mutex_lock(&cx231xx_devlist_mutex);
+	list_add_tail(&dev->devlist, &cx231xx_devlist);
+	mutex_unlock(&cx231xx_devlist_mutex);
+};
+
+
+
+
+static LIST_HEAD(cx231xx_extension_devlist);
+static DEFINE_MUTEX(cx231xx_extension_devlist_lock);
+
+int cx231xx_register_extension(struct cx231xx_ops *ops)
+{
+	struct cx231xx *dev = NULL;
+
+	mutex_lock(&cx231xx_devlist_mutex);
+	mutex_lock(&cx231xx_extension_devlist_lock);
+	list_add_tail(&ops->next, &cx231xx_extension_devlist);
+	list_for_each_entry(dev, &cx231xx_devlist, devlist) {
+		if (dev)
+			ops->init(dev);
+	}
+	cx231xx_info("Cx231xx: Initialized (%s) extension\n", ops->name);
+	mutex_unlock(&cx231xx_extension_devlist_lock);
+	mutex_unlock(&cx231xx_devlist_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(cx231xx_register_extension);
+
+void cx231xx_unregister_extension(struct cx231xx_ops *ops)
+{
+	struct cx231xx *dev = NULL;
+
+	mutex_lock(&cx231xx_devlist_mutex);
+	list_for_each_entry(dev, &cx231xx_devlist, devlist) {
+		if (dev)
+			ops->fini(dev);
+	}
+
+	mutex_lock(&cx231xx_extension_devlist_lock);
+	cx231xx_info("Cx231xx: Removed (%s) extension\n", ops->name);
+	list_del(&ops->next);
+	mutex_unlock(&cx231xx_extension_devlist_lock);
+	mutex_unlock(&cx231xx_devlist_mutex);
+}
+EXPORT_SYMBOL(cx231xx_unregister_extension);
+
+
+void cx231xx_init_extension(struct cx231xx *dev)
+{
+	struct cx231xx_ops *ops = NULL;
+
+	mutex_lock(&cx231xx_extension_devlist_lock);
+	if (!list_empty(&cx231xx_extension_devlist)) {
+		list_for_each_entry(ops, &cx231xx_extension_devlist, next) {
+			if (ops->init)
+				ops->init(dev);
+		}
+	}
+	mutex_unlock(&cx231xx_extension_devlist_lock);
+}
+
+void cx231xx_close_extension(struct cx231xx *dev)
+{
+	struct cx231xx_ops *ops = NULL;
+
+	mutex_lock(&cx231xx_extension_devlist_lock);
+	if (!list_empty(&cx231xx_extension_devlist)) {
+		list_for_each_entry(ops, &cx231xx_extension_devlist, next) {
+			if (ops->fini)
+				ops->fini(dev);
+		}
+	}
+	mutex_unlock(&cx231xx_extension_devlist_lock);
+}
+
+/************************************************************************************
+*                              U S B related functions                              *
+*************************************************************************************/
+int cx231xx_send_usb_command(struct cx231xx_i2c *i2c_bus,
+                   struct cx231xx_i2c_xfer_data *req_data)
+{
+    int status = 0;
+    struct cx231xx *dev = i2c_bus->dev;
+    VENDOR_REQUEST_IN ven_req;
+
+    u8 saddr_len    = 0;
+    u8 _i2c_period  = 0;
+    u8 _i2c_nostop  = 0;
+    u8 _i2c_reserve = 0;
+
+    /* Get the I2C period, nostop and reserve parameters */
+    _i2c_period  = i2c_bus->i2c_period;
+    _i2c_nostop  = i2c_bus->i2c_nostop;
+    _i2c_reserve = i2c_bus->i2c_reserve;
+
+    saddr_len = req_data->saddr_len;
+
+    /* Set wValue */
+    if(saddr_len == 1)                          /* need check saddr_len == 0  */
+        ven_req.wValue = req_data->dev_addr<<9|_i2c_period<<4|saddr_len<<2|
+                        _i2c_nostop<<1|I2C_SYNC|_i2c_reserve<<6;
+    else
+        ven_req.wValue = req_data->dev_addr<<9|_i2c_period<<4|saddr_len<<2|
+                        _i2c_nostop<<1|I2C_SYNC|_i2c_reserve<<6;
+
+    /* set channel number */
+    if(req_data->direction & I2C_M_RD)
+        ven_req.bRequest = i2c_bus->nr + 4;   /* channel number, for read,
+                                                 spec required channel_num +4 */
+    else
+        ven_req.bRequest = i2c_bus->nr;       /* channel number,  */
+
+    /* set index value */
+    switch(saddr_len){
+        case 0:
+            ven_req.wIndex = 0;               /* need check */
+            break;
+        case 1:
+            ven_req.wIndex = (req_data->saddr_dat & 0xff);
+            break;
+        case 2:
+            ven_req.wIndex = req_data->saddr_dat;
+            break;
+    }
+
+    /* set wLength value */
+    ven_req.wLength = req_data->buf_size;
+
+    /* set bData value */
+    ven_req.bData = 0;
+
+    /* set the direction */
+    if(req_data->direction){
+        ven_req.direction = USB_DIR_IN;
+        memset(req_data->p_buffer, 0x00, ven_req.wLength);
+    }
+    else
+        ven_req.direction = USB_DIR_OUT;
+
+    /* set the buffer for read / write */
+    ven_req.pBuff = req_data->p_buffer;
+
+
+
+    /* call common vendor command request */
+    status = cx231xx_send_vendor_cmd(dev, &ven_req);
+    if (status < 0) {
+        cx231xx_info("UsbInterface::sendCommand, output buffer failed with status -%d\n", status);
+    }
+
+    return status;
+}
+
+EXPORT_SYMBOL_GPL(cx231xx_send_usb_command);
+/*
+ * cx231xx_read_ctrl_reg()
+ * reads data from the usb device specifying bRequest and wValue
+ */
+int cx231xx_read_ctrl_reg(struct cx231xx *dev, u8 req, u16 reg,
+			    char *buf, int len)
+{
+     u8 val = 0;
+	int ret;
+	int pipe = usb_rcvctrlpipe(dev->udev, 0);
+
+	if (dev->state & DEV_DISCONNECTED)
+		return -ENODEV;
+
+	if (len > URB_MAX_CTRL_SIZE)
+		return -EINVAL;
+
+    switch(len)
+    {
+        case 1:
+            val = ENABLE_ONE_BYTE;
+            break;
+        case 2:
+            val = ENABLE_TWE_BYTE;
+            break;
+        case 3:
+            val = ENABLE_THREE_BYTE;
+            break;
+        case 4:
+            val = ENABLE_FOUR_BYTE;
+            break;
+        default:
+            val = 0xFF; /* invalid option */
+    }
+
+    if(val == 0xFF)
+        return -EINVAL;
+
+	if (reg_debug) {
+		cx231xx_isocdbg("(pipe 0x%08x): "
+			"IN:  %02x %02x %02x %02x %02x %02x %02x %02x ",
+			pipe,
+			USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+			req, 0, val,
+			reg & 0xff, reg >> 8,
+			len & 0xff, len >> 8);
+	}
+
+	/* mutex_lock(&dev->ctrl_urb_lock);  */
+	ret = usb_control_msg(dev->udev, pipe, req,
+			      USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+			      val, reg, dev->urb_buf, len, HZ);
+	if (ret < 0) {
+		cx231xx_isocdbg(" failed!\n");
+		/* mutex_unlock(&dev->ctrl_urb_lock); */
+		return ret;
+	}
+
+	if (len)
+		memcpy(buf, dev->urb_buf, len);
+
+	/* mutex_unlock(&dev->ctrl_urb_lock); */
+
+	if (reg_debug) {
+		int byte;
+
+		cx231xx_isocdbg("<<<");
+		for (byte = 0; byte < len; byte++)
+			cx231xx_isocdbg(" %02x", (unsigned char)buf[byte]);
+		cx231xx_isocdbg("\n");
+	}
+
+	return ret;
+}
+
+
+int cx231xx_send_vendor_cmd(struct cx231xx *dev, VENDOR_REQUEST_IN *ven_req)
+{
+    int ret;
+	int pipe = 0;
+
+	if (dev->state & DEV_DISCONNECTED)
+		return -ENODEV;
+
+	if ((ven_req->wLength > URB_MAX_CTRL_SIZE))
+		return -EINVAL;
+
+    if(ven_req->direction)
+        pipe = usb_rcvctrlpipe(dev->udev, 0);
+    else
+        pipe = usb_sndctrlpipe(dev->udev, 0);
+
+
+	if (reg_debug) {
+		int byte;
+
+		cx231xx_isocdbg("(pipe 0x%08x): "
+			"OUT: %02x %02x %02x %04x %04x %04x >>>",
+			pipe,
+			ven_req->direction | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+			ven_req->bRequest, 0, ven_req->wValue,
+			ven_req->wIndex,
+			ven_req->wLength);
+
+		for (byte = 0; byte < ven_req->wLength; byte++)
+			cx231xx_isocdbg(" %02x", (unsigned char)ven_req->pBuff[byte]);
+		cx231xx_isocdbg("\n");
+	}
+
+	/* mutex_lock(&dev->ctrl_urb_lock); */
+	ret = usb_control_msg(dev->udev, pipe, ven_req->bRequest,
+			      ven_req->direction | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+			      ven_req->wValue, ven_req->wIndex, ven_req->pBuff, ven_req->wLength, HZ);
+	/* mutex_unlock(&dev->ctrl_urb_lock); */
+
+	return ret;
+}
+
+/*
+ * cx231xx_write_ctrl_reg()
+ * sends data to the usb device, specifying bRequest
+ */
+int cx231xx_write_ctrl_reg(struct cx231xx *dev, u8 req, u16 reg, char *buf,
+				 int len)
+{
+    u8 val = 0;
+	int ret;
+	int pipe = usb_sndctrlpipe(dev->udev, 0);
+
+	if (dev->state & DEV_DISCONNECTED)
+		return -ENODEV;
+
+	if ((len < 1) || (len > URB_MAX_CTRL_SIZE))
+		return -EINVAL;
+
+    switch(len)
+    {
+        case 1:
+            val = ENABLE_ONE_BYTE;
+            break;
+        case 2:
+            val = ENABLE_TWE_BYTE;
+            break;
+        case 3:
+            val = ENABLE_THREE_BYTE;
+            break;
+        case 4:
+            val = ENABLE_FOUR_BYTE;
+            break;
+        default:
+            val = 0xFF; /* invalid option */
+    }
+
+    if(val == 0xFF)
+        return -EINVAL;
+
+	if (reg_debug) {
+		int byte;
+
+		cx231xx_isocdbg("(pipe 0x%08x): "
+			"OUT: %02x %02x %02x %02x %02x %02x %02x %02x >>>",
+			pipe,
+			USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+			req, 0, val,
+			reg & 0xff, reg >> 8,
+			len & 0xff, len >> 8);
+
+		for (byte = 0; byte < len; byte++)
+			cx231xx_isocdbg(" %02x", (unsigned char)buf[byte]);
+		cx231xx_isocdbg("\n");
+	}
+
+	/* mutex_lock(&dev->ctrl_urb_lock); */
+	memcpy(dev->urb_buf, buf, len);
+	ret = usb_control_msg(dev->udev, pipe, req,
+			      USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+			      val, reg, dev->urb_buf, len, HZ);
+	/* mutex_unlock(&dev->ctrl_urb_lock); */
+
+	return ret;
+}
+
+
+/************************************************************************************
+*                         USB Alternate Setting functions                           *
+*************************************************************************************/
+
+int cx231xx_set_video_alternate(struct cx231xx *dev)
+{
+	int errCode, prev_alt = dev->video_mode.alt;
+	unsigned int min_pkt_size = dev->width * 2 + 4;
+    u32 usb_interface_index = 0;
+
+	/* When image size is bigger than a certain value,
+	   the frame size should be increased, otherwise, only
+	   green screen will be received.
+	 */
+	if (dev->width * 2 * dev->height > 720 * 240 * 2)
+		min_pkt_size *= 2;
+
+    if(dev->width > 360) {
+        /* resolutions: 720,704,640 */
+        dev->video_mode.alt = 3;
+    } else if(dev->width > 180) {
+        /* resolutions: 360,352,320,240 */
+        dev->video_mode.alt = 2;
+    } else if(dev->width > 0) {
+        /* resolutions: 180,176,160,128,88 */
+        dev->video_mode.alt = 1;
+    } else {
+        /* Change to alt0 BULK to release USB bandwidth */
+        dev->video_mode.alt = 0;
+    }
+
+    /* Get the correct video interface Index */
+    usb_interface_index = dev->current_pcb_config.hs_config_info[0].interface_info.video_index+1;
+
+	if (dev->video_mode.alt != prev_alt) {
+		cx231xx_coredbg("minimum isoc packet size: %u (alt=%d)\n",
+				min_pkt_size, dev->video_mode.alt);
+		dev->video_mode.max_pkt_size = dev->video_mode.alt_max_pkt_size[dev->video_mode.alt];
+		cx231xx_coredbg("setting alternate %d with wMaxPacketSize=%u\n",
+			       dev->video_mode.alt, dev->video_mode.max_pkt_size);
+        cx231xx_info(" setting alternate %d with wMaxPacketSize=%u , Interface = %d\n",
+			       dev->video_mode.alt, dev->video_mode.max_pkt_size, usb_interface_index);
+		errCode = usb_set_interface(dev->udev, usb_interface_index, dev->video_mode.alt);
+		if (errCode < 0) {
+			cx231xx_errdev("cannot change alternate number to %d (error=%i)\n",
+					dev->video_mode.alt, errCode);
+			return errCode;
+		}
+	}
+	return 0;
+}
+
+int cx231xx_set_alt_setting(struct cx231xx *dev, u8 index, u8 alt)
+{
+    int status = 0;
+    u32 usb_interface_index = 0;
+    u32 max_pkt_size = 0;
+
+    switch(index) {
+        case INDEX_TS1:
+            usb_interface_index = dev->current_pcb_config.hs_config_info[0].interface_info.ts1_index+1;
+            dev->video_mode.alt = alt;
+            if(dev->ts1_mode.alt_max_pkt_size != NULL)
+                max_pkt_size = dev->ts1_mode.max_pkt_size = dev->ts1_mode.alt_max_pkt_size[dev->ts1_mode.alt];
+            break;
+        case INDEX_TS2:
+            usb_interface_index = dev->current_pcb_config.hs_config_info[0].interface_info.ts2_index+1;
+            break;
+        case INDEX_AUDIO:
+            usb_interface_index = dev->current_pcb_config.hs_config_info[0].interface_info.audio_index+1;
+            dev->adev.alt = alt;
+            if( dev->adev.alt_max_pkt_size != NULL)
+                max_pkt_size = dev->adev.max_pkt_size = dev->adev.alt_max_pkt_size[dev->adev.alt];
+            break;
+        case INDEX_VIDEO:
+            usb_interface_index = dev->current_pcb_config.hs_config_info[0].interface_info.video_index+1;
+            dev->video_mode.alt = alt;
+            if(dev->video_mode.alt_max_pkt_size != NULL)
+                max_pkt_size = dev->video_mode.max_pkt_size = dev->video_mode.alt_max_pkt_size[dev->video_mode.alt];
+            break;
+        case INDEX_VANC:
+            usb_interface_index = dev->current_pcb_config.hs_config_info[0].interface_info.vanc_index+1;
+            dev->vbi_mode.alt = alt;
+            if(dev->vbi_mode.alt_max_pkt_size != NULL)
+                max_pkt_size = dev->vbi_mode.max_pkt_size = dev->vbi_mode.alt_max_pkt_size[dev->vbi_mode.alt];
+            break;
+        case INDEX_HANC:
+            usb_interface_index = dev->current_pcb_config.hs_config_info[0].interface_info.hanc_index+1;
+            dev->sliced_cc_mode.alt = alt;
+            if(dev->sliced_cc_mode.alt_max_pkt_size != NULL)
+                max_pkt_size = dev->sliced_cc_mode.max_pkt_size = dev->sliced_cc_mode.alt_max_pkt_size[dev->sliced_cc_mode.alt];
+            break;
+        default:
+            break;
+    }
+
+    if(alt > 0 && max_pkt_size == 0 ) {
+        cx231xx_errdev("cannot change interface %d alternate number to %d : Max. Pkt size is ZERO\n",
+					usb_interface_index, alt);
+        return -1;
+    }
+
+    cx231xx_info(" setting alternate %d with wMaxPacketSize=%u , Interface = %d\n",
+			       alt, max_pkt_size, usb_interface_index);
+
+    if(usb_interface_index > 0 ) {
+        status = usb_set_interface(dev->udev, usb_interface_index, alt);
+		if (status < 0) {
+			cx231xx_errdev("cannot change interface %d alternate number to %d (error=%i)\n",
+					usb_interface_index, alt, status);
+			return status;
+		}
+    }
+
+    return status;
+}
+EXPORT_SYMBOL_GPL(cx231xx_set_alt_setting);
+
+int cx231xx_gpio_set(struct cx231xx *dev, struct cx231xx_reg_seq *gpio)
+{
+	int rc = 0;
+
+	if (!gpio)
+		return rc;
+
+	/* Send GPIO reset sequences specified at board entry */
+	while (gpio->sleep >= 0) {
+        rc = cx231xx_set_gpio_value(dev, gpio->bit,
+						   gpio->val);
+			if (rc < 0)
+				return rc;
+
+		if (gpio->sleep > 0)
+			msleep(gpio->sleep);
+
+		gpio++;
+	}
+	return rc;
+}
+
+int cx231xx_set_mode(struct cx231xx *dev, enum cx231xx_mode set_mode)
+{
+	if (dev->mode == set_mode)
+		return 0;
+
+	if (set_mode == CX231XX_SUSPEND) {
+        /* Set the chip in power saving mode */
+		dev->mode = set_mode;
+	}
+
+	/* Resource is locked */
+	if (dev->mode != CX231XX_SUSPEND)
+		return -EINVAL;
+
+	dev->mode = set_mode;
+
+    if (dev->mode == CX231XX_DIGITAL_MODE) {
+        /* Set Digital power mode */
+    } else {
+        /* Set Analog Power mode*/
+    }
+    return 0;
+}
+EXPORT_SYMBOL_GPL(cx231xx_set_mode);
+
+/************************************************************************************
+*                               URB Streaming functions                             *
+*************************************************************************************/
+
+/*
+ * IRQ callback, called by URB callback
+ */
+static void cx231xx_irq_callback(struct urb *urb)
+{
+	struct cx231xx_dmaqueue  *dma_q = urb->context;
+    struct cx231xx_video_mode *vmode = container_of(dma_q, struct cx231xx_video_mode, vidq);
+    struct cx231xx *dev = container_of(vmode, struct cx231xx, video_mode);
+	int rc, i;
+
+
+    switch (urb->status) {
+	    case 0:             /* success */
+	    case -ETIMEDOUT:    /* NAK */
+		    break;
+	    case -ECONNRESET:   /* kill */
+	    case -ENOENT:
+	    case -ESHUTDOWN:
+		    return;
+	    default:            /* error */
+		    cx231xx_isocdbg("urb completition error %d.\n", urb->status);
+		    break;
+	}
+
+	/* Copy data from URB */
+	spin_lock(&dev->video_mode.slock);
+	rc = dev->video_mode.isoc_ctl.isoc_copy(dev, urb);
+	spin_unlock(&dev->video_mode.slock);
+
+	/* Reset urb buffers */
+	for (i = 0; i < urb->number_of_packets; i++) {
+		urb->iso_frame_desc[i].status = 0;
+		urb->iso_frame_desc[i].actual_length = 0;
+	}
+	urb->status = 0;
+
+	urb->status = usb_submit_urb(urb, GFP_ATOMIC);
+	if (urb->status) {
+		cx231xx_isocdbg("urb resubmit failed (error=%i)\n",
+			       urb->status);
+	}
+}
+
+/*
+ * Stop and Deallocate URBs
+ */
+void cx231xx_uninit_isoc(struct cx231xx *dev)
+{
+	struct urb *urb;
+	int i;
+
+	cx231xx_isocdbg("cx231xx: called cx231xx_uninit_isoc\n");
+
+	dev->video_mode.isoc_ctl.nfields = -1;
+	for (i = 0; i < dev->video_mode.isoc_ctl.num_bufs; i++) {
+		urb = dev->video_mode.isoc_ctl.urb[i];
+		if (urb) {
+            if (!irqs_disabled())
+			    usb_kill_urb(urb);
+            else
+			    usb_unlink_urb(urb);
+
+			if (dev->video_mode.isoc_ctl.transfer_buffer[i]) {
+				usb_buffer_free(dev->udev,
+					urb->transfer_buffer_length,
+					dev->video_mode.isoc_ctl.transfer_buffer[i],
+					urb->transfer_dma);
+			}
+			usb_free_urb(urb);
+			dev->video_mode.isoc_ctl.urb[i] = NULL;
+		}
+		dev->video_mode.isoc_ctl.transfer_buffer[i] = NULL;
+	}
+
+	kfree(dev->video_mode.isoc_ctl.urb);
+	kfree(dev->video_mode.isoc_ctl.transfer_buffer);
+
+	dev->video_mode.isoc_ctl.urb = NULL;
+	dev->video_mode.isoc_ctl.transfer_buffer = NULL;
+	dev->video_mode.isoc_ctl.num_bufs = 0;
+
+	cx231xx_capture_start(dev, 0, Raw_Video);
+}
+EXPORT_SYMBOL_GPL(cx231xx_uninit_isoc);
+
+/*
+ * Allocate URBs and start IRQ
+ */
+int cx231xx_init_isoc(struct cx231xx *dev, int max_packets,
+		     int num_bufs, int max_pkt_size,
+		     int (*isoc_copy) (struct cx231xx *dev, struct urb *urb))
+{
+	struct cx231xx_dmaqueue *dma_q = &dev->video_mode.vidq;
+	int i;
+	int sb_size, pipe;
+	struct urb *urb;
+	int j, k;
+	int rc;
+
+	cx231xx_isocdbg("cx231xx: called cx231xx_prepare_isoc\n");
+
+    dev->video_input = dev->video_input > 2?2:dev->video_input;
+
+    cx231xx_info("Setting Video mux to %d\n",dev->video_input);
+    video_mux(dev, dev->video_input);
+
+
+	/* De-allocates all pending stuff */
+	cx231xx_uninit_isoc(dev);
+
+	dev->video_mode.isoc_ctl.isoc_copy = isoc_copy;
+	dev->video_mode.isoc_ctl.num_bufs = num_bufs;
+    dma_q->pos = 0;
+    dma_q->is_partial_line = 0;
+    dma_q->last_sav = 0;
+    dma_q->current_field = -1;
+    dma_q->field1_done = 0;
+    dma_q->lines_per_field = dev->height/2;
+    dma_q->bytes_left_in_line = dev->width << 1;
+    dma_q->lines_completed = 0;
+    for(i = 0; i < 8 ; i++)
+        dma_q->partial_buf[i] = 0;
+
+	dev->video_mode.isoc_ctl.urb = kzalloc(sizeof(void *)*num_bufs,  GFP_KERNEL);
+	if (!dev->video_mode.isoc_ctl.urb) {
+		cx231xx_errdev("cannot alloc memory for usb buffers\n");
+		return -ENOMEM;
+	}
+
+	dev->video_mode.isoc_ctl.transfer_buffer = kzalloc(sizeof(void *)*num_bufs,
+					      GFP_KERNEL);
+	if (!dev->video_mode.isoc_ctl.transfer_buffer) {
+		cx231xx_errdev("cannot allocate memory for usbtransfer\n");
+		kfree(dev->video_mode.isoc_ctl.urb);
+		return -ENOMEM;
+	}
+
+	dev->video_mode.isoc_ctl.max_pkt_size = max_pkt_size;
+	dev->video_mode.isoc_ctl.buf = NULL;
+
+	sb_size = max_packets * dev->video_mode.isoc_ctl.max_pkt_size;
+
+	/* allocate urbs and transfer buffers */
+	for (i = 0; i < dev->video_mode.isoc_ctl.num_bufs; i++) {
+		urb = usb_alloc_urb(max_packets, GFP_KERNEL);
+		if (!urb) {
+			cx231xx_err("cannot alloc isoc_ctl.urb %i\n", i);
+			cx231xx_uninit_isoc(dev);
+			return -ENOMEM;
+		}
+		dev->video_mode.isoc_ctl.urb[i] = urb;
+
+		dev->video_mode.isoc_ctl.transfer_buffer[i] = usb_buffer_alloc(dev->udev,
+			sb_size, GFP_KERNEL, &urb->transfer_dma);
+		if (!dev->video_mode.isoc_ctl.transfer_buffer[i]) {
+			cx231xx_err("unable to allocate %i bytes for transfer"
+					" buffer %i%s\n",
+					sb_size, i,
+					in_interrupt()?" while in int":"");
+			cx231xx_uninit_isoc(dev);
+			return -ENOMEM;
+		}
+		memset(dev->video_mode.isoc_ctl.transfer_buffer[i], 0, sb_size);
+
+		pipe = usb_rcvisocpipe(dev->udev, dev->video_mode.end_point_addr);
+
+		usb_fill_int_urb(urb, dev->udev, pipe,
+				 dev->video_mode.isoc_ctl.transfer_buffer[i], sb_size,
+				 cx231xx_irq_callback, dma_q, 1);
+
+		urb->number_of_packets = max_packets;
+		urb->transfer_flags = URB_ISO_ASAP;
+
+		k = 0;
+		for (j = 0; j < max_packets; j++) {
+			urb->iso_frame_desc[j].offset = k;
+			urb->iso_frame_desc[j].length =
+						dev->video_mode.isoc_ctl.max_pkt_size;
+			k += dev->video_mode.isoc_ctl.max_pkt_size;
+		}
+	}
+
+	init_waitqueue_head(&dma_q->wq);
+
+
+	/* submit urbs and enables IRQ */
+	for (i = 0; i < dev->video_mode.isoc_ctl.num_bufs; i++) {
+		rc = usb_submit_urb(dev->video_mode.isoc_ctl.urb[i], GFP_ATOMIC);
+		if (rc) {
+			cx231xx_err("submit of urb %i failed (error=%i)\n", i,
+				   rc);
+			cx231xx_uninit_isoc(dev);
+			return rc;
+		}
+	}
+
+    cx231xx_capture_start(dev, 1, Raw_Video);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cx231xx_init_isoc);
+
+/************************************************************************************
+*                          Device Init/UnInit functions                             *
+*************************************************************************************/
+int cx231xx_dev_init(struct cx231xx *dev)
+{
+    int errCode = 0;
+
+    /* Initialize I2C bus */
+
+	/* External Master 1 Bus */
+	dev->i2c_bus[0].nr = 0;
+	dev->i2c_bus[0].dev = dev;
+	dev->i2c_bus[0].i2c_period = I2C_SPEED_1M;  /* 1MHz */
+    dev->i2c_bus[0].i2c_nostop = 0;
+    dev->i2c_bus[0].i2c_reserve = 0;
+
+	/* External Master 2 Bus */
+	dev->i2c_bus[1].nr = 1;
+	dev->i2c_bus[1].dev = dev;
+	dev->i2c_bus[1].i2c_period =  I2C_SPEED_1M;  /* 1MHz */
+    dev->i2c_bus[1].i2c_nostop = 0;
+    dev->i2c_bus[1].i2c_reserve = 0;
+
+	/* Internal Master 3 Bus */
+	dev->i2c_bus[2].nr = 2;
+	dev->i2c_bus[2].dev = dev;
+	dev->i2c_bus[2].i2c_period = I2C_SPEED_400K; /* 400kHz */
+    dev->i2c_bus[2].i2c_nostop = 0;
+    dev->i2c_bus[2].i2c_reserve = 0;
+
+    /* register I2C buses */
+	cx231xx_i2c_register(&dev->i2c_bus[0]);
+	cx231xx_i2c_register(&dev->i2c_bus[1]);
+	cx231xx_i2c_register(&dev->i2c_bus[2]);
+
+    /* init hardware */
+    /* Note : with out calling set power mode function, colibri can not be set up correctly */
+    errCode = cx231xx_set_power_mode(dev, POLARIS_AVMODE_ANALOGT_TV);
+    if (errCode < 0) {
+        cx231xx_errdev("%s: cx231xx_set_power_mode : Failed to set Power - errCode [%d]!\n",
+			__func__, errCode);
+		return errCode;
+	}
+
+    /* initialize Colibri block */
+    errCode = cx231xx_colibri_init_super_block(dev, 0x23c);
+	if (errCode < 0) {
+		cx231xx_errdev("%s: cx231xx_colibri init super block - errCode [%d]!\n",
+			__func__, errCode);
+		return errCode;
+	}
+    errCode = cx231xx_colibri_init_channels(dev);
+    if (errCode < 0) {
+		cx231xx_errdev("%s: cx231xx_colibri init channels - errCode [%d]!\n",
+			__func__, errCode);
+		return errCode;
+	}
+
+    /* Set DIF in By pass mode */
+    errCode = cx231xx_dif_set_standard(dev, DIF_USE_BASEBAND);
+    if (errCode < 0) {
+		cx231xx_errdev("%s: cx231xx_dif set to By pass mode - errCode [%d]!\n",
+			__func__, errCode);
+		return errCode;
+	}
+
+    /* flatiron related functions */
+    errCode = cx231xx_flatiron_initialize(dev);
+    if (errCode < 0) {
+		cx231xx_errdev("%s: cx231xx_flatiron initialize - errCode [%d]!\n",
+			__func__, errCode);
+		return errCode;
+	}
+
+    /* init control pins */
+    errCode = cx231xx_init_ctrl_pin_status(dev);
+    if (errCode < 0) {
+		cx231xx_errdev("%s: cx231xx_init ctrl pins - errCode [%d]!\n",
+			__func__, errCode);
+		return errCode;
+	}
+
+    /* set AGC mode to Analog */
+    errCode = cx231xx_set_agc_analog_digital_mux_select(dev, 1);
+    if (errCode < 0) {
+		cx231xx_errdev("%s: cx231xx_AGC mode to Analog - errCode [%d]!\n",
+			__func__, errCode);
+		return errCode;
+	}
+
+    /* set all alternate settings to zero initially */
+    cx231xx_set_alt_setting(dev, INDEX_VIDEO, 0);
+    cx231xx_set_alt_setting(dev, INDEX_VANC, 0);
+    cx231xx_set_alt_setting(dev, INDEX_HANC, 0);
+    if(dev->board.has_dvb)
+        cx231xx_set_alt_setting(dev, INDEX_TS1, 0);
+
+    /* set the I2C master port to 3 on channel 1 */
+    errCode = cx231xx_enable_i2c_for_tuner(dev, I2C_3);
+
+	return errCode;
+}
+EXPORT_SYMBOL_GPL(cx231xx_dev_init);
+
+void cx231xx_dev_uninit(struct cx231xx *dev)
+{
+    /* Un Initialize I2C bus */
+	cx231xx_i2c_unregister(&dev->i2c_bus[2]);
+	cx231xx_i2c_unregister(&dev->i2c_bus[1]);
+	cx231xx_i2c_unregister(&dev->i2c_bus[0]);
+}
+EXPORT_SYMBOL_GPL(cx231xx_dev_uninit);
+
+
+/************************************************************************************
+*                              G P I O related functions                            *
+*************************************************************************************/
+int cx231xx_send_gpio_cmd(struct cx231xx *dev, u32 gpio_bit, u8* gpio_val,
+                          u8 len, u8 request, u8 direction)
+{
+    int status = 0;
+    VENDOR_REQUEST_IN ven_req;
+
+    /* Set wValue */
+    ven_req.wValue = (u16)(gpio_bit>>16 & 0xffff);
+
+    /* set request */
+    if(!request){
+        if(direction)
+            ven_req.bRequest = VRT_GET_GPIO; /* 0x8 gpio */
+        else
+            ven_req.bRequest = VRT_SET_GPIO; /* 0x9 gpio */
+    }
+    else {
+        if(direction)
+            ven_req.bRequest = VRT_GET_GPIE; /* 0xa gpie */
+        else
+            ven_req.bRequest = VRT_SET_GPIE; /* 0xb gpie */
+    }
+
+    /* set index value */
+    ven_req.wIndex = (u16)(gpio_bit & 0xffff);
+
+    /* set wLength value */
+    ven_req.wLength = len;
+
+    /* set bData value */
+    ven_req.bData = 0;
+
+	/* set the buffer for read / write */
+    ven_req.pBuff = gpio_val;
+
+    /* set the direction */
+    if(direction){
+        ven_req.direction = USB_DIR_IN;
+        memset(ven_req.pBuff, 0x00, ven_req.wLength);
+    }
+    else
+        ven_req.direction = USB_DIR_OUT;
+
+
+
+    /* call common vendor command request */
+    status = cx231xx_send_vendor_cmd(dev, &ven_req);
+    if (status < 0)
+    {
+        cx231xx_info("UsbInterface::sendCommand, output buffer failed with status -%d\n", status);
+    }
+
+    return status;
+}
+
+EXPORT_SYMBOL_GPL(cx231xx_send_gpio_cmd);
+
+/*************************************************************************************
+ *               C O N T R O L - Register R E A D / W R I T E functions              *
+ *************************************************************************************/
+int cx231xx_mode_register(struct cx231xx *dev, u16 address, u32 mode)
+{
+    u8  value[4] = {0x0, 0x0, 0x0, 0x0};
+    u32 tmp =0;
+    int status = 0;
+
+    status = cx231xx_read_ctrl_reg(dev,VRT_GET_REGISTER, address,value,4);
+    if(status < 0)
+        return status;
+
+    tmp = *((u32 *)value);
+    tmp |= mode;
+
+    value[0]=(u8) tmp;
+    value[1]=(u8)(tmp>>8);
+    value[2]=(u8)(tmp>>16);
+    value[3]=(u8)(tmp>>24);
+
+    status = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, address,value,4);
+
+    return status;
+}
+
+/*************************************************************************************
+ *                      I 2 C Internal C O N T R O L   functions                     *
+ *************************************************************************************/
+int cx231xx_read_i2c_data(struct cx231xx *dev, u8 dev_addr, u16 saddr,
+                           u8 saddr_len, u32 *data, u8 data_len)
+{
+    int status = 0;
+    struct cx231xx_i2c_xfer_data req_data;
+    u8 value[4] ={0,0,0,0};
+
+    if(saddr_len == 0)
+        saddr = 0;
+    else if(saddr_len == 0)
+        saddr &= 0xff;
+
+    /* prepare xfer_data struct */
+    req_data.dev_addr = dev_addr >> 1;
+    req_data.direction = I2C_M_RD;
+    req_data.saddr_len = saddr_len;
+    req_data.saddr_dat = saddr;
+    req_data.buf_size = data_len;
+    req_data.p_buffer = (u8*)value;
+
+    /* usb send command */
+    status = dev->cx231xx_send_usb_command(&dev->i2c_bus[0], &req_data);
+
+    if(status >= 0)
+    {
+        /* Copy the data read back to main buffer */
+        if(data_len == 1)
+            *data = value[0];
+        else
+            *data = value[0] | value[1] << 8 | value[2] << 16 | value[3] << 24;
+    }
+
+    return status;
+}
+
+int cx231xx_write_i2c_data(struct cx231xx *dev, u8 dev_addr, u16 saddr,
+                           u8 saddr_len, u32 data, u8 data_len)
+{
+    int status = 0;
+    u8 value[4] ={0,0,0,0};
+    struct cx231xx_i2c_xfer_data req_data;
+
+    value[0]=(u8)data;
+    value[1]=(u8)(data>>8);
+    value[2]=(u8)(data>>16);
+    value[3]=(u8)(data>>24);
+
+    if(saddr_len == 0)
+        saddr = 0;
+    else if(saddr_len == 0)
+        saddr &= 0xff;
+
+    /* prepare xfer_data struct */
+    req_data.dev_addr = dev_addr >> 1;
+    req_data.direction = 0;
+    req_data.saddr_len = saddr_len;
+    req_data.saddr_dat = saddr;
+    req_data.buf_size = data_len;
+    req_data.p_buffer = value;
+
+    /* usb send command */
+    status = dev->cx231xx_send_usb_command(&dev->i2c_bus[0], &req_data);
+
+    return status;
+}
+
+int cx231xx_reg_mask_write(struct cx231xx *dev, u8 dev_addr, u8 size, u16 register_address,
+                           u8 bit_start,u8 bit_end, u32 value)
+{
+    int status = 0;
+    u32 tmp;
+    u32 mask = 0;
+    int i;
+
+    if (bit_start>(size-1) || bit_end>(size-1)) {
+        return -1;
+    }
+
+    if (size==8){
+        status = cx231xx_read_i2c_data(dev, dev_addr, register_address, 2, &tmp, 1);
+    } else {
+        status = cx231xx_read_i2c_data(dev, dev_addr, register_address, 2, &tmp, 4);
+    }
+
+    if (status < 0) {
+        return status;
+    }
+
+    mask = 1<<bit_end;
+    for (i=bit_end; i>bit_start&&i>0; i--) {
+        mask = mask + (1<<(i-1));
+    }
+
+    value <<= bit_start;
+
+    if (size==8)
+    {
+        tmp &=  ~mask;
+        tmp |=  value;
+        tmp &= 0xff;
+        status = cx231xx_write_i2c_data(dev, dev_addr, register_address, 2, tmp, 1);
+    }
+    else
+    {
+        tmp &=  ~mask;
+        tmp |=  value;
+        status = cx231xx_write_i2c_data(dev, dev_addr, register_address, 2, tmp, 4);
+    }
+
+    return status;
+}
+
+
+
+int cx231xx_read_modify_write_i2c_dword(struct cx231xx *dev, u8 dev_addr,
+                                       u16 saddr, u32 mask, u32 value)
+{
+    u32   temp;
+    int status = 0;
+
+    status = cx231xx_read_i2c_data(dev, dev_addr, saddr, 2, &temp, 4);
+
+    if(status < 0)
+        return status;
+
+    temp &= ~mask;
+    temp |= value;
+
+    status = cx231xx_write_i2c_data(dev, dev_addr, saddr, 2, temp, 4);
+
+    return status;
+}
+
+u32 cx231xx_set_field(u32 field_mask, u32 data)
+{
+   u32 temp;
+
+   for (temp = field_mask; (temp & 1) == 0; temp >>= 1) {
+      data <<= 1;
+   }
+
+   return data;
+}
diff --git a/drivers/media/video/cx231xx/cx231xx-dvb.c b/drivers/media/video/cx231xx/cx231xx-dvb.c
new file mode 100644
index 000000000000..46bdcecb4055
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-dvb.c
@@ -0,0 +1,565 @@
+/*
+ DVB device driver for cx231xx
+
+ Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+        Based on em28xx driver
+
+ This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/usb.h>
+
+#include "cx231xx.h"
+#include <media/v4l2-common.h>
+#include <media/videobuf-vmalloc.h>
+
+#include "xc5000.h"
+#include "dvb_dummy_fe.h"
+
+
+MODULE_DESCRIPTION("driver for cx231xx based DVB cards");
+MODULE_AUTHOR("Srinivasa Deevi <srinivasa.deevi@conexant.com>");
+MODULE_LICENSE("GPL");
+
+static unsigned int debug;
+module_param(debug, int, 0644);
+MODULE_PARM_DESC(debug, "enable debug messages [dvb]");
+
+DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr);
+
+#define dprintk(level, fmt, arg...) do {			\
+if (debug >= level) 						\
+	printk(KERN_DEBUG "%s/2-dvb: " fmt, dev->name, ## arg);	\
+} while (0)
+
+#define CX231XX_DVB_NUM_BUFS 5
+#define CX231XX_DVB_MAX_PACKETSIZE 564
+#define CX231XX_DVB_MAX_PACKETS 64
+
+struct cx231xx_dvb {
+	struct dvb_frontend        *frontend;
+
+	/* feed count management */
+	struct mutex               lock;
+	int                        nfeeds;
+
+	/* general boilerplate stuff */
+	struct dvb_adapter         adapter;
+	struct dvb_demux           demux;
+	struct dmxdev              dmxdev;
+	struct dmx_frontend        fe_hw;
+	struct dmx_frontend        fe_mem;
+	struct dvb_net             net;
+};
+
+
+static inline void print_err_status(struct cx231xx *dev,
+				     int packet, int status)
+{
+	char *errmsg = "Unknown";
+
+	switch (status) {
+	case -ENOENT:
+		errmsg = "unlinked synchronuously";
+		break;
+	case -ECONNRESET:
+		errmsg = "unlinked asynchronuously";
+		break;
+	case -ENOSR:
+		errmsg = "Buffer error (overrun)";
+		break;
+	case -EPIPE:
+		errmsg = "Stalled (device not responding)";
+		break;
+	case -EOVERFLOW:
+		errmsg = "Babble (bad cable?)";
+		break;
+	case -EPROTO:
+		errmsg = "Bit-stuff error (bad cable?)";
+		break;
+	case -EILSEQ:
+		errmsg = "CRC/Timeout (could be anything)";
+		break;
+	case -ETIME:
+		errmsg = "Device does not respond";
+		break;
+	}
+	if (packet < 0) {
+		dprintk(1, "URB status %d [%s].\n", status, errmsg);
+	} else {
+		dprintk(1, "URB packet %d, status %d [%s].\n",
+			packet, status, errmsg);
+	}
+}
+
+static inline int dvb_isoc_copy(struct cx231xx *dev, struct urb *urb)
+{
+	int i;
+
+	if (!dev)
+		return 0;
+
+	if ((dev->state & DEV_DISCONNECTED) || (dev->state & DEV_MISCONFIGURED))
+		return 0;
+
+	if (urb->status < 0) {
+		print_err_status(dev, -1, urb->status);
+		if (urb->status == -ENOENT)
+			return 0;
+	}
+
+	for (i = 0; i < urb->number_of_packets; i++) {
+		int status = urb->iso_frame_desc[i].status;
+
+		if (status < 0) {
+			print_err_status(dev, i, status);
+			if (urb->iso_frame_desc[i].status != -EPROTO)
+				continue;
+		}
+
+		dvb_dmx_swfilter(&dev->dvb->demux, urb->transfer_buffer +
+				 urb->iso_frame_desc[i].offset,
+				 urb->iso_frame_desc[i].actual_length);
+	}
+
+	return 0;
+}
+
+static int start_streaming(struct cx231xx_dvb *dvb)
+{
+	int rc;
+	struct cx231xx *dev = dvb->adapter.priv;
+
+	usb_set_interface(dev->udev, 0, 1);
+	rc = cx231xx_set_mode(dev, CX231XX_DIGITAL_MODE);
+	if (rc < 0)
+		return rc;
+
+	return cx231xx_init_isoc(dev, CX231XX_DVB_MAX_PACKETS,
+				CX231XX_DVB_NUM_BUFS, CX231XX_DVB_MAX_PACKETSIZE,
+				dvb_isoc_copy);
+}
+
+static int stop_streaming(struct cx231xx_dvb *dvb)
+{
+	struct cx231xx *dev = dvb->adapter.priv;
+
+	cx231xx_uninit_isoc(dev);
+
+	cx231xx_set_mode(dev, CX231XX_SUSPEND);
+
+	return 0;
+}
+
+static int start_feed(struct dvb_demux_feed *feed)
+{
+	struct dvb_demux *demux  = feed->demux;
+	struct cx231xx_dvb *dvb = demux->priv;
+	int rc, ret;
+
+	if (!demux->dmx.frontend)
+		return -EINVAL;
+
+	mutex_lock(&dvb->lock);
+	dvb->nfeeds++;
+	rc = dvb->nfeeds;
+
+	if (dvb->nfeeds == 1) {
+		ret = start_streaming(dvb);
+		if (ret < 0)
+			rc = ret;
+	}
+
+	mutex_unlock(&dvb->lock);
+	return rc;
+}
+
+static int stop_feed(struct dvb_demux_feed *feed)
+{
+	struct dvb_demux *demux  = feed->demux;
+	struct cx231xx_dvb *dvb = demux->priv;
+	int err = 0;
+
+	mutex_lock(&dvb->lock);
+	dvb->nfeeds--;
+
+	if (0 == dvb->nfeeds)
+		err = stop_streaming(dvb);
+
+	mutex_unlock(&dvb->lock);
+	return err;
+}
+
+
+
+/* ------------------------------------------------------------------ */
+static int cx231xx_dvb_bus_ctrl(struct dvb_frontend *fe, int acquire)
+{
+	struct cx231xx *dev = fe->dvb->priv;
+
+	if (acquire)
+		return cx231xx_set_mode(dev, CX231XX_DIGITAL_MODE);
+	else
+		return cx231xx_set_mode(dev, CX231XX_SUSPEND);
+}
+
+/* ------------------------------------------------------------------ */
+
+
+static struct xc5000_config cnxt_rde250_tunerconfig = {
+	.i2c_address      = 0x61,
+	.if_khz           = 5380,
+};
+
+
+/* ------------------------------------------------------------------ */
+#if 0
+static int attach_xc5000(u8 addr, struct cx231xx *dev)
+{
+
+	struct dvb_frontend *fe;
+	struct xc5000_config cfg;
+
+	memset(&cfg, 0, sizeof(cfg));
+	cfg.i2c_adap  = &dev->i2c_bus[1].i2c_adap;
+	cfg.i2c_addr  = addr;
+
+	if (!dev->dvb->frontend) {
+		printk(KERN_ERR "%s/2: dvb frontend not attached. "
+				"Can't attach xc5000\n",
+		       dev->name);
+		return -EINVAL;
+	}
+
+	fe = dvb_attach(xc5000_attach, dev->dvb->frontend, &cfg);
+	if (!fe) {
+		printk(KERN_ERR "%s/2: xc5000 attach failed\n", dev->name);
+		dvb_frontend_detach(dev->dvb->frontend);
+		dev->dvb->frontend = NULL;
+		return -EINVAL;
+	}
+
+	printk(KERN_INFO "%s/2: xc5000 attached\n", dev->name);
+
+	return 0;
+}
+#endif
+
+int cx231xx_set_analog_freq(struct cx231xx *dev, u32 freq )
+{
+	int status = 0;
+
+	if( (dev->dvb != NULL) && (dev->dvb->frontend != NULL) ){
+
+			struct dvb_tuner_ops *dops = &dev->dvb->frontend->ops.tuner_ops;
+
+		    if(dops->set_analog_params != NULL) {
+				 struct analog_parameters params;
+
+				 params.frequency = freq;
+				 params.std = dev->norm;
+				 params.mode = 0 ; /* 0- Air; 1 - cable */
+				 /*params.audmode = ;       */
+
+				 /* Set the analog parameters to set the frequency */
+				 cx231xx_info("Setting Frequency for XC5000\n");
+				 dops->set_analog_params(dev->dvb->frontend, &params);
+			}
+
+		}
+
+	return status;
+}
+
+int cx231xx_reset_analog_tuner(struct cx231xx *dev)
+{
+    int status = 0;
+
+	if( (dev->dvb != NULL) && (dev->dvb->frontend != NULL) ){
+
+			struct dvb_tuner_ops *dops = &dev->dvb->frontend->ops.tuner_ops;
+
+		    if(dops->init != NULL && !dev->xc_fw_load_done) {
+
+				 cx231xx_info("Reloading firmware for XC5000\n");
+				 status = dops->init(dev->dvb->frontend);
+                 if(status == 0 ) {
+                    dev->xc_fw_load_done = 1;
+                    cx231xx_info("XC5000 firmware download completed\n");
+                 } else {
+                     dev->xc_fw_load_done = 0;
+                     cx231xx_info("XC5000 firmware download failed !!!\n");
+                 }
+			}
+
+		}
+
+	return status;
+}
+
+
+/* ------------------------------------------------------------------ */
+
+static int register_dvb(struct cx231xx_dvb *dvb,
+		 struct module *module,
+		 struct cx231xx *dev,
+		 struct device *device)
+{
+	int result;
+
+	mutex_init(&dvb->lock);
+
+	/* register adapter */
+	result = dvb_register_adapter(&dvb->adapter, dev->name, module, device,
+				      adapter_nr);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: dvb_register_adapter failed (errno = %d)\n",
+		       dev->name, result);
+		goto fail_adapter;
+	}
+
+	/* Ensure all frontends negotiate bus access */
+	dvb->frontend->ops.ts_bus_ctrl = cx231xx_dvb_bus_ctrl;
+
+	dvb->adapter.priv = dev;
+
+	/* register frontend */
+	result = dvb_register_frontend(&dvb->adapter, dvb->frontend);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: dvb_register_frontend failed (errno = %d)\n",
+		       dev->name, result);
+		goto fail_frontend;
+	}
+
+	/* register demux stuff */
+	dvb->demux.dmx.capabilities =
+		DMX_TS_FILTERING | DMX_SECTION_FILTERING |
+		DMX_MEMORY_BASED_FILTERING;
+	dvb->demux.priv       = dvb;
+	dvb->demux.filternum  = 256;
+	dvb->demux.feednum    = 256;
+	dvb->demux.start_feed = start_feed;
+	dvb->demux.stop_feed  = stop_feed;
+
+	result = dvb_dmx_init(&dvb->demux);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: dvb_dmx_init failed (errno = %d)\n",
+		       dev->name, result);
+		goto fail_dmx;
+	}
+
+	dvb->dmxdev.filternum    = 256;
+	dvb->dmxdev.demux        = &dvb->demux.dmx;
+	dvb->dmxdev.capabilities = 0;
+	result = dvb_dmxdev_init(&dvb->dmxdev, &dvb->adapter);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: dvb_dmxdev_init failed (errno = %d)\n",
+		       dev->name, result);
+		goto fail_dmxdev;
+	}
+
+	dvb->fe_hw.source = DMX_FRONTEND_0;
+	result = dvb->demux.dmx.add_frontend(&dvb->demux.dmx, &dvb->fe_hw);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: add_frontend failed (DMX_FRONTEND_0, errno = %d)\n",
+		       dev->name, result);
+		goto fail_fe_hw;
+	}
+
+	dvb->fe_mem.source = DMX_MEMORY_FE;
+	result = dvb->demux.dmx.add_frontend(&dvb->demux.dmx, &dvb->fe_mem);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: add_frontend failed (DMX_MEMORY_FE, errno = %d)\n",
+		       dev->name, result);
+		goto fail_fe_mem;
+	}
+
+	result = dvb->demux.dmx.connect_frontend(&dvb->demux.dmx, &dvb->fe_hw);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: connect_frontend failed (errno = %d)\n",
+		       dev->name, result);
+		goto fail_fe_conn;
+	}
+
+	/* register network adapter */
+	dvb_net_init(&dvb->adapter, &dvb->net, &dvb->demux.dmx);
+	return 0;
+
+fail_fe_conn:
+	dvb->demux.dmx.remove_frontend(&dvb->demux.dmx, &dvb->fe_mem);
+fail_fe_mem:
+	dvb->demux.dmx.remove_frontend(&dvb->demux.dmx, &dvb->fe_hw);
+fail_fe_hw:
+	dvb_dmxdev_release(&dvb->dmxdev);
+fail_dmxdev:
+	dvb_dmx_release(&dvb->demux);
+fail_dmx:
+	dvb_unregister_frontend(dvb->frontend);
+fail_frontend:
+	dvb_frontend_detach(dvb->frontend);
+	dvb_unregister_adapter(&dvb->adapter);
+fail_adapter:
+	return result;
+}
+
+static void unregister_dvb(struct cx231xx_dvb *dvb)
+{
+	dvb_net_release(&dvb->net);
+	dvb->demux.dmx.remove_frontend(&dvb->demux.dmx, &dvb->fe_mem);
+	dvb->demux.dmx.remove_frontend(&dvb->demux.dmx, &dvb->fe_hw);
+	dvb_dmxdev_release(&dvb->dmxdev);
+	dvb_dmx_release(&dvb->demux);
+	dvb_unregister_frontend(dvb->frontend);
+	dvb_frontend_detach(dvb->frontend);
+	dvb_unregister_adapter(&dvb->adapter);
+}
+
+
+static int dvb_init(struct cx231xx *dev)
+{
+	int result = 0;
+	struct cx231xx_dvb *dvb;
+
+	if (!dev->board.has_dvb) {
+		/* This device does not support the extension */
+		return 0;
+	}
+
+	dvb = kzalloc(sizeof(struct cx231xx_dvb), GFP_KERNEL);
+
+	if (dvb == NULL) {
+		printk(KERN_INFO "cx231xx_dvb: memory allocation failed\n");
+		return -ENOMEM;
+	}
+	dev->dvb = dvb;
+	dev->cx231xx_set_analog_freq = cx231xx_set_analog_freq;
+    dev->cx231xx_reset_analog_tuner = cx231xx_reset_analog_tuner;
+
+	cx231xx_set_mode(dev, CX231XX_DIGITAL_MODE);
+	/* init frontend */
+	switch (dev->model) {
+        case CX231XX_BOARD_CNXT_RDE_250:
+
+           /* dev->dvb->frontend = dvb_attach(s5h1411_attach,
+							&dvico_s5h1411_config,
+							&dev->i2c_bus[1].i2c_adap);*/
+            dev->dvb->frontend = dvb_attach(dvb_dummy_fe_ofdm_attach);
+
+            if(dev->dvb->frontend == NULL) {
+                printk(DRIVER_NAME ": Failed to attach dummy front end\n");
+                result = -EINVAL;
+			    goto out_free;
+            }
+
+            /* define general-purpose callback pointer */
+	        dvb->frontend->callback = cx231xx_tuner_callback;
+
+            if(dvb_attach(xc5000_attach, dev->dvb->frontend,
+				   &dev->i2c_bus[1].i2c_adap,
+				   &cnxt_rde250_tunerconfig) < 0) {
+			    result = -EINVAL;
+			    goto out_free;
+		    }
+
+            break;
+       case CX231XX_BOARD_CNXT_RDU_250:
+
+           dev->dvb->frontend = dvb_attach(dvb_dummy_fe_ofdm_attach);
+
+            if(dev->dvb->frontend == NULL) {
+                printk(DRIVER_NAME ": Failed to attach dummy front end\n");
+                result = -EINVAL;
+			    goto out_free;
+            }
+
+            /* define general-purpose callback pointer */
+	        dvb->frontend->callback = cx231xx_tuner_callback;
+
+           if(dvb_attach(xc5000_attach, dev->dvb->frontend,
+				   &dev->i2c_bus[1].i2c_adap,
+				   &cnxt_rde250_tunerconfig) < 0) {
+			    result = -EINVAL;
+			    goto out_free;
+		    }
+           break;
+
+	default:
+		printk(KERN_ERR "%s/2: The frontend of your DVB/ATSC card"
+				" isn't supported yet\n",
+		       dev->name);
+		break;
+	}
+	if (NULL == dvb->frontend) {
+		printk(KERN_ERR
+		       "%s/2: frontend initialization failed\n",
+		       dev->name);
+		result = -EINVAL;
+		goto out_free;
+	}
+
+
+	/* register everything */
+	result = register_dvb(dvb, THIS_MODULE, dev, &dev->udev->dev);
+
+	if (result < 0)
+		goto out_free;
+
+	cx231xx_set_mode(dev, CX231XX_SUSPEND);
+	printk(KERN_INFO "Successfully loaded cx231xx-dvb\n");
+	return 0;
+
+out_free:
+	cx231xx_set_mode(dev, CX231XX_SUSPEND);
+	kfree(dvb);
+	dev->dvb = NULL;
+	return result;
+}
+
+static int dvb_fini(struct cx231xx *dev)
+{
+	if (!dev->board.has_dvb) {
+		/* This device does not support the extension */
+		return 0;
+	}
+
+	if (dev->dvb) {
+		unregister_dvb(dev->dvb);
+		dev->dvb = NULL;
+	}
+
+	return 0;
+}
+
+static struct cx231xx_ops dvb_ops = {
+	.id   = CX231XX_DVB,
+	.name = "Cx231xx dvb Extension",
+	.init = dvb_init,
+	.fini = dvb_fini,
+};
+
+static int __init cx231xx_dvb_register(void)
+{
+	return cx231xx_register_extension(&dvb_ops);
+}
+
+static void __exit cx231xx_dvb_unregister(void)
+{
+	cx231xx_unregister_extension(&dvb_ops);
+}
+
+module_init(cx231xx_dvb_register);
+module_exit(cx231xx_dvb_unregister);
+
diff --git a/drivers/media/video/cx231xx/cx231xx-i2c.c b/drivers/media/video/cx231xx/cx231xx-i2c.c
new file mode 100644
index 000000000000..d75ed6c3c8d7
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-i2c.c
@@ -0,0 +1,577 @@
+/*
+   cx231xx-i2c.c - driver for Conexant Cx23100/101/102 USB video capture devices
+
+   Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+        Based on em28xx driver
+        Based on Cx23885 driver
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/usb.h>
+#include <linux/i2c.h>
+#include <media/v4l2-common.h>
+#include <media/tuner.h>
+
+#include "cx231xx.h"
+
+
+/* ----------------------------------------------------------- */
+
+static unsigned int i2c_scan;
+module_param(i2c_scan, int, 0444);
+MODULE_PARM_DESC(i2c_scan, "scan i2c bus at insmod time");
+
+static unsigned int i2c_debug;
+module_param(i2c_debug, int, 0644);
+MODULE_PARM_DESC(i2c_debug, "enable debug messages [i2c]");
+
+
+#define dprintk1(lvl, fmt, args...)			\
+do {							\
+	if (i2c_debug >= lvl) {				\
+	printk(fmt, ##args);				\
+      }							\
+} while (0)
+
+#define dprintk2(lvl, fmt, args...)			\
+do {							\
+	if (i2c_debug >= lvl) {				\
+		printk(KERN_DEBUG "%s at %s: " fmt,	\
+		       dev->name, __func__ , ##args);	\
+      } 						\
+} while (0)
+
+
+/*
+ * cx231xx_i2c_send_bytes()
+ */
+int cx231xx_i2c_send_bytes(struct i2c_adapter *i2c_adap,
+			 const struct i2c_msg *msg)
+{
+	struct cx231xx_i2c *bus = i2c_adap->algo_data;
+	struct cx231xx *dev = bus->dev;
+    struct cx231xx_i2c_xfer_data req_data;
+    int status = 0;
+    u16 size = 0;
+    u8 loop = 0;
+    u8 saddr_len = 1;
+    u8 *buf_ptr = NULL;
+    u16 saddr = 0;
+    u8 need_gpio = 0;
+
+
+    if( (bus->nr ==1) && (msg->addr == 0x61) && (dev->tuner_type == TUNER_XC5000) ) {
+
+        size = msg->len;
+
+        if( size == 2 ) { /* register write sub addr*/
+
+            /* Just writing sub address will cause problem to XC5000
+               So ignore the request */
+            return 0;
+
+        } else if( size == 4 ) { /* register write with sub addr*/
+
+            if(msg->len >= 2 )
+                saddr = msg->buf[0] << 8 | msg->buf[1];
+            else if ( msg->len == 1 )
+                saddr = msg->buf[0];
+
+            switch(saddr) {
+                case 0x0000:    /* start tuner calibration mode */
+                    need_gpio = 1;
+                    dev->xc_fw_load_done = 1; /* FW Loading is done */
+                    break;
+                case 0x000D:    /* Set signal source */
+                case 0x0001:    /* Set TV standard - Video */
+                case 0x0002:    /* Set TV standard - Audio */
+                case 0x0003:    /* Set RF Frequency */
+                    need_gpio = 1;
+                    break;
+                default:
+                    if(dev->xc_fw_load_done)
+                        need_gpio = 1;
+                    break;
+            }
+
+            if(need_gpio ) {
+                dprintk1(1, " GPIO W R I T E : addr 0x%x, len %d, saddr 0x%x\n",
+                    msg->addr, msg->len,  saddr);
+
+                return dev->cx231xx_gpio_i2c_write(dev, msg->addr, msg->buf, msg->len);
+            }
+
+        }
+
+        /* special case for Xc5000 tuner case */
+        saddr_len = 1;
+
+        /* adjust the length to correct length */
+        size -= saddr_len;
+        buf_ptr = (u8*) (msg->buf + 1 );
+
+        do {
+            /* prepare xfer_data struct */
+            req_data.dev_addr = msg->addr;
+            req_data.direction = msg->flags;
+            req_data.saddr_len = saddr_len;
+            req_data.saddr_dat = msg->buf[0];
+            req_data.buf_size =  size > 16 ? 16: size;
+            req_data.p_buffer = (u8*)(buf_ptr + loop * 16);
+
+            bus->i2c_nostop =  (size > 16) ? 1: 0;
+            bus->i2c_reserve = (loop == 0) ? 0: 1;
+
+            /* usb send command */
+            status = dev->cx231xx_send_usb_command(bus, &req_data);
+            loop++;
+
+            if( size >= 16 )
+                size -=  16;
+            else
+                size = 0;
+
+        }while( size > 0 );
+
+        bus->i2c_nostop =  0;
+        bus->i2c_reserve = 0;
+
+    } else { /* regular case */
+
+        /* prepare xfer_data struct */
+        req_data.dev_addr = msg->addr;
+        req_data.direction = msg->flags;
+        req_data.saddr_len = 0;
+        req_data.saddr_dat = 0;
+        req_data.buf_size = msg->len;
+        req_data.p_buffer = msg->buf;
+
+        /* usb send command */
+        status = dev->cx231xx_send_usb_command(bus, &req_data);
+    }
+
+    return status < 0 ? status: 0;
+}
+
+/*
+ * cx231xx_i2c_recv_bytes()
+ * read a byte from the i2c device
+ */
+static int cx231xx_i2c_recv_bytes(struct i2c_adapter *i2c_adap,
+			 const struct i2c_msg *msg)
+{
+    struct cx231xx_i2c *bus = i2c_adap->algo_data;
+	struct cx231xx *dev = bus->dev;
+    struct cx231xx_i2c_xfer_data req_data;
+    int status = 0;
+    u16 saddr = 0;
+    u8 need_gpio = 0;
+
+    if((bus->nr ==1) && (msg->addr == 0x61) && dev->tuner_type == TUNER_XC5000) {
+
+        if(msg->len == 2 )
+            saddr = msg->buf[0] << 8 | msg->buf[1];
+        else if ( msg->len == 1 )
+            saddr = msg->buf[0];
+
+        if( dev->xc_fw_load_done) {
+
+            switch(saddr) {
+                case 0x0009:    /* BUSY check */
+                    dprintk1(1, " GPIO R E A D : Special case BUSY check \n");
+                    /* Try to read BUSY register, just set it to zero */
+                    msg->buf[0] = 0;
+                    if(msg->len == 2 )
+                        msg->buf[1] = 0;
+                    return 0;
+                case 0x0004:    /* read Lock status */
+                    need_gpio = 1;
+                    break;
+
+            }
+
+            if(need_gpio) {
+                /* this is a special case to handle Xceive tuner clock stretch issue
+                   with gpio based I2C interface */
+                dprintk1(1, " GPIO R E A D : addr 0x%x, len %d, saddr 0x%x\n",
+                    msg->addr, msg->len, msg->buf[0] << 8| msg->buf[1]);
+                status = dev->cx231xx_gpio_i2c_write(dev, msg->addr, msg->buf, msg->len);
+                status = dev->cx231xx_gpio_i2c_read(dev, msg->addr, msg->buf, msg->len);
+                return status;
+            }
+        }
+
+         /* prepare xfer_data struct */
+        req_data.dev_addr = msg->addr;
+        req_data.direction = msg->flags;
+        req_data.saddr_len = msg->len;
+        req_data.saddr_dat = msg->buf[0] << 8 | msg->buf[1];
+        req_data.buf_size = msg->len;
+        req_data.p_buffer = msg->buf;
+
+        /* usb send command */
+        status = dev->cx231xx_send_usb_command(bus, &req_data);
+
+    } else {
+
+        /* prepare xfer_data struct */
+        req_data.dev_addr = msg->addr;
+        req_data.direction = msg->flags;
+        req_data.saddr_len = 0;
+        req_data.saddr_dat = 0;
+        req_data.buf_size = msg->len;
+        req_data.p_buffer = msg->buf;
+
+        /* usb send command */
+        status = dev->cx231xx_send_usb_command(bus, &req_data);
+    }
+
+    return status < 0 ? status: 0;
+}
+
+/*
+ * cx231xx_i2c_recv_bytes_with_saddr()
+ * read a byte from the i2c device
+ */
+static int cx231xx_i2c_recv_bytes_with_saddr(struct i2c_adapter *i2c_adap,
+			 const struct i2c_msg *msg1, const struct i2c_msg *msg2)
+{
+    struct cx231xx_i2c *bus = i2c_adap->algo_data;
+	struct cx231xx *dev = bus->dev;
+    struct cx231xx_i2c_xfer_data req_data;
+    int status = 0;
+    u16 saddr = 0;
+    u8 need_gpio = 0;
+
+    if(msg1->len == 2 )
+        saddr = msg1->buf[0] << 8 | msg1->buf[1];
+    else if ( msg1->len == 1 )
+        saddr = msg1->buf[0];
+
+    if ( (bus->nr ==1) && (msg2->addr == 0x61) &&  dev->tuner_type == TUNER_XC5000) {
+
+        if( (msg2->len < 16) ) {
+
+            dprintk1(1, " i2c_read : addr 0x%x, len %d, subaddr 0x%x, leng %d\n",
+                msg2->addr, msg2->len, saddr, msg1->len);
+
+            switch(saddr) {
+                case 0x0008:    /* read FW load status */
+                    need_gpio = 1;
+                    break;
+                case 0x0004:    /* read Lock status */
+                    need_gpio = 1;
+                    break;
+            }
+
+            if(need_gpio ) {
+                status = dev->cx231xx_gpio_i2c_write(dev, msg1->addr, msg1->buf, msg1->len);
+                status = dev->cx231xx_gpio_i2c_read(dev, msg2->addr, msg2->buf, msg2->len);
+                return status;
+            }
+        }
+    }
+
+    /* prepare xfer_data struct */
+    req_data.dev_addr = msg2->addr;
+    req_data.direction = msg2->flags;
+    req_data.saddr_len = msg1->len;
+    req_data.saddr_dat = saddr;
+    req_data.buf_size = msg2->len;
+    req_data.p_buffer = msg2->buf;
+
+    /* usb send command */
+    status = dev->cx231xx_send_usb_command(bus, &req_data);
+
+    return status < 0 ? status: 0;
+}
+
+/*
+ * cx231xx_i2c_check_for_device()
+ * check if there is a i2c_device at the supplied address
+ */
+static int cx231xx_i2c_check_for_device(struct i2c_adapter *i2c_adap,
+			 const struct i2c_msg *msg)
+{
+    struct cx231xx_i2c *bus = i2c_adap->algo_data;
+	struct cx231xx *dev = bus->dev;
+    struct cx231xx_i2c_xfer_data req_data;
+    int status = 0;
+
+    /* prepare xfer_data struct */
+    req_data.dev_addr = msg->addr;
+    req_data.direction = msg->flags;
+    req_data.saddr_len = 0;
+    req_data.saddr_dat = 0;
+    req_data.buf_size = 0;
+    req_data.p_buffer = NULL;
+
+    /* usb send command */
+    status = dev->cx231xx_send_usb_command(bus, &req_data);
+
+    return status < 0 ? status: 0;
+}
+
+/*
+ * cx231xx_i2c_xfer()
+ * the main i2c transfer function
+ */
+static int cx231xx_i2c_xfer(struct i2c_adapter *i2c_adap,
+			   struct i2c_msg msgs[], int num)
+{
+	struct cx231xx_i2c *bus = i2c_adap->algo_data;
+	struct cx231xx *dev = bus->dev;
+	int addr, rc, i, byte;
+
+	if (num <= 0)
+		return 0;
+
+	for (i = 0; i < num; i++) {
+
+		addr = msgs[i].addr >> 1;
+
+		dprintk2(2, "%s %s addr=%x len=%d:",
+			 (msgs[i].flags & I2C_M_RD) ? "read" : "write",
+			 i == num - 1 ? "stop" : "nonstop", addr, msgs[i].len);
+		if (!msgs[i].len) { /* no len: check only for device presence */
+			rc = cx231xx_i2c_check_for_device(i2c_adap, &msgs[i]);
+			if (rc < 0) {
+				dprintk2(2, " no device\n");
+				return rc;
+			}
+
+		} else if (msgs[i].flags & I2C_M_RD) {
+			/* read bytes */
+			rc = cx231xx_i2c_recv_bytes(i2c_adap, &msgs[i]);
+			if (i2c_debug >= 2) {
+				for (byte = 0; byte < msgs[i].len; byte++)
+					printk(" %02x", msgs[i].buf[byte]);
+			}
+		} else if (i + 1 < num && (msgs[i + 1].flags & I2C_M_RD) &&
+			   msgs[i].addr == msgs[i + 1].addr  && (msgs[i].len <= 2) && (bus->nr < 2)) {
+			/* read bytes */
+			rc = cx231xx_i2c_recv_bytes_with_saddr(i2c_adap, &msgs[i], &msgs[i+1]);
+			if (i2c_debug >= 2) {
+				for (byte = 0; byte < msgs[i].len; byte++)
+					printk(" %02x", msgs[i].buf[byte]);
+			}
+            i++;
+		} else {
+			/* write bytes */
+			if (i2c_debug >= 2) {
+				for (byte = 0; byte < msgs[i].len; byte++)
+					printk(" %02x", msgs[i].buf[byte]);
+			}
+			rc = cx231xx_i2c_send_bytes(i2c_adap,&msgs[i]);
+		}
+		if (rc < 0)
+			goto err;
+		if (i2c_debug >= 2)
+			printk("\n");
+	}
+
+	return num;
+err:
+	dprintk2(2, " ERROR: %i\n", rc);
+	return rc;
+}
+
+/* ----------------------------------------------------------- */
+
+/*
+ * functionality()
+ */
+static u32 functionality(struct i2c_adapter *adap)
+{
+	return I2C_FUNC_SMBUS_EMUL | I2C_FUNC_I2C;
+}
+
+/*
+ * attach_inform()
+ * gets called when a device attaches to the i2c bus
+ * does some basic configuration
+ */
+static int attach_inform(struct i2c_client *client)
+{
+    struct cx231xx_i2c *bus = i2c_get_adapdata(client->adapter);
+	struct cx231xx *dev = bus->dev;
+
+	switch (client->addr << 1) {
+	case 0x32:
+		dprintk1(1, "attach_inform: Geminit III detected.\n");
+		break;
+	case 0x02:
+		dprintk1(1, "attach_inform: Acquarius detected.\n");
+		break;
+	case 0xa0:
+		dprintk1(1, "attach_inform: eeprom detected.\n");
+		break;
+	case 0x60:
+        dprintk1(1, "attach_inform: Colibri detected.\n");
+        break;
+	case 0x8e:
+	{
+		struct IR_i2c *ir = i2c_get_clientdata(client);
+		dprintk1(1, "attach_inform: IR detected (%s).\n",
+			ir->phys);
+		cx231xx_set_ir(dev, ir);
+		break;
+	}
+	case 0x80:
+	case 0x88:
+		dprintk1(1, "attach_inform: Hammerhead detected.\n");
+		break;
+
+	default:
+		if (!dev->tuner_addr)
+			dev->tuner_addr = client->addr;
+
+		dprintk1(1, "attach inform: detected I2C address %x\n",
+				client->addr << 1);
+	}
+
+	return 0;
+}
+
+static int detach_inform(struct i2c_client *client)
+{
+	dprintk1(1, "i2c detach [client=%s]\n", client->name);
+	return 0;
+}
+
+
+static struct i2c_algorithm cx231xx_algo = {
+	.master_xfer   = cx231xx_i2c_xfer,
+	.functionality = functionality,
+};
+
+static struct i2c_adapter cx231xx_adap_template = {
+	.owner = THIS_MODULE,
+	.class = I2C_CLASS_TV_ANALOG,
+	.name = "cx231xx",
+	.id = I2C_HW_B_CX231XX,
+	.algo = &cx231xx_algo,
+	.client_register   = attach_inform,
+	.client_unregister = detach_inform,
+};
+
+static struct i2c_client cx231xx_client_template = {
+	.name = "cx231xx internal",
+};
+
+/* ----------------------------------------------------------- */
+
+/*
+ * i2c_devs
+ * incomplete list of known devices
+ */
+static char *i2c_devs[128] = {
+	[0x60 >> 1] = "colibri",
+	[0x88 >> 1] = "hammerhead",
+    [0x8e >> 1] = "CIR",
+	[0x32 >> 1] = "GeminiIII",
+    [0x02 >> 1] = "Aquarius",
+	[0xa0 >> 1] = "eeprom",
+	[0xc0 >> 1] = "tuner/XC3028",
+	[0xc2 >> 1] = "tuner/XC5000",
+};
+
+/*
+ * cx231xx_do_i2c_scan()
+ * check i2c address range for devices
+ */
+void cx231xx_do_i2c_scan(struct cx231xx *dev, struct i2c_client *c)
+{
+	unsigned char buf;
+	int i, rc;
+
+    cx231xx_info(": Checking for I2C devices ..\n");
+	for (i = 0; i < 128; i++) {
+		c->addr = i;
+		rc = i2c_master_recv(c, &buf, 0);
+		if (rc < 0)
+			continue;
+		cx231xx_info("%s: i2c scan: found device @ 0x%x  [%s]\n",
+		       dev->name, i << 1, i2c_devs[i] ? i2c_devs[i] : "???");
+	}
+    cx231xx_info(": Completed Checking for I2C devices.\n");
+}
+
+/*
+ * cx231xx_i2c_call_clients()
+ * send commands to all attached i2c devices
+ */
+void cx231xx_i2c_call_clients(struct cx231xx_i2c *bus, unsigned int cmd, void *arg)
+{
+	/* struct cx231xx *dev = bus->dev; */
+
+	BUG_ON(NULL == bus->i2c_adap.algo_data);
+	i2c_clients_command(&bus->i2c_adap, cmd, arg);
+}
+
+/*
+ * cx231xx_i2c_register()
+ * register i2c bus
+ */
+int cx231xx_i2c_register(struct cx231xx_i2c *bus)
+{
+    struct cx231xx *dev = bus->dev;
+
+    BUG_ON(!dev->cx231xx_send_usb_command);
+
+    cx231xx_info("%s(bus = %d)\n", __func__, bus->nr);
+
+	memcpy(&bus->i2c_adap, &cx231xx_adap_template,
+	       sizeof(bus->i2c_adap));
+	memcpy(&bus->i2c_algo, &cx231xx_algo,
+	       sizeof(bus->i2c_algo));
+	memcpy(&bus->i2c_client, &cx231xx_client_template,
+	       sizeof(bus->i2c_client));
+
+	bus->i2c_adap.dev.parent = &dev->udev->dev;
+
+	strlcpy(bus->i2c_adap.name, bus->dev->name,
+		sizeof(bus->i2c_adap.name));
+
+	bus->i2c_algo.data = bus;
+	bus->i2c_adap.algo_data = bus;
+	i2c_set_adapdata(&bus->i2c_adap, bus);
+	i2c_add_adapter(&bus->i2c_adap);
+
+	bus->i2c_client.adapter = &bus->i2c_adap;
+
+	if (0 == bus->i2c_rc) {
+		cx231xx_info("%s: i2c bus %d registered\n", dev->name, bus->nr);
+		if (i2c_scan)
+			cx231xx_do_i2c_scan(dev, &bus->i2c_client);
+	} else
+		cx231xx_warn("%s: i2c bus %d register FAILED\n",
+			dev->name, bus->nr);
+
+	return bus->i2c_rc;
+}
+
+/*
+ * cx231xx_i2c_unregister()
+ * unregister i2c_bus
+ */
+int cx231xx_i2c_unregister(struct cx231xx_i2c *bus)
+{
+	i2c_del_adapter(&bus->i2c_adap);
+	return 0;
+}
diff --git a/drivers/media/video/cx231xx/cx231xx-input.c b/drivers/media/video/cx231xx/cx231xx-input.c
new file mode 100644
index 000000000000..fdc37a838d10
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-input.c
@@ -0,0 +1,250 @@
+/*
+  handle cx231xx IR remotes via linux kernel input layer.
+
+  Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+        Based on em28xx driver
+
+        < This is a place holder for IR now.>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/usb.h>
+
+#include "cx231xx.h"
+
+
+static unsigned int ir_debug;
+module_param(ir_debug, int, 0644);
+MODULE_PARM_DESC(ir_debug, "enable debug messages [IR]");
+
+#define i2cdprintk(fmt, arg...) \
+	if (ir_debug) { \
+		printk(KERN_DEBUG "%s/ir: " fmt, ir->c.name , ## arg); \
+	}
+
+#define dprintk(fmt, arg...) \
+	if (ir_debug) { \
+		printk(KERN_DEBUG "%s/ir: " fmt, ir->name , ## arg); \
+	}
+
+/**********************************************************
+ Polling structure used by cx231xx IR's
+ **********************************************************/
+
+struct cx231xx_ir_poll_result {
+	unsigned int toggle_bit:1;
+	unsigned int read_count:7;
+	u8 rc_address;
+	u8 rc_data[4];
+};
+
+struct cx231xx_IR {
+	struct cx231xx *dev;
+	struct input_dev *input;
+	struct ir_input_state ir;
+	char name[32];
+	char phys[32];
+
+	/* poll external decoder */
+	int polling;
+	struct work_struct work;
+	struct timer_list timer;
+	unsigned int last_toggle:1;
+	unsigned int last_readcount;
+	unsigned int repeat_interval;
+
+	int  (*get_key)(struct cx231xx_IR *, struct cx231xx_ir_poll_result *);
+};
+
+
+
+/**********************************************************
+ Polling code for cx231xx
+ **********************************************************/
+
+static void cx231xx_ir_handle_key(struct cx231xx_IR *ir)
+{
+	int result;
+	int do_sendkey = 0;
+	struct cx231xx_ir_poll_result poll_result;
+
+	/* read the registers containing the IR status */
+	result = ir->get_key(ir, &poll_result);
+	if (result < 0) {
+		dprintk("ir->get_key() failed %d\n", result);
+		return;
+	}
+
+	dprintk("ir->get_key result tb=%02x rc=%02x lr=%02x data=%02x\n",
+		poll_result.toggle_bit, poll_result.read_count,
+		ir->last_readcount, poll_result.rc_data[0]);
+
+	if (ir->dev->chip_id == CHIP_ID_EM2874) {
+		/* The em2874 clears the readcount field every time the
+		   register is read.  The em2860/2880 datasheet says that it
+		   is supposed to clear the readcount, but it doesn't.  So with
+		   the em2874, we are looking for a non-zero read count as
+		   opposed to a readcount that is incrementing */
+		ir->last_readcount = 0;
+	}
+
+	if (poll_result.read_count == 0) {
+		/* The button has not been pressed since the last read */
+	} else if (ir->last_toggle != poll_result.toggle_bit) {
+		/* A button has been pressed */
+		dprintk("button has been pressed\n");
+		ir->last_toggle = poll_result.toggle_bit;
+		ir->repeat_interval = 0;
+		do_sendkey = 1;
+	} else if (poll_result.toggle_bit == ir->last_toggle &&
+		   poll_result.read_count > 0 &&
+		   poll_result.read_count != ir->last_readcount) {
+		/* The button is still being held down */
+		dprintk("button being held down\n");
+
+		/* Debouncer for first keypress */
+		if (ir->repeat_interval++ > 9) {
+			/* Start repeating after 1 second */
+			do_sendkey = 1;
+		}
+	}
+
+	if (do_sendkey) {
+		dprintk("sending keypress\n");
+		ir_input_keydown(ir->input, &ir->ir, poll_result.rc_data[0],
+				 poll_result.rc_data[0]);
+		ir_input_nokey(ir->input, &ir->ir);
+	}
+
+	ir->last_readcount = poll_result.read_count;
+	return;
+}
+
+static void ir_timer(unsigned long data)
+{
+	struct cx231xx_IR *ir = (struct cx231xx_IR *)data;
+
+	schedule_work(&ir->work);
+}
+
+static void cx231xx_ir_work(struct work_struct *work)
+{
+	struct cx231xx_IR *ir = container_of(work, struct cx231xx_IR, work);
+
+	cx231xx_ir_handle_key(ir);
+	mod_timer(&ir->timer, jiffies + msecs_to_jiffies(ir->polling));
+}
+
+void cx231xx_ir_start(struct cx231xx_IR *ir)
+{
+	setup_timer(&ir->timer, ir_timer, (unsigned long)ir);
+	INIT_WORK(&ir->work, cx231xx_ir_work);
+	schedule_work(&ir->work);
+}
+
+static void cx231xx_ir_stop(struct cx231xx_IR *ir)
+{
+	del_timer_sync(&ir->timer);
+	flush_scheduled_work();
+}
+
+int cx231xx_ir_init(struct cx231xx *dev)
+{
+	struct cx231xx_IR *ir;
+	struct input_dev *input_dev;
+	u8 ir_config;
+	int err = -ENOMEM;
+
+	if (dev->board.ir_codes == NULL) {
+		/* No remote control support */
+		return 0;
+	}
+
+	ir = kzalloc(sizeof(*ir), GFP_KERNEL);
+	input_dev = input_allocate_device();
+	if (!ir || !input_dev)
+		goto err_out_free;
+
+	ir->input = input_dev;
+
+	/* Setup the proper handler based on the chip */
+	switch (dev->chip_id) {
+	    default:
+		    printk("Unrecognized cx231xx chip id: IR not supported\n");
+		    goto err_out_free;
+	}
+
+	/* This is how often we ask the chip for IR information */
+	ir->polling = 100; /* ms */
+
+	/* init input device */
+	snprintf(ir->name, sizeof(ir->name), "cx231xx IR (%s)",
+						dev->name);
+
+	usb_make_path(dev->udev, ir->phys, sizeof(ir->phys));
+	strlcat(ir->phys, "/input0", sizeof(ir->phys));
+
+	ir_input_init(input_dev, &ir->ir, IR_TYPE_OTHER, dev->board.ir_codes);
+	input_dev->name = ir->name;
+	input_dev->phys = ir->phys;
+	input_dev->id.bustype = BUS_USB;
+	input_dev->id.version = 1;
+	input_dev->id.vendor = le16_to_cpu(dev->udev->descriptor.idVendor);
+	input_dev->id.product = le16_to_cpu(dev->udev->descriptor.idProduct);
+
+	input_dev->dev.parent = &dev->udev->dev;
+	/* record handles to ourself */
+	ir->dev = dev;
+	dev->ir = ir;
+
+	cx231xx_ir_start(ir);
+
+	/* all done */
+	err = input_register_device(ir->input);
+	if (err)
+		goto err_out_stop;
+
+	return 0;
+ err_out_stop:
+	cx231xx_ir_stop(ir);
+	dev->ir = NULL;
+ err_out_free:
+	input_free_device(input_dev);
+	kfree(ir);
+	return err;
+}
+
+int cx231xx_ir_fini(struct cx231xx *dev)
+{
+	struct cx231xx_IR *ir = dev->ir;
+
+	/* skip detach on non attached boards */
+	if (!ir)
+		return 0;
+
+	cx231xx_ir_stop(ir);
+	input_unregister_device(ir->input);
+	kfree(ir);
+
+	/* done */
+	dev->ir = NULL;
+	return 0;
+}
diff --git a/drivers/media/video/cx231xx/cx231xx-reg.h b/drivers/media/video/cx231xx/cx231xx-reg.h
new file mode 100644
index 000000000000..ef24781418e4
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-reg.h
@@ -0,0 +1,1574 @@
+/*
+   cx231xx-reg.h - driver for Conexant Cx23100/101/102 USB video capture devices
+
+   Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _CX231XX_REG_H
+#define _CX231XX_REG_H
+
+/*****************************************************************************
+                    * VBI codes *
+*****************************************************************************/
+
+#define SAV_ACTIVE_VIDEO_FIELD1	      0x80
+#define EAV_ACTIVE_VIDEO_FIELD1		  0x90
+
+#define SAV_ACTIVE_VIDEO_FIELD2		  0xC0
+#define EAV_ACTIVE_VIDEO_FIELD2		  0xD0
+
+#define SAV_VBLANK_FIELD1		      0xA0
+#define EAV_VBLANK_FIELD1		      0xB0
+
+#define SAV_VBLANK_FIELD2		      0xE0
+#define EAV_VBLANK_FIELD2		      0xF0
+
+#define SAV_VBI_FIELD1	              0x20
+#define EAV_VBI_FIELD1	              0x30
+
+#define SAV_VBI_FIELD2	              0x60
+#define EAV_VBI_FIELD2	              0x70
+
+/*****************************************************************************/
+/* Audio ADC Registers */
+#define CH_PWR_CTRL1	              0x0000000E
+#define CH_PWR_CTRL2	              0x0000000F
+/*****************************************************************************/
+
+#define      HOST_REG1                0x000
+#define      FLD_FORCE_CHIP_SEL       0x80
+#define      FLD_AUTO_INC_DIS         0x20
+#define      FLD_PREFETCH_EN          0x10
+/* Reserved [2:3] */
+#define      FLD_DIGITAL_PWR_DN       0x02
+#define      FLD_SLEEP                0x01
+
+/*****************************************************************************/
+#define      HOST_REG2                0x001
+
+
+/*****************************************************************************/
+#define      HOST_REG3                0x002
+
+/*****************************************************************************/
+/* added for polaris */
+#define      GPIO_PIN_CTL0            0x3
+#define      GPIO_PIN_CTL1            0x4
+#define      GPIO_PIN_CTL2            0x5
+#define      GPIO_PIN_CTL3            0x6
+#define      TS1_PIN_CTL0             0x7
+#define      TS1_PIN_CTL1             0x8
+/*****************************************************************************/
+
+#define      FLD_CLK_IN_EN           0x80
+#define      FLD_XTAL_CTRL           0x70
+#define      FLD_BB_CLK_MODE         0x0C
+#define      FLD_REF_DIV_PLL         0x02
+#define      FLD_REF_SEL_PLL1        0x01
+
+/*****************************************************************************/
+#define      CHIP_CTRL                0x100
+/* Reserved [27] */
+/* Reserved [31:21] */
+#define      FLD_CHIP_ACFG_DIS        0x00100000
+/* Reserved [19] */
+#define      FLD_DUAL_MODE_ADC2       0x00040000
+#define      FLD_SIF_EN               0x00020000
+#define      FLD_SOFT_RST             0x00010000
+#define      FLD_DEVICE_ID            0x0000FFFF
+
+/*****************************************************************************/
+#define      AFE_CTRL                 0x104
+#define      AFE_CTRL_C2HH_SRC_CTRL   0x104
+#define      FLD_DIF_OUT_SEL          0xC0000000
+#define      FLD_AUX_PLL_CLK_ALT_SEL  0x3C000000
+#define      FLD_UV_ORDER_MODE        0x02000000
+#define      FLD_FUNC_MODE            0x01800000
+#define      FLD_ROT1_PHASE_CTL       0x007F8000
+#define      FLD_AUD_IN_SEL           0x00004000
+#define      FLD_LUMA_IN_SEL          0x00002000
+#define      FLD_CHROMA_IN_SEL        0x00001000
+/* reserve [11:10] */
+#define      FLD_INV_SPEC_DIS         0x00000200
+#define      FLD_VGA_SEL_CH3          0x00000100
+#define      FLD_VGA_SEL_CH2          0x00000080
+#define      FLD_VGA_SEL_CH1          0x00000040
+#define      FLD_DCR_BYP_CH1          0x00000020
+#define      FLD_DCR_BYP_CH2          0x00000010
+#define      FLD_DCR_BYP_CH3          0x00000008
+#define      FLD_EN_12DB_CH3          0x00000004
+#define      FLD_EN_12DB_CH2          0x00000002
+#define      FLD_EN_12DB_CH1          0x00000001
+
+/* redefine in Cx231xx */
+/*****************************************************************************/
+#define      DC_CTRL1                 0x108
+/* reserve [31:30] */
+#define      FLD_CLAMP_LVL_CH1        0x3FFF8000
+#define      FLD_CLAMP_LVL_CH2        0x00007FFF
+/*****************************************************************************/
+
+/*****************************************************************************/
+#define      DC_CTRL2                 0x10c
+/* reserve [31:28] */
+#define      FLD_CLAMP_LVL_CH3        0x00FFFE00
+#define      FLD_CLAMP_WIND_LENTH     0x000001E0
+#define      FLD_C2HH_SAT_MIN         0x0000001E
+#define      FLD_FLT_BYP_SEL          0x00000001
+/*****************************************************************************/
+
+/*****************************************************************************/
+#define      DC_CTRL3                 0x110
+/* reserve [31:16] */
+#define      FLD_ERR_GAIN_CTL         0x00070000
+#define      FLD_LPF_MIN              0x0000FFFF
+/*****************************************************************************/
+
+/*****************************************************************************/
+#define      DC_CTRL4                 0x114
+/* reserve [31:31] */
+#define      FLD_INTG_CH1             0x7FFFFFFF
+/*****************************************************************************/
+
+/*****************************************************************************/
+#define      DC_CTRL5                 0x118
+/* reserve [31:31] */
+#define      FLD_INTG_CH2             0x7FFFFFFF
+/*****************************************************************************/
+
+/*****************************************************************************/
+#define      DC_CTRL6                 0x11c
+/* reserve [31:31] */
+#define      FLD_INTG_CH3             0x7FFFFFFF
+/*****************************************************************************/
+
+/*****************************************************************************/
+#define      PIN_CTRL                 0x120
+#define      FLD_OEF_AGC_RF           0x00000001
+#define      FLD_OEF_AGC_IFVGA        0x00000002
+#define      FLD_OEF_AGC_IF           0x00000004
+#define      FLD_REG_BO_PUD           0x80000000
+#define      FLD_IR_IRQ_STAT          0x40000000
+#define      FLD_AUD_IRQ_STAT         0x20000000
+#define      FLD_VID_IRQ_STAT         0x10000000
+/* Reserved [27:26] */
+#define      FLD_IRQ_N_OUT_EN         0x02000000
+#define      FLD_IRQ_N_POLAR          0x01000000
+/* Reserved [23:6] */
+#define      FLD_OE_AUX_PLL_CLK       0x00000020
+#define      FLD_OE_I2S_BCLK          0x00000010
+#define      FLD_OE_I2S_WCLK          0x00000008
+#define      FLD_OE_AGC_IF            0x00000004
+#define      FLD_OE_AGC_IFVGA         0x00000002
+#define      FLD_OE_AGC_RF            0x00000001
+
+/*****************************************************************************/
+#define      AUD_IO_CTRL              0x124
+/* Reserved [31:8] */
+#define      FLD_I2S_PORT_DIR         0x00000080
+#define      FLD_I2S_OUT_SRC          0x00000040
+#define      FLD_AUD_CHAN3_SRC        0x00000030
+#define      FLD_AUD_CHAN2_SRC        0x0000000C
+#define      FLD_AUD_CHAN1_SRC        0x00000003
+
+/*****************************************************************************/
+#define      AUD_LOCK1                0x128
+#define      FLD_AUD_LOCK_KI_SHIFT    0xC0000000
+#define      FLD_AUD_LOCK_KD_SHIFT    0x30000000
+/* Reserved [27:25] */
+#define      FLD_EN_AV_LOCK           0x01000000
+#define      FLD_VID_COUNT            0x00FFFFFF
+
+/*****************************************************************************/
+#define      AUD_LOCK2                0x12C
+#define      FLD_AUD_LOCK_KI_MULT     0xF0000000
+#define      FLD_AUD_LOCK_KD_MULT     0x0F000000
+/* Reserved [23:22] */
+#define      FLD_AUD_LOCK_FREQ_SHIFT  0x00300000
+#define      FLD_AUD_COUNT            0x000FFFFF
+
+/*****************************************************************************/
+#define      AFE_DIAG_CTRL1           0x134
+/* Reserved [31:16] */
+#define      FLD_CUV_DLY_LENGTH       0x0000FF00
+#define      FLD_YC_DLY_LENGTH        0x000000FF
+
+/*****************************************************************************/
+/* Poalris redefine */
+#define      AFE_DIAG_CTRL3           0x138
+/* Reserved [31:26] */
+#define      FLD_AUD_DUAL_FLAG_POL    0x02000000
+#define      FLD_VID_DUAL_FLAG_POL    0x01000000
+/* Reserved [23:23] */
+#define      FLD_COL_CLAMP_DIS_CH1    0x00400000
+#define      FLD_COL_CLAMP_DIS_CH2    0x00200000
+#define      FLD_COL_CLAMP_DIS_CH3    0x00100000
+
+#define      TEST_CTRL1                 0x144
+/* Reserved [31:29] */
+#define      FLD_LBIST_EN               0x10000000
+/* Reserved [27:10] */
+#define      FLD_FI_BIST_INTR_R         0x0000200
+#define      FLD_FI_BIST_INTR_L         0x0000100
+#define      FLD_BIST_FAIL_AUD_PLL      0x0000080
+#define      FLD_BIST_INTR_AUD_PLL      0x0000040
+#define      FLD_BIST_FAIL_VID_PLL      0x0000020
+#define      FLD_BIST_INTR_VID_PLL      0x0000010
+/* Reserved [3:1] */
+#define      FLD_CIR_TEST_DIS           0x00000001
+
+
+/*****************************************************************************/
+#define      TEST_CTRL2               0x148
+#define      FLD_TSXCLK_POL_CTL       0x80000000
+#define      FLD_ISO_CTL_SEL          0x40000000
+#define      FLD_ISO_CTL_EN           0x20000000
+#define      FLD_BIST_DEBUGZ          0x10000000
+#define      FLD_AUD_BIST_TEST_H      0x0F000000
+/* Reserved [23:22] */
+#define      FLD_FLTRN_BIST_TEST_H    0x00020000
+#define      FLD_VID_BIST_TEST_H      0x00010000
+/* Reserved [19:17] */
+#define      FLD_BIST_TEST_H          0x00010000
+/* Reserved [15:13] */
+#define      FLD_TAB_EN               0x00001000
+/* Reserved [11:0] */
+
+/*****************************************************************************/
+#define      BIST_STAT                0x14C
+#define      FLD_AUD_BIST_FAIL_H      0xFFF00000
+#define      FLD_FLTRN_BIST_FAIL_H    0x00180000
+#define      FLD_VID_BIST_FAIL_H      0x00070000
+#define      FLD_AUD_BIST_TST_DONE    0x0000FFF0
+#define      FLD_FLTRN_BIST_TST_DONE  0x00000008
+#define      FLD_VID_BIST_TST_DONE    0x00000007
+
+
+/*****************************************************************************/
+/* DirectIF registers definition have been moved to DIF_reg.h                */
+/*****************************************************************************/
+#define      MODE_CTRL                0x400
+#define      FLD_AFD_PAL60_DIS        0x20000000
+#define      FLD_AFD_FORCE_SECAM      0x10000000
+#define      FLD_AFD_FORCE_PALNC      0x08000000
+#define      FLD_AFD_FORCE_PAL        0x04000000
+#define      FLD_AFD_PALM_SEL         0x03000000
+#define      FLD_CKILL_MODE           0x00300000
+#define      FLD_COMB_NOTCH_MODE      0x00c00000       /* bit[19:18] */
+#define      FLD_CLR_LOCK_STAT        0x00020000
+#define      FLD_FAST_LOCK_MD         0x00010000
+#define      FLD_WCEN                 0x00008000
+#define      FLD_CAGCEN               0x00004000
+#define      FLD_CKILLEN              0x00002000
+#define      FLD_AUTO_SC_LOCK         0x00001000
+#define      FLD_MAN_SC_FAST_LOCK     0x00000800
+#define      FLD_INPUT_MODE           0x00000600
+#define      FLD_AFD_ACQUIRE          0x00000100
+#define      FLD_AFD_NTSC_SEL         0x00000080
+#define      FLD_AFD_PAL_SEL          0x00000040
+#define      FLD_ACFG_DIS             0x00000020
+#define      FLD_SQ_PIXEL             0x00000010
+#define      FLD_VID_FMT_SEL          0x0000000F
+
+/*****************************************************************************/
+#define      OUT_CTRL1                0x404
+#define      FLD_POLAR                0x7F000000
+/* Reserved [23] */
+#define      FLD_RND_MODE             0x00600000
+#define      FLD_VIPCLAMP_EN          0x00100000
+#define      FLD_VIPBLANK_EN          0x00080000
+#define      FLD_VIP_OPT_AL           0x00040000
+#define      FLD_IDID0_SOURCE         0x00020000
+#define      FLD_DCMODE               0x00010000
+#define      FLD_CLK_GATING           0x0000C000
+#define      FLD_CLK_INVERT           0x00002000
+#define      FLD_HSFMT                0x00001000
+#define      FLD_VALIDFMT             0x00000800
+#define      FLD_ACTFMT               0x00000400
+#define      FLD_SWAPRAW              0x00000200
+#define      FLD_CLAMPRAW_EN          0x00000100
+#define      FLD_BLUE_FIELD_EN        0x00000080
+#define      FLD_BLUE_FIELD_ACT       0x00000040
+#define      FLD_TASKBIT_VAL          0x00000020
+#define      FLD_ANC_DATA_EN          0x00000010
+#define      FLD_VBIHACTRAW_EN        0x00000008
+#define      FLD_MODE10B              0x00000004
+#define      FLD_OUT_MODE             0x00000003
+
+/*****************************************************************************/
+#define      OUT_CTRL2                0x408
+#define      FLD_AUD_GRP              0xC0000000
+#define      FLD_SAMPLE_RATE          0x30000000
+#define      FLD_AUD_ANC_EN           0x08000000
+#define      FLD_EN_C                 0x04000000
+#define      FLD_EN_B                 0x02000000
+#define      FLD_EN_A                 0x01000000
+/* Reserved [23:20] */
+#define      FLD_IDID1_LSB            0x000C0000
+#define      FLD_IDID0_LSB            0x00030000
+#define      FLD_IDID1_MSB            0x0000FF00
+#define      FLD_IDID0_MSB            0x000000FF
+
+/*****************************************************************************/
+#define      GEN_STAT                 0x40C
+#define      FLD_VCR_DETECT           0x00800000
+#define      FLD_SPECIAL_PLAY_N       0x00400000
+#define      FLD_VPRES                0x00200000
+#define      FLD_AGC_LOCK             0x00100000
+#define      FLD_CSC_LOCK             0x00080000
+#define      FLD_VLOCK                0x00040000
+#define      FLD_SRC_LOCK             0x00020000
+#define      FLD_HLOCK                0x00010000
+#define      FLD_VSYNC_N              0x00008000
+#define      FLD_SRC_FIFO_UFLOW       0x00004000
+#define      FLD_SRC_FIFO_OFLOW       0x00002000
+#define      FLD_FIELD                0x00001000
+#define      FLD_AFD_FMT_STAT         0x00000F00
+#define      FLD_MV_TYPE2_PAIR        0x00000080
+#define      FLD_MV_T3CS              0x00000040
+#define      FLD_MV_CS                0x00000020
+#define      FLD_MV_PSP               0x00000010
+/* Reserved [3] */
+#define      FLD_MV_CDAT              0x00000003
+
+/*****************************************************************************/
+#define      INT_STAT_MASK            0x410
+#define      FLD_COMB_3D_FIFO_MSK     0x80000000
+#define      FLD_WSS_DAT_AVAIL_MSK    0x40000000
+#define      FLD_GS2_DAT_AVAIL_MSK    0x20000000
+#define      FLD_GS1_DAT_AVAIL_MSK    0x10000000
+#define      FLD_CC_DAT_AVAIL_MSK     0x08000000
+#define      FLD_VPRES_CHANGE_MSK     0x04000000
+#define      FLD_MV_CHANGE_MSK        0x02000000
+#define      FLD_END_VBI_EVEN_MSK     0x01000000
+#define      FLD_END_VBI_ODD_MSK      0x00800000
+#define      FLD_FMT_CHANGE_MSK       0x00400000
+#define      FLD_VSYNC_TRAIL_MSK      0x00200000
+#define      FLD_HLOCK_CHANGE_MSK     0x00100000
+#define      FLD_VLOCK_CHANGE_MSK     0x00080000
+#define      FLD_CSC_LOCK_CHANGE_MSK  0x00040000
+#define      FLD_SRC_FIFO_UFLOW_MSK   0x00020000
+#define      FLD_SRC_FIFO_OFLOW_MSK   0x00010000
+#define      FLD_COMB_3D_FIFO_STAT    0x00008000
+#define      FLD_WSS_DAT_AVAIL_STAT   0x00004000
+#define      FLD_GS2_DAT_AVAIL_STAT   0x00002000
+#define      FLD_GS1_DAT_AVAIL_STAT   0x00001000
+#define      FLD_CC_DAT_AVAIL_STAT    0x00000800
+#define      FLD_VPRES_CHANGE_STAT    0x00000400
+#define      FLD_MV_CHANGE_STAT       0x00000200
+#define      FLD_END_VBI_EVEN_STAT    0x00000100
+#define      FLD_END_VBI_ODD_STAT     0x00000080
+#define      FLD_FMT_CHANGE_STAT      0x00000040
+#define      FLD_VSYNC_TRAIL_STAT     0x00000020
+#define      FLD_HLOCK_CHANGE_STAT    0x00000010
+#define      FLD_VLOCK_CHANGE_STAT    0x00000008
+#define      FLD_CSC_LOCK_CHANGE_STAT 0x00000004
+#define      FLD_SRC_FIFO_UFLOW_STAT  0x00000002
+#define      FLD_SRC_FIFO_OFLOW_STAT  0x00000001
+
+/*****************************************************************************/
+#define      LUMA_CTRL                0x414
+#define      BRIGHTNESS_CTRL_BYTE     0x414
+#define      CONTRAST_CTRL_BYTE       0x415
+#define      LUMA_CTRL_BYTE_3         0x416
+#define      FLD_LUMA_CORE_SEL        0x00C00000
+#define      FLD_RANGE                0x00300000
+/* Reserved [19] */
+#define      FLD_PEAK_EN              0x00040000
+#define      FLD_PEAK_SEL             0x00030000
+#define      FLD_CNTRST               0x0000FF00
+#define      FLD_BRITE                0x000000FF
+
+/*****************************************************************************/
+#define      HSCALE_CTRL              0x418
+#define      FLD_HFILT                0x03000000
+#define      FLD_HSCALE               0x00FFFFFF
+
+/*****************************************************************************/
+#define      VSCALE_CTRL              0x41C
+#define      FLD_LINE_AVG_DIS         0x01000000
+/* Reserved [23:20] */
+#define      FLD_VS_INTRLACE          0x00080000
+#define      FLD_VFILT                0x00070000
+/* Reserved [15:13] */
+#define      FLD_VSCALE               0x00001FFF
+
+/*****************************************************************************/
+#define      CHROMA_CTRL              0x420
+#define      USAT_CTRL_BYTE           0x420
+#define      VSAT_CTRL_BYTE           0x421
+#define      HUE_CTRL_BYTE            0x422
+#define      FLD_C_LPF_EN             0x20000000
+#define      FLD_CHR_DELAY            0x1C000000
+#define      FLD_C_CORE_SEL           0x03000000
+#define      FLD_HUE                  0x00FF0000
+#define      FLD_VSAT                 0x0000FF00
+#define      FLD_USAT                 0x000000FF
+
+/*****************************************************************************/
+#define      VBI_LINE_CTRL1           0x424
+#define      FLD_VBI_MD_LINE4         0xFF000000
+#define      FLD_VBI_MD_LINE3         0x00FF0000
+#define      FLD_VBI_MD_LINE2         0x0000FF00
+#define      FLD_VBI_MD_LINE1         0x000000FF
+
+/*****************************************************************************/
+#define      VBI_LINE_CTRL2           0x428
+#define      FLD_VBI_MD_LINE8         0xFF000000
+#define      FLD_VBI_MD_LINE7         0x00FF0000
+#define      FLD_VBI_MD_LINE6         0x0000FF00
+#define      FLD_VBI_MD_LINE5         0x000000FF
+
+/*****************************************************************************/
+#define      VBI_LINE_CTRL3           0x42C
+#define      FLD_VBI_MD_LINE12        0xFF000000
+#define      FLD_VBI_MD_LINE11        0x00FF0000
+#define      FLD_VBI_MD_LINE10        0x0000FF00
+#define      FLD_VBI_MD_LINE9         0x000000FF
+
+/*****************************************************************************/
+#define      VBI_LINE_CTRL4           0x430
+#define      FLD_VBI_MD_LINE16        0xFF000000
+#define      FLD_VBI_MD_LINE15        0x00FF0000
+#define      FLD_VBI_MD_LINE14        0x0000FF00
+#define      FLD_VBI_MD_LINE13        0x000000FF
+
+/*****************************************************************************/
+#define      VBI_LINE_CTRL5           0x434
+#define      FLD_VBI_MD_LINE17        0x000000FF
+
+/*****************************************************************************/
+#define      VBI_FC_CFG               0x438
+#define      FLD_FC_ALT2              0xFF000000
+#define      FLD_FC_ALT1              0x00FF0000
+#define      FLD_FC_ALT2_TYPE         0x0000F000
+#define      FLD_FC_ALT1_TYPE         0x00000F00
+/* Reserved [7:1] */
+#define      FLD_FC_SEARCH_MODE       0x00000001
+
+/*****************************************************************************/
+#define      VBI_MISC_CFG1            0x43C
+#define      FLD_TTX_PKTADRU          0xFFF00000
+#define      FLD_TTX_PKTADRL          0x000FFF00
+/* Reserved [7:6] */
+#define      FLD_MOJI_PACK_DIS        0x00000020
+#define      FLD_VPS_DEC_DIS          0x00000010
+#define      FLD_CRI_MARG_SCALE       0x0000000C
+#define      FLD_EDGE_RESYNC_EN       0x00000002
+#define      FLD_ADAPT_SLICE_DIS      0x00000001
+
+/*****************************************************************************/
+#define      VBI_MISC_CFG2            0x440
+#define      FLD_HAMMING_TYPE         0x0F000000
+/* Reserved [23:20] */
+#define      FLD_WSS_FIFO_RST         0x00080000
+#define      FLD_GS2_FIFO_RST         0x00040000
+#define      FLD_GS1_FIFO_RST         0x00020000
+#define      FLD_CC_FIFO_RST          0x00010000
+/* Reserved [15:12] */
+#define      FLD_VBI3_SDID            0x00000F00
+#define      FLD_VBI2_SDID            0x000000F0
+#define      FLD_VBI1_SDID            0x0000000F
+
+/*****************************************************************************/
+#define      VBI_PAY1                 0x444
+#define      FLD_GS1_FIFO_DAT         0xFF000000
+#define      FLD_GS1_STAT             0x00FF0000
+#define      FLD_CC_FIFO_DAT          0x0000FF00
+#define      FLD_CC_STAT              0x000000FF
+
+/*****************************************************************************/
+#define      VBI_PAY2                 0x448
+#define      FLD_WSS_FIFO_DAT         0xFF000000
+#define      FLD_WSS_STAT             0x00FF0000
+#define      FLD_GS2_FIFO_DAT         0x0000FF00
+#define      FLD_GS2_STAT             0x000000FF
+
+/*****************************************************************************/
+#define      VBI_CUST1_CFG1           0x44C
+/* Reserved [31] */
+#define      FLD_VBI1_CRIWIN          0x7F000000
+#define      FLD_VBI1_SLICE_DIST      0x00F00000
+#define      FLD_VBI1_BITINC          0x000FFF00
+#define      FLD_VBI1_HDELAY          0x000000FF
+
+/*****************************************************************************/
+#define      VBI_CUST1_CFG2           0x450
+#define      FLD_VBI1_FC_LENGTH       0x1F000000
+#define      FLD_VBI1_FRAME_CODE      0x00FFFFFF
+
+/*****************************************************************************/
+#define      VBI_CUST1_CFG3           0x454
+#define      FLD_VBI1_HAM_EN          0x80000000
+#define      FLD_VBI1_FIFO_MODE       0x70000000
+#define      FLD_VBI1_FORMAT_TYPE     0x0F000000
+#define      FLD_VBI1_PAYLD_LENGTH    0x00FF0000
+#define      FLD_VBI1_CRI_LENGTH      0x0000F000
+#define      FLD_VBI1_CRI_MARGIN      0x00000F00
+#define      FLD_VBI1_CRI_TIME        0x000000FF
+
+/*****************************************************************************/
+#define      VBI_CUST2_CFG1           0x458
+/* Reserved [31] */
+#define      FLD_VBI2_CRIWIN          0x7F000000
+#define      FLD_VBI2_SLICE_DIST      0x00F00000
+#define      FLD_VBI2_BITINC          0x000FFF00
+#define      FLD_VBI2_HDELAY          0x000000FF
+
+/*****************************************************************************/
+#define      VBI_CUST2_CFG2           0x45C
+#define      FLD_VBI2_FC_LENGTH       0x1F000000
+#define      FLD_VBI2_FRAME_CODE      0x00FFFFFF
+
+/*****************************************************************************/
+#define      VBI_CUST2_CFG3           0x460
+#define      FLD_VBI2_HAM_EN          0x80000000
+#define      FLD_VBI2_FIFO_MODE       0x70000000
+#define      FLD_VBI2_FORMAT_TYPE     0x0F000000
+#define      FLD_VBI2_PAYLD_LENGTH    0x00FF0000
+#define      FLD_VBI2_CRI_LENGTH      0x0000F000
+#define      FLD_VBI2_CRI_MARGIN      0x00000F00
+#define      FLD_VBI2_CRI_TIME        0x000000FF
+
+/*****************************************************************************/
+#define      VBI_CUST3_CFG1           0x464
+/* Reserved [31] */
+#define      FLD_VBI3_CRIWIN          0x7F000000
+#define      FLD_VBI3_SLICE_DIST      0x00F00000
+#define      FLD_VBI3_BITINC          0x000FFF00
+#define      FLD_VBI3_HDELAY          0x000000FF
+
+/*****************************************************************************/
+#define      VBI_CUST3_CFG2           0x468
+#define      FLD_VBI3_FC_LENGTH       0x1F000000
+#define      FLD_VBI3_FRAME_CODE      0x00FFFFFF
+
+/*****************************************************************************/
+#define      VBI_CUST3_CFG3           0x46C
+#define      FLD_VBI3_HAM_EN          0x80000000
+#define      FLD_VBI3_FIFO_MODE       0x70000000
+#define      FLD_VBI3_FORMAT_TYPE     0x0F000000
+#define      FLD_VBI3_PAYLD_LENGTH    0x00FF0000
+#define      FLD_VBI3_CRI_LENGTH      0x0000F000
+#define      FLD_VBI3_CRI_MARGIN      0x00000F00
+#define      FLD_VBI3_CRI_TIME        0x000000FF
+
+/*****************************************************************************/
+#define      HORIZ_TIM_CTRL           0x470
+#define      FLD_BGDEL_CNT            0xFF000000
+/* Reserved [23:22] */
+#define      FLD_HACTIVE_CNT          0x003FF000
+/* Reserved [11:10] */
+#define      FLD_HBLANK_CNT           0x000003FF
+
+/*****************************************************************************/
+#define      VERT_TIM_CTRL            0x474
+#define      FLD_V656BLANK_CNT        0xFF000000
+/* Reserved [23:22] */
+#define      FLD_VACTIVE_CNT          0x003FF000
+/* Reserved [11:10] */
+#define      FLD_VBLANK_CNT           0x000003FF
+
+/*****************************************************************************/
+#define      SRC_COMB_CFG             0x478
+#define      FLD_CCOMB_2LN_CHECK      0x80000000
+#define      FLD_CCOMB_3LN_EN         0x40000000
+#define      FLD_CCOMB_2LN_EN         0x20000000
+#define      FLD_CCOMB_3D_EN          0x10000000
+/* Reserved [27] */
+#define      FLD_LCOMB_3LN_EN         0x04000000
+#define      FLD_LCOMB_2LN_EN         0x02000000
+#define      FLD_LCOMB_3D_EN          0x01000000
+#define      FLD_LUMA_LPF_SEL         0x00C00000
+#define      FLD_UV_LPF_SEL           0x00300000
+#define      FLD_BLEND_SLOPE          0x000F0000
+#define      FLD_CCOMB_REDUCE_EN      0x00008000
+/* Reserved [14:10] */
+#define      FLD_SRC_DECIM_RATIO      0x000003FF
+
+/*****************************************************************************/
+#define      CHROMA_VBIOFF_CFG        0x47C
+#define      FLD_VBI_VOFFSET          0x1F000000
+/* Reserved [23:20] */
+#define      FLD_SC_STEP              0x000FFFFF
+
+/*****************************************************************************/
+#define      FIELD_COUNT              0x480
+#define      FLD_FIELD_COUNT_FLD      0x000003FF
+
+/*****************************************************************************/
+#define      MISC_TIM_CTRL            0x484
+#define      FLD_DEBOUNCE_COUNT       0xC0000000
+#define      FLD_VT_LINE_CNT_HYST     0x30000000
+/* Reserved [27] */
+#define      FLD_AFD_STAT             0x07FF0000
+#define      FLD_VPRES_VERT_EN        0x00008000
+/* Reserved [14:12] */
+#define      FLD_HR32                 0x00000800
+#define      FLD_TDALGN               0x00000400
+#define      FLD_TDFIELD              0x00000200
+/* Reserved [8:6] */
+#define      FLD_TEMPDEC              0x0000003F
+
+/*****************************************************************************/
+#define      DFE_CTRL1                0x488
+#define      FLD_CLAMP_AUTO_EN        0x80000000
+#define      FLD_AGC_AUTO_EN          0x40000000
+#define      FLD_VGA_CRUSH_EN         0x20000000
+#define      FLD_VGA_AUTO_EN          0x10000000
+#define      FLD_VBI_GATE_EN          0x08000000
+#define      FLD_CLAMP_LEVEL          0x07000000
+/* Reserved [23:22] */
+#define      FLD_CLAMP_SKIP_CNT       0x00300000
+#define      FLD_AGC_GAIN             0x000FFF00
+/* Reserved [7:6] */
+#define      FLD_VGA_GAIN             0x0000003F
+
+/*****************************************************************************/
+#define      DFE_CTRL2                0x48C
+#define      FLD_VGA_ACQUIRE_RANGE    0x00FF0000
+#define      FLD_VGA_TRACK_RANGE      0x0000FF00
+#define      FLD_VGA_SYNC             0x000000FF
+
+/*****************************************************************************/
+#define      DFE_CTRL3                0x490
+#define      FLD_BP_PERCENT           0xFF000000
+#define      FLD_DFT_THRESHOLD        0x00FF0000
+/* Reserved [15:12] */
+#define      FLD_SYNC_WIDTH_SEL       0x00000600
+#define      FLD_BP_LOOP_GAIN         0x00000300
+#define      FLD_SYNC_LOOP_GAIN       0x000000C0
+/* Reserved [5:4] */
+#define      FLD_AGC_LOOP_GAIN        0x0000000C
+#define      FLD_DCC_LOOP_GAIN        0x00000003
+
+/*****************************************************************************/
+#define      PLL_CTRL                 0x494
+#define      FLD_PLL_KD               0xFF000000
+#define      FLD_PLL_KI               0x00FF0000
+#define      FLD_PLL_MAX_OFFSET       0x0000FFFF
+
+
+/*****************************************************************************/
+#define      HTL_CTRL                 0x498
+/* Reserved [31:24] */
+#define      FLD_AUTO_LOCK_SPD        0x00080000
+#define      FLD_MAN_FAST_LOCK        0x00040000
+#define      FLD_HTL_15K_EN           0x00020000
+#define      FLD_HTL_500K_EN          0x00010000
+#define      FLD_HTL_KD               0x0000FF00
+#define      FLD_HTL_KI               0x000000FF
+
+/*****************************************************************************/
+#define      COMB_CTRL                0x49C
+#define      FLD_COMB_PHASE_LIMIT     0xFF000000
+#define      FLD_CCOMB_ERR_LIMIT      0x00FF0000
+#define      FLD_LUMA_THRESHOLD       0x0000FF00
+#define      FLD_LCOMB_ERR_LIMIT      0x000000FF
+
+/*****************************************************************************/
+#define      CRUSH_CTRL               0x4A0
+#define      FLD_WTW_EN               0x00400000
+#define      FLD_CRUSH_FREQ           0x00200000
+#define      FLD_MAJ_SEL_EN           0x00100000
+#define      FLD_MAJ_SEL              0x000C0000
+/* Reserved [17:15] */
+#define      FLD_SYNC_TIP_REDUCE      0x00007E00
+/* Reserved [8:6] */
+#define      FLD_SYNC_TIP_INC         0x0000003F
+
+/*****************************************************************************/
+#define      SOFT_RST_CTRL            0x4A4
+#define      FLD_VD_SOFT_RST          0x00008000
+/* Reserved [14:12] */
+#define      FLD_REG_RST_MSK          0x00000800
+#define      FLD_VOF_RST_MSK          0x00000400
+#define      FLD_MVDET_RST_MSK        0x00000200
+#define      FLD_VBI_RST_MSK          0x00000100
+#define      FLD_SCALE_RST_MSK        0x00000080
+#define      FLD_CHROMA_RST_MSK       0x00000040
+#define      FLD_LUMA_RST_MSK         0x00000020
+#define      FLD_VTG_RST_MSK          0x00000010
+#define      FLD_YCSEP_RST_MSK        0x00000008
+#define      FLD_SRC_RST_MSK          0x00000004
+#define      FLD_DFE_RST_MSK          0x00000002
+/* Reserved [0] */
+
+/*****************************************************************************/
+#define      MV_DT_CTRL1              0x4A8
+/* Reserved [31:29] */
+#define      FLD_PSP_STOP_LINE        0x1F000000
+/* Reserved [23:21] */
+#define      FLD_PSP_STRT_LINE        0x001F0000
+/* Reserved [15] */
+#define      FLD_PSP_LLIMW            0x00007F00
+/* Reserved [7] */
+#define      FLD_PSP_ULIMW            0x0000007F
+
+/*****************************************************************************/
+#define      MV_DT_CTRL2              0x4AC
+#define      FLD_CS_STOPWIN           0xFF000000
+#define      FLD_CS_STRTWIN           0x00FF0000
+#define      FLD_CS_WIDTH             0x0000FF00
+#define      FLD_PSP_SPEC_VAL         0x000000FF
+
+/*****************************************************************************/
+#define      MV_DT_CTRL3              0x4B0
+#define      FLD_AUTO_RATE_DIS        0x80000000
+#define      FLD_HLOCK_DIS            0x40000000
+#define      FLD_SEL_FIELD_CNT        0x20000000
+#define      FLD_CS_TYPE2_SEL         0x10000000
+#define      FLD_CS_LINE_THRSH_SEL    0x08000000
+#define      FLD_CS_ATHRESH_SEL       0x04000000
+#define      FLD_PSP_SPEC_SEL         0x02000000
+#define      FLD_PSP_LINES_SEL        0x01000000
+#define      FLD_FIELD_CNT            0x00F00000
+#define      FLD_CS_TYPE2_CNT         0x000FC000
+#define      FLD_CS_LINE_CNT          0x00003F00
+#define      FLD_CS_ATHRESH_LEV       0x000000FF
+
+/*****************************************************************************/
+#define      CHIP_VERSION             0x4B4
+/* Cx231xx redefine  */
+#define      VERSION                  0x4B4
+#define      FLD_REV_ID               0x000000FF
+
+/*****************************************************************************/
+#define      MISC_DIAG_CTRL           0x4B8
+/* Reserved [31:24] */
+#define      FLD_SC_CONVERGE_THRESH   0x00FF0000
+#define      FLD_CCOMB_ERR_LIMIT_3D   0x0000FF00
+#define      FLD_LCOMB_ERR_LIMIT_3D   0x000000FF
+
+/*****************************************************************************/
+#define      VBI_PASS_CTRL            0x4BC
+#define      FLD_VBI_PASS_MD          0x00200000
+#define      FLD_VBI_SETUP_DIS        0x00100000
+#define      FLD_PASS_LINE_CTRL       0x000FFFFF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      VCR_DET_CTRL             0x4c0
+#define      FLD_EN_FIELD_PHASE_DET   0x80000000
+#define      FLD_EN_HEAD_SW_DET       0x40000000
+#define      FLD_FIELD_PHASE_LENGTH   0x01FF0000
+/* Reserved [29:25] */
+#define      FLD_FIELD_PHASE_DELAY    0x0000FF00
+#define      FLD_FIELD_PHASE_LIMIT    0x000000F0
+#define      FLD_HEAD_SW_DET_LIMIT    0x0000000F
+
+
+/*****************************************************************************/
+#define      DL_CTL                   0x800
+#define      DL_CTL_ADDRESS_LOW       0x800    /* Byte 1 in DL_CTL */
+#define      DL_CTL_ADDRESS_HIGH      0x801    /* Byte 2 in DL_CTL */
+#define      DL_CTL_DATA              0x802    /* Byte 3 in DL_CTL */
+#define      DL_CTL_CONTROL           0x803    /* Byte 4 in DL_CTL */
+/* Reserved [31:5] */
+#define      FLD_START_8051           0x10000000
+#define      FLD_DL_ENABLE            0x08000000
+#define      FLD_DL_AUTO_INC          0x04000000
+#define      FLD_DL_MAP               0x03000000
+
+/*****************************************************************************/
+#define      STD_DET_STATUS           0x804
+#define      FLD_SPARE_STATUS1        0xFF000000
+#define      FLD_SPARE_STATUS0        0x00FF0000
+#define      FLD_MOD_DET_STATUS1      0x0000FF00
+#define      FLD_MOD_DET_STATUS0      0x000000FF
+
+/*****************************************************************************/
+#define      AUD_BUILD_NUM            0x806
+#define      AUD_VER_NUM              0x807
+#define      STD_DET_CTL              0x808
+#define      STD_DET_CTL_AUD_CTL      0x808  /* Byte 1 in STD_DET_CTL */
+#define      STD_DET_CTL_PREF_MODE    0x809  /* Byte 2 in STD_DET_CTL */
+#define      FLD_SPARE_CTL0           0xFF000000
+#define      FLD_DIS_DBX              0x00800000
+#define      FLD_DIS_BTSC             0x00400000
+#define      FLD_DIS_NICAM_A2         0x00200000
+#define      FLD_VIDEO_PRESENT        0x00100000
+#define      FLD_DW8051_VIDEO_FORMAT  0x000F0000
+#define      FLD_PREF_DEC_MODE        0x0000FF00
+#define      FLD_AUD_CONFIG           0x000000FF
+
+/*****************************************************************************/
+#define      DW8051_INT               0x80C
+#define      FLD_VIDEO_PRESENT_CHANGE 0x80000000
+#define      FLD_VIDEO_CHANGE         0x40000000
+#define      FLD_RDS_READY            0x20000000
+#define      FLD_AC97_INT             0x10000000
+#define      FLD_NICAM_BIT_ERROR_TOO_HIGH         0x08000000
+#define      FLD_NICAM_LOCK           0x04000000
+#define      FLD_NICAM_UNLOCK         0x02000000
+#define      FLD_DFT4_TH_CMP          0x01000000
+/* Reserved [23:22] */
+#define      FLD_LOCK_IND_INT         0x00200000
+#define      FLD_DFT3_TH_CMP          0x00100000
+#define      FLD_DFT2_TH_CMP          0x00080000
+#define      FLD_DFT1_TH_CMP          0x00040000
+#define      FLD_FM2_DFT_TH_CMP       0x00020000
+#define      FLD_FM1_DFT_TH_CMP       0x00010000
+#define      FLD_VIDEO_PRESENT_EN     0x00008000
+#define      FLD_VIDEO_CHANGE_EN      0x00004000
+#define      FLD_RDS_READY_EN         0x00002000
+#define      FLD_AC97_INT_EN          0x00001000
+#define      FLD_NICAM_BIT_ERROR_TOO_HIGH_EN      0x00000800
+#define      FLD_NICAM_LOCK_EN        0x00000400
+#define      FLD_NICAM_UNLOCK_EN      0x00000200
+#define      FLD_DFT4_TH_CMP_EN       0x00000100
+/* Reserved [7] */
+#define      FLD_DW8051_INT6_CTL1     0x00000040
+#define      FLD_DW8051_INT5_CTL1     0x00000020
+#define      FLD_DW8051_INT4_CTL1     0x00000010
+#define      FLD_DW8051_INT3_CTL1     0x00000008
+#define      FLD_DW8051_INT2_CTL1     0x00000004
+#define      FLD_DW8051_INT1_CTL1     0x00000002
+#define      FLD_DW8051_INT0_CTL1     0x00000001
+
+/*****************************************************************************/
+#define      GENERAL_CTL              0x810
+#define      FLD_RDS_INT              0x80000000
+#define      FLD_NBER_INT             0x40000000
+#define      FLD_NLL_INT              0x20000000
+#define      FLD_IFL_INT              0x10000000
+#define      FLD_FDL_INT              0x08000000
+#define      FLD_AFC_INT              0x04000000
+#define      FLD_AMC_INT              0x02000000
+#define      FLD_AC97_INT_CTL         0x01000000
+#define      FLD_RDS_INT_DIS          0x00800000
+#define      FLD_NBER_INT_DIS         0x00400000
+#define      FLD_NLL_INT_DIS          0x00200000
+#define      FLD_IFL_INT_DIS          0x00100000
+#define      FLD_FDL_INT_DIS          0x00080000
+#define      FLD_FC_INT_DIS           0x00040000
+#define      FLD_AMC_INT_DIS          0x00020000
+#define      FLD_AC97_INT_DIS         0x00010000
+#define      FLD_REV_NUM              0x0000FF00
+/* Reserved [7:5] */
+#define      FLD_DBX_SOFT_RESET_REG   0x00000010
+#define      FLD_AD_SOFT_RESET_REG    0x00000008
+#define      FLD_SRC_SOFT_RESET_REG   0x00000004
+#define      FLD_CDMOD_SOFT_RESET     0x00000002
+#define      FLD_8051_SOFT_RESET      0x00000001
+
+/*****************************************************************************/
+#define      AAGC_CTL                 0x814
+#define      FLD_AFE_12DB_EN          0x80000000
+#define      FLD_AAGC_DEFAULT_EN      0x40000000
+#define      FLD_AAGC_DEFAULT         0x3F000000
+/* Reserved [23] */
+#define      FLD_AAGC_GAIN            0x00600000
+#define      FLD_AAGC_TH              0x001F0000
+/* Reserved [15:14] */
+#define      FLD_AAGC_HYST2           0x00003F00
+/* Reserved [7:6] */
+#define      FLD_AAGC_HYST1           0x0000003F
+
+/*****************************************************************************/
+#define      IF_SRC_CTL               0x818
+#define      FLD_DBX_BYPASS           0x80000000
+/* Reserved [30:25] */
+#define      FLD_IF_SRC_MODE          0x01000000
+/* Reserved [23:18] */
+#define      FLD_IF_SRC_PHASE_INC     0x0001FFFF
+
+/*****************************************************************************/
+#define      ANALOG_DEMOD_CTL         0x81C
+#define      FLD_ROT1_PHACC_PROG      0xFFFF0000
+/* Reserved [15] */
+#define      FLD_FM1_DELAY_FIX        0x00007000
+#define      FLD_PDF4_SHIFT           0x00000C00
+#define      FLD_PDF3_SHIFT           0x00000300
+#define      FLD_PDF2_SHIFT           0x000000C0
+#define      FLD_PDF1_SHIFT           0x00000030
+#define      FLD_FMBYPASS_MODE2       0x00000008
+#define      FLD_FMBYPASS_MODE1       0x00000004
+#define      FLD_NICAM_MODE           0x00000002
+#define      FLD_BTSC_FMRADIO_MODE    0x00000001
+
+/*****************************************************************************/
+#define      ROT_FREQ_CTL             0x820
+#define      FLD_ROT3_PHACC_PROG      0xFFFF0000
+#define      FLD_ROT2_PHACC_PROG      0x0000FFFF
+
+/*****************************************************************************/
+#define      FM_CTL                   0x824
+#define      FLD_FM2_DC_FB_SHIFT      0xF0000000
+#define      FLD_FM2_DC_INT_SHIFT     0x0F000000
+#define      FLD_FM2_AFC_RESET        0x00800000
+#define      FLD_FM2_DC_PASS_IN       0x00400000
+#define      FLD_FM2_DAGC_SHIFT       0x00380000
+#define      FLD_FM2_CORDIC_SHIFT     0x00070000
+#define      FLD_FM1_DC_FB_SHIFT      0x0000F000
+#define      FLD_FM1_DC_INT_SHIFT     0x00000F00
+#define      FLD_FM1_AFC_RESET        0x00000080
+#define      FLD_FM1_DC_PASS_IN       0x00000040
+#define      FLD_FM1_DAGC_SHIFT       0x00000038
+#define      FLD_FM1_CORDIC_SHIFT     0x00000007
+
+/*****************************************************************************/
+#define      LPF_PDF_CTL              0x828
+/* Reserved [31:30] */
+#define      FLD_LPF32_SHIFT1         0x30000000
+#define      FLD_LPF32_SHIFT2         0x0C000000
+#define      FLD_LPF160_SHIFTA        0x03000000
+#define      FLD_LPF160_SHIFTB        0x00C00000
+#define      FLD_LPF160_SHIFTC        0x00300000
+#define      FLD_LPF32_COEF_SEL2      0x000C0000
+#define      FLD_LPF32_COEF_SEL1      0x00030000
+#define      FLD_LPF160_COEF_SELC     0x0000C000
+#define      FLD_LPF160_COEF_SELB     0x00003000
+#define      FLD_LPF160_COEF_SELA     0x00000C00
+#define      FLD_LPF160_IN_EN_REG     0x00000300
+#define      FLD_PDF4_PDF_SEL         0x000000C0
+#define      FLD_PDF3_PDF_SEL         0x00000030
+#define      FLD_PDF2_PDF_SEL         0x0000000C
+#define      FLD_PDF1_PDF_SEL         0x00000003
+
+/*****************************************************************************/
+#define      DFT1_CTL1                0x82C
+#define      FLD_DFT1_DWELL           0xFFFF0000
+#define      FLD_DFT1_FREQ            0x0000FFFF
+
+/*****************************************************************************/
+#define      DFT1_CTL2                0x830
+#define      FLD_DFT1_THRESHOLD       0xFFFFFF00
+#define      FLD_DFT1_CMP_CTL         0x00000080
+#define      FLD_DFT1_AVG             0x00000070
+/* Reserved [3:1] */
+#define      FLD_DFT1_START           0x00000001
+
+/*****************************************************************************/
+#define      DFT1_STATUS              0x834
+#define      FLD_DFT1_DONE            0x80000000
+#define      FLD_DFT1_TH_CMP_STAT     0x40000000
+#define      FLD_DFT1_RESULT          0x3FFFFFFF
+
+/*****************************************************************************/
+#define      DFT2_CTL1                0x838
+#define      FLD_DFT2_DWELL           0xFFFF0000
+#define      FLD_DFT2_FREQ            0x0000FFFF
+
+/*****************************************************************************/
+#define      DFT2_CTL2                0x83C
+#define      FLD_DFT2_THRESHOLD       0xFFFFFF00
+#define      FLD_DFT2_CMP_CTL         0x00000080
+#define      FLD_DFT2_AVG             0x00000070
+/* Reserved [3:1] */
+#define      FLD_DFT2_START           0x00000001
+
+/*****************************************************************************/
+#define      DFT2_STATUS              0x840
+#define      FLD_DFT2_DONE            0x80000000
+#define      FLD_DFT2_TH_CMP_STAT     0x40000000
+#define      FLD_DFT2_RESULT          0x3FFFFFFF
+
+/*****************************************************************************/
+#define      DFT3_CTL1                0x844
+#define      FLD_DFT3_DWELL           0xFFFF0000
+#define      FLD_DFT3_FREQ            0x0000FFFF
+
+/*****************************************************************************/
+#define      DFT3_CTL2                0x848
+#define      FLD_DFT3_THRESHOLD       0xFFFFFF00
+#define      FLD_DFT3_CMP_CTL         0x00000080
+#define      FLD_DFT3_AVG             0x00000070
+/* Reserved [3:1] */
+#define      FLD_DFT3_START           0x00000001
+
+/*****************************************************************************/
+#define      DFT3_STATUS              0x84C
+#define      FLD_DFT3_DONE            0x80000000
+#define      FLD_DFT3_TH_CMP_STAT     0x40000000
+#define      FLD_DFT3_RESULT          0x3FFFFFFF
+
+/*****************************************************************************/
+#define      DFT4_CTL1                0x850
+#define      FLD_DFT4_DWELL           0xFFFF0000
+#define      FLD_DFT4_FREQ            0x0000FFFF
+
+/*****************************************************************************/
+#define      DFT4_CTL2                0x854
+#define      FLD_DFT4_THRESHOLD       0xFFFFFF00
+#define      FLD_DFT4_CMP_CTL         0x00000080
+#define      FLD_DFT4_AVG             0x00000070
+/* Reserved [3:1] */
+#define      FLD_DFT4_START           0x00000001
+
+/*****************************************************************************/
+#define      DFT4_STATUS              0x858
+#define      FLD_DFT4_DONE            0x80000000
+#define      FLD_DFT4_TH_CMP_STAT     0x40000000
+#define      FLD_DFT4_RESULT          0x3FFFFFFF
+
+/*****************************************************************************/
+#define      AM_MTS_DET               0x85C
+#define      FLD_AM_MTS_MODE          0x80000000
+/* Reserved [30:26] */
+#define      FLD_AM_SUB               0x02000000
+#define      FLD_AM_GAIN_EN           0x01000000
+/* Reserved [23:16] */
+#define      FLD_AMMTS_GAIN_SCALE     0x0000E000
+#define      FLD_MTS_PDF_SHIFT        0x00001800
+#define      FLD_AM_REG_GAIN          0x00000700
+#define      FLD_AGC_REF              0x000000FF
+
+/*****************************************************************************/
+#define      ANALOG_MUX_CTL           0x860
+/* Reserved [31:29] */
+#define      FLD_MUX21_SEL            0x10000000
+#define      FLD_MUX20_SEL            0x08000000
+#define      FLD_MUX19_SEL            0x04000000
+#define      FLD_MUX18_SEL            0x02000000
+#define      FLD_MUX17_SEL            0x01000000
+#define      FLD_MUX16_SEL            0x00800000
+#define      FLD_MUX15_SEL            0x00400000
+#define      FLD_MUX14_SEL            0x00300000
+#define      FLD_MUX13_SEL            0x000C0000
+#define      FLD_MUX12_SEL            0x00020000
+#define      FLD_MUX11_SEL            0x00018000
+#define      FLD_MUX10_SEL            0x00004000
+#define      FLD_MUX9_SEL             0x00002000
+#define      FLD_MUX8_SEL             0x00001000
+#define      FLD_MUX7_SEL             0x00000800
+#define      FLD_MUX6_SEL             0x00000600
+#define      FLD_MUX5_SEL             0x00000100
+#define      FLD_MUX4_SEL             0x000000C0
+#define      FLD_MUX3_SEL             0x00000030
+#define      FLD_MUX2_SEL             0x0000000C
+#define      FLD_MUX1_SEL             0x00000003
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DPLL_CTRL1               0x864
+#define      DIG_PLL_CTL1             0x864
+
+#define      FLD_PLL_STATUS           0x07000000
+#define      FLD_BANDWIDTH_SELECT     0x00030000
+#define      FLD_PLL_SHIFT_REG        0x00007000
+#define      FLD_PHASE_SHIFT          0x000007FF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DPLL_CTRL2               0x868
+#define      DIG_PLL_CTL2             0x868
+#define      FLD_PLL_UNLOCK_THR       0xFF000000
+#define      FLD_PLL_LOCK_THR         0x00FF0000
+/* Reserved [15:8] */
+#define      FLD_AM_PDF_SEL2          0x000000C0
+#define      FLD_AM_PDF_SEL1          0x00000030
+#define      FLD_DPLL_FSM_CTRL        0x0000000C
+/* Reserved [1] */
+#define      FLD_PLL_PILOT_DET        0x00000001
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DPLL_CTRL3               0x86C
+#define      DIG_PLL_CTL3             0x86C
+#define      FLD_DISABLE_LOOP         0x01000000
+#define      FLD_A1_DS1_SEL           0x000C0000
+#define      FLD_A1_DS2_SEL           0x00030000
+#define      FLD_A1_KI                0x0000FF00
+#define      FLD_A1_KD                0x000000FF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DPLL_CTRL4               0x870
+#define      DIG_PLL_CTL4             0x870
+#define      FLD_A2_DS1_SEL           0x000C0000
+#define      FLD_A2_DS2_SEL           0x00030000
+#define      FLD_A2_KI                0x0000FF00
+#define      FLD_A2_KD                0x000000FF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DPLL_CTRL5               0x874
+#define      DIG_PLL_CTL5             0x874
+#define      FLD_TRK_DS1_SEL          0x000C0000
+#define      FLD_TRK_DS2_SEL          0x00030000
+#define      FLD_TRK_KI               0x0000FF00
+#define      FLD_TRK_KD               0x000000FF
+
+/*****************************************************************************/
+#define      DEEMPH_GAIN_CTL          0x878
+#define      FLD_DEEMPH2_GAIN         0xFFFF0000
+#define      FLD_DEEMPH1_GAIN         0x0000FFFF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DEEMPH_COEFF1            0x87C
+#define      DEEMPH_COEF1             0x87C
+#define      FLD_DEEMPH_B0            0xFFFF0000
+#define      FLD_DEEMPH_A0            0x0000FFFF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DEEMPH_COEFF2            0x880
+#define      DEEMPH_COEF2             0x880
+#define      FLD_DEEMPH_B1            0xFFFF0000
+#define      FLD_DEEMPH_A1            0x0000FFFF
+
+/*****************************************************************************/
+#define      DBX1_CTL1                0x884
+#define      FLD_DBX1_WBE_GAIN        0xFFFF0000
+#define      FLD_DBX1_IN_GAIN         0x0000FFFF
+
+/*****************************************************************************/
+#define      DBX1_CTL2                0x888
+#define      FLD_DBX1_SE_BYPASS       0xFFFF0000
+#define      FLD_DBX1_SE_GAIN         0x0000FFFF
+
+/*****************************************************************************/
+#define      DBX1_RMS_SE              0x88C
+#define      FLD_DBX1_RMS_WBE         0xFFFF0000
+#define      FLD_DBX1_RMS_SE_FLD      0x0000FFFF
+
+/*****************************************************************************/
+#define      DBX2_CTL1                0x890
+#define      FLD_DBX2_WBE_GAIN        0xFFFF0000
+#define      FLD_DBX2_IN_GAIN         0x0000FFFF
+
+/*****************************************************************************/
+#define      DBX2_CTL2                0x894
+#define      FLD_DBX2_SE_BYPASS       0xFFFF0000
+#define      FLD_DBX2_SE_GAIN         0x0000FFFF
+
+/*****************************************************************************/
+#define      DBX2_RMS_SE              0x898
+#define      FLD_DBX2_RMS_WBE         0xFFFF0000
+#define      FLD_DBX2_RMS_SE_FLD      0x0000FFFF
+
+/*****************************************************************************/
+#define      AM_FM_DIFF               0x89C
+/* Reserved [31] */
+#define      FLD_FM_DIFF_OUT          0x7FFF0000
+/* Reserved [15] */
+#define      FLD_AM_DIFF_OUT          0x00007FFF
+
+/*****************************************************************************/
+#define      NICAM_FAW                0x8A0
+#define      FLD_FAWDETWINEND         0xFC000000
+#define      FLD_FAWDETWINSTR         0x03FF0000
+/* Reserved [15:12] */
+#define      FLD_FAWDETTHRSHLD3       0x00000F00
+#define      FLD_FAWDETTHRSHLD2       0x000000F0
+#define      FLD_FAWDETTHRSHLD1       0x0000000F
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DEEMPH_GAIN              0x8A4
+#define      NICAM_DEEMPHGAIN         0x8A4
+/* Reserved [31:18] */
+#define      FLD_DEEMPHGAIN           0x0003FFFF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DEEMPH_NUMER1            0x8A8
+#define      NICAM_DEEMPHNUMER1       0x8A8
+/* Reserved [31:18] */
+#define      FLD_DEEMPHNUMER1         0x0003FFFF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DEEMPH_NUMER2            0x8AC
+#define      NICAM_DEEMPHNUMER2       0x8AC
+/* Reserved [31:18] */
+#define      FLD_DEEMPHNUMER2         0x0003FFFF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DEEMPH_DENOM1            0x8B0
+#define      NICAM_DEEMPHDENOM1       0x8B0
+/* Reserved [31:18] */
+#define      FLD_DEEMPHDENOM1         0x0003FFFF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      DEEMPH_DENOM2            0x8B4
+#define      NICAM_DEEMPHDENOM2       0x8B4
+/* Reserved [31:18] */
+#define      FLD_DEEMPHDENOM2         0x0003FFFF
+
+/*****************************************************************************/
+#define      NICAM_ERRLOG_CTL1        0x8B8
+/* Reserved [31:28] */
+#define      FLD_ERRINTRPTTHSHLD1     0x0FFF0000
+/* Reserved [15:12] */
+#define      FLD_ERRLOGPERIOD         0x00000FFF
+
+/*****************************************************************************/
+#define      NICAM_ERRLOG_CTL2        0x8BC
+/* Reserved [31:28] */
+#define      FLD_ERRINTRPTTHSHLD3     0x0FFF0000
+/* Reserved [15:12] */
+#define      FLD_ERRINTRPTTHSHLD2     0x00000FFF
+
+/*****************************************************************************/
+#define      NICAM_ERRLOG_STS1        0x8C0
+/* Reserved [31:28] */
+#define      FLD_ERRLOG2              0x0FFF0000
+/* Reserved [15:12] */
+#define      FLD_ERRLOG1              0x00000FFF
+
+/*****************************************************************************/
+#define      NICAM_ERRLOG_STS2        0x8C4
+/* Reserved [31:12] */
+#define      FLD_ERRLOG3              0x00000FFF
+
+/*****************************************************************************/
+#define      NICAM_STATUS             0x8C8
+/* Reserved [31:20] */
+#define      FLD_NICAM_CIB            0x000C0000
+#define      FLD_NICAM_LOCK_STAT      0x00020000
+#define      FLD_NICAM_MUTE           0x00010000
+#define      FLD_NICAMADDIT_DATA      0x0000FFE0
+#define      FLD_NICAMCNTRL           0x0000001F
+
+/*****************************************************************************/
+#define      DEMATRIX_CTL             0x8CC
+#define      FLD_AC97_IN_SHIFT        0xF0000000
+#define      FLD_I2S_IN_SHIFT         0x0F000000
+#define      FLD_DEMATRIX_SEL_CTL     0x00FF0000
+/* Reserved [15:11] */
+#define      FLD_DMTRX_BYPASS         0x00000400
+#define      FLD_DEMATRIX_MODE        0x00000300
+/* Reserved [7:6] */
+#define      FLD_PH_DBX_SEL           0x00000020
+#define      FLD_PH_CH_SEL            0x00000010
+#define      FLD_PHASE_FIX            0x0000000F
+
+/*****************************************************************************/
+#define      PATH1_CTL1               0x8D0
+/* Reserved [31:29] */
+#define      FLD_PATH1_MUTE_CTL       0x1F000000
+/* Reserved [23:22] */
+#define      FLD_PATH1_AVC_CG         0x00300000
+#define      FLD_PATH1_AVC_RT         0x000F0000
+#define      FLD_PATH1_AVC_AT         0x0000F000
+#define      FLD_PATH1_AVC_STEREO     0x00000800
+#define      FLD_PATH1_AVC_CR         0x00000700
+#define      FLD_PATH1_AVC_RMS_CON    0x000000F0
+#define      FLD_PATH1_SEL_CTL        0x0000000F
+
+/*****************************************************************************/
+#define      PATH1_VOL_CTL            0x8D4
+#define      FLD_PATH1_AVC_THRESHOLD  0x7FFF0000
+#define      FLD_PATH1_BAL_LEFT       0x00008000
+#define      FLD_PATH1_BAL_LEVEL      0x00007F00
+#define      FLD_PATH1_VOLUME         0x000000FF
+
+/*****************************************************************************/
+#define      PATH1_EQ_CTL             0x8D8
+/* Reserved [31:30] */
+#define      FLD_PATH1_EQ_TREBLE_VOL  0x3F000000
+/* Reserved [23:22] */
+#define      FLD_PATH1_EQ_MID_VOL     0x003F0000
+/* Reserved [15:14] */
+#define      FLD_PATH1_EQ_BASS_VOL    0x00003F00
+/* Reserved [7:1] */
+#define      FLD_PATH1_EQ_BAND_SEL    0x00000001
+
+/*****************************************************************************/
+#define      PATH1_SC_CTL             0x8DC
+#define      FLD_PATH1_SC_THRESHOLD   0x7FFF0000
+#define      FLD_PATH1_SC_RT          0x0000F000
+#define      FLD_PATH1_SC_AT          0x00000F00
+#define      FLD_PATH1_SC_STEREO      0x00000080
+#define      FLD_PATH1_SC_CR          0x00000070
+#define      FLD_PATH1_SC_RMS_CON     0x0000000F
+
+/*****************************************************************************/
+#define      PATH2_CTL1               0x8E0
+/* Reserved [31:26] */
+#define      FLD_PATH2_MUTE_CTL       0x03000000
+/* Reserved [23:22] */
+#define      FLD_PATH2_AVC_CG         0x00300000
+#define      FLD_PATH2_AVC_RT         0x000F0000
+#define      FLD_PATH2_AVC_AT         0x0000F000
+#define      FLD_PATH2_AVC_STEREO     0x00000800
+#define      FLD_PATH2_AVC_CR         0x00000700
+#define      FLD_PATH2_AVC_RMS_CON    0x000000F0
+#define      FLD_PATH2_SEL_CTL        0x0000000F
+
+/*****************************************************************************/
+#define      PATH2_VOL_CTL            0x8E4
+#define      FLD_PATH2_AVC_THRESHOLD  0xFFFF0000
+#define      FLD_PATH2_BAL_LEFT       0x00008000
+#define      FLD_PATH2_BAL_LEVEL      0x00007F00
+#define      FLD_PATH2_VOLUME         0x000000FF
+
+/*****************************************************************************/
+#define      PATH2_EQ_CTL             0x8E8
+/* Reserved [31:30] */
+#define      FLD_PATH2_EQ_TREBLE_VOL  0x3F000000
+/* Reserved [23:22] */
+#define      FLD_PATH2_EQ_MID_VOL     0x003F0000
+/* Reserved [15:14] */
+#define      FLD_PATH2_EQ_BASS_VOL    0x00003F00
+/* Reserved [7:1] */
+#define      FLD_PATH2_EQ_BAND_SEL    0x00000001
+
+/*****************************************************************************/
+#define      PATH2_SC_CTL             0x8EC
+#define      FLD_PATH2_SC_THRESHOLD   0xFFFF0000
+#define      FLD_PATH2_SC_RT          0x0000F000
+#define      FLD_PATH2_SC_AT          0x00000F00
+#define      FLD_PATH2_SC_STEREO      0x00000080
+#define      FLD_PATH2_SC_CR          0x00000070
+#define      FLD_PATH2_SC_RMS_CON     0x0000000F
+
+/*****************************************************************************/
+#define      SRC_CTL                  0x8F0
+#define      FLD_SRC_STATUS           0xFFFFFF00
+#define      FLD_FIFO_LF_EN           0x000000FC
+#define      FLD_BYPASS_LI            0x00000002
+#define      FLD_BYPASS_PF            0x00000001
+
+/*****************************************************************************/
+#define      SRC_LF_COEF              0x8F4
+#define      FLD_LOOP_FILTER_COEF2    0xFFFF0000
+#define      FLD_LOOP_FILTER_COEF1    0x0000FFFF
+
+/*****************************************************************************/
+#define      SRC1_CTL                 0x8F8
+/* Reserved [31:28] */
+#define      FLD_SRC1_FIFO_RD_TH      0x0F000000
+/* Reserved [23:18] */
+#define      FLD_SRC1_PHASE_INC       0x0003FFFF
+
+/*****************************************************************************/
+#define      SRC2_CTL                 0x8FC
+/* Reserved [31:28] */
+#define      FLD_SRC2_FIFO_RD_TH      0x0F000000
+/* Reserved [23:18] */
+#define      FLD_SRC2_PHASE_INC       0x0003FFFF
+
+/*****************************************************************************/
+#define      SRC3_CTL                 0x900
+/* Reserved [31:28] */
+#define      FLD_SRC3_FIFO_RD_TH      0x0F000000
+/* Reserved [23:18] */
+#define      FLD_SRC3_PHASE_INC       0x0003FFFF
+
+/*****************************************************************************/
+#define      SRC4_CTL                 0x904
+/* Reserved [31:28] */
+#define      FLD_SRC4_FIFO_RD_TH      0x0F000000
+/* Reserved [23:18] */
+#define      FLD_SRC4_PHASE_INC       0x0003FFFF
+
+/*****************************************************************************/
+#define      SRC5_CTL                 0x908
+/* Reserved [31:28] */
+#define      FLD_SRC5_FIFO_RD_TH      0x0F000000
+/* Reserved [23:18] */
+#define      FLD_SRC5_PHASE_INC       0x0003FFFF
+
+/*****************************************************************************/
+#define      SRC6_CTL                 0x90C
+/* Reserved [31:28] */
+#define      FLD_SRC6_FIFO_RD_TH      0x0F000000
+/* Reserved [23:18] */
+#define      FLD_SRC6_PHASE_INC       0x0003FFFF
+
+/*****************************************************************************/
+#define      BAND_OUT_SEL             0x910
+#define      FLD_SRC6_IN_SEL          0xC0000000
+#define      FLD_SRC6_CLK_SEL         0x30000000
+#define      FLD_SRC5_IN_SEL          0x0C000000
+#define      FLD_SRC5_CLK_SEL         0x03000000
+#define      FLD_SRC4_IN_SEL          0x00C00000
+#define      FLD_SRC4_CLK_SEL         0x00300000
+#define      FLD_SRC3_IN_SEL          0x000C0000
+#define      FLD_SRC3_CLK_SEL         0x00030000
+#define      FLD_BASEBAND_BYPASS_CTL  0x0000FF00
+#define      FLD_AC97_SRC_SEL         0x000000C0
+#define      FLD_I2S_SRC_SEL          0x00000030
+#define      FLD_PARALLEL2_SRC_SEL    0x0000000C
+#define      FLD_PARALLEL1_SRC_SEL    0x00000003
+
+/*****************************************************************************/
+#define      I2S_IN_CTL               0x914
+/* Reserved [31:11] */
+#define      FLD_I2S_UP2X_BW20K       0x00000400
+#define      FLD_I2S_UP2X_BYPASS      0x00000200
+#define      FLD_I2S_IN_MASTER_MODE   0x00000100
+#define      FLD_I2S_IN_SONY_MODE     0x00000080
+#define      FLD_I2S_IN_RIGHT_JUST    0x00000040
+#define      FLD_I2S_IN_WS_SEL        0x00000020
+#define      FLD_I2S_IN_BCN_DEL       0x0000001F
+
+/*****************************************************************************/
+#define      I2S_OUT_CTL              0x918
+/* Reserved [31:17] */
+#define      FLD_I2S_OUT_SOFT_RESET_EN  0x00010000
+/* Reserved [15:9] */
+#define      FLD_I2S_OUT_MASTER_MODE  0x00000100
+#define      FLD_I2S_OUT_SONY_MODE    0x00000080
+#define      FLD_I2S_OUT_RIGHT_JUST   0x00000040
+#define      FLD_I2S_OUT_WS_SEL       0x00000020
+#define      FLD_I2S_OUT_BCN_DEL      0x0000001F
+
+
+/*****************************************************************************/
+#define      AC97_CTL                 0x91C
+/* Reserved [31:26] */
+#define      FLD_AC97_UP2X_BW20K      0x02000000
+#define      FLD_AC97_UP2X_BYPASS     0x01000000
+/* Reserved [23:17] */
+#define      FLD_AC97_RST_ACL         0x00010000
+/* Reserved [15:9] */
+#define      FLD_AC97_WAKE_UP_SYNC    0x00000100
+/* Reserved [7:1] */
+#define      FLD_AC97_SHUTDOWN        0x00000001
+
+
+/* Cx231xx redefine */
+#define      QPSK_IAGC_CTL1  0x94c
+#define      QPSK_IAGC_CTL2  0x950
+#define      QPSK_FEPR_FREQ  0x954
+#define      QPSK_BTL_CTL1  0x958
+#define      QPSK_BTL_CTL2  0x95c
+#define      QPSK_CTL_CTL1  0x960
+#define      QPSK_CTL_CTL2  0x964
+#define      QPSK_MF_FAGC_CTL  0x968
+#define      QPSK_EQ_CTL  0x96c
+#define      QPSK_LOCK_CTL  0x970
+
+
+/*****************************************************************************/
+#define      FM1_DFT_CTL              0x9A8
+#define      FLD_FM1_DFT_THRESHOLD    0xFFFF0000
+/* Reserved [15:8] */
+#define      FLD_FM1_DFT_CMP_CTL      0x00000080
+#define      FLD_FM1_DFT_AVG          0x00000070
+/* Reserved [3:1] */
+#define      FLD_FM1_DFT_START        0x00000001
+
+/*****************************************************************************/
+#define      FM1_DFT_STATUS           0x9AC
+#define      FLD_FM1_DFT_DONE         0x80000000
+/* Reserved [30:19] */
+#define      FLD_FM_DFT_TH_CMP        0x00040000
+#define      FLD_FM1_DFT              0x0003FFFF
+
+/*****************************************************************************/
+#define      FM2_DFT_CTL              0x9B0
+#define      FLD_FM2_DFT_THRESHOLD    0xFFFF0000
+/* Reserved [15:8] */
+#define      FLD_FM2_DFT_CMP_CTL      0x00000080
+#define      FLD_FM2_DFT_AVG          0x00000070
+/* Reserved [3:1] */
+#define      FLD_FM2_DFT_START        0x00000001
+
+/*****************************************************************************/
+#define      FM2_DFT_STATUS           0x9B4
+#define      FLD_FM2_DFT_DONE         0x80000000
+/* Reserved [30:19] */
+#define      FLD_FM2_DFT_TH_CMP_STAT  0x00040000
+#define      FLD_FM2_DFT              0x0003FFFF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      AAGC_STATUS_REG          0x9B8
+#define      AAGC_STATUS              0x9B8
+/* Reserved [31:27] */
+#define      FLD_FM2_DAGC_OUT         0x07000000
+/* Reserved [23:19] */
+#define      FLD_FM1_DAGC_OUT         0x00070000
+/* Reserved [15:6] */
+#define      FLD_AFE_VGA_OUT          0x0000003F
+
+
+
+/*****************************************************************************/
+#define      MTS_GAIN_STATUS          0x9BC
+/* Reserved [31:14] */
+#define      FLD_MTS_GAIN             0x00003FFF
+
+#define      RDS_OUT                  0x9C0
+#define      FLD_RDS_Q                0xFFFF0000
+#define      FLD_RDS_I                0x0000FFFF
+
+/*****************************************************************************/
+#define      AUTOCONFIG_REG           0x9C4
+/* Reserved [31:4] */
+#define      FLD_AUTOCONFIG_MODE      0x0000000F
+
+#define      FM_AFC                   0x9C8
+#define      FLD_FM2_AFC              0xFFFF0000
+#define      FLD_FM1_AFC              0x0000FFFF
+
+/*****************************************************************************/
+/* Cx231xx redefine */
+#define      NEW_SPARE                0x9CC
+#define      NEW_SPARE_REG            0x9CC
+
+/*****************************************************************************/
+#define      DBX_ADJ                  0x9D0
+/* Reserved [31:28] */
+#define      FLD_DBX2_ADJ             0x0FFF0000
+/* Reserved [15:12] */
+#define      FLD_DBX1_ADJ             0x00000FFF
+
+#define      VID_FMT_AUTO              0
+#define      VID_FMT_NTSC_M            1
+#define      VID_FMT_NTSC_J            2
+#define      VID_FMT_NTSC_443          3
+#define      VID_FMT_PAL_BDGHI         4
+#define      VID_FMT_PAL_M             5
+#define      VID_FMT_PAL_N             6
+#define      VID_FMT_PAL_NC            7
+#define      VID_FMT_PAL_60            8
+#define      VID_FMT_SECAM             12
+#define      VID_FMT_SECAM_60          13
+
+#define      INPUT_MODE_CVBS_0         0   /* INPUT_MODE_VALUE(0) */
+#define      INPUT_MODE_YC_1           1   /* INPUT_MODE_VALUE(1) */
+#define      INPUT_MODE_YC2_2          2   /* INPUT_MODE_VALUE(2) */
+#define      INPUT_MODE_YUV_3          3   /* INPUT_MODE_VALUE(3) */
+
+
+#define      LUMA_LPF_LOW_BANDPASS     0   /* 0.6Mhz lowpass filter bandwidth */
+#define      LUMA_LPF_MEDIUM_BANDPASS  1   /* 1.0Mhz lowpass filter bandwidth */
+#define      LUMA_LPF_HIGH_BANDPASS    2   /* 1.5Mhz lowpass filter bandwidth */
+
+#define      UV_LPF_LOW_BANDPASS       0   /* 0.6Mhz lowpass filter bandwidth */
+#define      UV_LPF_MEDIUM_BANDPASS    1   /* 1.0Mhz lowpass filter bandwidth */
+#define      UV_LPF_HIGH_BANDPASS      2   /* 1.5Mhz lowpass filter bandwidth */
+
+#define      TWO_TAP_FILT              0
+#define      THREE_TAP_FILT            1
+#define      FOUR_TAP_FILT             2
+#define      FIVE_TAP_FILT             3
+
+#define      AUD_CHAN_SRC_PARALLEL     0
+#define      AUD_CHAN_SRC_I2S_INPUT    1
+#define      AUD_CHAN_SRC_FLATIRON     2
+#define      AUD_CHAN_SRC_PARALLEL3    3
+
+#define      OUT_MODE_601              0
+#define      OUT_MODE_656              1
+#define      OUT_MODE_VIP11            2
+#define      OUT_MODE_VIP20            3
+
+#define      PHASE_INC_49MHZ          0x0DF22
+#define      PHASE_INC_56MHZ          0x0FA5B
+#define      PHASE_INC_28MHZ          0x010000
+
+#endif
diff --git a/drivers/media/video/cx231xx/cx231xx-vbi.c b/drivers/media/video/cx231xx/cx231xx-vbi.c
new file mode 100644
index 000000000000..e370160973f4
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-vbi.c
@@ -0,0 +1,693 @@
+/*
+   cx231xx_vbi.c - driver for Conexant Cx23100/101/102 USB video capture devices
+
+   Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+        Based on cx88 driver
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitmap.h>
+#include <linux/usb.h>
+#include <linux/i2c.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+
+#include <media/v4l2-common.h>
+#include <media/v4l2-ioctl.h>
+#include <media/v4l2-chip-ident.h>
+#include <media/msp3400.h>
+#include <media/tuner.h>
+
+#include "cx231xx.h"
+#include "cx231xx-vbi.h"
+
+static inline void print_err_status(struct cx231xx *dev,
+				     int packet, int status)
+{
+	char *errmsg = "Unknown";
+
+	switch (status) {
+	case -ENOENT:
+		errmsg = "unlinked synchronuously";
+		break;
+	case -ECONNRESET:
+		errmsg = "unlinked asynchronuously";
+		break;
+	case -ENOSR:
+		errmsg = "Buffer error (overrun)";
+		break;
+	case -EPIPE:
+		errmsg = "Stalled (device not responding)";
+		break;
+	case -EOVERFLOW:
+		errmsg = "Babble (bad cable?)";
+		break;
+	case -EPROTO:
+		errmsg = "Bit-stuff error (bad cable?)";
+		break;
+	case -EILSEQ:
+		errmsg = "CRC/Timeout (could be anything)";
+		break;
+	case -ETIME:
+		errmsg = "Device does not respond";
+		break;
+	}
+	if (packet < 0) {
+		cx231xx_err(DRIVER_NAME "URB status %d [%s].\n",	status, errmsg);
+	} else {
+		cx231xx_err(DRIVER_NAME "URB packet %d, status %d [%s].\n",
+			       packet, status, errmsg);
+	}
+}
+
+/*
+ * Controls the isoc copy of each urb packet
+ */
+static inline int cx231xx_isoc_vbi_copy(struct cx231xx *dev, struct urb *urb)
+{
+	struct cx231xx_buffer    *buf;
+	struct cx231xx_dmaqueue  *dma_q = urb->context;
+	int   rc = 1;
+	unsigned char *p_buffer;
+    u32 bytes_parsed = 0, buffer_size = 0;
+    u8 sav_eav = 0;
+
+	if (!dev)
+		return 0;
+
+	if ((dev->state & DEV_DISCONNECTED) || (dev->state & DEV_MISCONFIGURED))
+		return 0;
+
+	if (urb->status < 0) {
+		print_err_status(dev, -1, urb->status);
+		if (urb->status == -ENOENT)
+			return 0;
+	}
+
+	buf = dev->vbi_mode.isoc_ctl.buf;
+
+    /* get buffer pointer and length */
+    p_buffer = urb->transfer_buffer;
+    buffer_size = urb->actual_length;
+
+    if (buffer_size > 0) {
+
+        bytes_parsed = 0;
+
+        if(dma_q->is_partial_line) {
+            /* Handle the case where we were working on a partial line */
+            sav_eav = dma_q->last_sav;
+        } else {
+            /* Check for a SAV/EAV overlapping the buffer boundary */
+            sav_eav = cx231xx_find_boundary_SAV_EAV(p_buffer, dma_q->partial_buf, &bytes_parsed);
+        }
+
+        sav_eav &= 0xF0;
+        /* Get the first line if we have some portion of an SAV/EAV from the last buffer
+           or a partial line */
+        if(sav_eav) {
+            bytes_parsed += cx231xx_get_vbi_line(dev, dma_q,
+                sav_eav,                        /* SAV/EAV */
+                p_buffer + bytes_parsed,        /* p_buffer */
+                buffer_size - bytes_parsed);    /* buffer size */
+        }
+
+        /* Now parse data that is completely in this buffer */
+        dma_q->is_partial_line = 0;
+
+        while(bytes_parsed < buffer_size)
+        {
+            u32 bytes_used = 0;
+
+            sav_eav = cx231xx_find_next_SAV_EAV(
+                p_buffer + bytes_parsed,        /* p_buffer */
+                buffer_size - bytes_parsed,     /* buffer size */
+                &bytes_used);                   /* Receives bytes used to get SAV/EAV */
+
+	        bytes_parsed += bytes_used;
+
+            sav_eav &= 0xF0;
+            if(sav_eav && (bytes_parsed < buffer_size))
+            {
+                bytes_parsed += cx231xx_get_vbi_line(dev, dma_q,
+                    sav_eav,                        /* SAV/EAV */
+                    p_buffer + bytes_parsed,        /* p_buffer */
+                    buffer_size - bytes_parsed);    /* buffer size */
+            }
+        }
+
+        /* Save the last four bytes of the buffer so we can check the buffer boundary
+           condition next time */
+        memcpy(dma_q->partial_buf, p_buffer + buffer_size - 4, 4);
+        bytes_parsed = 0;
+	}
+
+	return rc;
+}
+
+/* ------------------------------------------------------------------
+	Vbi buf operations
+   ------------------------------------------------------------------*/
+
+static int
+vbi_buffer_setup(struct videobuf_queue *vq, unsigned int *count, unsigned int *size)
+{
+	struct cx231xx_fh *fh = vq->priv_data;
+	struct cx231xx        *dev = fh->dev;
+    u32 height = 0;
+
+	height = ((dev->norm & V4L2_STD_625_50) ?
+                               PAL_VBI_LINES : NTSC_VBI_LINES) ;
+
+	*size = ( dev->width * height * 2);
+	if (0 == *count)
+		*count = CX231XX_DEF_VBI_BUF;
+
+	if (*count < CX231XX_MIN_BUF)
+		*count = CX231XX_MIN_BUF;
+
+    /* call VBI setup if required */
+	/* cx231xx_i2c_call_clients(&dev->i2c_bus[1], VIDIOC_S_FREQUENCY, &f);
+    */
+
+	return 0;
+}
+
+/* This is called *without* dev->slock held; please keep it that way */
+static void free_buffer(struct videobuf_queue *vq, struct cx231xx_buffer *buf)
+{
+	struct cx231xx_fh     *fh  = vq->priv_data;
+	struct cx231xx        *dev = fh->dev;
+	unsigned long flags = 0;
+	if (in_interrupt())
+		BUG();
+
+	/* We used to wait for the buffer to finish here, but this didn't work
+	   because, as we were keeping the state as VIDEOBUF_QUEUED,
+	   videobuf_queue_cancel marked it as finished for us.
+	   (Also, it could wedge forever if the hardware was misconfigured.)
+
+	   This should be safe; by the time we get here, the buffer isn't
+	   queued anymore. If we ever start marking the buffers as
+	   VIDEOBUF_ACTIVE, it won't be, though.
+	*/
+	spin_lock_irqsave(&dev->vbi_mode.slock, flags);
+	if (dev->vbi_mode.isoc_ctl.buf == buf)
+		dev->vbi_mode.isoc_ctl.buf = NULL;
+	spin_unlock_irqrestore(&dev->vbi_mode.slock, flags);
+
+	videobuf_vmalloc_free(&buf->vb);
+	buf->vb.state = VIDEOBUF_NEEDS_INIT;
+}
+
+static int
+vbi_buffer_prepare(struct videobuf_queue *vq, struct videobuf_buffer *vb,
+						enum v4l2_field field)
+{
+	struct cx231xx_fh     *fh  = vq->priv_data;
+	struct cx231xx_buffer *buf = container_of(vb, struct cx231xx_buffer, vb);
+	struct cx231xx        *dev = fh->dev;
+	int                  rc = 0, urb_init = 0;
+    u32 height = 0;
+
+	height = ((dev->norm & V4L2_STD_625_50) ?
+                               PAL_VBI_LINES : NTSC_VBI_LINES) ;
+	buf->vb.size = ( (dev->width << 1) * height );
+
+	if (0 != buf->vb.baddr  &&  buf->vb.bsize < buf->vb.size)
+		return -EINVAL;
+
+	buf->vb.width  = dev->width;
+	buf->vb.height = height;
+	buf->vb.field  = field;
+	buf->vb.field  = V4L2_FIELD_SEQ_TB;
+
+	if (VIDEOBUF_NEEDS_INIT == buf->vb.state) {
+		rc = videobuf_iolock(vq, &buf->vb, NULL);
+		if (rc < 0)
+			goto fail;
+	}
+
+	if (!dev->vbi_mode.isoc_ctl.num_bufs)
+		urb_init = 1;
+
+	if (urb_init) {
+		rc = cx231xx_init_vbi_isoc(dev, CX231XX_NUM_VBI_PACKETS,
+				      CX231XX_NUM_VBI_BUFS, dev->vbi_mode.alt_max_pkt_size[0],
+				      cx231xx_isoc_vbi_copy);
+		if (rc < 0)
+			goto fail;
+	}
+
+	buf->vb.state = VIDEOBUF_PREPARED;
+	return 0;
+
+fail:
+	free_buffer(vq, buf);
+	return rc;
+}
+
+static void
+vbi_buffer_queue(struct videobuf_queue *vq, struct videobuf_buffer *vb)
+{
+	struct cx231xx_buffer    *buf     = container_of(vb, struct cx231xx_buffer, vb);
+	struct cx231xx_fh        *fh      = vq->priv_data;
+	struct cx231xx           *dev     = fh->dev;
+	struct cx231xx_dmaqueue  *vidq    = &dev->vbi_mode.vidq;
+
+	buf->vb.state = VIDEOBUF_QUEUED;
+	list_add_tail(&buf->vb.queue, &vidq->active);
+
+}
+
+static void vbi_buffer_release(struct videobuf_queue *vq,
+				struct videobuf_buffer *vb)
+{
+	struct cx231xx_buffer   *buf  = container_of(vb, struct cx231xx_buffer, vb);
+    /*
+	struct cx231xx_fh       *fh   = vq->priv_data;
+	struct cx231xx          *dev  = (struct cx231xx *)fh->dev;
+
+	cx231xx_info(DRIVER_NAME "cx231xx: called vbi_buffer_release\n");
+    */
+
+	free_buffer(vq, buf);
+}
+
+
+struct videobuf_queue_ops cx231xx_vbi_qops = {
+	.buf_setup      = vbi_buffer_setup,
+	.buf_prepare    = vbi_buffer_prepare,
+	.buf_queue      = vbi_buffer_queue,
+	.buf_release    = vbi_buffer_release,
+};
+
+
+
+/* ------------------------------------------------------------------
+	URB control
+   ------------------------------------------------------------------*/
+
+/*
+ * IRQ callback, called by URB callback
+ */
+static void cx231xx_irq_vbi_callback(struct urb *urb)
+{
+	struct cx231xx_dmaqueue  *dma_q = urb->context;
+    struct cx231xx_video_mode *vmode = container_of(dma_q, struct cx231xx_video_mode, vidq);
+    struct cx231xx *dev = container_of(vmode, struct cx231xx, vbi_mode);
+	int rc;
+
+
+    switch (urb->status) {
+	    case 0:             /* success */
+	    case -ETIMEDOUT:    /* NAK */
+		    break;
+	    case -ECONNRESET:   /* kill */
+	    case -ENOENT:
+	    case -ESHUTDOWN:
+		    return;
+	    default:            /* error */
+		   cx231xx_err(DRIVER_NAME "urb completition error %d.\n", urb->status);
+		    break;
+	}
+
+	/* Copy data from URB */
+	spin_lock(&dev->vbi_mode.slock);
+	rc = dev->vbi_mode.isoc_ctl.isoc_copy(dev, urb);
+	spin_unlock(&dev->vbi_mode.slock);
+
+	/* Reset status */
+	urb->status = 0;
+
+	urb->status = usb_submit_urb(urb, GFP_ATOMIC);
+	if (urb->status) {
+		cx231xx_err(DRIVER_NAME "urb resubmit failed (error=%i)\n",
+			       urb->status);
+	}
+}
+
+/*
+ * Stop and Deallocate URBs
+ */
+void cx231xx_uninit_vbi_isoc(struct cx231xx *dev)
+{
+	struct urb *urb;
+	int i;
+
+	cx231xx_info(DRIVER_NAME  "cx231xx: called cx231xx_uninit_vbi_isoc\n");
+
+	dev->vbi_mode.isoc_ctl.nfields = -1;
+	for (i = 0; i < dev->vbi_mode.isoc_ctl.num_bufs; i++) {
+		urb = dev->vbi_mode.isoc_ctl.urb[i];
+		if (urb) {
+            if (!irqs_disabled())
+			    usb_kill_urb(urb);
+            else
+			    usb_unlink_urb(urb);
+
+			if (dev->vbi_mode.isoc_ctl.transfer_buffer[i]) {
+
+                kfree(dev->vbi_mode.isoc_ctl.transfer_buffer[i]);
+		        dev->vbi_mode.isoc_ctl.transfer_buffer[i] = NULL;
+			}
+			usb_free_urb(urb);
+			dev->vbi_mode.isoc_ctl.urb[i] = NULL;
+		}
+		dev->vbi_mode.isoc_ctl.transfer_buffer[i] = NULL;
+	}
+
+	kfree(dev->vbi_mode.isoc_ctl.urb);
+	kfree(dev->vbi_mode.isoc_ctl.transfer_buffer);
+
+	dev->vbi_mode.isoc_ctl.urb = NULL;
+	dev->vbi_mode.isoc_ctl.transfer_buffer = NULL;
+	dev->vbi_mode.isoc_ctl.num_bufs = 0;
+
+	cx231xx_capture_start(dev, 0, Vbi);
+}
+EXPORT_SYMBOL_GPL(cx231xx_uninit_vbi_isoc);
+
+/*
+ * Allocate URBs and start IRQ
+ */
+int cx231xx_init_vbi_isoc(struct cx231xx *dev, int max_packets,
+		     int num_bufs, int max_pkt_size,
+		     int (*isoc_copy) (struct cx231xx *dev, struct urb *urb))
+{
+	struct cx231xx_dmaqueue *dma_q = &dev->vbi_mode.vidq;
+	int i;
+	int sb_size, pipe;
+	struct urb *urb;
+	int rc;
+
+	cx231xx_info(DRIVER_NAME "cx231xx: called cx231xx_prepare_isoc\n");
+
+	/* De-allocates all pending stuff */
+	cx231xx_uninit_vbi_isoc(dev);
+
+    /* clear if any halt */
+    usb_clear_halt(dev->udev, usb_rcvbulkpipe(dev->udev, dev->vbi_mode.end_point_addr));
+
+
+	dev->vbi_mode.isoc_ctl.isoc_copy = isoc_copy;
+	dev->vbi_mode.isoc_ctl.num_bufs = num_bufs;
+    dma_q->pos = 0;
+    dma_q->is_partial_line = 0;
+    dma_q->last_sav = 0;
+    dma_q->current_field = -1;
+    dma_q->bytes_left_in_line = dev->width << 1;
+    dma_q->lines_per_field = ((dev->norm & V4L2_STD_625_50) ?
+                               PAL_VBI_LINES : NTSC_VBI_LINES) ;
+    dma_q->lines_completed = 0;
+    for(i = 0; i < 8 ; i++)
+        dma_q->partial_buf[i] = 0;
+
+	dev->vbi_mode.isoc_ctl.urb = kzalloc(sizeof(void *)*num_bufs,  GFP_KERNEL);
+	if (!dev->vbi_mode.isoc_ctl.urb) {
+		cx231xx_errdev("cannot alloc memory for usb buffers\n");
+		return -ENOMEM;
+	}
+
+	dev->vbi_mode.isoc_ctl.transfer_buffer = kzalloc(sizeof(void *)*num_bufs,
+					      GFP_KERNEL);
+	if (!dev->vbi_mode.isoc_ctl.transfer_buffer) {
+		cx231xx_errdev("cannot allocate memory for usbtransfer\n");
+		kfree(dev->vbi_mode.isoc_ctl.urb);
+		return -ENOMEM;
+	}
+
+	dev->vbi_mode.isoc_ctl.max_pkt_size = max_pkt_size;
+	dev->vbi_mode.isoc_ctl.buf = NULL;
+
+	sb_size = max_packets * dev->vbi_mode.isoc_ctl.max_pkt_size;
+
+	/* allocate urbs and transfer buffers */
+	for (i = 0; i < dev->vbi_mode.isoc_ctl.num_bufs; i++) {
+
+		urb = usb_alloc_urb(0, GFP_KERNEL);
+		if (!urb) {
+            cx231xx_err(DRIVER_NAME ": cannot alloc isoc_ctl.urb %i\n", i);
+			cx231xx_uninit_vbi_isoc(dev);
+			return -ENOMEM;
+		}
+		dev->vbi_mode.isoc_ctl.urb[i] = urb;
+        urb->transfer_flags = 0;
+
+        dev->vbi_mode.isoc_ctl.transfer_buffer[i] = kzalloc(sb_size, GFP_KERNEL);
+		if (!dev->vbi_mode.isoc_ctl.transfer_buffer[i]) {
+            cx231xx_err(DRIVER_NAME ": unable to allocate %i bytes for transfer"
+					" buffer %i%s\n",
+					sb_size, i,
+					in_interrupt()?" while in int":"");
+			cx231xx_uninit_vbi_isoc(dev);
+			return -ENOMEM;
+		}
+
+		pipe = usb_rcvbulkpipe(dev->udev, dev->vbi_mode.end_point_addr);
+        usb_fill_bulk_urb(urb, dev->udev, pipe,
+				 dev->vbi_mode.isoc_ctl.transfer_buffer[i], sb_size,
+				 cx231xx_irq_vbi_callback, dma_q);
+	}
+
+	init_waitqueue_head(&dma_q->wq);
+
+	/* submit urbs and enables IRQ */
+	for (i = 0; i < dev->vbi_mode.isoc_ctl.num_bufs; i++) {
+		rc = usb_submit_urb(dev->vbi_mode.isoc_ctl.urb[i], GFP_ATOMIC);
+		if (rc) {
+            cx231xx_err(DRIVER_NAME ": submit of urb %i failed (error=%i)\n", i,
+				   rc);
+			cx231xx_uninit_vbi_isoc(dev);
+			return rc;
+		}
+	}
+
+    cx231xx_capture_start(dev, 1, Vbi);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cx231xx_init_vbi_isoc);
+
+
+u32 cx231xx_get_vbi_line(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q,
+                           u8 sav_eav, u8 *p_buffer, u32 buffer_size)
+{
+    u32 bytes_copied = 0;
+    int current_field = -1;
+
+    switch(sav_eav) {
+
+        case SAV_VBI_FIELD1:
+            current_field = 1;
+            break;
+
+        case SAV_VBI_FIELD2:
+            current_field = 2;
+            break;
+        default:
+            break;
+    }
+
+    if(current_field < 0 )
+        return bytes_copied;
+
+    dma_q->last_sav = sav_eav;
+
+    bytes_copied = cx231xx_copy_vbi_line(dev, dma_q, p_buffer, buffer_size, current_field);
+
+    return bytes_copied;
+}
+
+/*
+ * Announces that a buffer were filled and request the next
+ */
+static inline void vbi_buffer_filled(struct cx231xx *dev,
+				  struct cx231xx_dmaqueue *dma_q,
+				  struct cx231xx_buffer *buf)
+{
+	/* Advice that buffer was filled */
+	/* cx231xx_info(DRIVER_NAME "[%p/%d] wakeup\n", buf, buf->vb.i); */
+
+	buf->vb.state = VIDEOBUF_DONE;
+	buf->vb.field_count++;
+	do_gettimeofday(&buf->vb.ts);
+
+	dev->vbi_mode.isoc_ctl.buf = NULL;
+
+	list_del(&buf->vb.queue);
+	wake_up(&buf->vb.done);
+}
+
+u32 cx231xx_copy_vbi_line(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q,
+                            u8 *p_line, u32 length, int field_number)
+{
+    u32 bytes_to_copy;
+    struct cx231xx_buffer *buf;
+    u32 _line_size = dev->width * 2;
+
+    if( dma_q->current_field != field_number )  {
+        cx231xx_reset_vbi_buffer(dev, dma_q);
+    }
+
+    /* get the buffer pointer */
+    buf = dev->vbi_mode.isoc_ctl.buf;
+
+    /* Remember the field number for next time */
+    dma_q->current_field = field_number;
+
+    bytes_to_copy = dma_q->bytes_left_in_line;
+    if(bytes_to_copy > length)
+        bytes_to_copy = length;
+
+    if(dma_q->lines_completed >= dma_q->lines_per_field) {
+        dma_q->bytes_left_in_line -= bytes_to_copy;
+        dma_q->is_partial_line = (dma_q->bytes_left_in_line == 0) ? 0 : 1;
+        return 0;
+    }
+
+    dma_q->is_partial_line = 1;
+
+    /* If we don't have a buffer, just return the number of bytes we would
+       have copied if we had a buffer. */
+    if(!buf) {
+        dma_q->bytes_left_in_line -= bytes_to_copy;
+        dma_q->is_partial_line = (dma_q->bytes_left_in_line == 0) ? 0 : 1;
+        return bytes_to_copy;
+    }
+
+    /* copy the data to video buffer */
+    cx231xx_do_vbi_copy(dev, dma_q, p_line, bytes_to_copy);
+
+    dma_q->pos += bytes_to_copy;
+    dma_q->bytes_left_in_line -= bytes_to_copy;
+
+    if(dma_q->bytes_left_in_line == 0) {
+
+        dma_q->bytes_left_in_line = _line_size;
+        dma_q->lines_completed++;
+        dma_q->is_partial_line = 0;
+
+        if(cx231xx_is_vbi_buffer_done(dev, dma_q) && buf ) {
+
+            vbi_buffer_filled(dev, dma_q, buf);
+
+            dma_q->pos = 0;
+            buf = NULL;
+            dma_q->lines_completed = 0;
+        }
+    }
+
+    return bytes_to_copy;
+}
+
+/*
+ * video-buf generic routine to get the next available buffer
+ */
+static inline void get_next_vbi_buf(struct cx231xx_dmaqueue *dma_q,
+					  struct cx231xx_buffer **buf)
+{
+    struct cx231xx_video_mode *vmode = container_of(dma_q, struct cx231xx_video_mode, vidq);
+    struct cx231xx *dev = container_of(vmode, struct cx231xx, vbi_mode);
+	char *outp;
+
+	if (list_empty(&dma_q->active)) {
+        cx231xx_err(DRIVER_NAME ": No active queue to serve\n");
+		dev->vbi_mode.isoc_ctl.buf = NULL;
+		*buf = NULL;
+		return;
+	}
+
+	/* Get the next buffer */
+	*buf = list_entry(dma_q->active.next, struct cx231xx_buffer, vb.queue);
+
+	/* Cleans up buffer - Usefull for testing for frame/URB loss */
+	outp = videobuf_to_vmalloc(&(*buf)->vb);
+	memset(outp, 0, (*buf)->vb.size);
+
+	dev->vbi_mode.isoc_ctl.buf = *buf;
+
+	return;
+}
+
+
+void cx231xx_reset_vbi_buffer(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q)
+{
+    struct cx231xx_buffer *buf;
+
+    buf = dev->vbi_mode.isoc_ctl.buf;
+
+    if(buf == NULL) {
+
+        /* first try to get the buffer */
+        get_next_vbi_buf(dma_q, &buf);
+
+        dma_q->pos = 0;
+        dma_q->current_field = -1;
+    }
+
+    dma_q->bytes_left_in_line = dev->width << 1;
+    dma_q->lines_completed = 0;
+}
+
+int cx231xx_do_vbi_copy(struct cx231xx *dev, struct cx231xx_dmaqueue *dma_q,
+                    u8 *p_buffer, u32 bytes_to_copy)
+{
+    u8 *p_out_buffer = NULL;
+    u32 current_line_bytes_copied = 0;
+    struct cx231xx_buffer *buf;
+    u32 _line_size = dev->width << 1;
+    void *startwrite;
+	int  offset, lencopy;
+
+    buf = dev->vbi_mode.isoc_ctl.buf;
+
+    if (buf == NULL) {
+        return -1;
+    }
+
+	p_out_buffer = videobuf_to_vmalloc(&buf->vb);
+
+    if(dma_q->bytes_left_in_line != _line_size ) {
+        current_line_bytes_copied = _line_size - dma_q->bytes_left_in_line;
+    }
+
+    offset = ( dma_q->lines_completed * _line_size ) + current_line_bytes_copied;
+
+    /* prepare destination address */
+	startwrite = p_out_buffer + offset;
+
+    lencopy =  dma_q->bytes_left_in_line > bytes_to_copy ? bytes_to_copy : dma_q->bytes_left_in_line;
+
+    memcpy(startwrite, p_buffer, lencopy);
+
+    return 0;
+}
+
+
+u8 cx231xx_is_vbi_buffer_done(struct cx231xx *dev,struct cx231xx_dmaqueue  *dma_q)
+{
+    u32 height = 0;
+
+	height = ((dev->norm & V4L2_STD_625_50) ?
+                               PAL_VBI_LINES : NTSC_VBI_LINES) ;
+    return (dma_q->lines_completed == height)?1:0;
+}
diff --git a/drivers/media/video/cx231xx/cx231xx-vbi.h b/drivers/media/video/cx231xx/cx231xx-vbi.h
new file mode 100644
index 000000000000..2a9e4a1668bf
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-vbi.h
@@ -0,0 +1,61 @@
+/*
+   cx231xx_vbi.h - driver for Conexant Cx23100/101/102 USB video capture devices
+
+   Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+        Based on cx88 driver
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _CX231XX_VBI_H
+#define _CX231XX_VBI_H
+
+extern struct videobuf_queue_ops cx231xx_vbi_qops;
+
+
+#define   NTSC_VBI_START_LINE           10		/* line 10 - 21 */
+#define   NTSC_VBI_END_LINE             21
+#define   NTSC_VBI_LINES                (NTSC_VBI_END_LINE - NTSC_VBI_START_LINE + 1)
+
+#define   PAL_VBI_START_LINE            6
+#define   PAL_VBI_END_LINE              23
+#define   PAL_VBI_LINES                 (PAL_VBI_END_LINE - PAL_VBI_START_LINE + 1)
+
+#define   VBI_STRIDE                    1440
+#define   VBI_SAMPLES_PER_LINE          1440
+
+#define   CX231XX_NUM_VBI_PACKETS       4
+#define   CX231XX_NUM_VBI_BUFS          5
+
+/* stream functions */
+int cx231xx_init_vbi_isoc(struct cx231xx *dev, int max_packets,
+		     int num_bufs, int max_pkt_size,
+		     int (*isoc_copy) (struct cx231xx *dev, struct urb *urb));
+
+void cx231xx_uninit_vbi_isoc(struct cx231xx *dev);
+
+/* vbi data copy functions */
+u32 cx231xx_get_vbi_line(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q,
+                           u8 sav_eav, u8 *p_buffer, u32 buffer_size);
+u32 cx231xx_copy_vbi_line(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q,
+                            u8 *p_line, u32 length, int field_number);
+void cx231xx_reset_vbi_buffer(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q);
+
+int cx231xx_do_vbi_copy(struct cx231xx *dev, struct cx231xx_dmaqueue *dma_q,
+                    u8 *p_buffer, u32 bytes_to_copy);
+
+u8 cx231xx_is_vbi_buffer_done(struct cx231xx *dev,struct cx231xx_dmaqueue  *dma_q);
+
+#endif
diff --git a/drivers/media/video/cx231xx/cx231xx-video.c b/drivers/media/video/cx231xx/cx231xx-video.c
new file mode 100644
index 000000000000..3eb5626ead1d
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx-video.c
@@ -0,0 +1,2440 @@
+/*
+   cx231xx-video.c - driver for Conexant Cx23100/101/102 USB video capture devices
+
+   Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+        Based on em28xx driver
+        Based on cx23885 driver
+        Based on cx88 driver
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitmap.h>
+#include <linux/usb.h>
+#include <linux/i2c.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+
+#include <media/v4l2-common.h>
+#include <media/v4l2-ioctl.h>
+#include <media/v4l2-chip-ident.h>
+#include <media/msp3400.h>
+#include <media/tuner.h>
+
+#include "dvb_frontend.h"
+
+#include "cx231xx.h"
+#include "cx231xx-vbi.h"
+
+
+#define DRIVER_AUTHOR   "Srinivasa Deevi <srinivasa.deevi@conexant.com>"
+#define DRIVER_DESC     "Conexant cx231xx based USB video device driver"
+
+
+#define cx231xx_videodbg(fmt, arg...) do {\
+	if (video_debug) \
+		printk(KERN_INFO "%s %s :"fmt, \
+			 dev->name, __func__ , ##arg); } while (0)
+
+static unsigned int isoc_debug;
+module_param(isoc_debug, int, 0644);
+MODULE_PARM_DESC(isoc_debug, "enable debug messages [isoc transfers]");
+
+#define cx231xx_isocdbg(fmt, arg...) \
+do {\
+	if (isoc_debug) { \
+		printk(KERN_INFO "%s %s :"fmt, \
+			 dev->name, __func__ , ##arg); \
+	} \
+  } while (0)
+
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
+
+
+
+static unsigned int card[]     = {[0 ... (CX231XX_MAXBOARDS - 1)] = UNSET };
+static unsigned int video_nr[] = {[0 ... (CX231XX_MAXBOARDS - 1)] = UNSET };
+static unsigned int vbi_nr[]   = {[0 ... (CX231XX_MAXBOARDS - 1)] = UNSET };
+static unsigned int radio_nr[] = {[0 ... (CX231XX_MAXBOARDS - 1)] = UNSET };
+
+module_param_array(card,  int, NULL, 0444);
+module_param_array(video_nr, int, NULL, 0444);
+module_param_array(vbi_nr, int, NULL, 0444);
+module_param_array(radio_nr, int, NULL, 0444);
+
+MODULE_PARM_DESC(card,     "card type");
+MODULE_PARM_DESC(video_nr, "video device numbers");
+MODULE_PARM_DESC(vbi_nr,   "vbi device numbers");
+MODULE_PARM_DESC(radio_nr, "radio device numbers");
+
+static unsigned int video_debug;
+module_param(video_debug, int, 0644);
+MODULE_PARM_DESC(video_debug, "enable debug messages [video]");
+
+
+
+/* supported video standards */
+static struct cx231xx_fmt format[] = {
+	{
+		.name     = "16bpp YUY2, 4:2:2, packed",
+		.fourcc   = V4L2_PIX_FMT_YUYV,
+		.depth    = 16,
+		.reg	  = 0,
+	},
+};
+
+
+/* supported controls */
+/* Common to all boards */
+
+/* ------------------------------------------------------------------- */
+
+static const struct v4l2_queryctrl no_ctl = {
+	.name  = "42",
+	.flags = V4L2_CTRL_FLAG_DISABLED,
+};
+
+static struct cx231xx_ctrl cx231xx_ctls[] = {
+	/* --- video --- */
+	{
+		.v = {
+			.id            = V4L2_CID_BRIGHTNESS,
+			.name          = "Brightness",
+			.minimum       = 0x00,
+			.maximum       = 0xff,
+			.step          = 1,
+			.default_value = 0x7f,
+			.type          = V4L2_CTRL_TYPE_INTEGER,
+		},
+		.off                   = 128,
+		.reg                   = LUMA_CTRL,
+		.mask                  = 0x00ff,
+		.shift                 = 0,
+	}, {
+		.v = {
+			.id            = V4L2_CID_CONTRAST,
+			.name          = "Contrast",
+			.minimum       = 0,
+			.maximum       = 0xff,
+			.step          = 1,
+			.default_value = 0x3f,
+			.type          = V4L2_CTRL_TYPE_INTEGER,
+		},
+		.off                   = 0,
+		.reg                   = LUMA_CTRL,
+		.mask                  = 0xff00,
+		.shift                 = 8,
+	}, {
+		.v = {
+			.id            = V4L2_CID_HUE,
+			.name          = "Hue",
+			.minimum       = 0,
+			.maximum       = 0xff,
+			.step          = 1,
+			.default_value = 0x7f,
+			.type          = V4L2_CTRL_TYPE_INTEGER,
+		},
+		.off                   = 128,
+		.reg                   = CHROMA_CTRL,
+		.mask                  = 0xff0000,
+		.shift                 = 16,
+	}, {
+		/* strictly, this only describes only U saturation.
+		 * V saturation is handled specially through code.
+		 */
+		.v = {
+			.id            = V4L2_CID_SATURATION,
+			.name          = "Saturation",
+			.minimum       = 0,
+			.maximum       = 0xff,
+			.step          = 1,
+			.default_value = 0x7f,
+			.type          = V4L2_CTRL_TYPE_INTEGER,
+		},
+		.off                   = 0,
+		.reg                   = CHROMA_CTRL,
+		.mask                  = 0x00ff,
+		.shift                 = 0,
+	}, {
+	/* --- audio --- */
+		.v = {
+			.id            = V4L2_CID_AUDIO_MUTE,
+			.name          = "Mute",
+			.minimum       = 0,
+			.maximum       = 1,
+			.default_value = 1,
+			.type          = V4L2_CTRL_TYPE_BOOLEAN,
+		},
+		.reg                   = PATH1_CTL1,
+		.mask                  = (0x1f << 24),
+		.shift                 = 24,
+	}, {
+		.v = {
+			.id            = V4L2_CID_AUDIO_VOLUME,
+			.name          = "Volume",
+			.minimum       = 0,
+			.maximum       = 0x3f,
+			.step          = 1,
+			.default_value = 0x3f,
+			.type          = V4L2_CTRL_TYPE_INTEGER,
+		},
+		.reg                   = PATH1_VOL_CTL,
+		.mask                  = 0xff,
+		.shift                 = 0,
+	}
+};
+static const int CX231XX_CTLS = ARRAY_SIZE(cx231xx_ctls);
+
+static const u32 cx231xx_user_ctrls[] = {
+	V4L2_CID_USER_CLASS,
+	V4L2_CID_BRIGHTNESS,
+	V4L2_CID_CONTRAST,
+	V4L2_CID_SATURATION,
+	V4L2_CID_HUE,
+	V4L2_CID_AUDIO_VOLUME,
+#if 0
+	V4L2_CID_AUDIO_BALANCE,
+#endif
+	V4L2_CID_AUDIO_MUTE,
+	0
+};
+
+static const u32 *ctrl_classes[] = {
+	cx231xx_user_ctrls,
+	NULL
+};
+
+
+/* ------------------------------------------------------------------
+	Video buffer and parser functions
+   ------------------------------------------------------------------*/
+
+/*
+ * Announces that a buffer were filled and request the next
+ */
+static inline void buffer_filled(struct cx231xx *dev,
+				  struct cx231xx_dmaqueue *dma_q,
+				  struct cx231xx_buffer *buf)
+{
+	/* Advice that buffer was filled */
+	cx231xx_isocdbg("[%p/%d] wakeup\n", buf, buf->vb.i);
+	buf->vb.state = VIDEOBUF_DONE;
+	buf->vb.field_count++;
+	do_gettimeofday(&buf->vb.ts);
+
+	dev->video_mode.isoc_ctl.buf = NULL;
+
+	list_del(&buf->vb.queue);
+	wake_up(&buf->vb.done);
+}
+
+
+static inline void print_err_status(struct cx231xx *dev,
+				     int packet, int status)
+{
+	char *errmsg = "Unknown";
+
+	switch (status) {
+	case -ENOENT:
+		errmsg = "unlinked synchronuously";
+		break;
+	case -ECONNRESET:
+		errmsg = "unlinked asynchronuously";
+		break;
+	case -ENOSR:
+		errmsg = "Buffer error (overrun)";
+		break;
+	case -EPIPE:
+		errmsg = "Stalled (device not responding)";
+		break;
+	case -EOVERFLOW:
+		errmsg = "Babble (bad cable?)";
+		break;
+	case -EPROTO:
+		errmsg = "Bit-stuff error (bad cable?)";
+		break;
+	case -EILSEQ:
+		errmsg = "CRC/Timeout (could be anything)";
+		break;
+	case -ETIME:
+		errmsg = "Device does not respond";
+		break;
+	}
+	if (packet < 0) {
+		cx231xx_isocdbg("URB status %d [%s].\n",	status, errmsg);
+	} else {
+		cx231xx_isocdbg("URB packet %d, status %d [%s].\n",
+			       packet, status, errmsg);
+	}
+}
+
+/*
+ * video-buf generic routine to get the next available buffer
+ */
+static inline void get_next_buf(struct cx231xx_dmaqueue *dma_q,
+					  struct cx231xx_buffer **buf)
+{
+    struct cx231xx_video_mode *vmode = container_of(dma_q, struct cx231xx_video_mode, vidq);
+    struct cx231xx *dev = container_of(vmode, struct cx231xx, video_mode);
+
+	char *outp;
+
+
+	if (list_empty(&dma_q->active)) {
+		cx231xx_isocdbg("No active queue to serve\n");
+		dev->video_mode.isoc_ctl.buf = NULL;
+		*buf = NULL;
+		return;
+	}
+
+	/* Get the next buffer */
+	*buf = list_entry(dma_q->active.next, struct cx231xx_buffer, vb.queue);
+
+	/* Cleans up buffer - Usefull for testing for frame/URB loss */
+	outp = videobuf_to_vmalloc(&(*buf)->vb);
+	memset(outp, 0, (*buf)->vb.size);
+
+	dev->video_mode.isoc_ctl.buf = *buf;
+
+	return;
+}
+
+/*
+ * Controls the isoc copy of each urb packet
+ */
+static inline int cx231xx_isoc_copy(struct cx231xx *dev, struct urb *urb)
+{
+	struct cx231xx_buffer    *buf;
+	struct cx231xx_dmaqueue  *dma_q = urb->context;
+	unsigned char *outp = NULL;
+	int i,  rc = 1;
+	unsigned char *p_buffer;
+    u32 bytes_parsed = 0, buffer_size = 0;
+    u8 sav_eav = 0;
+
+	if (!dev)
+		return 0;
+
+	if ((dev->state & DEV_DISCONNECTED) || (dev->state & DEV_MISCONFIGURED))
+		return 0;
+
+	if (urb->status < 0) {
+		print_err_status(dev, -1, urb->status);
+		if (urb->status == -ENOENT)
+			return 0;
+	}
+
+	buf = dev->video_mode.isoc_ctl.buf;
+	if (buf != NULL)
+		outp = videobuf_to_vmalloc(&buf->vb);
+
+	for (i = 0; i < urb->number_of_packets; i++) {
+		int status = urb->iso_frame_desc[i].status;
+
+		if (status < 0) {
+			print_err_status(dev, i, status);
+			if (urb->iso_frame_desc[i].status != -EPROTO)
+				continue;
+		}
+
+        if (urb->iso_frame_desc[i].actual_length <= 0) {
+			/* cx231xx_isocdbg("packet %d is empty",i); - spammy */
+			continue;
+		}
+		if (urb->iso_frame_desc[i].actual_length >
+						dev->video_mode.max_pkt_size) {
+			cx231xx_isocdbg("packet bigger than packet size");
+			continue;
+		}
+
+        /*  get buffer pointer and length */
+		p_buffer = urb->transfer_buffer + urb->iso_frame_desc[i].offset;
+        buffer_size = urb->iso_frame_desc[i].actual_length;
+        bytes_parsed = 0;
+
+        if(dma_q->is_partial_line)
+        {
+            /* Handle the case where we were working on a partial line */
+            sav_eav = dma_q->last_sav;
+        } else {
+            /* Check for a SAV/EAV overlapping the buffer boundary */
+            sav_eav = cx231xx_find_boundary_SAV_EAV(p_buffer, dma_q->partial_buf, &bytes_parsed);
+        }
+
+        sav_eav &= 0xF0;
+        /* Get the first line if we have some portion of an SAV/EAV from the last buffer
+           or a partial line  */
+        if(sav_eav) {
+            bytes_parsed += cx231xx_get_video_line(dev, dma_q,
+                sav_eav,                            /* SAV/EAV */
+                p_buffer + bytes_parsed,            /* p_buffer */
+                buffer_size - bytes_parsed);        /* buffer size */
+        }
+
+        /* Now parse data that is completely in this buffer */
+        /* dma_q->is_partial_line = 0;  */
+
+        while(bytes_parsed < buffer_size)
+        {
+            u32 bytes_used = 0;
+
+            sav_eav = cx231xx_find_next_SAV_EAV(
+                p_buffer + bytes_parsed,            /* p_buffer */
+                buffer_size - bytes_parsed,         /* buffer size */
+                &bytes_used);                       /* Receives bytes used to get SAV/EAV */
+
+	        bytes_parsed += bytes_used;
+
+            sav_eav &= 0xF0;
+            if(sav_eav && (bytes_parsed < buffer_size))
+            {
+                bytes_parsed += cx231xx_get_video_line(dev, dma_q,
+                    sav_eav,                        /* SAV/EAV */
+                    p_buffer + bytes_parsed,        /* p_buffer */
+                    buffer_size - bytes_parsed);    /* buffer size */
+            }
+        }
+
+        /* Save the last four bytes of the buffer so we can check the buffer boundary
+           condition next time */
+        memcpy(dma_q->partial_buf, p_buffer + buffer_size - 4, 4);
+        bytes_parsed = 0;
+
+	}
+	return rc;
+}
+
+u8 cx231xx_find_boundary_SAV_EAV(u8 *p_buffer, u8 *partial_buf, u32 *p_bytes_used)
+{
+    u32 bytes_used;
+    u8 boundary_bytes[8];
+    u8 sav_eav = 0;
+
+    *p_bytes_used = 0;
+
+    /* Create an array of the last 4 bytes of the last buffer and the first
+       4 bytes of the current buffer. */
+
+    memcpy(boundary_bytes, partial_buf, 4);
+    memcpy(boundary_bytes + 4, p_buffer, 4);
+
+    /* Check for the SAV/EAV in the boundary buffer */
+    sav_eav = cx231xx_find_next_SAV_EAV((u8*)&boundary_bytes, 8, &bytes_used);
+
+    if(sav_eav) {
+        /* found a boundary SAV/EAV.  Updates the bytes used to reflect
+           only those used in the new buffer */
+        *p_bytes_used = bytes_used - 4;
+    }
+
+    return sav_eav;
+}
+
+u8 cx231xx_find_next_SAV_EAV(u8 *p_buffer, u32 buffer_size, u32 *p_bytes_used)
+{
+    u32 i;
+    u8 sav_eav = 0;
+
+    /* Don't search if the buffer size is less than 4.  It causes a page fault since
+       buffer_size - 4 evaluates to a large number in that case. */
+    if(buffer_size < 4) {
+        *p_bytes_used = buffer_size;
+        return 0;
+    }
+
+    for(i = 0;i < (buffer_size - 3); i++)  {
+
+        if((p_buffer[i] == 0xFF) &&
+            (p_buffer[i+1] == 0x00) &&
+            (p_buffer[i+2] == 0x00)) {
+
+            *p_bytes_used = i+4;
+            sav_eav = p_buffer[i+3];
+            return sav_eav;
+        }
+    }
+
+    *p_bytes_used = buffer_size;
+    return 0;
+}
+
+
+
+
+u32 cx231xx_get_video_line(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q,
+                           u8 sav_eav, u8 *p_buffer, u32 buffer_size)
+{
+    u32 bytes_copied = 0;
+    int current_field = -1;
+
+
+    switch(sav_eav) {
+        case SAV_ACTIVE_VIDEO_FIELD1:
+            /* looking for skipped line which occurred in PAL 720x480 mode. In this case,
+		       there will be no active data contained between the SAV and EAV */
+            if ( (buffer_size > 3) &&
+		         (p_buffer[0] == 0xFF) && (p_buffer[1] == 0x00) && (p_buffer[2] == 0x00) &&
+			     ( (p_buffer[3] == EAV_ACTIVE_VIDEO_FIELD1)	|| (p_buffer[3] == EAV_ACTIVE_VIDEO_FIELD2) ||
+			       (p_buffer[3] == EAV_VBLANK_FIELD1)	    || (p_buffer[3] == EAV_VBLANK_FIELD2)
+			     )
+		       )
+		    {
+		        return bytes_copied;
+		    }
+            current_field = 1;
+            break;
+
+        case SAV_ACTIVE_VIDEO_FIELD2:
+            /* looking for skipped line which occurred in PAL 720x480 mode. In this case,
+		       there will be no active data contained between the SAV and EAV */
+             if ( (buffer_size > 3) &&
+		         (p_buffer[0] == 0xFF) && (p_buffer[1] == 0x00) && (p_buffer[2] == 0x00) &&
+			     ( (p_buffer[3] == EAV_ACTIVE_VIDEO_FIELD1)	|| (p_buffer[3] == EAV_ACTIVE_VIDEO_FIELD2) ||
+			       (p_buffer[3] == EAV_VBLANK_FIELD1)	    || (p_buffer[3] == EAV_VBLANK_FIELD2)
+			     )
+		       )
+		    {
+		        return bytes_copied;
+		    }
+            current_field = 2;
+            break;
+    }
+
+    dma_q->last_sav = sav_eav;
+
+    bytes_copied = cx231xx_copy_video_line(dev, dma_q, p_buffer, buffer_size, current_field);
+
+    return bytes_copied;
+}
+
+u32 cx231xx_copy_video_line(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q,
+                            u8 *p_line, u32 length, int field_number)
+{
+    u32 bytes_to_copy;
+    struct cx231xx_buffer *buf;
+    u32 _line_size = dev->width * 2;
+
+    if( dma_q->current_field != field_number )  {
+        cx231xx_reset_video_buffer(dev, dma_q);
+    }
+
+    /* get the buffer pointer */
+    buf = dev->video_mode.isoc_ctl.buf;
+
+     /* Remember the field number for next time */
+    dma_q->current_field = field_number;
+
+    bytes_to_copy = dma_q->bytes_left_in_line;
+    if(bytes_to_copy > length)
+        bytes_to_copy = length;
+
+
+    if(dma_q->lines_completed >= dma_q->lines_per_field) {
+        dma_q->bytes_left_in_line -= bytes_to_copy;
+        dma_q->is_partial_line = (dma_q->bytes_left_in_line == 0) ? 0 : 1;
+        return 0;
+    }
+
+    dma_q->is_partial_line = 1;
+
+    /* If we don't have a buffer, just return the number of bytes we would
+       have copied if we had a buffer. */
+    if(!buf)
+    {
+        dma_q->bytes_left_in_line -= bytes_to_copy;
+        dma_q->is_partial_line = (dma_q->bytes_left_in_line == 0) ? 0 : 1;
+        return bytes_to_copy;
+    }
+
+    /* copy the data to video buffer */
+    cx231xx_do_copy(dev, dma_q, p_line, bytes_to_copy);
+
+    dma_q->pos += bytes_to_copy;
+    dma_q->bytes_left_in_line -= bytes_to_copy;
+
+    if(dma_q->bytes_left_in_line == 0) {
+
+        dma_q->bytes_left_in_line = _line_size;
+        dma_q->lines_completed++;
+        dma_q->is_partial_line = 0;
+
+        if(cx231xx_is_buffer_done(dev, dma_q) && buf) {
+
+            buffer_filled(dev, dma_q, buf);
+
+            dma_q->pos = 0;
+            buf = NULL;
+            dma_q->lines_completed = 0;
+        }
+    }
+
+    return bytes_to_copy;
+}
+
+void cx231xx_reset_video_buffer(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q)
+{
+    struct cx231xx_buffer *buf;
+
+    /* handle the switch from field 1 to field 2 */
+    if(dma_q->current_field == 1) {
+        if(dma_q->lines_completed >= dma_q->lines_per_field ) {
+            dma_q->field1_done = 1;
+        } else {
+            dma_q->field1_done = 0;
+        }
+    }
+
+    buf = dev->video_mode.isoc_ctl.buf;
+
+    if(buf == NULL) {
+        u8* outp = NULL;
+        /* first try to get the buffer */
+       get_next_buf(dma_q, &buf);
+
+       if(buf)
+           outp = videobuf_to_vmalloc(&buf->vb);
+
+        dma_q->pos = 0;
+        dma_q->field1_done = 0;
+        dma_q->current_field = -1;
+    }
+
+    /* reset the counters */
+    dma_q->bytes_left_in_line = dev->width << 1;
+    dma_q->lines_completed = 0;
+}
+
+int cx231xx_do_copy(struct cx231xx *dev, struct cx231xx_dmaqueue *dma_q,
+                    u8 *p_buffer, u32 bytes_to_copy)
+{
+    u8 *p_out_buffer = NULL;
+    u32 current_line_bytes_copied = 0;
+    struct cx231xx_buffer *buf;
+    u32 _line_size = dev->width << 1;
+    void *startwrite;
+	int  offset, lencopy;
+
+    buf = dev->video_mode.isoc_ctl.buf;
+
+    if (buf == NULL)
+        return -1;
+
+	p_out_buffer = videobuf_to_vmalloc(&buf->vb);
+
+    current_line_bytes_copied = _line_size - dma_q->bytes_left_in_line;
+
+    /* Offset field 2 one line from the top of the buffer */
+    offset =  (dma_q->current_field == 1)? 0: _line_size;
+
+    /* Offset for field 2 */
+    startwrite = p_out_buffer + offset;
+
+    /* lines already completed in the current field */
+    startwrite += (dma_q->lines_completed * _line_size * 2);
+
+    /* bytes already completed in the current line */
+    startwrite += current_line_bytes_copied;
+
+    lencopy =  dma_q->bytes_left_in_line > bytes_to_copy ? bytes_to_copy : dma_q->bytes_left_in_line;
+
+    if( (u8*)(startwrite +lencopy) > (u8*)(p_out_buffer+ buf->vb.size) )  {
+        return 0;
+    }
+
+    /* The below copies the UYVY data straight into video buffer */
+    cx231xx_swab( (u16*)p_buffer, (u16*)startwrite, (u16)lencopy);
+
+    return 0;
+}
+
+void cx231xx_swab(u16 *from, u16 *to, u16 len)
+{
+    u16 i;
+
+    if( len <= 0)
+        return;
+
+    for(i = 0; i < len/2; i++) {
+        to[i] = (from[i] << 8) | (from[i] >> 8);
+    }
+}
+
+u8 cx231xx_is_buffer_done(struct cx231xx *dev,struct cx231xx_dmaqueue  *dma_q)
+{
+    u8 buffer_complete = 0;
+
+    /* Dual field stream */
+    buffer_complete =
+        ((dma_q->current_field == 2) &&
+        (dma_q->lines_completed >= dma_q->lines_per_field) &&
+        dma_q->field1_done);
+
+    return buffer_complete;
+}
+
+
+/* ------------------------------------------------------------------
+	Videobuf operations
+   ------------------------------------------------------------------*/
+
+static int
+buffer_setup(struct videobuf_queue *vq, unsigned int *count, unsigned int *size)
+{
+	struct cx231xx_fh *fh = vq->priv_data;
+	struct cx231xx        *dev = fh->dev;
+	struct v4l2_frequency f;
+
+	*size = ( fh->dev->width * fh->dev->height * dev->format->depth + 7)  >> 3;
+	if (0 == *count)
+		*count = CX231XX_DEF_BUF;
+
+	if (*count < CX231XX_MIN_BUF)
+		*count = CX231XX_MIN_BUF;
+
+	/* Ask tuner to go to analog mode */
+	memset(&f, 0, sizeof(f));
+	f.frequency = dev->ctl_freq;
+    f.type = fh->radio ? V4L2_TUNER_RADIO : V4L2_TUNER_ANALOG_TV;
+
+	cx231xx_i2c_call_clients(&dev->i2c_bus[1], VIDIOC_S_FREQUENCY, &f);
+
+	return 0;
+}
+
+/* This is called *without* dev->slock held; please keep it that way */
+static void free_buffer(struct videobuf_queue *vq, struct cx231xx_buffer *buf)
+{
+	struct cx231xx_fh     *fh  = vq->priv_data;
+	struct cx231xx        *dev = fh->dev;
+	unsigned long flags = 0;
+	if (in_interrupt())
+		BUG();
+
+	/* We used to wait for the buffer to finish here, but this didn't work
+	   because, as we were keeping the state as VIDEOBUF_QUEUED,
+	   videobuf_queue_cancel marked it as finished for us.
+	   (Also, it could wedge forever if the hardware was misconfigured.)
+
+	   This should be safe; by the time we get here, the buffer isn't
+	   queued anymore. If we ever start marking the buffers as
+	   VIDEOBUF_ACTIVE, it won't be, though.
+	*/
+	spin_lock_irqsave(&dev->video_mode.slock, flags);
+	if (dev->video_mode.isoc_ctl.buf == buf)
+		dev->video_mode.isoc_ctl.buf = NULL;
+	spin_unlock_irqrestore(&dev->video_mode.slock, flags);
+
+	videobuf_vmalloc_free(&buf->vb);
+	buf->vb.state = VIDEOBUF_NEEDS_INIT;
+}
+
+static int
+buffer_prepare(struct videobuf_queue *vq, struct videobuf_buffer *vb,
+						enum v4l2_field field)
+{
+	struct cx231xx_fh     *fh  = vq->priv_data;
+	struct cx231xx_buffer *buf = container_of(vb, struct cx231xx_buffer, vb);
+	struct cx231xx        *dev = fh->dev;
+	int                  rc = 0, urb_init = 0;
+
+	/* The only currently supported format is 16 bits/pixel */
+	buf->vb.size = (fh->dev->width * fh->dev->height * dev->format->depth + 7) >> 3;
+
+	if (0 != buf->vb.baddr  &&  buf->vb.bsize < buf->vb.size)
+		return -EINVAL;
+
+	buf->vb.width  = dev->width;
+	buf->vb.height = dev->height;
+	buf->vb.field  = field;
+
+	if (VIDEOBUF_NEEDS_INIT == buf->vb.state) {
+		rc = videobuf_iolock(vq, &buf->vb, NULL);
+		if (rc < 0)
+			goto fail;
+	}
+
+	if (!dev->video_mode.isoc_ctl.num_bufs)
+		urb_init = 1;
+
+	if (urb_init) {
+		rc = cx231xx_init_isoc(dev, CX231XX_NUM_PACKETS,
+				      CX231XX_NUM_BUFS, dev->video_mode.max_pkt_size,
+				      cx231xx_isoc_copy);
+		if (rc < 0)
+			goto fail;
+	}
+
+	buf->vb.state = VIDEOBUF_PREPARED;
+	return 0;
+
+fail:
+	free_buffer(vq, buf);
+	return rc;
+}
+
+static void
+buffer_queue(struct videobuf_queue *vq, struct videobuf_buffer *vb)
+{
+	struct cx231xx_buffer    *buf     = container_of(vb, struct cx231xx_buffer, vb);
+	struct cx231xx_fh        *fh      = vq->priv_data;
+	struct cx231xx           *dev     = fh->dev;
+	struct cx231xx_dmaqueue  *vidq    = &dev->video_mode.vidq;
+
+	buf->vb.state = VIDEOBUF_QUEUED;
+	list_add_tail(&buf->vb.queue, &vidq->active);
+
+}
+
+static void buffer_release(struct videobuf_queue *vq,
+				struct videobuf_buffer *vb)
+{
+	struct cx231xx_buffer   *buf  = container_of(vb, struct cx231xx_buffer, vb);
+	struct cx231xx_fh       *fh   = vq->priv_data;
+	struct cx231xx          *dev  = (struct cx231xx *)fh->dev;
+
+	cx231xx_isocdbg("cx231xx: called buffer_release\n");
+
+	free_buffer(vq, buf);
+}
+
+static struct videobuf_queue_ops cx231xx_video_qops = {
+	.buf_setup      = buffer_setup,
+	.buf_prepare    = buffer_prepare,
+	.buf_queue      = buffer_queue,
+	.buf_release    = buffer_release,
+};
+
+/*********************  v4l2 interface  **************************************/
+
+
+void video_mux(struct cx231xx *dev, int index)
+{
+
+	struct v4l2_routing route;
+
+	route.input = INPUT(index)->vmux;
+	route.output = 0;
+	dev->video_input = index;
+	dev->ctl_ainput = INPUT(index)->amux;
+
+    cx231xx_set_video_input_mux(dev,index);
+
+    cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_INT_S_VIDEO_ROUTING, &route);
+
+    cx231xx_set_audio_input(dev, dev->ctl_ainput );
+
+    cx231xx_info("video_mux : %d\n", index);
+
+    /* do mode control overrides if required */
+    cx231xx_do_mode_ctrl_overrides(dev);
+}
+
+/* Usage lock check functions */
+static int res_get(struct cx231xx_fh *fh)
+{
+	struct cx231xx    *dev = fh->dev;
+	int		 rc   = 0;
+
+	/* This instance already has stream_on */
+	if (fh->stream_on)
+		return rc;
+
+    if(fh->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) {
+	    if (dev->stream_on)
+		    return -EBUSY;
+        dev->stream_on = 1;
+    } else if(fh->type == V4L2_BUF_TYPE_VBI_CAPTURE) {
+        if (dev->vbi_stream_on)
+		    return -EBUSY;
+        dev->vbi_stream_on = 1;
+    } else
+        return -EINVAL;
+
+	fh->stream_on  = 1;
+
+	return rc;
+}
+
+static int res_check(struct cx231xx_fh *fh)
+{
+	return (fh->stream_on);
+}
+
+static void res_free(struct cx231xx_fh *fh)
+{
+	struct cx231xx    *dev = fh->dev;
+
+	fh->stream_on = 0;
+
+    if(fh->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
+        dev->stream_on = 0;
+    if(fh->type == V4L2_BUF_TYPE_VBI_CAPTURE)
+	    dev->vbi_stream_on = 0;
+}
+
+static int check_dev(struct cx231xx *dev)
+{
+	if (dev->state & DEV_DISCONNECTED) {
+		cx231xx_errdev("v4l2 ioctl: device not present\n");
+		return -ENODEV;
+	}
+
+	if (dev->state & DEV_MISCONFIGURED) {
+		cx231xx_errdev("v4l2 ioctl: device is misconfigured; "
+			      "close and open it again\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+void get_scale(struct cx231xx *dev,
+			unsigned int width, unsigned int height,
+			unsigned int *hscale, unsigned int *vscale)
+{
+	unsigned int          maxw   = norm_maxw(dev);
+	unsigned int          maxh   = norm_maxh(dev);
+
+	*hscale = (((unsigned long)maxw) << 12) / width - 4096L;
+	if (*hscale >= 0x4000)
+		*hscale = 0x3fff;
+
+	*vscale = (((unsigned long)maxh) << 12) / height - 4096L;
+	if (*vscale >= 0x4000)
+		*vscale = 0x3fff;
+
+    dev->hscale = *hscale;
+    dev->vscale = *vscale;
+
+}
+
+/* ------------------------------------------------------------------
+	IOCTL vidioc handling
+   ------------------------------------------------------------------*/
+
+static int vidioc_g_fmt_vid_cap(struct file *file, void *priv,
+					struct v4l2_format *f)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+
+	mutex_lock(&dev->lock);
+
+	f->fmt.pix.width = dev->width;
+	f->fmt.pix.height = dev->height;
+	f->fmt.pix.pixelformat = dev->format->fourcc;;
+	f->fmt.pix.bytesperline = (dev->width * dev->format->depth + 7) >> 3;;
+	f->fmt.pix.sizeimage = f->fmt.pix.bytesperline  * dev->height;
+	f->fmt.pix.colorspace = V4L2_COLORSPACE_SMPTE170M;
+
+	f->fmt.pix.field = V4L2_FIELD_INTERLACED;
+
+	mutex_unlock(&dev->lock);
+	return 0;
+}
+
+static struct cx231xx_fmt *format_by_fourcc(unsigned int fourcc)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(format); i++)
+		if (format[i].fourcc == fourcc)
+			return &format[i];
+
+	return NULL;
+}
+
+static int vidioc_try_fmt_vid_cap(struct file *file, void *priv,
+			struct v4l2_format *f)
+{
+	struct cx231xx_fh      *fh    = priv;
+	struct cx231xx         *dev   = fh->dev;
+	int                   width  = f->fmt.pix.width;
+	int                   height = f->fmt.pix.height;
+	unsigned int          maxw   = norm_maxw(dev);
+	unsigned int          maxh   = norm_maxh(dev);
+	unsigned int          hscale, vscale;
+    struct cx231xx_fmt     *fmt;
+
+	fmt = format_by_fourcc(f->fmt.pix.pixelformat);
+	if (!fmt) {
+		cx231xx_videodbg("Fourcc format (%08x) invalid.\n",
+				f->fmt.pix.pixelformat);
+		return -EINVAL;
+	}
+
+	/* width must even because of the YUYV format
+	   height must be even because of interlacing */
+	height &= 0xfffe;
+	width &= 0xfffe;
+
+	if (unlikely(height < 32))
+		height = 32;
+	if (unlikely(height > maxh))
+		height = maxh;
+	if (unlikely(width < 48))
+		width = 48;
+	if (unlikely(width > maxw))
+		width = maxw;
+
+	get_scale(dev, width, height, &hscale, &vscale);
+
+	width = (((unsigned long)maxw) << 12) / (hscale + 4096L);
+	height = (((unsigned long)maxh) << 12) / (vscale + 4096L);
+
+	f->fmt.pix.width = width;
+	f->fmt.pix.height = height;
+	f->fmt.pix.pixelformat = fmt->fourcc;
+	f->fmt.pix.bytesperline = (dev->width * fmt->depth + 7) >> 3;
+	f->fmt.pix.sizeimage = f->fmt.pix.bytesperline * height;
+	f->fmt.pix.colorspace = V4L2_COLORSPACE_SMPTE170M;
+	f->fmt.pix.field = V4L2_FIELD_INTERLACED;
+
+	return 0;
+}
+
+static int vidioc_s_fmt_vid_cap(struct file *file, void *priv,
+			struct v4l2_format *f)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+    struct cx231xx_fmt     *fmt;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+
+	mutex_lock(&dev->lock);
+
+    vidioc_try_fmt_vid_cap(file, priv, f);
+
+    fmt = format_by_fourcc(f->fmt.pix.pixelformat);
+	if (!fmt) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (videobuf_queue_is_busy(&fh->vb_vidq)) {
+		cx231xx_errdev("%s queue busy\n", __func__);
+		rc = -EBUSY;
+		goto out;
+	}
+
+	if (dev->stream_on && !fh->stream_on) {
+		cx231xx_errdev("%s device in use by another fh\n", __func__);
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/* set new image size */
+	dev->width = f->fmt.pix.width;
+	dev->height = f->fmt.pix.height;
+    dev->format = fmt;
+	get_scale(dev, dev->width, dev->height, &dev->hscale, &dev->vscale);
+
+    cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_S_FMT, f);
+
+    /* Set the correct alternate setting for this resolution */
+	cx231xx_resolution_set(dev);
+
+out:
+	mutex_unlock(&dev->lock);
+	return rc;
+}
+
+static int vidioc_g_std(struct file *file, void *priv, v4l2_std_id *id)
+{
+	struct cx231xx_fh   *fh  = priv;
+	struct cx231xx      *dev = fh->dev;
+
+	*id = dev->norm;
+	return 0;
+}
+
+static int vidioc_s_std(struct file *file, void *priv, v4l2_std_id * norm)
+{
+	struct cx231xx_fh   *fh  = priv;
+	struct cx231xx      *dev = fh->dev;
+	struct v4l2_format f;
+	int                rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+    cx231xx_info("vidioc_s_std : 0x%x\n", (unsigned int)*norm);
+
+	mutex_lock(&dev->lock);
+	dev->norm = *norm;
+
+
+	/* Adjusts width/height, if needed */
+	f.fmt.pix.width = dev->width;
+	f.fmt.pix.height = dev->height;
+	vidioc_try_fmt_vid_cap(file, priv, &f);
+
+	/* set new image size */
+	dev->width = f.fmt.pix.width;
+	dev->height = f.fmt.pix.height;
+	get_scale(dev, dev->width, dev->height, &dev->hscale, &dev->vscale);
+
+	cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_S_STD, &dev->norm);
+
+	mutex_unlock(&dev->lock);
+
+    cx231xx_resolution_set(dev);
+
+    /* do mode control overrides */
+    cx231xx_do_mode_ctrl_overrides(dev);
+
+	return 0;
+}
+
+static const char *iname[] = {
+	[CX231XX_VMUX_COMPOSITE1] = "Composite1",
+	[CX231XX_VMUX_SVIDEO]     = "S-Video",
+	[CX231XX_VMUX_TELEVISION] = "Television",
+    [CX231XX_VMUX_CABLE]      = "Cable TV",
+    [CX231XX_VMUX_DVB]        = "DVB",
+	[CX231XX_VMUX_DEBUG]      = "for debug only",
+};
+
+static int vidioc_enum_input(struct file *file, void *priv,
+				struct v4l2_input *i)
+{
+	struct cx231xx_fh   *fh  = priv;
+	struct cx231xx      *dev = fh->dev;
+	unsigned int       n;
+
+	n = i->index;
+	if (n >= MAX_CX231XX_INPUT)
+		return -EINVAL;
+	if (0 == INPUT(n)->type)
+		return -EINVAL;
+
+	i->index = n;
+	i->type = V4L2_INPUT_TYPE_CAMERA;
+
+	strcpy(i->name, iname[INPUT(n)->type]);
+
+	if ((CX231XX_VMUX_TELEVISION == INPUT(n)->type) ||
+		(CX231XX_VMUX_CABLE == INPUT(n)->type))
+		i->type = V4L2_INPUT_TYPE_TUNER;
+
+	i->std = dev->vdev->tvnorms;
+
+	return 0;
+}
+
+static int vidioc_g_input(struct file *file, void *priv, unsigned int *i)
+{
+	struct cx231xx_fh   *fh  = priv;
+	struct cx231xx      *dev = fh->dev;
+
+	*i = dev->video_input;
+
+	return 0;
+}
+
+static int vidioc_s_input(struct file *file, void *priv, unsigned int i)
+{
+	struct cx231xx_fh   *fh  = priv;
+	struct cx231xx      *dev = fh->dev;
+	int                rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	if (i >= MAX_CX231XX_INPUT)
+		return -EINVAL;
+	if (0 == INPUT(i)->type)
+		return -EINVAL;
+
+	mutex_lock(&dev->lock);
+
+	video_mux(dev, i);
+
+	mutex_unlock(&dev->lock);
+	return 0;
+}
+
+static int vidioc_g_audio(struct file *file, void *priv, struct v4l2_audio *a)
+{
+	struct cx231xx_fh   *fh    = priv;
+	struct cx231xx      *dev   = fh->dev;
+
+	switch (a->index) {
+	case CX231XX_AMUX_VIDEO:
+		strcpy(a->name, "Television");
+		break;
+	case CX231XX_AMUX_LINE_IN:
+		strcpy(a->name, "Line In");
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	a->index = dev->ctl_ainput;
+	a->capability = V4L2_AUDCAP_STEREO;
+
+	return 0;
+}
+
+static int vidioc_s_audio(struct file *file, void *priv, struct v4l2_audio *a)
+{
+	struct cx231xx_fh   *fh  = priv;
+	struct cx231xx      *dev = fh->dev;
+    int status = 0;
+
+
+	/* Doesn't allow manual routing */
+	if (a->index != dev->ctl_ainput)
+		return -EINVAL;
+
+	dev->ctl_ainput = INPUT(a->index)->amux;
+    status = cx231xx_set_audio_input(dev, dev->ctl_ainput);
+
+	return status;
+}
+
+static int vidioc_queryctrl(struct file *file, void *priv,
+				struct v4l2_queryctrl *qc)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   id  = qc->id;
+	int                   i;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+    qc->id = v4l2_ctrl_next(ctrl_classes, qc->id);
+	if (unlikely(qc->id == 0))
+		return -EINVAL;
+
+	memset(qc, 0, sizeof(*qc));
+
+	qc->id = id;
+
+    if (qc->id < V4L2_CID_BASE ||
+	    qc->id >= V4L2_CID_LASTP1)
+		return -EINVAL;
+
+	for (i = 0; i < CX231XX_CTLS; i++)
+		if (cx231xx_ctls[i].v.id == qc->id)
+			break;
+
+	if (i == CX231XX_CTLS) {
+		*qc = no_ctl;
+		return 0;
+	}
+	*qc = cx231xx_ctls[i].v;
+
+	mutex_lock(&dev->lock);
+    cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_QUERYCTRL, qc);
+	mutex_unlock(&dev->lock);
+
+	if (qc->type)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+static int vidioc_g_ctrl(struct file *file, void *priv,
+				struct v4l2_control *ctrl)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	mutex_lock(&dev->lock);
+
+	cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_G_CTRL, ctrl);
+
+	mutex_unlock(&dev->lock);
+	return rc;
+}
+
+static int vidioc_s_ctrl(struct file *file, void *priv,
+				struct v4l2_control *ctrl)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	mutex_lock(&dev->lock);
+
+	cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_S_CTRL, ctrl);
+
+	mutex_unlock(&dev->lock);
+	return rc;
+}
+
+static int vidioc_g_tuner(struct file *file, void *priv,
+				struct v4l2_tuner *t)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	if (0 != t->index)
+		return -EINVAL;
+
+	strcpy(t->name, "Tuner");
+
+    t->type       = V4L2_TUNER_ANALOG_TV;
+	t->capability = V4L2_TUNER_CAP_NORM;
+	t->rangehigh  = 0xffffffffUL;
+	t->signal     = 0xffff ; /* LOCKED */
+
+	return 0;
+}
+
+static int vidioc_s_tuner(struct file *file, void *priv,
+				struct v4l2_tuner *t)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	if (0 != t->index)
+		return -EINVAL;
+#if 0
+	mutex_lock(&dev->lock);
+
+	cx231xx_i2c_call_clients(&dev->i2c_bus[1], VIDIOC_S_TUNER, t);
+
+	mutex_unlock(&dev->lock);
+#endif
+	return 0;
+}
+
+static int vidioc_g_frequency(struct file *file, void *priv,
+				struct v4l2_frequency *f)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+
+    mutex_lock(&dev->lock);
+	f->type = fh->radio ? V4L2_TUNER_RADIO : V4L2_TUNER_ANALOG_TV;
+	f->frequency = dev->ctl_freq;
+
+    cx231xx_i2c_call_clients(&dev->i2c_bus[1], VIDIOC_G_FREQUENCY, f);
+
+    mutex_unlock(&dev->lock);
+
+	return 0;
+}
+
+static int vidioc_s_frequency(struct file *file, void *priv,
+				struct v4l2_frequency *f)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	if (0 != f->tuner)
+		return -EINVAL;
+
+	if (unlikely(0 == fh->radio && f->type != V4L2_TUNER_ANALOG_TV))
+		return -EINVAL;
+	if (unlikely(1 == fh->radio && f->type != V4L2_TUNER_RADIO))
+		return -EINVAL;
+
+    /* set pre channel change settings in DIF first */
+    rc = cx231xx_tuner_pre_channel_change(dev);
+
+	mutex_lock(&dev->lock);
+
+	dev->ctl_freq = f->frequency;
+
+	if(dev->tuner_type == TUNER_XC5000) {
+		if( dev->cx231xx_set_analog_freq != NULL ) {
+			dev->cx231xx_set_analog_freq(dev, f->frequency );
+		}
+	} else {
+		cx231xx_i2c_call_clients(&dev->i2c_bus[1], VIDIOC_S_FREQUENCY, f);
+	}
+
+	mutex_unlock(&dev->lock);
+
+    /* set post channel change settings in DIF first */
+    rc = cx231xx_tuner_post_channel_change(dev);
+
+	cx231xx_info("Set New FREQUENCY to %d\n",f->frequency);
+
+	return rc;
+}
+
+#ifdef CONFIG_VIDEO_ADV_DEBUG
+
+
+/*
+  -R, --list-registers=type=<host/i2cdrv/i2caddr>,chip=<chip>[,min=<addr>,max=<addr>]
+                     dump registers from <min> to <max> [VIDIOC_DBG_G_REGISTER]
+  -r, --set-register=type=<host/i2cdrv/i2caddr>,chip=<chip>,reg=<addr>,val=<val>
+                     set the register [VIDIOC_DBG_S_REGISTER]
+
+  if type == host, then <chip> is the hosts chip ID (default 0)
+  if type == i2cdrv (default), then <chip> is the I2C driver name or ID
+  if type == i2caddr, then <chip> is the 7-bit I2C address
+*/
+
+
+static int vidioc_g_register(struct file *file, void *priv,
+			     struct v4l2_dbg_register *reg)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int ret = 0;
+    u8 value[4] ={0,0,0,0};
+    u32 data = 0;
+
+    switch (reg->match.type) {
+        case V4L2_CHIP_MATCH_HOST:
+            switch(reg->match.addr) {
+            case 0: /* Cx231xx - internal registers */
+                ret = cx231xx_read_ctrl_reg(dev,VRT_GET_REGISTER, (u16) reg->reg, value, 4);
+                reg->val = value[0] | value[1] << 8 | value[2] << 16 | value[3] << 24;
+                break;
+            case 1: /* Colibri - read byte */
+                ret = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS, (u16) reg->reg, 2, &data, 1);
+                reg->val = le32_to_cpu(data & 0xff);
+                break;
+            case 14: /* Colibri - read dword */
+                ret = cx231xx_read_i2c_data(dev, Colibri_DEVICE_ADDRESS, (u16) reg->reg, 2, &data, 4);
+                reg->val = le32_to_cpu(data);
+                break;
+            case 2: /* Hammerhead - read byte */
+                ret = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, (u16) reg->reg, 2, &data, 1);
+                reg->val = le32_to_cpu(data & 0xff);
+                break;
+            case 24: /* Hammerhead - read dword */
+                ret = cx231xx_read_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, (u16) reg->reg, 2, &data, 4);
+                reg->val = le32_to_cpu(data);
+                break;
+            case 3: /* flatiron - read byte */
+                ret = cx231xx_read_i2c_data(dev, Flatrion_DEVICE_ADDRESS, (u16) reg->reg, 1, &data, 1);
+                reg->val = le32_to_cpu(data & 0xff);
+                break;
+            case 34: /* flatiron - read dword */
+                ret = cx231xx_read_i2c_data(dev, Flatrion_DEVICE_ADDRESS, (u16) reg->reg, 1, &data, 4);
+                reg->val = le32_to_cpu(data);
+                break;
+            }
+            return ret < 0?ret:0;
+
+	    case V4L2_CHIP_MATCH_I2C_DRIVER:
+		    cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_DBG_G_REGISTER, reg);
+		    return 0;
+	    case V4L2_CHIP_MATCH_I2C_ADDR:
+		    /* Not supported yet */
+		    return -EINVAL;
+	    default:
+		    if (!v4l2_chip_match_host(&reg->match))
+			    return -EINVAL;
+	}
+
+
+    mutex_lock(&dev->lock);
+
+	cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_DBG_G_REGISTER, reg);
+
+    mutex_unlock(&dev->lock);
+
+	return ret;
+}
+
+static int vidioc_s_register(struct file *file, void *priv,
+			     struct v4l2_dbg_register *reg)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+        int ret = 0;
+	__le64 buf;
+    u32 value;
+    u8 data[4] ={0,0,0,0};
+
+	buf = cpu_to_le64(reg->val);
+
+    switch (reg->match.type) {
+        case V4L2_CHIP_MATCH_HOST:
+            {
+                value = (u32) buf & 0xffffffff;
+
+                switch(reg->match.addr) {
+                    case 0: /* cx231xx internal registers */
+                        data[0]=(u8)value;
+                        data[1]=(u8)(value>>8);
+                        data[2]=(u8)(value>>16);
+                        data[3]=(u8)(value>>24);
+                        ret = cx231xx_write_ctrl_reg(dev,VRT_SET_REGISTER, (u16) reg->reg, data, 4);
+                        break;
+                    case 1: /* Colibri - read byte */
+                        ret = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, (u16) reg->reg, 2, value, 1);
+                        break;
+                    case 14: /* Colibri - read dword */
+                        ret = cx231xx_write_i2c_data(dev, Colibri_DEVICE_ADDRESS, (u16) reg->reg, 2, value, 4);
+                        break;
+                    case 2: /* Hammerhead - read byte */
+                        ret = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, (u16) reg->reg, 2, value, 1);
+                        break;
+                    case 24: /* Hammerhead - read dword */
+                        ret = cx231xx_write_i2c_data(dev, HAMMERHEAD_I2C_ADDRESS, (u16) reg->reg, 2, value, 4);
+                        break;
+                    case 3: /* flatiron - read byte */
+                        ret = cx231xx_write_i2c_data(dev, Flatrion_DEVICE_ADDRESS, (u16) reg->reg, 1, value, 1);
+                        break;
+                    case 34: /* flatiron - read dword */
+                        ret = cx231xx_write_i2c_data(dev, Flatrion_DEVICE_ADDRESS, (u16) reg->reg, 1, value, 4);
+                        break;
+                }
+            }
+            return ret < 0?ret:0;
+
+        default:
+            break;
+    }
+
+	mutex_lock(&dev->lock);
+
+	cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_DBG_S_REGISTER, reg);
+
+    mutex_unlock(&dev->lock);
+
+    return ret;
+}
+#endif
+
+
+static int vidioc_cropcap(struct file *file, void *priv,
+					struct v4l2_cropcap *cc)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+
+	if (cc->type != V4L2_BUF_TYPE_VIDEO_CAPTURE)
+		return -EINVAL;
+
+	cc->bounds.left = 0;
+	cc->bounds.top = 0;
+	cc->bounds.width = dev->width;
+	cc->bounds.height = dev->height;
+	cc->defrect = cc->bounds;
+	cc->pixelaspect.numerator = 54;	/* 4:3 FIXME: remove magic numbers */
+	cc->pixelaspect.denominator = 59;
+
+	return 0;
+}
+
+static int vidioc_streamon(struct file *file, void *priv,
+					enum v4l2_buf_type type)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+    mutex_lock(&dev->lock);
+	rc = res_get(fh);
+
+	if (likely(rc >= 0))
+		rc = videobuf_streamon(&fh->vb_vidq);
+
+	mutex_unlock(&dev->lock);
+
+	return rc;
+}
+
+static int vidioc_streamoff(struct file *file, void *priv,
+					enum v4l2_buf_type type)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	if ( (fh->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) ||
+         (fh->type != V4L2_BUF_TYPE_VBI_CAPTURE) )
+		return -EINVAL;
+	if (type != fh->type)
+		return -EINVAL;
+
+    mutex_lock(&dev->lock);
+
+	videobuf_streamoff(&fh->vb_vidq);
+	res_free(fh);
+
+    mutex_unlock(&dev->lock);
+
+	return 0;
+}
+
+static int vidioc_querycap(struct file *file, void  *priv,
+					struct v4l2_capability *cap)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+
+	strlcpy(cap->driver, "cx231xx", sizeof(cap->driver));
+	strlcpy(cap->card, cx231xx_boards[dev->model].name, sizeof(cap->card));
+	strlcpy(cap->bus_info, dev_name(&dev->udev->dev), sizeof(cap->bus_info));
+
+	cap->version = CX231XX_VERSION_CODE;
+
+	cap->capabilities =
+			V4L2_CAP_VBI_CAPTURE |
+#if 0
+			V4L2_CAP_SLICED_VBI_CAPTURE |
+#endif
+			V4L2_CAP_VIDEO_CAPTURE |
+			V4L2_CAP_AUDIO |
+			V4L2_CAP_READWRITE | V4L2_CAP_STREAMING;
+
+	if (dev->tuner_type != TUNER_ABSENT)
+		cap->capabilities |= V4L2_CAP_TUNER;
+
+	return 0;
+}
+
+static int vidioc_enum_fmt_vid_cap(struct file *file, void  *priv,
+					struct v4l2_fmtdesc *f)
+{
+    if (unlikely(f->index >= ARRAY_SIZE(format)))
+		return -EINVAL;
+
+	strlcpy(f->description, format[f->index].name, sizeof(f->description));
+	f->pixelformat = format[f->index].fourcc;
+
+	return 0;
+}
+
+/* Sliced VBI ioctls */
+static int vidioc_g_fmt_sliced_vbi_cap(struct file *file, void *priv,
+					struct v4l2_format *f)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	mutex_lock(&dev->lock);
+
+	f->fmt.sliced.service_set = 0;
+
+	cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_G_FMT, f);
+
+	if (f->fmt.sliced.service_set == 0)
+		rc = -EINVAL;
+
+	mutex_unlock(&dev->lock);
+	return rc;
+}
+
+static int vidioc_try_set_sliced_vbi_cap(struct file *file, void *priv,
+			struct v4l2_format *f)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	mutex_lock(&dev->lock);
+	cx231xx_i2c_call_clients(&dev->i2c_bus[0], VIDIOC_G_FMT, f);
+	mutex_unlock(&dev->lock);
+
+	if (f->fmt.sliced.service_set == 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+
+/* RAW VBI ioctls */
+
+static int vidioc_g_fmt_vbi_cap(struct file *file, void *priv,
+					struct v4l2_format *f)
+{
+    struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+
+	f->fmt.vbi.sampling_rate = (dev->norm & V4L2_STD_625_50) ?
+                                35468950:28636363;
+	f->fmt.vbi.samples_per_line = VBI_LINE_LENGTH;
+	f->fmt.vbi.sample_format = V4L2_PIX_FMT_GREY;
+	f->fmt.vbi.offset = 64 * 4;
+	f->fmt.vbi.start[0] = (dev->norm & V4L2_STD_625_50) ?
+                               PAL_VBI_START_LINE : NTSC_VBI_START_LINE;
+	f->fmt.vbi.count[0] = (dev->norm & V4L2_STD_625_50) ?
+                               PAL_VBI_LINES : NTSC_VBI_LINES;
+	f->fmt.vbi.start[1] = (dev->norm & V4L2_STD_625_50) ?
+                               PAL_VBI_START_LINE+312 : NTSC_VBI_START_LINE + 263;
+	f->fmt.vbi.count[1] = f->fmt.vbi.count[0];
+
+	return 0;
+
+}
+
+static int vidioc_try_fmt_vbi_cap(struct file *file, void *priv,
+			struct v4l2_format *f)
+{
+    struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+
+    if (dev->vbi_stream_on && !fh->stream_on) {
+		cx231xx_errdev("%s device in use by another fh\n", __func__);
+		return -EBUSY;
+	}
+
+	f->type = V4L2_BUF_TYPE_VBI_CAPTURE;
+	f->fmt.vbi.sampling_rate = (dev->norm & V4L2_STD_625_50) ?
+                                35468950:28636363;
+	f->fmt.vbi.samples_per_line = VBI_LINE_LENGTH;
+	f->fmt.vbi.sample_format = V4L2_PIX_FMT_GREY;
+	f->fmt.vbi.offset = 244;
+	f->fmt.vbi.flags = 0;
+	f->fmt.vbi.start[0] = (dev->norm & V4L2_STD_625_50) ?
+                               PAL_VBI_START_LINE : NTSC_VBI_START_LINE;
+	f->fmt.vbi.count[0] = (dev->norm & V4L2_STD_625_50) ?
+                               PAL_VBI_LINES : NTSC_VBI_LINES;
+	f->fmt.vbi.start[1] = (dev->norm & V4L2_STD_625_50) ?
+                               PAL_VBI_START_LINE+312 : NTSC_VBI_START_LINE + 263;
+	f->fmt.vbi.count[1] = f->fmt.vbi.count[0];
+
+	return 0;
+
+}
+
+static int vidioc_reqbufs(struct file *file, void *priv,
+			  struct v4l2_requestbuffers *rb)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	return (videobuf_reqbufs(&fh->vb_vidq, rb));
+}
+
+static int vidioc_querybuf(struct file *file, void *priv,
+			   struct v4l2_buffer *b)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	return (videobuf_querybuf(&fh->vb_vidq, b));
+}
+
+static int vidioc_qbuf(struct file *file, void *priv, struct v4l2_buffer *b)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	return (videobuf_qbuf(&fh->vb_vidq, b));
+}
+
+static int vidioc_dqbuf(struct file *file, void *priv, struct v4l2_buffer *b)
+{
+	struct cx231xx_fh      *fh  = priv;
+	struct cx231xx         *dev = fh->dev;
+	int                   rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	return (videobuf_dqbuf(&fh->vb_vidq, b,
+				file->f_flags & O_NONBLOCK));
+}
+
+#ifdef CONFIG_VIDEO_V4L1_COMPAT
+static int vidiocgmbuf(struct file *file, void *priv, struct video_mbuf *mbuf)
+{
+	struct cx231xx_fh  *fh = priv;
+
+	return videobuf_cgmbuf(&fh->vb_vidq, mbuf, 8);
+}
+#endif
+
+
+/* ----------------------------------------------------------- */
+/* RADIO ESPECIFIC IOCTLS                                      */
+/* ----------------------------------------------------------- */
+
+static int radio_querycap(struct file *file, void  *priv,
+			  struct v4l2_capability *cap)
+{
+	struct cx231xx *dev = ((struct cx231xx_fh *)priv)->dev;
+
+	strlcpy(cap->driver, "cx231xx", sizeof(cap->driver));
+	strlcpy(cap->card, cx231xx_boards[dev->model].name, sizeof(cap->card));
+	usb_make_path(dev->udev, cap->bus_info, sizeof(cap->bus_info));
+
+	cap->version = CX231XX_VERSION_CODE;
+	cap->capabilities = V4L2_CAP_TUNER;
+	return 0;
+}
+
+static int radio_g_tuner(struct file *file, void *priv,
+			 struct v4l2_tuner *t)
+{
+	struct cx231xx *dev = ((struct cx231xx_fh *)priv)->dev;
+
+	if (unlikely(t->index > 0))
+		return -EINVAL;
+
+	strcpy(t->name, "Radio");
+	t->type = V4L2_TUNER_RADIO;
+
+    mutex_lock(&dev->lock);
+	cx231xx_i2c_call_clients(&dev->i2c_bus[1], VIDIOC_G_TUNER, t);
+    mutex_unlock(&dev->lock);
+
+	return 0;
+}
+
+static int radio_enum_input(struct file *file, void *priv,
+			    struct v4l2_input *i)
+{
+	if (i->index != 0)
+		return -EINVAL;
+	strcpy(i->name, "Radio");
+	i->type = V4L2_INPUT_TYPE_TUNER;
+
+	return 0;
+}
+
+static int radio_g_audio(struct file *file, void *priv, struct v4l2_audio *a)
+{
+	if (unlikely(a->index))
+		return -EINVAL;
+
+	strcpy(a->name, "Radio");
+	return 0;
+}
+
+static int radio_s_tuner(struct file *file, void *priv,
+			 struct v4l2_tuner *t)
+{
+	struct cx231xx *dev = ((struct cx231xx_fh *)priv)->dev;
+
+	if (0 != t->index)
+		return -EINVAL;
+
+    mutex_lock(&dev->lock);
+	cx231xx_i2c_call_clients(&dev->i2c_bus[1], VIDIOC_S_TUNER, t);
+    mutex_unlock(&dev->lock);
+
+	return 0;
+}
+
+static int radio_s_audio(struct file *file, void *fh,
+			 struct v4l2_audio *a)
+{
+	return 0;
+}
+
+static int radio_s_input(struct file *file, void *fh, unsigned int i)
+{
+	return 0;
+}
+
+static int radio_queryctrl(struct file *file, void *priv,
+			   struct v4l2_queryctrl *c)
+{
+	int i;
+
+	if (c->id <  V4L2_CID_BASE ||
+		c->id >= V4L2_CID_LASTP1)
+		return -EINVAL;
+	if (c->id == V4L2_CID_AUDIO_MUTE) {
+		for (i = 0; i < CX231XX_CTLS; i++)
+			if (cx231xx_ctls[i].v.id == c->id)
+				break;
+		*c = cx231xx_ctls[i].v;
+	} else
+		*c = no_ctl;
+	return 0;
+}
+
+/*
+ * cx231xx_v4l2_open()
+ * inits the device and starts isoc transfer
+ */
+static int cx231xx_v4l2_open(struct file *filp)
+{
+    int minor = video_devdata(filp)->minor;
+	int errCode = 0, radio = 0;
+	struct cx231xx *dev = NULL;
+	struct cx231xx_fh *fh;
+	enum v4l2_buf_type fh_type = 0;
+
+    dev = cx231xx_get_device(minor, &fh_type, &radio);
+	if (NULL == dev)
+		return -ENODEV;
+
+	mutex_lock(&dev->lock);
+
+	cx231xx_videodbg("open minor=%d type=%s users=%d\n",
+				minor, v4l2_type_names[fh_type], dev->users);
+
+#if 0
+	errCode = cx231xx_set_mode(dev, CX231XX_ANALOG_MODE);
+	if (errCode < 0) {
+		cx231xx_errdev("Device locked on digital mode. Can't open analog\n");
+		mutex_unlock(&dev->lock);
+		return -EBUSY;
+	}
+#endif
+
+	fh = kzalloc(sizeof(struct cx231xx_fh), GFP_KERNEL);
+	if (!fh) {
+		cx231xx_errdev("cx231xx-video.c: Out of memory?!\n");
+		mutex_unlock(&dev->lock);
+		return -ENOMEM;
+	}
+	fh->dev = dev;
+	fh->radio = radio;
+	fh->type = fh_type;
+	filp->private_data = fh;
+
+	if (fh->type == V4L2_BUF_TYPE_VIDEO_CAPTURE && dev->users == 0) {
+		dev->width = norm_maxw(dev);
+		dev->height = norm_maxh(dev);
+		dev->hscale = 0;
+		dev->vscale = 0;
+
+
+        /* Power up in Analog TV mode */
+        cx231xx_set_power_mode(dev, POLARIS_AVMODE_ANALOGT_TV);
+
+#if 0
+		cx231xx_set_mode(dev, CX231XX_ANALOG_MODE);
+#endif
+		cx231xx_resolution_set(dev);
+
+        /* set video alternate setting */
+        cx231xx_set_video_alternate(dev);
+
+		/* Needed, since GPIO might have disabled power of
+		   some i2c device */
+		cx231xx_config_i2c(dev);
+
+		/* device needs to be initialized before isoc transfer */
+        dev->video_input = dev->video_input > 2 ? 2: dev->video_input;
+		video_mux(dev, dev->video_input );
+
+	}
+	if (fh->radio) {
+		cx231xx_videodbg("video_open: setting radio device\n");
+
+		/* cx231xx_start_radio(dev); */
+
+		cx231xx_i2c_call_clients(&dev->i2c_bus[1], AUDC_SET_RADIO, NULL);
+	}
+
+	dev->users++;
+
+    if (fh->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) {
+	    videobuf_queue_vmalloc_init(&fh->vb_vidq, &cx231xx_video_qops,
+			NULL, &dev->video_mode.slock, fh->type, V4L2_FIELD_INTERLACED, /* V4L2_FIELD_SEQ_TB, */
+			sizeof(struct cx231xx_buffer), fh);
+    }
+
+    if (fh->type == V4L2_BUF_TYPE_VBI_CAPTURE) {
+
+        /* Set the required alternate setting  VBI interface works in Bulk mode only */
+        cx231xx_set_alt_setting(dev, INDEX_VANC, 0);
+
+        videobuf_queue_vmalloc_init(&fh->vb_vidq, &cx231xx_vbi_qops,
+			NULL, &dev->vbi_mode.slock, fh->type, V4L2_FIELD_SEQ_TB, /* V4L2_FIELD_INTERLACED,  */
+			sizeof(struct cx231xx_buffer), fh);
+    }
+
+	mutex_unlock(&dev->lock);
+
+	return errCode;
+}
+
+/*
+ * cx231xx_realease_resources()
+ * unregisters the v4l2,i2c and usb devices
+ * called when the device gets disconected or at module unload
+*/
+void cx231xx_release_analog_resources(struct cx231xx *dev)
+{
+
+	/*FIXME: I2C IR should be disconnected */
+
+	if (dev->radio_dev) {
+		if (-1 != dev->radio_dev->minor)
+			video_unregister_device(dev->radio_dev);
+		else
+			video_device_release(dev->radio_dev);
+		dev->radio_dev = NULL;
+	}
+	if (dev->vbi_dev) {
+		cx231xx_info("V4L2 device /dev/vbi%d deregistered\n",
+			    dev->vbi_dev->num);
+		if (-1 != dev->vbi_dev->minor)
+			video_unregister_device(dev->vbi_dev);
+		else
+			video_device_release(dev->vbi_dev);
+		dev->vbi_dev = NULL;
+	}
+	if (dev->vdev) {
+		cx231xx_info("V4L2 device /dev/video%d deregistered\n",
+			    dev->vdev->num);
+		if (-1 != dev->vdev->minor)
+			video_unregister_device(dev->vdev);
+		else
+			video_device_release(dev->vdev);
+		dev->vdev = NULL;
+	}
+}
+
+/*
+ * cx231xx_v4l2_close()
+ * stops streaming and deallocates all resources allocated by the v4l2
+ * calls and ioctls
+ */
+static int cx231xx_v4l2_close(struct file *filp)
+{
+	struct cx231xx_fh *fh  = filp->private_data;
+	struct cx231xx    *dev = fh->dev;
+
+	cx231xx_videodbg("users=%d\n", dev->users);
+
+    mutex_lock(&dev->lock);
+
+	if (res_check(fh))
+		res_free(fh);
+
+    if (fh->type == V4L2_BUF_TYPE_VBI_CAPTURE) {
+		    videobuf_stop(&fh->vb_vidq);
+		    videobuf_mmap_free(&fh->vb_vidq);
+
+		    /* the device is already disconnect,
+		       free the remaining resources */
+		    if (dev->state & DEV_DISCONNECTED) {
+			    cx231xx_release_resources(dev);
+			    mutex_unlock(&dev->lock);
+			    kfree(dev);
+			    return 0;
+		    }
+
+		    /* do this before setting alternate! */
+            cx231xx_uninit_vbi_isoc(dev);
+
+            /* set alternate 0 */
+            if( !dev->vbi_or_sliced_cc_mode) {
+		        cx231xx_set_alt_setting(dev, INDEX_VANC, 0);
+            } else {
+                 cx231xx_set_alt_setting(dev, INDEX_HANC, 0);
+            }
+
+        kfree(fh);
+	    dev->users--;
+	    wake_up_interruptible_nr(&dev->open, 1);
+	    mutex_unlock(&dev->lock);
+	    return 0;
+	}
+
+	if (dev->users == 1) {
+		videobuf_stop(&fh->vb_vidq);
+		videobuf_mmap_free(&fh->vb_vidq);
+
+		/* the device is already disconnect,
+		   free the remaining resources */
+		if (dev->state & DEV_DISCONNECTED) {
+			cx231xx_release_resources(dev);
+			mutex_unlock(&dev->lock);
+			kfree(dev);
+			return 0;
+		}
+
+        /* Save some power by putting tuner to sleep */
+		cx231xx_i2c_call_clients(&dev->i2c_bus[1], TUNER_SET_STANDBY, NULL);
+
+		/* do this before setting alternate! */
+		cx231xx_uninit_isoc(dev);
+		cx231xx_set_mode(dev, CX231XX_SUSPEND);
+
+		/* set alternate 0 */
+		cx231xx_set_alt_setting(dev, INDEX_VIDEO, 0);
+	}
+	kfree(fh);
+	dev->users--;
+	wake_up_interruptible_nr(&dev->open, 1);
+	mutex_unlock(&dev->lock);
+	return 0;
+}
+
+/*
+ * cx231xx_v4l2_read()
+ * will allocate buffers when called for the first time
+ */
+static ssize_t
+cx231xx_v4l2_read(struct file *filp, char __user *buf, size_t count,
+		 loff_t *pos)
+{
+	struct cx231xx_fh *fh = filp->private_data;
+	struct cx231xx *dev = fh->dev;
+	int rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	if ( (fh->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) ||
+         (fh->type == V4L2_BUF_TYPE_VBI_CAPTURE)      ) {
+		mutex_lock(&dev->lock);
+		rc = res_get(fh);
+		mutex_unlock(&dev->lock);
+
+		if (unlikely(rc < 0))
+			return rc;
+
+		return videobuf_read_stream(&fh->vb_vidq, buf, count, pos, 0,
+					filp->f_flags & O_NONBLOCK);
+	}
+	return 0;
+}
+
+/*
+ * cx231xx_v4l2_poll()
+ * will allocate buffers when called for the first time
+ */
+static unsigned int cx231xx_v4l2_poll(struct file *filp, poll_table * wait)
+{
+	struct cx231xx_fh *fh = filp->private_data;
+	struct cx231xx *dev = fh->dev;
+	int rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+	mutex_lock(&dev->lock);
+	rc = res_get(fh);
+	mutex_unlock(&dev->lock);
+
+	if (unlikely(rc < 0))
+		return POLLERR;
+
+	if ( (V4L2_BUF_TYPE_VIDEO_CAPTURE == fh->type) ||
+         (V4L2_BUF_TYPE_VBI_CAPTURE == fh->type) )
+         return videobuf_poll_stream(filp, &fh->vb_vidq, wait);
+    else
+		return POLLERR;
+}
+
+/*
+ * cx231xx_v4l2_mmap()
+ */
+static int cx231xx_v4l2_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct cx231xx_fh *fh    = filp->private_data;
+	struct cx231xx	 *dev   = fh->dev;
+	int		 rc;
+
+	rc = check_dev(dev);
+	if (rc < 0)
+		return rc;
+
+    mutex_lock(&dev->lock);
+	rc = res_get(fh);
+	mutex_unlock(&dev->lock);
+
+	if (unlikely(rc < 0))
+		return rc;
+
+	rc = videobuf_mmap_mapper(&fh->vb_vidq, vma);
+
+	cx231xx_videodbg("vma start=0x%08lx, size=%ld, ret=%d\n",
+		(unsigned long)vma->vm_start,
+		(unsigned long)vma->vm_end-(unsigned long)vma->vm_start,
+		rc);
+
+	return rc;
+}
+
+static const struct v4l2_file_operations cx231xx_v4l_fops = {
+	.owner         = THIS_MODULE,
+	.open          = cx231xx_v4l2_open,
+	.release       = cx231xx_v4l2_close,
+	.read          = cx231xx_v4l2_read,
+	.poll          = cx231xx_v4l2_poll,
+	.mmap          = cx231xx_v4l2_mmap,
+	.ioctl	       = video_ioctl2,
+};
+
+static const struct v4l2_ioctl_ops video_ioctl_ops = {
+	.vidioc_querycap            = vidioc_querycap,
+	.vidioc_enum_fmt_vid_cap    = vidioc_enum_fmt_vid_cap,
+	.vidioc_g_fmt_vid_cap       = vidioc_g_fmt_vid_cap,
+	.vidioc_try_fmt_vid_cap     = vidioc_try_fmt_vid_cap,
+	.vidioc_s_fmt_vid_cap       = vidioc_s_fmt_vid_cap,
+	.vidioc_g_fmt_vbi_cap       = vidioc_g_fmt_vbi_cap,
+	.vidioc_try_fmt_vbi_cap     = vidioc_try_fmt_vbi_cap,
+	.vidioc_s_fmt_vbi_cap       = vidioc_try_fmt_vbi_cap,
+	.vidioc_g_audio             = vidioc_g_audio,
+	.vidioc_s_audio             = vidioc_s_audio,
+	.vidioc_cropcap             = vidioc_cropcap,
+	.vidioc_g_fmt_sliced_vbi_cap   = vidioc_g_fmt_sliced_vbi_cap,
+	.vidioc_try_fmt_sliced_vbi_cap = vidioc_try_set_sliced_vbi_cap,
+	.vidioc_reqbufs             = vidioc_reqbufs,
+	.vidioc_querybuf            = vidioc_querybuf,
+	.vidioc_qbuf                = vidioc_qbuf,
+	.vidioc_dqbuf               = vidioc_dqbuf,
+	.vidioc_s_std               = vidioc_s_std,
+    .vidioc_g_std               = vidioc_g_std,
+	.vidioc_enum_input          = vidioc_enum_input,
+	.vidioc_g_input             = vidioc_g_input,
+	.vidioc_s_input             = vidioc_s_input,
+	.vidioc_queryctrl           = vidioc_queryctrl,
+	.vidioc_g_ctrl              = vidioc_g_ctrl,
+	.vidioc_s_ctrl              = vidioc_s_ctrl,
+	.vidioc_streamon            = vidioc_streamon,
+	.vidioc_streamoff           = vidioc_streamoff,
+	.vidioc_g_tuner             = vidioc_g_tuner,
+	.vidioc_s_tuner             = vidioc_s_tuner,
+	.vidioc_g_frequency         = vidioc_g_frequency,
+	.vidioc_s_frequency         = vidioc_s_frequency,
+#ifdef CONFIG_VIDEO_ADV_DEBUG
+	.vidioc_g_register          = vidioc_g_register,
+	.vidioc_s_register          = vidioc_s_register,
+#endif
+#ifdef CONFIG_VIDEO_V4L1_COMPAT
+	.vidiocgmbuf                = vidiocgmbuf,
+#endif
+};
+
+static struct video_device cx231xx_vbi_template;
+
+static const struct video_device cx231xx_video_template = {
+	.fops                       = &cx231xx_v4l_fops,
+	.release                    = video_device_release,
+	.ioctl_ops 		            = &video_ioctl_ops,
+	.minor                      = -1,
+	.tvnorms                    = V4L2_STD_ALL,
+	.current_norm               = V4L2_STD_PAL,
+};
+
+static const struct v4l2_file_operations radio_fops = {
+	.owner         = THIS_MODULE,
+	.open          = cx231xx_v4l2_open,
+	.release       = cx231xx_v4l2_close,
+	.ioctl	       = video_ioctl2,
+};
+
+static const struct v4l2_ioctl_ops radio_ioctl_ops = {
+	.vidioc_querycap      = radio_querycap,
+	.vidioc_g_tuner       = radio_g_tuner,
+	.vidioc_enum_input    = radio_enum_input,
+	.vidioc_g_audio       = radio_g_audio,
+	.vidioc_s_tuner       = radio_s_tuner,
+	.vidioc_s_audio       = radio_s_audio,
+	.vidioc_s_input       = radio_s_input,
+	.vidioc_queryctrl     = radio_queryctrl,
+	.vidioc_g_ctrl        = vidioc_g_ctrl,
+	.vidioc_s_ctrl        = vidioc_s_ctrl,
+	.vidioc_g_frequency   = vidioc_g_frequency,
+	.vidioc_s_frequency   = vidioc_s_frequency,
+#ifdef CONFIG_VIDEO_ADV_DEBUG
+	.vidioc_g_register    = vidioc_g_register,
+	.vidioc_s_register    = vidioc_s_register,
+#endif
+};
+
+static struct video_device cx231xx_radio_template = {
+	.name                 = "cx231xx-radio",
+	.fops                 = &radio_fops,
+	.ioctl_ops 	      = &radio_ioctl_ops,
+	.minor                = -1,
+};
+
+/******************************** usb interface ******************************/
+
+
+static struct video_device *cx231xx_vdev_init(struct cx231xx *dev,
+					     const struct video_device *template,
+					     const char *type_name)
+{
+	struct video_device *vfd;
+
+	vfd = video_device_alloc();
+	if (NULL == vfd)
+		return NULL;
+	*vfd = *template;
+	vfd->minor   = -1;
+	vfd->parent = &dev->udev->dev;
+	vfd->release = video_device_release;
+	vfd->debug = video_debug;
+
+	snprintf(vfd->name, sizeof(vfd->name), "%s %s",
+		 dev->name, type_name);
+
+	return vfd;
+}
+
+int cx231xx_register_analog_devices(struct cx231xx *dev)
+{
+	int ret;
+
+    cx231xx_info("%s()\n", __func__);
+
+	cx231xx_info("%s: v4l2 driver version %d.%d.%d\n",
+		dev->name,
+		(CX231XX_VERSION_CODE >> 16) & 0xff,
+		(CX231XX_VERSION_CODE >> 8) & 0xff, CX231XX_VERSION_CODE & 0xff);
+
+	/* set default norm */
+	/*dev->norm = cx231xx_video_template.current_norm;*/
+	dev->width = norm_maxw(dev);
+	dev->height = norm_maxh(dev);
+	dev->interlaced = 0;
+	dev->hscale = 0;
+	dev->vscale = 0;
+
+	/* Analog specific initialization */
+	dev->format = &format[0];
+	/* video_mux(dev, dev->video_input); */
+
+	/* Audio defaults */
+	dev->mute = 1;
+	dev->volume = 0x1f;
+
+	/* enable vbi capturing */
+    /* write code here...  */
+
+	/* allocate and fill video video_device struct */
+	dev->vdev = cx231xx_vdev_init(dev, &cx231xx_video_template, "video");
+	if (!dev->vdev) {
+		cx231xx_errdev("cannot allocate video_device.\n");
+		return -ENODEV;
+	}
+
+	/* register v4l2 video video_device */
+	ret = video_register_device(dev->vdev, VFL_TYPE_GRABBER,
+				       video_nr[dev->devno]);
+	if (ret) {
+		cx231xx_errdev("unable to register video device (error=%i).\n", ret);
+		return ret;
+	}
+
+    cx231xx_info("%s/0: registered device video%d [v4l2]\n",
+	       dev->name, dev->vdev->num);
+
+    /* Initialize VBI template */
+	memcpy( &cx231xx_vbi_template, &cx231xx_video_template,
+		sizeof(cx231xx_vbi_template) );
+	strcpy(cx231xx_vbi_template.name,"cx231xx-vbi");
+
+
+	/* Allocate and fill vbi video_device struct */
+	dev->vbi_dev = cx231xx_vdev_init(dev, &cx231xx_vbi_template, "vbi");
+
+	/* register v4l2 vbi video_device */
+	ret = video_register_device(dev->vbi_dev, VFL_TYPE_VBI,
+					vbi_nr[dev->devno]);
+	if (ret < 0) {
+		cx231xx_errdev("unable to register vbi device\n");
+		return ret;
+	}
+
+    cx231xx_info("%s/0: registered device vbi%d\n",
+	       dev->name, dev->vbi_dev->num);
+
+	if (cx231xx_boards[dev->model].radio.type == CX231XX_RADIO) {
+		dev->radio_dev = cx231xx_vdev_init(dev, &cx231xx_radio_template, "radio");
+		if (!dev->radio_dev) {
+			cx231xx_errdev("cannot allocate video_device.\n");
+			return -ENODEV;
+		}
+		ret = video_register_device(dev->radio_dev, VFL_TYPE_RADIO,
+					    radio_nr[dev->devno]);
+		if (ret < 0) {
+			cx231xx_errdev("can't register radio device\n");
+			return ret;
+		}
+		cx231xx_info("Registered radio device as /dev/radio%d\n",
+			    dev->radio_dev->num);
+	}
+
+	cx231xx_info("V4L2 device registered as /dev/video%d and /dev/vbi%d\n",
+				dev->vdev->num, dev->vbi_dev->num);
+
+	return 0;
+}
+
+
diff --git a/drivers/media/video/cx231xx/cx231xx.h b/drivers/media/video/cx231xx/cx231xx.h
new file mode 100644
index 000000000000..5fef87fbbcec
--- /dev/null
+++ b/drivers/media/video/cx231xx/cx231xx.h
@@ -0,0 +1,762 @@
+/*
+   cx231xx.h - driver for Conexant Cx23100/101/102 USB video capture devices
+
+   Copyright (C) 2008 <srinivasa.deevi at conexant dot com>
+        Based on em28xx driver
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _CX231XX_H
+#define _CX231XX_H
+
+#include <linux/videodev2.h>
+#include <media/videobuf-vmalloc.h>
+
+#include <linux/i2c.h>
+#include <linux/i2c-algo-bit.h>
+#include <linux/mutex.h>
+#include <media/ir-kbd-i2c.h>
+#if defined(CONFIG_VIDEO_CX231XX_DVB) || defined(CONFIG_VIDEO_CX231XX_DVB_MODULE)
+#include <media/videobuf-dvb.h>
+#endif
+
+#include "cx231xx-reg.h"
+#include "cx231xx-pcb-config.h"
+#include "cx231xx-conf-reg.h"
+
+#define CX231XX_VERSION_CODE            KERNEL_VERSION(0, 1, 0)
+#define DRIVER_NAME                     "cx231xx"
+#define PWR_SLEEP_INTERVAL              5
+
+/* I2C addresses for control block in Cx231xx */
+#define     Colibri_DEVICE_ADDRESS      0x60
+#define     Flatrion_DEVICE_ADDRESS     0x98
+#define     HAMMERHEAD_I2C_ADDRESS      0x88
+#define     DIF_USE_BASEBAND            0xFFFFFFFF
+
+/* Boards supported by driver */
+#define CX231XX_BOARD_UNKNOWN		    0
+#define CX231XX_BOARD_CNXT_RDE_250     	1
+#define CX231XX_BOARD_CNXT_RDU_250     	2
+
+/* Limits minimum and default number of buffers */
+#define CX231XX_MIN_BUF                 4
+#define CX231XX_DEF_BUF                 12
+#define CX231XX_DEF_VBI_BUF             6
+
+#define VBI_LINE_COUNT                  17
+#define VBI_LINE_LENGTH                 1440
+
+/*Limits the max URB message size */
+#define URB_MAX_CTRL_SIZE               80
+
+/* Params for validated field */
+#define CX231XX_BOARD_NOT_VALIDATED     1
+#define CX231XX_BOARD_VALIDATED	        0
+
+/* maximum number of cx231xx boards */
+#define CX231XX_MAXBOARDS               8
+
+/* maximum number of frames that can be queued */
+#define CX231XX_NUM_FRAMES              5
+
+/* number of buffers for isoc transfers */
+#define CX231XX_NUM_BUFS                8
+
+/* number of packets for each buffer
+   windows requests only 40 packets .. so we better do the same
+   this is what I found out for all alternate numbers there!
+ */
+#define CX231XX_NUM_PACKETS             40
+
+
+/* default alternate; 0 means choose the best */
+#define CX231XX_PINOUT                  0
+
+#define CX231XX_INTERLACED_DEFAULT      1
+
+
+/* time to wait when stopping the isoc transfer */
+#define CX231XX_URB_TIMEOUT             msecs_to_jiffies(CX231XX_NUM_BUFS * CX231XX_NUM_PACKETS)
+
+
+enum cx231xx_mode {
+	CX231XX_SUSPEND,
+	CX231XX_ANALOG_MODE,
+	CX231XX_DIGITAL_MODE,
+};
+
+enum cx231xx_std_mode {
+	CX231XX_TV_AIR = 0,
+	CX231XX_TV_CABLE
+};
+
+enum cx231xx_stream_state {
+	STREAM_OFF,
+	STREAM_INTERRUPT,
+	STREAM_ON,
+};
+
+struct cx231xx;
+
+struct cx231xx_usb_isoc_ctl {
+		/* max packet size of isoc transaction */
+	int				max_pkt_size;
+
+		/* number of allocated urbs */
+	int				num_bufs;
+
+		/* urb for isoc transfers */
+	struct urb			**urb;
+
+		/* transfer buffers for isoc transfer */
+	char				**transfer_buffer;
+
+		/* Last buffer command and region */
+	u8				cmd;
+	int				pos, size, pktsize;
+
+		/* Last field: ODD or EVEN? */
+	int				field;
+
+		/* Stores incomplete commands */
+	u32				tmp_buf;
+	int				tmp_buf_len;
+
+		/* Stores already requested buffers */
+	struct cx231xx_buffer    	*buf;
+
+		/* Stores the number of received fields */
+	int				nfields;
+
+		/* isoc urb callback */
+	int (*isoc_copy) (struct cx231xx *dev, struct urb *urb);
+
+};
+
+
+
+struct cx231xx_fmt {
+	char  *name;
+	u32   fourcc;          /* v4l2 format id */
+    int   depth;
+    int   reg;
+};
+
+/* buffer for one video frame */
+struct cx231xx_buffer {
+	/* common v4l buffer stuff -- must be first */
+	struct videobuf_buffer vb;
+
+	struct list_head frame;
+	int top_field;
+	int receiving;
+};
+
+struct cx231xx_dmaqueue {
+	struct list_head   active;
+	struct list_head   queued;
+
+	wait_queue_head_t  wq;
+
+	/* Counters to control buffer fill */
+	int                pos;
+    u8                 is_partial_line;
+    u8                 partial_buf[8];
+    u8                 last_sav;
+    int                current_field;
+    u32                bytes_left_in_line;
+    u32                lines_completed;
+    u8                 field1_done;
+    u32                lines_per_field;
+};
+
+
+/* inputs */
+
+#define MAX_CX231XX_INPUT               4
+
+enum cx231xx_itype {
+	CX231XX_VMUX_COMPOSITE1 = 1,
+	CX231XX_VMUX_SVIDEO,
+	CX231XX_VMUX_TELEVISION,
+    CX231XX_VMUX_CABLE,
+    CX231XX_RADIO,
+    CX231XX_VMUX_DVB,
+	CX231XX_VMUX_DEBUG
+};
+
+enum cx231xx_v_input {
+    CX231XX_VIN_1_1 = 0x1,
+    CX231XX_VIN_2_1,
+    CX231XX_VIN_3_1,
+    CX231XX_VIN_4_1,
+    CX231XX_VIN_1_2 = 0x01,
+    CX231XX_VIN_2_2,
+    CX231XX_VIN_3_2,
+    CX231XX_VIN_1_3 = 0x1,
+    CX231XX_VIN_2_3,
+    CX231XX_VIN_3_3,
+};
+
+/* cx231xx has two audio inputs: tuner and line in */
+enum cx231xx_amux {
+	/* This is the only entry for cx231xx tuner input */
+	CX231XX_AMUX_VIDEO,	    /* cx231xx tuner*/
+	CX231XX_AMUX_LINE_IN,	/* Line In */
+};
+
+struct cx231xx_reg_seq {
+	unsigned char bit;
+    unsigned char val;
+	int sleep;
+};
+
+struct cx231xx_input {
+	enum cx231xx_itype type;
+	unsigned int vmux;
+	enum cx231xx_amux amux;
+	struct cx231xx_reg_seq *gpio;
+};
+
+#define INPUT(nr) (&cx231xx_boards[dev->model].input[nr])
+
+enum cx231xx_decoder {
+	CX231XX_NODECODER,
+	CX231XX_AVDECODER
+};
+
+typedef enum _I2C_MASTER_PORT
+{
+     I2C_0 =0,
+     I2C_1 =1,
+     I2C_2 =2,
+     I2C_3 =3
+}CX231XX_I2C_MASTER_PORT;
+
+struct cx231xx_board {
+	char *name;
+	int vchannels;
+	int tuner_type;
+	int tuner_addr;
+    v4l2_std_id norm;	            /* tv norm */
+
+    /* demod related */
+    int demod_addr;
+    u8 demod_xfer_mode;  /* 0 - Serial; 1 - parallel */
+
+	/* GPIO Pins */
+	struct cx231xx_reg_seq *dvb_gpio;
+	struct cx231xx_reg_seq *suspend_gpio;
+	struct cx231xx_reg_seq *tuner_gpio;
+    u8 tuner_sif_gpio;
+    u8 tuner_scl_gpio;
+    u8 tuner_sda_gpio;
+
+    /* PIN ctrl */
+    u32 ctl_pin_status_mask;
+    u8  agc_analog_digital_select_gpio;
+    u32 gpio_pin_status_mask;
+
+    /* i2c masters */
+    u8 tuner_i2c_master;
+    u8 demod_i2c_master;
+
+	unsigned int max_range_640_480:1;
+	unsigned int has_dvb:1;
+	unsigned int valid:1;
+
+	unsigned char xclk, i2c_speed;
+
+	enum cx231xx_decoder decoder;
+
+	struct cx231xx_input       input[MAX_CX231XX_INPUT];
+	struct cx231xx_input	  radio;
+	IR_KEYTAB_TYPE            *ir_codes;
+};
+
+/* device states */
+enum cx231xx_dev_state {
+	DEV_INITIALIZED = 0x01,
+	DEV_DISCONNECTED = 0x02,
+	DEV_MISCONFIGURED = 0x04,
+};
+
+enum AFE_MODE
+{
+    AFE_MODE_LOW_IF,
+    AFE_MODE_BASEBAND,
+    AFE_MODE_EU_HI_IF,
+    AFE_MODE_US_HI_IF,
+    AFE_MODE_JAPAN_HI_IF
+};
+
+enum AUDIO_INPUT
+{
+    AUDIO_INPUT_MUTE,
+    AUDIO_INPUT_LINE,
+    AUDIO_INPUT_TUNER_TV,
+    AUDIO_INPUT_SPDIF,
+    AUDIO_INPUT_TUNER_FM
+};
+
+#define CX231XX_AUDIO_BUFS              5
+#define CX231XX_NUM_AUDIO_PACKETS       64
+#define CX231XX_CAPTURE_STREAM_EN       1
+#define CX231XX_STOP_AUDIO              0
+#define CX231XX_START_AUDIO             1
+
+
+/* cx231xx extensions */
+#define CX231XX_AUDIO                   0x10
+#define CX231XX_DVB                     0x20
+
+struct cx231xx_audio {
+	char name[50];
+	char *transfer_buffer[CX231XX_AUDIO_BUFS];
+	struct urb *urb[CX231XX_AUDIO_BUFS];
+	struct usb_device *udev;
+	unsigned int capture_transfer_done;
+	struct snd_pcm_substream   *capture_pcm_substream;
+
+	unsigned int hwptr_done_capture;
+	struct snd_card            *sndcard;
+
+	int users, shutdown;
+	enum cx231xx_stream_state capture_stream;
+	spinlock_t slock;
+
+    int alt;		                /* alternate */
+	int max_pkt_size;	            /* max packet size of isoc transaction */
+	int num_alt;		            /* Number of alternative settings */
+	unsigned int *alt_max_pkt_size;	/* array of wMaxPacketSize */
+    u16 end_point_addr;
+};
+
+struct cx231xx;
+
+struct cx231xx_fh {
+	struct cx231xx *dev;
+	unsigned int  stream_on:1;	    /* Locks streams */
+	int           radio;
+
+	struct videobuf_queue        vb_vidq;
+
+	enum v4l2_buf_type           type;
+};
+
+/**********************************************************************************/
+/* set/get i2c */
+#define I2C_SPEED_1M            0x0  /* 00--1Mb/s, 01-400kb/s, 10--100kb/s, 11--5Mb/s */
+#define I2C_SPEED_400K          0x1  /* 00--1Mb/s, 01-400kb/s, 10--100kb/s, 11--5Mb/s */
+#define I2C_SPEED_100K          0x2  /* 00--1Mb/s, 01-400kb/s, 10--100kb/s, 11--5Mb/s */
+#define I2C_SPEED_5M            0x3	 /* 00--1Mb/s, 01-400kb/s, 10--100kb/s, 11--5Mb/s */
+
+#define I2C_STOP                0x0  /* 0-- STOP transaction */
+#define I2C_NOSTOP              0x1  /* 1-- do not transmit STOP at end of transaction */
+#define I2C_SYNC                0x1  /* 1--alllow slave to insert clock wait states */
+
+struct cx231xx_i2c {
+	struct cx231xx             *dev;
+
+	int                        nr;
+
+	/* i2c i/o */
+	struct i2c_adapter         i2c_adap;
+	struct i2c_algo_bit_data   i2c_algo;
+	struct i2c_client          i2c_client;
+	u32                        i2c_rc;
+
+	/* different settings for each bus */
+	u8                         i2c_period;
+    u8                         i2c_nostop;
+    u8                         i2c_reserve;
+};
+
+struct cx231xx_i2c_xfer_data{
+    u8                  dev_addr;
+    u8                  direction; /* 1 - IN, 0 - OUT */
+    u8                  saddr_len; /* sub address len */
+    u16                 saddr_dat; /* sub addr data */
+    u8                  buf_size;  /* buffer size */
+    u8*                 p_buffer;  /* pointer to the buffer */
+};
+
+typedef struct _VENDOR_REQUEST_IN
+{
+	u8					bRequest;
+	u16					wValue;
+	u16					wIndex;
+	u16					wLength;
+	u8					direction;
+	u8					bData;
+	u8					*pBuff;
+} VENDOR_REQUEST_IN, *PVENDOR_REQUEST_IN;
+
+struct cx231xx_ctrl {
+	struct v4l2_queryctrl v;
+	u32                   off;
+	u32                   reg;
+	u32                   mask;
+	u32                   shift;
+};
+
+typedef enum{
+    Raw_Video = 0,
+    Audio,
+    Vbi,        /* VANC */
+    Sliced_cc,  /* HANC */
+    TS1_serial_mode,
+    TS2,
+    TS1_parallel_mode
+}TRANSFER_TYPE;
+
+struct cx231xx_video_mode {
+    /* Isoc control struct */
+	struct cx231xx_dmaqueue vidq;
+	struct cx231xx_usb_isoc_ctl isoc_ctl;
+	spinlock_t slock;
+
+	/* usb transfer */
+	int alt;		                /* alternate */
+	int max_pkt_size;	            /* max packet size of isoc transaction */
+	int num_alt;		            /* Number of alternative settings */
+	unsigned int *alt_max_pkt_size;	/* array of wMaxPacketSize */
+    u16 end_point_addr;
+};
+
+
+/* main device struct */
+struct cx231xx {
+	/* generic device properties */
+	char name[30];		            /* name (including minor) of the device */
+	int model;		                /* index in the device_data struct */
+	int devno;		                /* marks the number of this device */
+
+	struct cx231xx_board board;
+
+	unsigned int stream_on:1;	    /* Locks streams */
+    unsigned int vbi_stream_on:1;	/* Locks streams for VBI */
+	unsigned int has_audio_class:1;
+	unsigned int has_alsa_audio:1;
+
+    struct cx231xx_fmt *format;
+
+	struct cx231xx_IR *ir;
+
+	struct list_head	devlist;
+
+	int tuner_type;		            /* type of the tuner */
+	int tuner_addr;		            /* tuner address */
+
+    /* I2C adapters: Master 1 & 2 (External) & Master 3 (Internal only) */
+	struct cx231xx_i2c         i2c_bus[3];
+    unsigned int xc_fw_load_done:1;
+    struct mutex gpio_i2c_lock;
+
+	/* video for linux */
+	int users;		                /* user count for exclusive use */
+	struct video_device *vdev;	    /* video for linux device struct */
+	v4l2_std_id norm;	            /* selected tv norm */
+	int ctl_freq;		            /* selected frequency */
+	unsigned int ctl_ainput;        /* selected audio input */
+	int mute;
+	int volume;
+
+	/* frame properties */
+	int width;		                /* current frame width */
+	int height;		                /* current frame height */
+	unsigned hscale;	            /* horizontal scale factor (see datasheet) */
+	unsigned vscale;	            /* vertical scale factor (see datasheet) */
+	int interlaced;		            /* 1=interlace fileds, 0=just top fileds */
+
+	struct cx231xx_audio adev;
+
+	/* states */
+	enum cx231xx_dev_state state;
+
+	struct work_struct         request_module_wk;
+
+	/* locks */
+	struct mutex lock;
+	struct mutex ctrl_urb_lock;	    /* protects urb_buf */
+	struct list_head inqueue, outqueue;
+	wait_queue_head_t open, wait_frame, wait_stream;
+	struct video_device *vbi_dev;
+	struct video_device *radio_dev;
+
+	unsigned char eedata[256];
+
+    struct cx231xx_video_mode video_mode;
+    struct cx231xx_video_mode vbi_mode;
+    struct cx231xx_video_mode sliced_cc_mode;
+    struct cx231xx_video_mode ts1_mode;
+
+    struct usb_device *udev;	    /* the usb device */
+    char urb_buf[URB_MAX_CTRL_SIZE];/* urb control msg buffer */
+
+
+	/* helper funcs that call usb_control_msg */
+    int (*cx231xx_read_ctrl_reg) (struct cx231xx *dev, u8 req, u16 reg,
+					char *buf, int len);
+    int (*cx231xx_write_ctrl_reg)(struct cx231xx *dev, u8 req, u16 reg,
+				      char *buf, int len);
+    int (*cx231xx_send_usb_command)(struct cx231xx_i2c *i2c_bus,
+                   struct cx231xx_i2c_xfer_data *req_data);
+    int (*cx231xx_gpio_i2c_read)(struct cx231xx *dev, u8 dev_addr, u8 *buf ,u8 len);
+    int (*cx231xx_gpio_i2c_write)(struct cx231xx *dev, u8 dev_addr, u8 *buf ,u8 len);
+
+	int (*cx231xx_set_analog_freq)(struct cx231xx *dev, u32 freq ) ;
+    int (*cx231xx_reset_analog_tuner)(struct cx231xx *dev) ;
+
+	enum cx231xx_mode mode;
+
+	struct cx231xx_dvb *dvb;
+
+    /* Cx231xx supported PCB config's */
+    struct pcb_config  current_pcb_config;
+    u8 current_scenario_idx;
+    u8 interface_count;
+    u8 max_iad_interface_count;
+
+    /* GPIO related register direction and values */
+    u32 gpio_dir;
+    u32 gpio_val;
+
+    /* Power Modes */
+    int power_mode;
+
+    /* colibri parameters */
+    enum AFE_MODE colibri_mode;
+    u32 colibri_ref_count;
+
+    /* video related parameters */
+    u32 video_input;
+    u32 active_mode;
+    u8  vbi_or_sliced_cc_mode;          /* 0 - vbi ; 1 - sliced cc mode */
+    enum cx231xx_std_mode std_mode;     /* 0 - Air; 1 - cable */
+
+};
+
+struct cx231xx_ops {
+	struct list_head next;
+	char *name;
+	int id;
+	int (*init)(struct cx231xx *);
+	int (*fini)(struct cx231xx *);
+};
+
+/* call back functions in dvb module */
+int cx231xx_set_analog_freq(struct cx231xx *dev, u32 freq ) ;
+int cx231xx_reset_analog_tuner(struct cx231xx *dev) ;
+
+/* Provided by cx231xx-i2c.c */
+void cx231xx_i2c_call_clients(struct cx231xx_i2c *bus, unsigned int cmd, void *arg);
+void cx231xx_do_i2c_scan(struct cx231xx *dev, struct i2c_client *c);
+int cx231xx_i2c_register(struct cx231xx_i2c *bus);
+int cx231xx_i2c_unregister(struct cx231xx_i2c *bus);
+
+/* Internal block control functions */
+int cx231xx_read_i2c_data(struct cx231xx *dev, u8 dev_addr,
+                          u16 saddr, u8 saddr_len, u32 *data, u8 data_len);
+int cx231xx_write_i2c_data(struct cx231xx *dev, u8 dev_addr,
+                          u16 saddr, u8 saddr_len, u32 data, u8 data_len);
+int cx231xx_reg_mask_write(struct cx231xx *dev, u8 dev_addr, u8 size, u16 register_address,
+                           u8 bit_start,u8 bit_end, u32 value);
+int cx231xx_read_modify_write_i2c_dword(struct cx231xx *dev, u8 dev_addr,
+                                       u16 saddr, u32 mask, u32 value);
+u32 cx231xx_set_field(u32 field_mask, u32 data);
+
+/* Colibri related functions */
+int cx231xx_colibri_init_super_block(struct cx231xx *dev, u32 ref_count);
+int cx231xx_colibri_init_channels(struct cx231xx *dev);
+int cx231xx_colibri_setup_AFE_for_baseband(struct cx231xx *dev);
+int cx231xx_colibri_set_input_mux(struct cx231xx *dev, u32 input_mux);
+int cx231xx_colibri_set_mode(struct cx231xx *dev, enum AFE_MODE mode);
+int cx231xx_colibri_update_power_control(struct cx231xx *dev, AV_MODE avmode);
+int cx231xx_colibri_adjust_ref_count(struct cx231xx *dev, u32 video_input);
+
+/* flatiron related functions */
+int cx231xx_flatiron_initialize(struct cx231xx *dev);
+int cx231xx_flatiron_update_power_control(struct cx231xx *dev, AV_MODE avmode);
+int cx231xx_flatiron_set_audio_input(struct cx231xx *dev, u8 audio_input);
+
+/* DIF related functions */
+int cx231xx_dif_configure_C2HH_for_low_IF(struct cx231xx *dev, u32 mode,
+                                          u32 function_mode, u32 standard);
+int cx231xx_dif_set_standard(struct cx231xx *dev, u32 standard);
+int cx231xx_tuner_pre_channel_change(struct cx231xx *dev);
+int cx231xx_tuner_post_channel_change(struct cx231xx *dev);
+
+/* video parser functions */
+u8 cx231xx_find_next_SAV_EAV(u8 *p_buffer, u32 buffer_size, u32 *p_bytes_used);
+u8 cx231xx_find_boundary_SAV_EAV(u8 *p_buffer, u8 *partial_buf, u32 *p_bytes_used);
+int cx231xx_do_copy(struct cx231xx *dev, struct cx231xx_dmaqueue *dma_q,
+                    u8 *p_buffer, u32 bytes_to_copy);
+void cx231xx_reset_video_buffer(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q);
+u8 cx231xx_is_buffer_done(struct cx231xx *dev,struct cx231xx_dmaqueue  *dma_q);
+u32 cx231xx_copy_video_line(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q,
+                            u8 *p_line, u32 length, int field_number);
+u32 cx231xx_get_video_line(struct cx231xx *dev, struct cx231xx_dmaqueue  *dma_q,
+                           u8 sav_eav, u8 *p_buffer, u32 buffer_size);
+void cx231xx_swab(u16 *from, u16 *to, u16 len);
+
+/* Provided by cx231xx-core.c */
+
+u32 cx231xx_request_buffers(struct cx231xx *dev, u32 count);
+void cx231xx_queue_unusedframes(struct cx231xx *dev);
+void cx231xx_release_buffers(struct cx231xx *dev);
+
+/* read from control pipe */
+int cx231xx_read_ctrl_reg(struct cx231xx *dev, u8 req, u16 reg,
+			    char *buf, int len);
+
+/* write to control pipe */
+int cx231xx_write_ctrl_reg(struct cx231xx *dev, u8 req, u16 reg,
+				      char *buf, int len);
+int cx231xx_mode_register(struct cx231xx *dev, u16 address, u32 mode);
+
+int cx231xx_send_vendor_cmd(struct cx231xx *dev, VENDOR_REQUEST_IN *ven_req);
+int cx231xx_send_usb_command(struct cx231xx_i2c *i2c_bus,
+                   struct cx231xx_i2c_xfer_data *req_data);
+
+/* Gpio related functions */
+int cx231xx_send_gpio_cmd(struct cx231xx *dev, u32 gpio_bit, u8* gpio_val,
+                          u8 len, u8 request, u8 direction);
+int cx231xx_set_gpio_bit(struct cx231xx *dev, u32 gpio_bit, u8* gpio_val);
+int cx231xx_get_gpio_bit(struct cx231xx *dev, u32 gpio_bit, u8* gpio_val);
+int cx231xx_set_gpio_value(struct cx231xx *dev, int pin_number, int pin_value);
+int cx231xx_set_gpio_direction(struct cx231xx *dev, int pin_number, int pin_value);
+
+int cx231xx_gpio_i2c_start(struct cx231xx *dev);
+int cx231xx_gpio_i2c_end(struct cx231xx *dev);
+int cx231xx_gpio_i2c_write_byte(struct cx231xx *dev, u8 data);
+int cx231xx_gpio_i2c_read_byte(struct cx231xx *dev, u8 *buf);
+int cx231xx_gpio_i2c_read_ack(struct cx231xx *dev);
+int cx231xx_gpio_i2c_write_ack(struct cx231xx *dev);
+int cx231xx_gpio_i2c_write_nak(struct cx231xx *dev);
+
+int cx231xx_gpio_i2c_read(struct cx231xx *dev, u8 dev_addr, u8 *buf ,u8 len);
+int cx231xx_gpio_i2c_write(struct cx231xx *dev, u8 dev_addr, u8 *buf ,u8 len);
+
+/* audio related functions */
+int cx231xx_set_audio_decoder_input(struct cx231xx *dev, enum AUDIO_INPUT audio_input);
+
+int cx231xx_capture_start(struct cx231xx *dev, int start, u8 media_type);
+int cx231xx_resolution_set(struct cx231xx *dev);
+int cx231xx_set_video_alternate(struct cx231xx *dev);
+int cx231xx_set_alt_setting(struct cx231xx *dev, u8 index, u8 alt);
+int cx231xx_init_isoc(struct cx231xx *dev, int max_packets,
+		     int num_bufs, int max_pkt_size,
+		     int (*isoc_copy) (struct cx231xx *dev, struct urb *urb));
+void cx231xx_uninit_isoc(struct cx231xx *dev);
+int cx231xx_set_mode(struct cx231xx *dev, enum cx231xx_mode set_mode);
+int cx231xx_gpio_set(struct cx231xx *dev, struct cx231xx_reg_seq *gpio);
+
+/* Device list functions */
+void cx231xx_release_resources(struct cx231xx *dev);
+void cx231xx_release_analog_resources(struct cx231xx *dev);
+int cx231xx_register_analog_devices(struct cx231xx *dev);
+void cx231xx_remove_from_devlist(struct cx231xx *dev);
+void cx231xx_add_into_devlist(struct cx231xx *dev);
+struct cx231xx *cx231xx_get_device(int minor,
+				 enum v4l2_buf_type *fh_type, int *has_radio);
+void cx231xx_init_extension(struct cx231xx *dev);
+void cx231xx_close_extension(struct cx231xx *dev);
+
+/* hardware init functions */
+int cx231xx_dev_init(struct cx231xx *dev);
+void cx231xx_dev_uninit(struct cx231xx *dev);
+void cx231xx_config_i2c(struct cx231xx *dev);
+int cx231xx_config(struct cx231xx *dev);
+
+/* Stream control functions */
+int cx231xx_start_stream(struct cx231xx *dev, u32 ep_mask);
+int cx231xx_stop_stream(struct cx231xx *dev, u32 ep_mask);
+
+int cx231xx_initialize_stream_xfer(struct cx231xx *dev, u32 media_type);
+
+/* Power control functions */
+int cx231xx_set_power_mode(struct cx231xx *dev, AV_MODE mode);
+int cx231xx_power_suspend(struct cx231xx *dev);
+
+/* chip specific control functions */
+int cx231xx_init_ctrl_pin_status(struct cx231xx *dev);
+int cx231xx_set_agc_analog_digital_mux_select(struct cx231xx *dev, u8 analog_or_digital);
+int cx231xx_enable_i2c_for_tuner(struct cx231xx *dev, u8 I2CIndex);
+
+/* video audio decoder related functions */
+void video_mux(struct cx231xx *dev, int index);
+int cx231xx_set_video_input_mux(struct cx231xx *dev, u8 input);
+int cx231xx_set_decoder_video_input(struct cx231xx *dev, u8 pin_type, u8 input);
+int cx231xx_do_mode_ctrl_overrides(struct cx231xx *dev);
+int cx231xx_set_audio_input(struct cx231xx *dev, u8 input);
+void get_scale(struct cx231xx *dev,
+			unsigned int width, unsigned int height,
+			unsigned int *hscale, unsigned int *vscale);
+
+/* Provided by cx231xx-video.c */
+int cx231xx_register_extension(struct cx231xx_ops *dev);
+void cx231xx_unregister_extension(struct cx231xx_ops *dev);
+void cx231xx_init_extension(struct cx231xx *dev);
+void cx231xx_close_extension(struct cx231xx *dev);
+
+/* Provided by cx231xx-cards.c */
+extern void cx231xx_pre_card_setup(struct cx231xx *dev);
+extern void cx231xx_card_setup(struct cx231xx *dev);
+extern struct cx231xx_board cx231xx_boards[];
+extern struct usb_device_id cx231xx_id_table[];
+extern const unsigned int cx231xx_bcount;
+void cx231xx_set_ir(struct cx231xx *dev, struct IR_i2c *ir);
+int cx231xx_tuner_callback(void *ptr, int component, int command, int arg);
+
+/* Provided by cx231xx-input.c */
+int cx231xx_ir_init(struct cx231xx *dev);
+int cx231xx_ir_fini(struct cx231xx *dev);
+
+/* printk macros */
+
+#define cx231xx_err(fmt, arg...) do {\
+	printk(KERN_ERR fmt , ##arg); } while (0)
+
+#define cx231xx_errdev(fmt, arg...) do {\
+	printk(KERN_ERR "%s: "fmt,\
+			dev->name , ##arg); } while (0)
+
+#define cx231xx_info(fmt, arg...) do {\
+	printk(KERN_INFO "%s: "fmt,\
+			dev->name , ##arg); } while (0)
+#define cx231xx_warn(fmt, arg...) do {\
+	printk(KERN_WARNING "%s: "fmt,\
+			dev->name , ##arg); } while (0)
+
+
+static inline unsigned int norm_maxw(struct cx231xx *dev)
+{
+	if (dev->board.max_range_640_480)
+		return 640;
+	else
+		return 720;
+}
+
+static inline unsigned int norm_maxh(struct cx231xx *dev)
+{
+	if (dev->board.max_range_640_480)
+		return 480;
+	else
+		return (dev->norm & V4L2_STD_625_50) ? 576 : 480;
+}
+#endif
diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index f27604af8378..f9d48c9ad5d0 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -88,6 +88,7 @@
 #define I2C_HW_B_CX2341X	0x010020 /* Conexant CX2341X MPEG encoder cards */
 #define I2C_HW_B_CX23885	0x010022 /* conexant 23885 based tv cards (bus1) */
 #define I2C_HW_B_AU0828		0x010023 /* auvitek au0828 usb bridge */
+#define I2C_HW_B_CX231XX	0x010024 /* Conexant CX231XX USB based cards */
 #define I2C_HW_B_HDPVR		0x010025 /* Hauppauge HD PVR */
 
 /* --- SGI adapters							*/
-- 
cgit v1.2.3


From dfa76fa2824967c0ec196fbcba36d3e74b66d3aa Mon Sep 17 00:00:00 2001
From: Adam Baker <linux@baker-net.org.uk>
Date: Sun, 29 Mar 2009 19:17:10 -0300
Subject: V4L/DVB (11387): Sensor orientation reporting

Add support to the SQ-905 driver to pass back to user space the
sensor orientation information obtained from the camera during init.
Modifies gspca and the videodev2.h header to create the necessary
API.

[mchehab@redhat.com: Changed "Output is" to "Frames are" at the comments, as suggested at LMML]
Signed-off-by: Adam Baker <linux@baker-net.org.uk>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/gspca/gspca.c | 1 +
 drivers/media/video/gspca/gspca.h | 1 +
 drivers/media/video/gspca/sq905.c | 6 ++++++
 include/linux/videodev2.h         | 5 +++++
 4 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/media/video/gspca/gspca.c b/drivers/media/video/gspca/gspca.c
index a75c1ca2db41..a2741d7dccfe 100644
--- a/drivers/media/video/gspca/gspca.c
+++ b/drivers/media/video/gspca/gspca.c
@@ -1132,6 +1132,7 @@ static int vidioc_enum_input(struct file *file, void *priv,
 	if (input->index != 0)
 		return -EINVAL;
 	input->type = V4L2_INPUT_TYPE_CAMERA;
+	input->status = gspca_dev->cam.input_flags;
 	strncpy(input->name, gspca_dev->sd_desc->name,
 		sizeof input->name);
 	return 0;
diff --git a/drivers/media/video/gspca/gspca.h b/drivers/media/video/gspca/gspca.h
index e4d4cf6ce05a..58e8ff02136a 100644
--- a/drivers/media/video/gspca/gspca.h
+++ b/drivers/media/video/gspca/gspca.h
@@ -56,6 +56,7 @@ struct cam {
 				 * - cannot be > MAX_NURBS
 				 * - when 0 and bulk_size != 0 means
 				 *   1 URB and submit done by subdriver */
+	u32 input_flags;	/* value for ENUM_INPUT status flags */
 };
 
 struct gspca_dev;
diff --git a/drivers/media/video/gspca/sq905.c b/drivers/media/video/gspca/sq905.c
index 04e3ae57a2e3..2e1cdf068fda 100644
--- a/drivers/media/video/gspca/sq905.c
+++ b/drivers/media/video/gspca/sq905.c
@@ -360,6 +360,12 @@ static int sd_init(struct gspca_dev *gspca_dev)
 	gspca_dev->cam.nmodes = ARRAY_SIZE(sq905_mode);
 	if (!(ident & SQ905_HIRES_MASK))
 		gspca_dev->cam.nmodes--;
+
+	if (ident & SQ905_ORIENTATION_MASK)
+		gspca_dev->cam.input_flags = V4L2_IN_ST_VFLIP;
+	else
+		gspca_dev->cam.input_flags = V4L2_IN_ST_VFLIP |
+					     V4L2_IN_ST_HFLIP;
 	return 0;
 }
 
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 139d234923cd..ebb2ea6b4995 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -737,6 +737,11 @@ struct v4l2_input {
 #define V4L2_IN_ST_NO_SIGNAL   0x00000002
 #define V4L2_IN_ST_NO_COLOR    0x00000004
 
+/* field 'status' - sensor orientation */
+/* If sensor is mounted upside down set both bits */
+#define V4L2_IN_ST_HFLIP       0x00000010 /* Frames are flipped horizontally */
+#define V4L2_IN_ST_VFLIP       0x00000020 /* Frames are flipped vertically */
+
 /* field 'status' - analog */
 #define V4L2_IN_ST_NO_H_LOCK   0x00000100  /* No horizontal sync lock */
 #define V4L2_IN_ST_COLOR_KILL  0x00000200  /* Color killer is active */
-- 
cgit v1.2.3


From 26308eab69aa193f7b3fb50764a64ae14544a39b Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Fri, 27 Mar 2009 10:31:51 +0100
Subject: block: fix inconsistency in I/O stat accounting code

This forces in_flight to be zero when turning off or on the I/O stat
accounting and stops updating I/O stats in attempt_merge() when
accounting is turned off.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c         | 13 ++++---------
 block/blk-merge.c        | 29 +++++++++++++++++------------
 block/blk-sysfs.c        |  4 ++++
 block/blk.h              | 10 ++++++----
 block/elevator.c         |  2 +-
 include/linux/elevator.h |  1 +
 6 files changed, 33 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 25572802dac2..3688abff2430 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,12 +64,11 @@ static struct workqueue_struct *kblockd_workqueue;
 
 static void drive_stat_acct(struct request *rq, int new_io)
 {
-	struct gendisk *disk = rq->rq_disk;
 	struct hd_struct *part;
 	int rw = rq_data_dir(rq);
 	int cpu;
 
-	if (!blk_fs_request(rq) || !disk || !blk_do_io_stat(disk->queue))
+	if (!blk_fs_request(rq) || !blk_do_io_stat(rq))
 		return;
 
 	cpu = part_stat_lock();
@@ -1675,9 +1674,7 @@ EXPORT_SYMBOL(blkdev_dequeue_request);
 
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
-	struct gendisk *disk = req->rq_disk;
-
-	if (!disk || !blk_do_io_stat(disk->queue))
+	if (!blk_do_io_stat(req))
 		return;
 
 	if (blk_fs_request(req)) {
@@ -1694,9 +1691,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 
 static void blk_account_io_done(struct request *req)
 {
-	struct gendisk *disk = req->rq_disk;
-
-	if (!disk || !blk_do_io_stat(disk->queue))
+	if (!blk_do_io_stat(req))
 		return;
 
 	/*
@@ -1711,7 +1706,7 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(disk, req->sector);
+		part = disk_map_sector_rcu(req->rq_disk, req->sector);
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e39cb24b7679..63760ca3da0f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -338,6 +338,22 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	return 1;
 }
 
+static void blk_account_io_merge(struct request *req)
+{
+	if (blk_do_io_stat(req)) {
+		struct hd_struct *part;
+		int cpu;
+
+		cpu = part_stat_lock();
+		part = disk_map_sector_rcu(req->rq_disk, req->sector);
+
+		part_round_stats(cpu, part);
+		part_dec_in_flight(part);
+
+		part_stat_unlock();
+	}
+}
+
 /*
  * Has to be called with the request spinlock acquired
  */
@@ -386,18 +402,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 
 	elv_merge_requests(q, req, next);
 
-	if (req->rq_disk) {
-		struct hd_struct *part;
-		int cpu;
-
-		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, req->sector);
-
-		part_round_stats(cpu, part);
-		part_dec_in_flight(part);
-
-		part_stat_unlock();
-	}
+	blk_account_io_merge(req);
 
 	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
 	if (blk_rq_cpu_valid(next))
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3ff9bba3379a..73f36beff5cd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -209,10 +209,14 @@ static ssize_t queue_iostats_store(struct request_queue *q, const char *page,
 	ssize_t ret = queue_var_store(&stats, page, count);
 
 	spin_lock_irq(q->queue_lock);
+	elv_quisce_start(q);
+
 	if (stats)
 		queue_flag_set(QUEUE_FLAG_IO_STAT, q);
 	else
 		queue_flag_clear(QUEUE_FLAG_IO_STAT, q);
+
+	elv_quisce_end(q);
 	spin_unlock_irq(q->queue_lock);
 
 	return ret;
diff --git a/block/blk.h b/block/blk.h
index 22043c2886c7..24fcaeeaf620 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -112,12 +112,14 @@ static inline int blk_cpu_to_group(int cpu)
 #endif
 }
 
-static inline int blk_do_io_stat(struct request_queue *q)
+static inline int blk_do_io_stat(struct request *rq)
 {
-	if (q)
-		return blk_queue_io_stat(q);
+	struct gendisk *disk = rq->rq_disk;
 
-	return 0;
+	if (!disk || !disk->queue)
+		return 0;
+
+	return blk_queue_io_stat(disk->queue) && (rq->cmd_flags & REQ_ELVPRIV);
 }
 
 #endif
diff --git a/block/elevator.c b/block/elevator.c
index c6744913ff4a..fb81bcc14a8c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -573,7 +573,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 	elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
 }
 
-static void elv_drain_elevator(struct request_queue *q)
+void elv_drain_elevator(struct request_queue *q)
 {
 	static int printed;
 	while (q->elevator->ops->elevator_dispatch_fn(q, 1))
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 7a204256b155..c59b769f62b0 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -116,6 +116,7 @@ extern void elv_abort_queue(struct request_queue *);
 extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
 extern void elv_put_request(struct request_queue *, struct request *);
+extern void elv_drain_elevator(struct request_queue *);
 
 /*
  * io scheduler registration
-- 
cgit v1.2.3


From 3fbed4c61abd8458896e38633d10110cb5a589d4 Mon Sep 17 00:00:00 2001
From: unsik Kim <donari75@gmail.com>
Date: Thu, 2 Apr 2009 12:50:58 -0700
Subject: mflash: initial support

This driver supports mflash IO mode for linux.

Mflash is embedded flash drive and mainly targeted mobile and consumer
electronic devices.

Internally, mflash has nand flash and other hardware logics and supports 2
different operation (ATA, IO) modes.  ATA mode doesn't need any new driver
and currently works well under standard IDE subsystem.  Actually it's one
chip SSD.  IO mode is ATA-like custom mode for the host that doesn't have
IDE interface.

Followings are brief descriptions about IO mode.
A. IO mode based on ATA protocol and uses some custom command. (read confirm,
write confirm)
B. IO mode uses SRAM bus interface.
C. IO mode supports 4kB boot area, so host can boot from mflash.

This driver is quitely similar to a standard ATA driver, but because of
following reasons it is currently seperated with ATA layer.

1. ATA layer deals standard ATA protocol.  ATA layer have many low-
   level device specific interface, but data transfer keeps ATA rule.
   But, mflash IO mode doesn't.

2. Even though currently not used in mflash driver code, mflash has
   some custom command and modes.  (nand fusing, firmware patch, etc) If
   this feature supported in linux kernel, ATA layer more altered.

3. Currently PATA platform device driver doesn't support interrupt.
   (I'm not sure) But, mflash uses interrupt (polling mode is just for
   debug).

4. mflash is somewhat under-develop product.  Even though some company
   already using mflash their own product, I think more time is needed for
   standardization of custom command and mode.  That time (maybe October)
   I will talk to with ATA people.  If they accept integration, I will
   integrate.

Signed-off-by: unsik Kim <donari75@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/blockdev/00-INDEX   |    2 +
 Documentation/blockdev/mflash.txt |   84 ++++
 drivers/block/Kconfig             |   17 +
 drivers/block/Makefile            |    1 +
 drivers/block/mg_disk.c           | 1005 +++++++++++++++++++++++++++++++++++++
 include/linux/mg_disk.h           |  206 ++++++++
 6 files changed, 1315 insertions(+)
 create mode 100644 Documentation/blockdev/mflash.txt
 create mode 100644 drivers/block/mg_disk.c
 create mode 100644 include/linux/mg_disk.h

(limited to 'include/linux')

diff --git a/Documentation/blockdev/00-INDEX b/Documentation/blockdev/00-INDEX
index 86f054c47013..c08df56dd91b 100644
--- a/Documentation/blockdev/00-INDEX
+++ b/Documentation/blockdev/00-INDEX
@@ -8,6 +8,8 @@ cpqarray.txt
 	- info on using Compaq's SMART2 Intelligent Disk Array Controllers.
 floppy.txt
 	- notes and driver options for the floppy disk driver.
+mflash.txt
+	- info on mGine m(g)flash driver for linux.
 nbd.txt
 	- info on a TCP implementation of a network block device.
 paride.txt
diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt
new file mode 100644
index 000000000000..1f610ecf698a
--- /dev/null
+++ b/Documentation/blockdev/mflash.txt
@@ -0,0 +1,84 @@
+This document describes m[g]flash support in linux.
+
+Contents
+  1. Overview
+  2. Reserved area configuration
+  3. Example of mflash platform driver registration
+
+1. Overview
+
+Mflash and gflash are embedded flash drive. The only difference is mflash is
+MCP(Multi Chip Package) device. These two device operate exactly same way.
+So the rest mflash repersents mflash and gflash altogether.
+
+Internally, mflash has nand flash and other hardware logics and supports
+2 different operation (ATA, IO) modes. ATA mode doesn't need any new
+driver and currently works well under standard IDE subsystem. Actually it's
+one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
+IDE interface.
+
+Followings are brief descriptions about IO mode.
+A. IO mode based on ATA protocol and uses some custom command. (read confirm,
+write confirm)
+B. IO mode uses SRAM bus interface.
+C. IO mode supports 4kB boot area, so host can boot from mflash.
+
+2. Reserved area configuration
+If host boot from mflash, usually needs raw area for boot loader image. All of
+the mflash's block device operation will be taken this value as start offset.
+Note that boot loader's size of reserved area and kernel configuration value
+must be same.
+
+3. Example of mflash platform driver registration
+Working mflash is very straight forward. Adding platform device stuff to board
+configuration file is all. Here is some pseudo example.
+
+static struct mg_drv_data mflash_drv_data = {
+	/* If you want to polling driver set to 1 */
+	.use_polling = 0,
+	/* device attribution */
+	.dev_attr = MG_BOOT_DEV
+};
+
+static struct resource mg_mflash_rsc[] = {
+	/* Base address of mflash */
+	[0] = {
+		.start = 0x08000000,
+		.end = 0x08000000 + SZ_64K - 1,
+		.flags = IORESOURCE_MEM
+	},
+	/* mflash interrupt pin */
+	[1] = {
+		.start = IRQ_GPIO(84),
+		.end = IRQ_GPIO(84),
+		.flags = IORESOURCE_IRQ
+	},
+	/* mflash reset pin */
+	[2] = {
+		.start = 43,
+		.end = 43,
+		.name = MG_RST_PIN,
+		.flags = IORESOURCE_IO
+	},
+	/* mflash reset-out pin
+	 * If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
+	 * should assign this */
+	[3] = {
+		.start = 51,
+		.end = 51,
+		.name = MG_RSTOUT_PIN,
+		.flags = IORESOURCE_IO
+	}
+};
+
+static struct platform_device mflash_dev = {
+	.name = MG_DEV_NAME,
+	.id = -1,
+	.dev = {
+		.platform_data = &mflash_drv_data,
+	},
+	.num_resources = ARRAY_SIZE(mg_mflash_rsc),
+	.resource = mg_mflash_rsc
+};
+
+platform_device_register(&mflash_dev);
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index e7b8aa0cb47c..ddea8e485cc9 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -410,6 +410,23 @@ config ATA_OVER_ETH
 	This driver provides Support for ATA over Ethernet block
 	devices like the Coraid EtherDrive (R) Storage Blade.
 
+config MG_DISK
+	tristate "mGine mflash, gflash support"
+	depends on ARM && ATA && GPIOLIB
+	help
+	  mGine mFlash(gFlash) block device driver
+
+config MG_DISK_RES
+	int "Size of reserved area before MBR"
+	depends on MG_DISK
+	default 0
+	help
+	  Define size of reserved area that usually used for boot. Unit is KB.
+	  All of the block device operation will be taken this value as start
+	  offset
+	  Examples:
+			1024 => 1 MB
+
 config SUNVDC
 	tristate "Sun Virtual Disk Client support"
 	depends on SUN_LDOMS
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 3145141cef72..7755a5e2a85e 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
 obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
+obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
new file mode 100644
index 000000000000..fb39d9aa3cdc
--- /dev/null
+++ b/drivers/block/mg_disk.c
@@ -0,0 +1,1005 @@
+/*
+ *  drivers/block/mg_disk.c
+ *
+ *  Support for the mGine m[g]flash IO mode.
+ *  Based on legacy hd.c
+ *
+ * (c) 2008 mGine Co.,LTD
+ * (c) 2008 unsik Kim <donari75@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/libata.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/mg_disk.h>
+
+#define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
+
+static void mg_request(struct request_queue *);
+
+static void mg_dump_status(const char *msg, unsigned int stat,
+		struct mg_host *host)
+{
+	char *name = MG_DISK_NAME;
+	struct request *req;
+
+	if (host->breq) {
+		req = elv_next_request(host->breq);
+		if (req)
+			name = req->rq_disk->disk_name;
+	}
+
+	printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
+	if (stat & MG_REG_STATUS_BIT_BUSY)
+		printk("Busy ");
+	if (stat & MG_REG_STATUS_BIT_READY)
+		printk("DriveReady ");
+	if (stat & MG_REG_STATUS_BIT_WRITE_FAULT)
+		printk("WriteFault ");
+	if (stat & MG_REG_STATUS_BIT_SEEK_DONE)
+		printk("SeekComplete ");
+	if (stat & MG_REG_STATUS_BIT_DATA_REQ)
+		printk("DataRequest ");
+	if (stat & MG_REG_STATUS_BIT_CORRECTED_ERROR)
+		printk("CorrectedError ");
+	if (stat & MG_REG_STATUS_BIT_ERROR)
+		printk("Error ");
+	printk("}\n");
+	if ((stat & MG_REG_STATUS_BIT_ERROR) == 0) {
+		host->error = 0;
+	} else {
+		host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR);
+		printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg,
+				host->error & 0xff);
+		if (host->error & MG_REG_ERR_BBK)
+			printk("BadSector ");
+		if (host->error & MG_REG_ERR_UNC)
+			printk("UncorrectableError ");
+		if (host->error & MG_REG_ERR_IDNF)
+			printk("SectorIdNotFound ");
+		if (host->error & MG_REG_ERR_ABRT)
+			printk("DriveStatusError ");
+		if (host->error & MG_REG_ERR_AMNF)
+			printk("AddrMarkNotFound ");
+		printk("}");
+		if (host->error &
+				(MG_REG_ERR_BBK | MG_REG_ERR_UNC |
+				 MG_REG_ERR_IDNF | MG_REG_ERR_AMNF)) {
+			if (host->breq) {
+				req = elv_next_request(host->breq);
+				if (req)
+					printk(", sector=%ld", req->sector);
+			}
+
+		}
+		printk("\n");
+	}
+}
+
+static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
+{
+	u8 status;
+	unsigned long expire, cur_jiffies;
+	struct mg_drv_data *prv_data = host->dev->platform_data;
+
+	host->error = MG_ERR_NONE;
+	expire = jiffies + msecs_to_jiffies(msec);
+
+	status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+
+	do {
+		cur_jiffies = jiffies;
+		if (status & MG_REG_STATUS_BIT_BUSY) {
+			if (expect == MG_REG_STATUS_BIT_BUSY)
+				break;
+		} else {
+			/* Check the error condition! */
+			if (status & MG_REG_STATUS_BIT_ERROR) {
+				mg_dump_status("mg_wait", status, host);
+				break;
+			}
+
+			if (expect == MG_STAT_READY)
+				if (MG_READY_OK(status))
+					break;
+
+			if (expect == MG_REG_STATUS_BIT_DATA_REQ)
+				if (status & MG_REG_STATUS_BIT_DATA_REQ)
+					break;
+		}
+		if (!msec) {
+			mg_dump_status("not ready", status, host);
+			return MG_ERR_INV_STAT;
+		}
+		if (prv_data->use_polling)
+			msleep(1);
+
+		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+	} while (time_before(cur_jiffies, expire));
+
+	if (time_after_eq(cur_jiffies, expire) && msec)
+		host->error = MG_ERR_TIMEOUT;
+
+	return host->error;
+}
+
+static unsigned int mg_wait_rstout(u32 rstout, u32 msec)
+{
+	unsigned long expire;
+
+	expire = jiffies + msecs_to_jiffies(msec);
+	while (time_before(jiffies, expire)) {
+		if (gpio_get_value(rstout) == 1)
+			return MG_ERR_NONE;
+		msleep(10);
+	}
+
+	return MG_ERR_RSTOUT;
+}
+
+static void mg_unexpected_intr(struct mg_host *host)
+{
+	u32 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+
+	mg_dump_status("mg_unexpected_intr", status, host);
+}
+
+static irqreturn_t mg_irq(int irq, void *dev_id)
+{
+	struct mg_host *host = dev_id;
+	void (*handler)(struct mg_host *) = host->mg_do_intr;
+
+	host->mg_do_intr = 0;
+	del_timer(&host->timer);
+	if (!handler)
+		handler = mg_unexpected_intr;
+	handler(host);
+	return IRQ_HANDLED;
+}
+
+static int mg_get_disk_id(struct mg_host *host)
+{
+	u32 i;
+	s32 err;
+	const u16 *id = host->id;
+	struct mg_drv_data *prv_data = host->dev->platform_data;
+	char fwrev[ATA_ID_FW_REV_LEN + 1];
+	char model[ATA_ID_PROD_LEN + 1];
+	char serial[ATA_ID_SERNO_LEN + 1];
+
+	if (!prv_data->use_polling)
+		outb(MG_REG_CTRL_INTR_DISABLE,
+				(unsigned long)host->dev_base +
+				MG_REG_DRV_CTRL);
+
+	outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND);
+	err = mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, MG_TMAX_WAIT_RD_DRQ);
+	if (err)
+		return err;
+
+	for (i = 0; i < (MG_SECTOR_SIZE >> 1); i++)
+		host->id[i] = le16_to_cpu(inw((unsigned long)host->dev_base +
+					MG_BUFF_OFFSET + i * 2));
+
+	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
+	err = mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD);
+	if (err)
+		return err;
+
+	if ((id[ATA_ID_FIELD_VALID] & 1) == 0)
+		return MG_ERR_TRANSLATION;
+
+	host->n_sectors = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
+	host->cyls = id[ATA_ID_CYLS];
+	host->heads = id[ATA_ID_HEADS];
+	host->sectors = id[ATA_ID_SECTORS];
+
+	if (MG_RES_SEC && host->heads && host->sectors) {
+		/* modify cyls, n_sectors */
+		host->cyls = (host->n_sectors - MG_RES_SEC) /
+			host->heads / host->sectors;
+		host->nres_sectors = host->n_sectors - host->cyls *
+			host->heads * host->sectors;
+		host->n_sectors -= host->nres_sectors;
+	}
+
+	ata_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
+	ata_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
+	ata_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
+	printk(KERN_INFO "mg_disk: model: %s\n", model);
+	printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev);
+	printk(KERN_INFO "mg_disk: serial: %s\n", serial);
+	printk(KERN_INFO "mg_disk: %d + reserved %d sectors\n",
+			host->n_sectors, host->nres_sectors);
+
+	if (!prv_data->use_polling)
+		outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base +
+				MG_REG_DRV_CTRL);
+
+	return err;
+}
+
+
+static int mg_disk_init(struct mg_host *host)
+{
+	struct mg_drv_data *prv_data = host->dev->platform_data;
+	s32 err;
+	u8 init_status;
+
+	/* hdd rst low */
+	gpio_set_value(host->rst, 0);
+	err = mg_wait(host, MG_REG_STATUS_BIT_BUSY, MG_TMAX_RST_TO_BUSY);
+	if (err)
+		return err;
+
+	/* hdd rst high */
+	gpio_set_value(host->rst, 1);
+	err = mg_wait(host, MG_STAT_READY, MG_TMAX_HDRST_TO_RDY);
+	if (err)
+		return err;
+
+	/* soft reset on */
+	outb(MG_REG_CTRL_RESET |
+			(prv_data->use_polling ? MG_REG_CTRL_INTR_DISABLE :
+			 MG_REG_CTRL_INTR_ENABLE),
+			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
+	err = mg_wait(host, MG_REG_STATUS_BIT_BUSY, MG_TMAX_RST_TO_BUSY);
+	if (err)
+		return err;
+
+	/* soft reset off */
+	outb(prv_data->use_polling ? MG_REG_CTRL_INTR_DISABLE :
+			MG_REG_CTRL_INTR_ENABLE,
+			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
+	err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY);
+	if (err)
+		return err;
+
+	init_status = inb((unsigned long)host->dev_base + MG_REG_STATUS) & 0xf;
+
+	if (init_status == 0xf)
+		return MG_ERR_INIT_STAT;
+
+	return err;
+}
+
+static void mg_bad_rw_intr(struct mg_host *host)
+{
+	struct request *req = elv_next_request(host->breq);
+	if (req != NULL)
+		if (++req->errors >= MG_MAX_ERRORS ||
+				host->error == MG_ERR_TIMEOUT)
+			end_request(req, 0);
+}
+
+static unsigned int mg_out(struct mg_host *host,
+		unsigned int sect_num,
+		unsigned int sect_cnt,
+		unsigned int cmd,
+		void (*intr_addr)(struct mg_host *))
+{
+	struct mg_drv_data *prv_data = host->dev->platform_data;
+
+	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
+		return host->error;
+
+	if (!prv_data->use_polling) {
+		host->mg_do_intr = intr_addr;
+		mod_timer(&host->timer, jiffies + 3 * HZ);
+	}
+	if (MG_RES_SEC)
+		sect_num += MG_RES_SEC;
+	outb((u8)sect_cnt, (unsigned long)host->dev_base + MG_REG_SECT_CNT);
+	outb((u8)sect_num, (unsigned long)host->dev_base + MG_REG_SECT_NUM);
+	outb((u8)(sect_num >> 8), (unsigned long)host->dev_base +
+			MG_REG_CYL_LOW);
+	outb((u8)(sect_num >> 16), (unsigned long)host->dev_base +
+			MG_REG_CYL_HIGH);
+	outb((u8)((sect_num >> 24) | MG_REG_HEAD_LBA_MODE),
+			(unsigned long)host->dev_base + MG_REG_DRV_HEAD);
+	outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND);
+	return MG_ERR_NONE;
+}
+
+static void mg_read(struct request *req)
+{
+	u32 remains, j;
+	struct mg_host *host = req->rq_disk->private_data;
+
+	remains = req->nr_sectors;
+
+	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, 0) !=
+			MG_ERR_NONE)
+		mg_bad_rw_intr(host);
+
+	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
+			remains, req->sector, req->buffer);
+
+	while (remains) {
+		if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ,
+					MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
+			mg_bad_rw_intr(host);
+			return;
+		}
+		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) {
+			*(u16 *)req->buffer =
+				inw((unsigned long)host->dev_base +
+						MG_BUFF_OFFSET + (j << 1));
+			req->buffer += 2;
+		}
+
+		req->sector++;
+		req->errors = 0;
+		remains = --req->nr_sectors;
+		--req->current_nr_sectors;
+
+		if (req->current_nr_sectors <= 0) {
+			MG_DBG("remain : %d sects\n", remains);
+			end_request(req, 1);
+			if (remains > 0)
+				req = elv_next_request(host->breq);
+		}
+
+		outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
+				MG_REG_COMMAND);
+	}
+}
+
+static void mg_write(struct request *req)
+{
+	u32 remains, j;
+	struct mg_host *host = req->rq_disk->private_data;
+
+	remains = req->nr_sectors;
+
+	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, 0) !=
+			MG_ERR_NONE) {
+		mg_bad_rw_intr(host);
+		return;
+	}
+
+
+	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
+			remains, req->sector, req->buffer);
+	while (remains) {
+		if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ,
+					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
+			mg_bad_rw_intr(host);
+			return;
+		}
+		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) {
+			outw(*(u16 *)req->buffer,
+					(unsigned long)host->dev_base +
+					MG_BUFF_OFFSET + (j << 1));
+			req->buffer += 2;
+		}
+		req->sector++;
+		remains = --req->nr_sectors;
+		--req->current_nr_sectors;
+
+		if (req->current_nr_sectors <= 0) {
+			MG_DBG("remain : %d sects\n", remains);
+			end_request(req, 1);
+			if (remains > 0)
+				req = elv_next_request(host->breq);
+		}
+
+		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
+				MG_REG_COMMAND);
+	}
+}
+
+static void mg_read_intr(struct mg_host *host)
+{
+	u32 i;
+	struct request *req;
+
+	/* check status */
+	do {
+		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+		if (i & MG_REG_STATUS_BIT_BUSY)
+			break;
+		if (!MG_READY_OK(i))
+			break;
+		if (i & MG_REG_STATUS_BIT_DATA_REQ)
+			goto ok_to_read;
+	} while (0);
+	mg_dump_status("mg_read_intr", i, host);
+	mg_bad_rw_intr(host);
+	mg_request(host->breq);
+	return;
+
+ok_to_read:
+	/* get current segment of request */
+	req = elv_next_request(host->breq);
+
+	/* read 1 sector */
+	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) {
+		*(u16 *)req->buffer =
+			inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
+					(i << 1));
+		req->buffer += 2;
+	}
+
+	/* manipulate request */
+	MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
+			req->sector, req->nr_sectors - 1, req->buffer);
+
+	req->sector++;
+	req->errors = 0;
+	i = --req->nr_sectors;
+	--req->current_nr_sectors;
+
+	/* let know if current segment done */
+	if (req->current_nr_sectors <= 0)
+		end_request(req, 1);
+
+	/* set handler if read remains */
+	if (i > 0) {
+		host->mg_do_intr = mg_read_intr;
+		mod_timer(&host->timer, jiffies + 3 * HZ);
+	}
+
+	/* send read confirm */
+	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
+
+	/* goto next request */
+	if (!i)
+		mg_request(host->breq);
+}
+
+static void mg_write_intr(struct mg_host *host)
+{
+	u32 i, j;
+	u16 *buff;
+	struct request *req;
+
+	/* get current segment of request */
+	req = elv_next_request(host->breq);
+
+	/* check status */
+	do {
+		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+		if (i & MG_REG_STATUS_BIT_BUSY)
+			break;
+		if (!MG_READY_OK(i))
+			break;
+		if ((req->nr_sectors <= 1) || (i & MG_REG_STATUS_BIT_DATA_REQ))
+			goto ok_to_write;
+	} while (0);
+	mg_dump_status("mg_write_intr", i, host);
+	mg_bad_rw_intr(host);
+	mg_request(host->breq);
+	return;
+
+ok_to_write:
+	/* manipulate request */
+	req->sector++;
+	i = --req->nr_sectors;
+	--req->current_nr_sectors;
+	req->buffer += MG_SECTOR_SIZE;
+
+	/* let know if current segment or all done */
+	if (!i || (req->bio && req->current_nr_sectors <= 0))
+		end_request(req, 1);
+
+	/* write 1 sector and set handler if remains */
+	if (i > 0) {
+		buff = (u16 *)req->buffer;
+		for (j = 0; j < MG_STORAGE_BUFFER_SIZE >> 1; j++) {
+			outw(*buff, (unsigned long)host->dev_base +
+					MG_BUFF_OFFSET + (j << 1));
+			buff++;
+		}
+		MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
+				req->sector, req->nr_sectors, req->buffer);
+		host->mg_do_intr = mg_write_intr;
+		mod_timer(&host->timer, jiffies + 3 * HZ);
+	}
+
+	/* send write confirm */
+	outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
+
+	if (!i)
+		mg_request(host->breq);
+}
+
+void mg_times_out(unsigned long data)
+{
+	struct mg_host *host = (struct mg_host *)data;
+	char *name;
+	struct request *req;
+
+	req = elv_next_request(host->breq);
+	if (!req)
+		return;
+
+	host->mg_do_intr = NULL;
+
+	name = req->rq_disk->disk_name;
+	printk(KERN_DEBUG "%s: timeout\n", name);
+
+	host->error = MG_ERR_TIMEOUT;
+	mg_bad_rw_intr(host);
+
+	mg_request(host->breq);
+}
+
+static void mg_request_poll(struct request_queue *q)
+{
+	struct request *req;
+	struct mg_host *host;
+
+	while ((req = elv_next_request(q)) != NULL) {
+		host = req->rq_disk->private_data;
+		if (blk_fs_request(req)) {
+			switch (rq_data_dir(req)) {
+			case READ:
+				mg_read(req);
+				break;
+			case WRITE:
+				mg_write(req);
+				break;
+			default:
+				printk(KERN_WARNING "%s:%d unknown command\n",
+						__func__, __LINE__);
+				end_request(req, 0);
+				break;
+			}
+		}
+	}
+}
+
+static unsigned int mg_issue_req(struct request *req,
+		struct mg_host *host,
+		unsigned int sect_num,
+		unsigned int sect_cnt)
+{
+	u16 *buff;
+	u32 i;
+
+	switch (rq_data_dir(req)) {
+	case READ:
+		if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
+				!= MG_ERR_NONE) {
+			mg_bad_rw_intr(host);
+			return host->error;
+		}
+		break;
+	case WRITE:
+		/* TODO : handler */
+		outb(MG_REG_CTRL_INTR_DISABLE,
+				(unsigned long)host->dev_base +
+				MG_REG_DRV_CTRL);
+		if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
+				!= MG_ERR_NONE) {
+			mg_bad_rw_intr(host);
+			return host->error;
+		}
+		del_timer(&host->timer);
+		mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, MG_TMAX_WAIT_WR_DRQ);
+		outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base +
+				MG_REG_DRV_CTRL);
+		if (host->error) {
+			mg_bad_rw_intr(host);
+			return host->error;
+		}
+		buff = (u16 *)req->buffer;
+		for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) {
+			outw(*buff, (unsigned long)host->dev_base +
+					MG_BUFF_OFFSET + (i << 1));
+			buff++;
+		}
+		mod_timer(&host->timer, jiffies + 3 * HZ);
+		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
+				MG_REG_COMMAND);
+		break;
+	default:
+		printk(KERN_WARNING "%s:%d unknown command\n",
+				__func__, __LINE__);
+		end_request(req, 0);
+		break;
+	}
+	return MG_ERR_NONE;
+}
+
+/* This function also called from IRQ context */
+static void mg_request(struct request_queue *q)
+{
+	struct request *req;
+	struct mg_host *host;
+	u32 sect_num, sect_cnt;
+
+	while (1) {
+		req = elv_next_request(q);
+		if (!req)
+			return;
+
+		host = req->rq_disk->private_data;
+
+		/* check unwanted request call */
+		if (host->mg_do_intr)
+			return;
+
+		del_timer(&host->timer);
+
+		sect_num = req->sector;
+		/* deal whole segments */
+		sect_cnt = req->nr_sectors;
+
+		/* sanity check */
+		if (sect_num >= get_capacity(req->rq_disk) ||
+				((sect_num + sect_cnt) >
+				 get_capacity(req->rq_disk))) {
+			printk(KERN_WARNING
+					"%s: bad access: sector=%d, count=%d\n",
+					req->rq_disk->disk_name,
+					sect_num, sect_cnt);
+			end_request(req, 0);
+			continue;
+		}
+
+		if (!blk_fs_request(req))
+			return;
+
+		if (!mg_issue_req(req, host, sect_num, sect_cnt))
+			return;
+	}
+}
+
+static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct mg_host *host = bdev->bd_disk->private_data;
+
+	geo->cylinders = (unsigned short)host->cyls;
+	geo->heads = (unsigned char)host->heads;
+	geo->sectors = (unsigned char)host->sectors;
+	return 0;
+}
+
+static struct block_device_operations mg_disk_ops = {
+	.getgeo = mg_getgeo
+};
+
+static int mg_suspend(struct platform_device *plat_dev, pm_message_t state)
+{
+	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+	struct mg_host *host = prv_data->host;
+
+	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
+		return -EIO;
+
+	if (!prv_data->use_polling)
+		outb(MG_REG_CTRL_INTR_DISABLE,
+				(unsigned long)host->dev_base +
+				MG_REG_DRV_CTRL);
+
+	outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND);
+	/* wait until mflash deep sleep */
+	msleep(1);
+
+	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) {
+		if (!prv_data->use_polling)
+			outb(MG_REG_CTRL_INTR_ENABLE,
+					(unsigned long)host->dev_base +
+					MG_REG_DRV_CTRL);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int mg_resume(struct platform_device *plat_dev)
+{
+	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+	struct mg_host *host = prv_data->host;
+
+	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
+		return -EIO;
+
+	outb(MG_CMD_WAKEUP, (unsigned long)host->dev_base + MG_REG_COMMAND);
+	/* wait until mflash wakeup */
+	msleep(1);
+
+	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
+		return -EIO;
+
+	if (!prv_data->use_polling)
+		outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base +
+				MG_REG_DRV_CTRL);
+
+	return 0;
+}
+
+static int mg_probe(struct platform_device *plat_dev)
+{
+	struct mg_host *host;
+	struct resource *rsc;
+	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+	int err = 0;
+
+	if (!prv_data) {
+		printk(KERN_ERR	"%s:%d fail (no driver_data)\n",
+				__func__, __LINE__);
+		err = -EINVAL;
+		goto probe_err;
+	}
+
+	/* alloc mg_host */
+	host = kzalloc(sizeof(struct mg_host), GFP_KERNEL);
+	if (!host) {
+		printk(KERN_ERR "%s:%d fail (no memory for mg_host)\n",
+				__func__, __LINE__);
+		err = -ENOMEM;
+		goto probe_err;
+	}
+	host->major = MG_DISK_MAJ;
+
+	/* link each other */
+	prv_data->host = host;
+	host->dev = &plat_dev->dev;
+
+	/* io remap */
+	rsc = platform_get_resource(plat_dev, IORESOURCE_MEM, 0);
+	if (!rsc) {
+		printk(KERN_ERR "%s:%d platform_get_resource fail\n",
+				__func__, __LINE__);
+		err = -EINVAL;
+		goto probe_err_2;
+	}
+	host->dev_base = ioremap(rsc->start , rsc->end + 1);
+	if (!host->dev_base) {
+		printk(KERN_ERR "%s:%d ioremap fail\n",
+				__func__, __LINE__);
+		err = -EIO;
+		goto probe_err_2;
+	}
+	MG_DBG("dev_base = 0x%x\n", (u32)host->dev_base);
+
+	/* get reset pin */
+	rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
+			MG_RST_PIN);
+	if (!rsc) {
+		printk(KERN_ERR "%s:%d get reset pin fail\n",
+				__func__, __LINE__);
+		err = -EIO;
+		goto probe_err_3;
+	}
+	host->rst = rsc->start;
+
+	/* init rst pin */
+	err = gpio_request(host->rst, MG_RST_PIN);
+	if (err)
+		goto probe_err_3;
+	gpio_direction_output(host->rst, 1);
+
+	/* reset out pin */
+	if (!(prv_data->dev_attr & MG_DEV_MASK))
+		goto probe_err_3a;
+
+	if (prv_data->dev_attr != MG_BOOT_DEV) {
+		rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
+				MG_RSTOUT_PIN);
+		if (!rsc) {
+			printk(KERN_ERR "%s:%d get reset-out pin fail\n",
+					__func__, __LINE__);
+			err = -EIO;
+			goto probe_err_3a;
+		}
+		host->rstout = rsc->start;
+		err = gpio_request(host->rstout, MG_RSTOUT_PIN);
+		if (err)
+			goto probe_err_3a;
+		gpio_direction_input(host->rstout);
+	}
+
+	/* disk reset */
+	if (prv_data->dev_attr == MG_STORAGE_DEV) {
+		/* If POR seq. not yet finised, wait */
+		err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
+		if (err)
+			goto probe_err_3b;
+		err = mg_disk_init(host);
+		if (err) {
+			printk(KERN_ERR "%s:%d fail (err code : %d)\n",
+					__func__, __LINE__, err);
+			err = -EIO;
+			goto probe_err_3b;
+		}
+	}
+
+	/* get irq resource */
+	if (!prv_data->use_polling) {
+		host->irq = platform_get_irq(plat_dev, 0);
+		if (host->irq == -ENXIO) {
+			err = host->irq;
+			goto probe_err_3b;
+		}
+		err = request_irq(host->irq, mg_irq,
+				IRQF_DISABLED | IRQF_TRIGGER_RISING,
+				MG_DEV_NAME, host);
+		if (err) {
+			printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",
+					__func__, __LINE__, err);
+			goto probe_err_3b;
+		}
+
+	}
+
+	/* get disk id */
+	err = mg_get_disk_id(host);
+	if (err) {
+		printk(KERN_ERR "%s:%d fail (err code : %d)\n",
+				__func__, __LINE__, err);
+		err = -EIO;
+		goto probe_err_4;
+	}
+
+	err = register_blkdev(host->major, MG_DISK_NAME);
+	if (err < 0) {
+		printk(KERN_ERR "%s:%d register_blkdev fail (err code : %d)\n",
+				__func__, __LINE__, err);
+		goto probe_err_4;
+	}
+	if (!host->major)
+		host->major = err;
+
+	spin_lock_init(&host->lock);
+
+	if (prv_data->use_polling)
+		host->breq = blk_init_queue(mg_request_poll, &host->lock);
+	else
+		host->breq = blk_init_queue(mg_request, &host->lock);
+
+	if (!host->breq) {
+		err = -ENOMEM;
+		printk(KERN_ERR "%s:%d (blk_init_queue) fail\n",
+				__func__, __LINE__);
+		goto probe_err_5;
+	}
+
+	/* mflash is random device, thanx for the noop */
+	elevator_exit(host->breq->elevator);
+	err = elevator_init(host->breq, "noop");
+	if (err) {
+		printk(KERN_ERR "%s:%d (elevator_init) fail\n",
+				__func__, __LINE__);
+		goto probe_err_6;
+	}
+	blk_queue_max_sectors(host->breq, MG_MAX_SECTS);
+	blk_queue_hardsect_size(host->breq, MG_SECTOR_SIZE);
+
+	init_timer(&host->timer);
+	host->timer.function = mg_times_out;
+	host->timer.data = (unsigned long)host;
+
+	host->gd = alloc_disk(MG_DISK_MAX_PART);
+	if (!host->gd) {
+		printk(KERN_ERR "%s:%d (alloc_disk) fail\n",
+				__func__, __LINE__);
+		err = -ENOMEM;
+		goto probe_err_7;
+	}
+	host->gd->major = host->major;
+	host->gd->first_minor = 0;
+	host->gd->fops = &mg_disk_ops;
+	host->gd->queue = host->breq;
+	host->gd->private_data = host;
+	sprintf(host->gd->disk_name, MG_DISK_NAME"a");
+
+	set_capacity(host->gd, host->n_sectors);
+
+	add_disk(host->gd);
+
+	return err;
+
+probe_err_7:
+	del_timer_sync(&host->timer);
+probe_err_6:
+	blk_cleanup_queue(host->breq);
+probe_err_5:
+	unregister_blkdev(MG_DISK_MAJ, MG_DISK_NAME);
+probe_err_4:
+	if (!prv_data->use_polling)
+		free_irq(host->irq, host);
+probe_err_3b:
+	gpio_free(host->rstout);
+probe_err_3a:
+	gpio_free(host->rst);
+probe_err_3:
+	iounmap(host->dev_base);
+probe_err_2:
+	kfree(host);
+probe_err:
+	return err;
+}
+
+static int mg_remove(struct platform_device *plat_dev)
+{
+	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+	struct mg_host *host = prv_data->host;
+	int err = 0;
+
+	/* delete timer */
+	del_timer_sync(&host->timer);
+
+	/* remove disk */
+	if (host->gd) {
+		del_gendisk(host->gd);
+		put_disk(host->gd);
+	}
+	/* remove queue */
+	if (host->breq)
+		blk_cleanup_queue(host->breq);
+
+	/* unregister blk device */
+	unregister_blkdev(host->major, MG_DISK_NAME);
+
+	/* free irq */
+	if (!prv_data->use_polling)
+		free_irq(host->irq, host);
+
+	/* free reset-out pin */
+	if (prv_data->dev_attr != MG_BOOT_DEV)
+		gpio_free(host->rstout);
+
+	/* free rst pin */
+	if (host->rst)
+		gpio_free(host->rst);
+
+	/* unmap io */
+	if (host->dev_base)
+		iounmap(host->dev_base);
+
+	/* free mg_host */
+	kfree(host);
+
+	return err;
+}
+
+static struct platform_driver mg_disk_driver = {
+	.probe = mg_probe,
+	.remove = mg_remove,
+	.suspend = mg_suspend,
+	.resume = mg_resume,
+	.driver = {
+		.name = MG_DEV_NAME,
+		.owner = THIS_MODULE,
+	}
+};
+
+/****************************************************************************
+ *
+ * Module stuff
+ *
+ ****************************************************************************/
+
+static int __init mg_init(void)
+{
+	printk(KERN_INFO "mGine mflash driver, (c) 2008 mGine Co.\n");
+	return platform_driver_register(&mg_disk_driver);
+}
+
+static void __exit mg_exit(void)
+{
+	printk(KERN_INFO "mflash driver : bye bye\n");
+	platform_driver_unregister(&mg_disk_driver);
+}
+
+module_init(mg_init);
+module_exit(mg_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("unsik Kim <donari75@gmail.com>");
+MODULE_DESCRIPTION("mGine m[g]flash device driver");
diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h
new file mode 100644
index 000000000000..1f76b1ebf627
--- /dev/null
+++ b/include/linux/mg_disk.h
@@ -0,0 +1,206 @@
+/*
+ *  include/linux/mg_disk.c
+ *
+ *  Support for the mGine m[g]flash IO mode.
+ *  Based on legacy hd.c
+ *
+ * (c) 2008 mGine Co.,LTD
+ * (c) 2008 unsik Kim <donari75@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#ifndef __MG_DISK_H__
+#define __MG_DISK_H__
+
+#include <linux/blkdev.h>
+#include <linux/ata.h>
+
+/* name for block device */
+#define MG_DISK_NAME "mgd"
+/* name for platform device */
+#define MG_DEV_NAME "mg_disk"
+
+#define MG_DISK_MAJ 0
+#define MG_DISK_MAX_PART 16
+#define MG_SECTOR_SIZE 512
+#define MG_MAX_SECTS 256
+
+/* Register offsets */
+#define MG_BUFF_OFFSET			0x8000
+#define MG_STORAGE_BUFFER_SIZE		0x200
+#define MG_REG_OFFSET			0xC000
+#define MG_REG_FEATURE			(MG_REG_OFFSET + 2)	/* write case */
+#define MG_REG_ERROR			(MG_REG_OFFSET + 2)	/* read case */
+#define MG_REG_SECT_CNT			(MG_REG_OFFSET + 4)
+#define MG_REG_SECT_NUM			(MG_REG_OFFSET + 6)
+#define MG_REG_CYL_LOW			(MG_REG_OFFSET + 8)
+#define MG_REG_CYL_HIGH			(MG_REG_OFFSET + 0xA)
+#define MG_REG_DRV_HEAD			(MG_REG_OFFSET + 0xC)
+#define MG_REG_COMMAND			(MG_REG_OFFSET + 0xE)	/* write case */
+#define MG_REG_STATUS			(MG_REG_OFFSET + 0xE)	/* read  case */
+#define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
+#define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
+
+/* "Drive Select/Head Register" bit values */
+#define MG_REG_HEAD_MUST_BE_ON		0xA0 /* These 2 bits are always on */
+#define MG_REG_HEAD_DRIVE_MASTER	(0x00 | MG_REG_HEAD_MUST_BE_ON)
+#define MG_REG_HEAD_DRIVE_SLAVE		(0x10 | MG_REG_HEAD_MUST_BE_ON)
+#define MG_REG_HEAD_LBA_MODE		(0x40 | MG_REG_HEAD_MUST_BE_ON)
+
+
+/* "Device Control Register" bit values */
+#define MG_REG_CTRL_INTR_ENABLE			0x0
+#define MG_REG_CTRL_INTR_DISABLE		(0x1<<1)
+#define MG_REG_CTRL_RESET			(0x1<<2)
+#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH	0x0
+#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW	(0x1<<4)
+#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW		0x0
+#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH	(0x1<<5)
+#define MG_REG_CTRL_DPD_DISABLE			0x0
+#define MG_REG_CTRL_DPD_ENABLE			(0x1<<6)
+
+/* Status register bit */
+/* error bit in status register */
+#define MG_REG_STATUS_BIT_ERROR			0x01
+/* corrected error in status register */
+#define MG_REG_STATUS_BIT_CORRECTED_ERROR	0x04
+/* data request bit in status register */
+#define MG_REG_STATUS_BIT_DATA_REQ		0x08
+/* DSC - Drive Seek Complete */
+#define MG_REG_STATUS_BIT_SEEK_DONE		0x10
+/* DWF - Drive Write Fault */
+#define MG_REG_STATUS_BIT_WRITE_FAULT		0x20
+#define MG_REG_STATUS_BIT_READY			0x40
+#define MG_REG_STATUS_BIT_BUSY			0x80
+
+/* handy status */
+#define MG_STAT_READY	(MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE)
+#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | \
+				(MG_REG_STATUS_BIT_BUSY | \
+				 MG_REG_STATUS_BIT_WRITE_FAULT | \
+				 MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY)
+
+/* Error register */
+#define MG_REG_ERR_AMNF		0x01
+#define MG_REG_ERR_ABRT		0x04
+#define MG_REG_ERR_IDNF		0x10
+#define MG_REG_ERR_UNC		0x40
+#define MG_REG_ERR_BBK		0x80
+
+/* error code for others */
+#define MG_ERR_NONE		0
+#define MG_ERR_TIMEOUT		0x100
+#define MG_ERR_INIT_STAT	0x101
+#define MG_ERR_TRANSLATION	0x102
+#define MG_ERR_CTRL_RST		0x103
+#define MG_ERR_INV_STAT		0x104
+#define MG_ERR_RSTOUT		0x105
+
+#define MG_MAX_ERRORS	6	/* Max read/write errors */
+
+/* command */
+#define MG_CMD_RD 0x20
+#define MG_CMD_WR 0x30
+#define MG_CMD_SLEEP 0x99
+#define MG_CMD_WAKEUP 0xC3
+#define MG_CMD_ID 0xEC
+#define MG_CMD_WR_CONF 0x3C
+#define MG_CMD_RD_CONF 0x40
+
+/* operation mode */
+#define MG_OP_CASCADE (1 << 0)
+#define MG_OP_CASCADE_SYNC_RD (1 << 1)
+#define MG_OP_CASCADE_SYNC_WR (1 << 2)
+#define MG_OP_INTERLEAVE (1 << 3)
+
+/* synchronous */
+#define MG_BURST_LAT_4 (3 << 4)
+#define MG_BURST_LAT_5 (4 << 4)
+#define MG_BURST_LAT_6 (5 << 4)
+#define MG_BURST_LAT_7 (6 << 4)
+#define MG_BURST_LAT_8 (7 << 4)
+#define MG_BURST_LEN_4 (1 << 1)
+#define MG_BURST_LEN_8 (2 << 1)
+#define MG_BURST_LEN_16 (3 << 1)
+#define MG_BURST_LEN_32 (4 << 1)
+#define MG_BURST_LEN_CONT (0 << 1)
+
+/* timeout value (unit: ms) */
+#define MG_TMAX_CONF_TO_CMD	1
+#define MG_TMAX_WAIT_RD_DRQ	10
+#define MG_TMAX_WAIT_WR_DRQ	500
+#define MG_TMAX_RST_TO_BUSY	10
+#define MG_TMAX_HDRST_TO_RDY	500
+#define MG_TMAX_SWRST_TO_RDY	500
+#define MG_TMAX_RSTOUT		3000
+
+/* device attribution */
+/* use mflash as boot device */
+#define MG_BOOT_DEV		(1 << 0)
+/* use mflash as storage device */
+#define MG_STORAGE_DEV		(1 << 1)
+/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
+#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
+
+#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
+
+/* names of GPIO resource */
+#define MG_RST_PIN	"mg_rst"
+/* except MG_BOOT_DEV, reset-out pin should be assigned */
+#define MG_RSTOUT_PIN	"mg_rstout"
+
+/* private driver data */
+struct mg_drv_data {
+	/* disk resource */
+	u32 use_polling;
+
+	/* device attribution */
+	u32 dev_attr;
+
+	/* internally used */
+	struct mg_host *host;
+};
+
+/* main structure for mflash driver */
+struct mg_host {
+	struct device *dev;
+
+	struct request_queue *breq;
+	spinlock_t lock;
+	struct gendisk *gd;
+
+	struct timer_list timer;
+	void (*mg_do_intr) (struct mg_host *);
+
+	u16 id[ATA_ID_WORDS];
+
+	u16 cyls;
+	u16 heads;
+	u16 sectors;
+	u32 n_sectors;
+	u32 nres_sectors;
+
+	void __iomem *dev_base;
+	unsigned int irq;
+	unsigned int rst;
+	unsigned int rstout;
+
+	u32 major;
+	u32 error;
+};
+
+/*
+ * Debugging macro and defines
+ */
+#undef DO_MG_DEBUG
+#ifdef DO_MG_DEBUG
+#  define MG_DBG(fmt, args...) \
+	printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
+#else /* CONFIG_MG_DEBUG */
+#  define MG_DBG(fmt, args...) do { } while (0)
+#endif /* CONFIG_MG_DEBUG */
+
+#endif
-- 
cgit v1.2.3


From 2385327725419a76cfbca7258abd95908b8ba9eb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 7 Apr 2009 08:59:11 +0200
Subject: block: remove unused REQ_UNPLUG

The request inherits the unplug flag from the bio, but it isn't actually
used. The bio flag stops at __make_request(), which tells it to unplug
after submission. Passing it on to the request doesn't make any sense.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 2 --
 include/linux/blkdev.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3688abff2430..43fdedc524ee 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1123,8 +1123,6 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
-	if (bio_unplug(bio))
-		req->cmd_flags |= REQ_UNPLUG;
 	if (bio_rw_meta(bio))
 		req->cmd_flags |= REQ_RW_META;
 	if (bio_noidle(bio))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e03660964e02..ba54c834a590 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -117,7 +117,6 @@ enum rq_flag_bits {
 	__REQ_RW_META,		/* metadata io request */
 	__REQ_COPY_USER,	/* contains copies of user pages */
 	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
-	__REQ_UNPLUG,		/* unplug queue on submission */
 	__REQ_NOIDLE,		/* Don't anticipate more IO after this one */
 	__REQ_NR_BITS,		/* stops here */
 };
@@ -145,7 +144,6 @@ enum rq_flag_bits {
 #define REQ_RW_META	(1 << __REQ_RW_META)
 #define REQ_COPY_USER	(1 << __REQ_COPY_USER)
 #define REQ_INTEGRITY	(1 << __REQ_INTEGRITY)
-#define REQ_UNPLUG	(1 << __REQ_UNPLUG)
 #define REQ_NOIDLE	(1 << __REQ_NOIDLE)
 
 #define BLK_MAX_CDB	16
-- 
cgit v1.2.3


From b486ddbc0fb8127ccf2c820cfbf0b98e6f8a4e97 Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 2 Apr 2009 13:19:07 +0200
Subject: i2c: xtensa s6000 i2c driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support for the s6000 on-chip i2c controller.

Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Daniel Glöckner <dg@emlix.com>
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 drivers/i2c/busses/Kconfig     |  10 +
 drivers/i2c/busses/Makefile    |   1 +
 drivers/i2c/busses/i2c-s6000.c | 407 +++++++++++++++++++++++++++++++++++++++++
 drivers/i2c/busses/i2c-s6000.h |  79 ++++++++
 include/linux/i2c/s6000.h      |  10 +
 5 files changed, 507 insertions(+)
 create mode 100644 drivers/i2c/busses/i2c-s6000.c
 create mode 100644 drivers/i2c/busses/i2c-s6000.h
 create mode 100644 include/linux/i2c/s6000.h

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 5d4aa5e873ca..94eae5c3cbc7 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -472,6 +472,16 @@ config I2C_S3C2410
 	  Say Y here to include support for I2C controller in the
 	  Samsung S3C2410 based System-on-Chip devices.
 
+config I2C_S6000
+	tristate "S6000 I2C support"
+	depends on XTENSA_VARIANT_S6000
+	help
+	  This driver supports the on chip I2C device on the
+	  S6000 xtensa processor family.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called i2c-s6000.
+
 config I2C_SH7760
 	tristate "Renesas SH7760 I2C Controller"
 	depends on CPU_SUBTYPE_SH7760
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index 657ab2428011..776acb6403a7 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_I2C_PASEMI)	+= i2c-pasemi.o
 obj-$(CONFIG_I2C_PNX)		+= i2c-pnx.o
 obj-$(CONFIG_I2C_PXA)		+= i2c-pxa.o
 obj-$(CONFIG_I2C_S3C2410)	+= i2c-s3c2410.o
+obj-$(CONFIG_I2C_S6000)		+= i2c-s6000.o
 obj-$(CONFIG_I2C_SH7760)	+= i2c-sh7760.o
 obj-$(CONFIG_I2C_SH_MOBILE)	+= i2c-sh_mobile.o
 obj-$(CONFIG_I2C_SIMTEC)	+= i2c-simtec.o
diff --git a/drivers/i2c/busses/i2c-s6000.c b/drivers/i2c/busses/i2c-s6000.c
new file mode 100644
index 000000000000..c91359f4965c
--- /dev/null
+++ b/drivers/i2c/busses/i2c-s6000.c
@@ -0,0 +1,407 @@
+/*
+ * drivers/i2c/busses/i2c-s6000.c
+ *
+ * Description: Driver for S6000 Family I2C Interface
+ * Copyright (c) 2008 emlix GmbH
+ * Author:	Oskar Schirmer <os@emlix.com>
+ *
+ * Partially based on i2c-bfin-twi.c driver by <sonic.zhang@analog.com>
+ * Copyright (c) 2005-2007 Analog Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/i2c.h>
+#include <linux/i2c/s6000.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+
+#include <asm/io.h>
+#include "i2c-s6000.h"
+
+#define DRV_NAME "i2c-s6000"
+
+#define POLL_TIMEOUT	(2 * HZ)
+
+struct s6i2c_if {
+	u8 __iomem		*reg; /* memory mapped registers */
+	int			irq;
+	spinlock_t		lock;
+	struct i2c_msg		*msgs; /* messages currently handled */
+	int			msgs_num; /* nb of msgs to do */
+	int			msgs_push; /* nb of msgs read/written */
+	int			msgs_done; /* nb of msgs finally handled */
+	unsigned		push; /* nb of bytes read/written in msg */
+	unsigned		done; /* nb of bytes finally handled */
+	int			timeout_count; /* timeout retries left */
+	struct timer_list	timeout_timer;
+	struct i2c_adapter	adap;
+	struct completion	complete;
+	struct clk		*clk;
+	struct resource		*res;
+};
+
+static inline u16 i2c_rd16(struct s6i2c_if *iface, unsigned n)
+{
+	return readw(iface->reg + (n));
+}
+
+static inline void i2c_wr16(struct s6i2c_if *iface, unsigned n, u16 v)
+{
+	writew(v, iface->reg + (n));
+}
+
+static inline u32 i2c_rd32(struct s6i2c_if *iface, unsigned n)
+{
+	return readl(iface->reg + (n));
+}
+
+static inline void i2c_wr32(struct s6i2c_if *iface, unsigned n, u32 v)
+{
+	writel(v, iface->reg + (n));
+}
+
+static struct s6i2c_if s6i2c_if;
+
+static void s6i2c_handle_interrupt(struct s6i2c_if *iface)
+{
+	if (i2c_rd16(iface, S6_I2C_INTRSTAT) & (1 << S6_I2C_INTR_TXABRT)) {
+		i2c_rd16(iface, S6_I2C_CLRTXABRT);
+		i2c_wr16(iface, S6_I2C_INTRMASK, 0);
+		complete(&iface->complete);
+		return;
+	}
+	if (iface->msgs_done >= iface->msgs_num) {
+		dev_err(&iface->adap.dev, "s6i2c: spurious I2C irq: %04x\n",
+			i2c_rd16(iface, S6_I2C_INTRSTAT));
+		i2c_wr16(iface, S6_I2C_INTRMASK, 0);
+		return;
+	}
+	while ((iface->msgs_push < iface->msgs_num)
+	    && (i2c_rd16(iface, S6_I2C_STATUS) & (1 << S6_I2C_STATUS_TFNF))) {
+		struct i2c_msg *m = &iface->msgs[iface->msgs_push];
+		if (!(m->flags & I2C_M_RD))
+			i2c_wr16(iface, S6_I2C_DATACMD, m->buf[iface->push]);
+		else
+			i2c_wr16(iface, S6_I2C_DATACMD,
+				 1 << S6_I2C_DATACMD_READ);
+		if (++iface->push >= m->len) {
+			iface->push = 0;
+			iface->msgs_push += 1;
+		}
+	}
+	do {
+		struct i2c_msg *m = &iface->msgs[iface->msgs_done];
+		if (!(m->flags & I2C_M_RD)) {
+			if (iface->msgs_done < iface->msgs_push)
+				iface->msgs_done += 1;
+			else
+				break;
+		} else if (i2c_rd16(iface, S6_I2C_STATUS)
+				& (1 << S6_I2C_STATUS_RFNE)) {
+			m->buf[iface->done] = i2c_rd16(iface, S6_I2C_DATACMD);
+			if (++iface->done >= m->len) {
+				iface->done = 0;
+				iface->msgs_done += 1;
+			}
+		} else{
+			break;
+		}
+	} while (iface->msgs_done < iface->msgs_num);
+	if (iface->msgs_done >= iface->msgs_num) {
+		i2c_wr16(iface, S6_I2C_INTRMASK, 1 << S6_I2C_INTR_TXABRT);
+		complete(&iface->complete);
+	} else if (iface->msgs_push >= iface->msgs_num) {
+		i2c_wr16(iface, S6_I2C_INTRMASK, (1 << S6_I2C_INTR_TXABRT) |
+						 (1 << S6_I2C_INTR_RXFULL));
+	} else {
+		i2c_wr16(iface, S6_I2C_INTRMASK, (1 << S6_I2C_INTR_TXABRT) |
+						 (1 << S6_I2C_INTR_TXEMPTY) |
+						 (1 << S6_I2C_INTR_RXFULL));
+	}
+}
+
+static irqreturn_t s6i2c_interrupt_entry(int irq, void *dev_id)
+{
+	struct s6i2c_if *iface = dev_id;
+	if (!(i2c_rd16(iface, S6_I2C_STATUS) & ((1 << S6_I2C_INTR_RXUNDER)
+					      | (1 << S6_I2C_INTR_RXOVER)
+					      | (1 << S6_I2C_INTR_RXFULL)
+					      | (1 << S6_I2C_INTR_TXOVER)
+					      | (1 << S6_I2C_INTR_TXEMPTY)
+					      | (1 << S6_I2C_INTR_RDREQ)
+					      | (1 << S6_I2C_INTR_TXABRT)
+					      | (1 << S6_I2C_INTR_RXDONE)
+					      | (1 << S6_I2C_INTR_ACTIVITY)
+					      | (1 << S6_I2C_INTR_STOPDET)
+					      | (1 << S6_I2C_INTR_STARTDET)
+					      | (1 << S6_I2C_INTR_GENCALL))))
+		return IRQ_NONE;
+
+	spin_lock(&iface->lock);
+	del_timer(&iface->timeout_timer);
+	s6i2c_handle_interrupt(iface);
+	spin_unlock(&iface->lock);
+	return IRQ_HANDLED;
+}
+
+static void s6i2c_timeout(unsigned long data)
+{
+	struct s6i2c_if *iface = (struct s6i2c_if *)data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iface->lock, flags);
+	s6i2c_handle_interrupt(iface);
+	if (--iface->timeout_count > 0) {
+		iface->timeout_timer.expires = jiffies + POLL_TIMEOUT;
+		add_timer(&iface->timeout_timer);
+	} else {
+		complete(&iface->complete);
+		i2c_wr16(iface, S6_I2C_INTRMASK, 0);
+	}
+	spin_unlock_irqrestore(&iface->lock, flags);
+}
+
+static int s6i2c_master_xfer(struct i2c_adapter *adap,
+				struct i2c_msg *msgs, int num)
+{
+	struct s6i2c_if *iface = adap->algo_data;
+	int i;
+	if (num == 0)
+		return 0;
+	if (i2c_rd16(iface, S6_I2C_STATUS) & (1 << S6_I2C_STATUS_ACTIVITY))
+		yield();
+	i2c_wr16(iface, S6_I2C_INTRMASK, 0);
+	i2c_rd16(iface, S6_I2C_CLRINTR);
+	for (i = 0; i < num; i++) {
+		if (msgs[i].flags & I2C_M_TEN) {
+			dev_err(&adap->dev,
+				"s6i2c: 10 bits addr not supported\n");
+			return -EINVAL;
+		}
+		if (msgs[i].len == 0) {
+			dev_err(&adap->dev,
+				"s6i2c: zero length message not supported\n");
+			return -EINVAL;
+		}
+		if (msgs[i].addr != msgs[0].addr) {
+			dev_err(&adap->dev,
+				"s6i2c: multiple xfer cannot change target\n");
+			return -EINVAL;
+		}
+	}
+
+	iface->msgs = msgs;
+	iface->msgs_num = num;
+	iface->msgs_push = 0;
+	iface->msgs_done = 0;
+	iface->push = 0;
+	iface->done = 0;
+	iface->timeout_count = 10;
+	i2c_wr16(iface, S6_I2C_TAR, msgs[0].addr);
+	i2c_wr16(iface, S6_I2C_ENABLE, 1);
+	i2c_wr16(iface, S6_I2C_INTRMASK, (1 << S6_I2C_INTR_TXEMPTY) |
+					 (1 << S6_I2C_INTR_TXABRT));
+
+	iface->timeout_timer.expires = jiffies + POLL_TIMEOUT;
+	add_timer(&iface->timeout_timer);
+	wait_for_completion(&iface->complete);
+	del_timer_sync(&iface->timeout_timer);
+	while (i2c_rd32(iface, S6_I2C_TXFLR) > 0)
+		schedule();
+	while (i2c_rd16(iface, S6_I2C_STATUS) & (1 << S6_I2C_STATUS_ACTIVITY))
+		schedule();
+
+	i2c_wr16(iface, S6_I2C_INTRMASK, 0);
+	i2c_wr16(iface, S6_I2C_ENABLE, 0);
+	return iface->msgs_done;
+}
+
+static u32 s6i2c_functionality(struct i2c_adapter *adap)
+{
+	return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
+}
+
+static struct i2c_algorithm s6i2c_algorithm = {
+	.master_xfer   = s6i2c_master_xfer,
+	.functionality = s6i2c_functionality,
+};
+
+static u16 __devinit nanoseconds_on_clk(struct s6i2c_if *iface, u32 ns)
+{
+	u32 dividend = ((clk_get_rate(iface->clk) / 1000) * ns) / 1000000;
+	if (dividend > 0xffff)
+		return 0xffff;
+	return dividend;
+}
+
+static int __devinit s6i2c_probe(struct platform_device *dev)
+{
+	struct s6i2c_if *iface = &s6i2c_if;
+	struct i2c_adapter *p_adap;
+	const char *clock;
+	int bus_num, rc;
+	spin_lock_init(&iface->lock);
+	init_completion(&iface->complete);
+	iface->irq = platform_get_irq(dev, 0);
+	if (iface->irq < 0) {
+		rc = iface->irq;
+		goto err_out;
+	}
+	iface->res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!iface->res) {
+		rc = -ENXIO;
+		goto err_out;
+	}
+	iface->res = request_mem_region(iface->res->start,
+					resource_size(iface->res),
+					dev->dev.bus_id);
+	if (!iface->res) {
+		rc = -EBUSY;
+		goto err_out;
+	}
+	iface->reg = ioremap_nocache(iface->res->start,
+				     resource_size(iface->res));
+	if (!iface->reg) {
+		rc = -ENOMEM;
+		goto err_reg;
+	}
+
+	clock = 0;
+	bus_num = -1;
+	if (dev->dev.platform_data) {
+		struct s6_i2c_platform_data *pdata = dev->dev.platform_data;
+		bus_num = pdata->bus_num;
+		clock = pdata->clock;
+	}
+	iface->clk = clk_get(&dev->dev, clock);
+	if (IS_ERR(iface->clk)) {
+		rc = PTR_ERR(iface->clk);
+		goto err_map;
+	}
+	rc = clk_enable(iface->clk);
+	if (rc < 0)
+		goto err_clk_put;
+	init_timer(&iface->timeout_timer);
+	iface->timeout_timer.function = s6i2c_timeout;
+	iface->timeout_timer.data = (unsigned long)iface;
+
+	p_adap = &iface->adap;
+	strlcpy(p_adap->name, dev->name, sizeof(p_adap->name));
+	p_adap->algo = &s6i2c_algorithm;
+	p_adap->algo_data = iface;
+	p_adap->nr = bus_num;
+	p_adap->class = 0;
+	p_adap->dev.parent = &dev->dev;
+	i2c_wr16(iface, S6_I2C_INTRMASK, 0);
+	rc = request_irq(iface->irq, s6i2c_interrupt_entry,
+			 IRQF_SHARED, dev->name, iface);
+	if (rc) {
+		dev_err(&p_adap->dev, "s6i2c: cant get IRQ %d\n", iface->irq);
+		goto err_clk_dis;
+	}
+
+	i2c_wr16(iface, S6_I2C_ENABLE, 0);
+	udelay(1);
+	i2c_wr32(iface, S6_I2C_SRESET, 1 << S6_I2C_SRESET_IC_SRST);
+	i2c_wr16(iface, S6_I2C_CLRTXABRT, 1);
+	i2c_wr16(iface, S6_I2C_CON,
+			(1 << S6_I2C_CON_MASTER) |
+			(S6_I2C_CON_SPEED_NORMAL << S6_I2C_CON_SPEED) |
+			(0 << S6_I2C_CON_10BITSLAVE) |
+			(0 << S6_I2C_CON_10BITMASTER) |
+			(1 << S6_I2C_CON_RESTARTENA) |
+			(1 << S6_I2C_CON_SLAVEDISABLE));
+	i2c_wr16(iface, S6_I2C_SSHCNT, nanoseconds_on_clk(iface, 4000));
+	i2c_wr16(iface, S6_I2C_SSLCNT, nanoseconds_on_clk(iface, 4700));
+	i2c_wr16(iface, S6_I2C_FSHCNT, nanoseconds_on_clk(iface, 600));
+	i2c_wr16(iface, S6_I2C_FSLCNT, nanoseconds_on_clk(iface, 1300));
+	i2c_wr16(iface, S6_I2C_RXTL, 0);
+	i2c_wr16(iface, S6_I2C_TXTL, 0);
+
+	platform_set_drvdata(dev, iface);
+	if (bus_num < 0)
+		rc = i2c_add_adapter(p_adap);
+	else
+		rc = i2c_add_numbered_adapter(p_adap);
+	if (rc)
+		goto err_irq_free;
+	return 0;
+
+err_irq_free:
+	free_irq(iface->irq, iface);
+err_clk_dis:
+	clk_disable(iface->clk);
+err_clk_put:
+	clk_put(iface->clk);
+err_map:
+	iounmap(iface->reg);
+err_reg:
+	release_mem_region(iface->res->start,
+			   resource_size(iface->res));
+err_out:
+	return rc;
+}
+
+static int __devexit s6i2c_remove(struct platform_device *pdev)
+{
+	struct s6i2c_if *iface = platform_get_drvdata(pdev);
+	i2c_wr16(iface, S6_I2C_ENABLE, 0);
+	platform_set_drvdata(pdev, NULL);
+	i2c_del_adapter(&iface->adap);
+	free_irq(iface->irq, iface);
+	clk_disable(iface->clk);
+	clk_put(iface->clk);
+	iounmap(iface->reg);
+	release_mem_region(iface->res->start,
+			   resource_size(iface->res));
+	return 0;
+}
+
+static struct platform_driver s6i2c_driver = {
+	.probe		= s6i2c_probe,
+	.remove		= __devexit_p(s6i2c_remove),
+	.driver		= {
+		.name	= DRV_NAME,
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init s6i2c_init(void)
+{
+	pr_info("I2C: S6000 I2C driver\n");
+	return platform_driver_register(&s6i2c_driver);
+}
+
+static void __exit s6i2c_exit(void)
+{
+	platform_driver_unregister(&s6i2c_driver);
+}
+
+MODULE_DESCRIPTION("I2C-Bus adapter routines for S6000 I2C");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:" DRV_NAME);
+
+subsys_initcall(s6i2c_init);
+module_exit(s6i2c_exit);
diff --git a/drivers/i2c/busses/i2c-s6000.h b/drivers/i2c/busses/i2c-s6000.h
new file mode 100644
index 000000000000..ff23b81ded44
--- /dev/null
+++ b/drivers/i2c/busses/i2c-s6000.h
@@ -0,0 +1,79 @@
+/*
+ * drivers/i2c/busses/i2c-s6000.h
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2008 Emlix GmbH <info@emlix.com>
+ * Author:	Oskar Schirmer <os@emlix.com>
+ */
+
+#ifndef __DRIVERS_I2C_BUSSES_I2C_S6000_H
+#define __DRIVERS_I2C_BUSSES_I2C_S6000_H
+
+#define S6_I2C_CON		0x000
+#define S6_I2C_CON_MASTER		0
+#define S6_I2C_CON_SPEED		1
+#define S6_I2C_CON_SPEED_NORMAL			1
+#define S6_I2C_CON_SPEED_FAST			2
+#define S6_I2C_CON_SPEED_MASK			3
+#define S6_I2C_CON_10BITSLAVE		3
+#define S6_I2C_CON_10BITMASTER		4
+#define S6_I2C_CON_RESTARTENA		5
+#define S6_I2C_CON_SLAVEDISABLE		6
+#define S6_I2C_TAR		0x004
+#define S6_I2C_TAR_GCORSTART		10
+#define S6_I2C_TAR_SPECIAL		11
+#define S6_I2C_SAR		0x008
+#define S6_I2C_HSMADDR		0x00C
+#define S6_I2C_DATACMD		0x010
+#define S6_I2C_DATACMD_READ		8
+#define S6_I2C_SSHCNT		0x014
+#define S6_I2C_SSLCNT		0x018
+#define S6_I2C_FSHCNT		0x01C
+#define S6_I2C_FSLCNT		0x020
+#define S6_I2C_INTRSTAT		0x02C
+#define S6_I2C_INTRMASK		0x030
+#define S6_I2C_RAWINTR		0x034
+#define S6_I2C_INTR_RXUNDER		0
+#define S6_I2C_INTR_RXOVER		1
+#define S6_I2C_INTR_RXFULL		2
+#define S6_I2C_INTR_TXOVER		3
+#define S6_I2C_INTR_TXEMPTY		4
+#define S6_I2C_INTR_RDREQ		5
+#define S6_I2C_INTR_TXABRT		6
+#define S6_I2C_INTR_RXDONE		7
+#define S6_I2C_INTR_ACTIVITY		8
+#define S6_I2C_INTR_STOPDET		9
+#define S6_I2C_INTR_STARTDET		10
+#define S6_I2C_INTR_GENCALL		11
+#define S6_I2C_RXTL		0x038
+#define S6_I2C_TXTL		0x03C
+#define S6_I2C_CLRINTR		0x040
+#define S6_I2C_CLRRXUNDER	0x044
+#define S6_I2C_CLRRXOVER	0x048
+#define S6_I2C_CLRTXOVER	0x04C
+#define S6_I2C_CLRRDREQ		0x050
+#define S6_I2C_CLRTXABRT	0x054
+#define S6_I2C_CLRRXDONE	0x058
+#define S6_I2C_CLRACTIVITY	0x05C
+#define S6_I2C_CLRSTOPDET	0x060
+#define S6_I2C_CLRSTARTDET	0x064
+#define S6_I2C_CLRGENCALL	0x068
+#define S6_I2C_ENABLE		0x06C
+#define S6_I2C_STATUS		0x070
+#define S6_I2C_STATUS_ACTIVITY		0
+#define S6_I2C_STATUS_TFNF		1
+#define S6_I2C_STATUS_TFE		2
+#define S6_I2C_STATUS_RFNE		3
+#define S6_I2C_STATUS_RFF		4
+#define S6_I2C_TXFLR		0x074
+#define S6_I2C_RXFLR		0x078
+#define S6_I2C_SRESET		0x07C
+#define S6_I2C_SRESET_IC_SRST		0
+#define S6_I2C_SRESET_IC_MASTER_SRST	1
+#define S6_I2C_SRESET_IC_SLAVE_SRST	2
+#define S6_I2C_TXABRTSOURCE	0x080
+
+#endif
diff --git a/include/linux/i2c/s6000.h b/include/linux/i2c/s6000.h
new file mode 100644
index 000000000000..d9b34bfdae76
--- /dev/null
+++ b/include/linux/i2c/s6000.h
@@ -0,0 +1,10 @@
+#ifndef __LINUX_I2C_S6000_H
+#define __LINUX_I2C_S6000_H
+
+struct s6_i2c_platform_data {
+	const char *clock; /* the clock to use */
+	int bus_num; /* the bus number to register */
+};
+
+#endif
+
-- 
cgit v1.2.3


From 5ac9f62267dc92c7735c642a5942d9e6c1190308 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 25 Mar 2009 20:55:00 -0400
Subject: function-graph: add proper initialization for init task

Impact: fix to crash going to kexec

The init task did not properly initialize the function graph pointers.
Altough these pointers are NULL, they can not be assumed to be NULL
for the init task, and must still be properly initialize.

This usually is not an issue since a problem only arises when a task
exits, and the init tasks do not usually exit. But when doing tests
with kexec, the init tasks do exit, and the bug appears.

This patch properly initializes the init tasks function graph data
structures.

Reported-and-Tested-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <alpine.DEB.2.00.0903252053080.5675@gandalf.stny.rr.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h    | 8 ++++++--
 include/linux/init_task.h | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 015a3d22cf74..da5405dce347 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -356,6 +356,9 @@ struct ftrace_graph_ret {
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
+/* for init task */
+#define INIT_FTRACE_GRAPH		.ret_stack = NULL
+
 /*
  * Stack of return addresses for functions
  * of a thread.
@@ -430,10 +433,11 @@ static inline void unpause_graph_tracing(void)
 {
 	atomic_dec(&current->tracing_graph_pause);
 }
-#else
+#else /* !CONFIG_FUNCTION_GRAPH_TRACER */
 
 #define __notrace_funcgraph
 #define __irq_entry
+#define INIT_FTRACE_GRAPH
 
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
@@ -445,7 +449,7 @@ static inline int task_curr_ret_stack(struct task_struct *tsk)
 
 static inline void pause_graph_tracing(void) { }
 static inline void unpause_graph_tracing(void) { }
-#endif
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 #ifdef CONFIG_TRACING
 #include <linux/sched.h>
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e752d973fa21..cada05447bc8 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -5,6 +5,7 @@
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
 #include <linux/lockdep.h>
+#include <linux/ftrace.h>
 #include <linux/ipc.h>
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
@@ -184,6 +185,7 @@ extern struct cred init_cred;
 	INIT_IDS							\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
+	INIT_FTRACE_GRAPH						\
 }
 
 
-- 
cgit v1.2.3


From d9ad8bc0ca823705413f75b50c442a88cc518b35 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@gmail.com>
Date: Sun, 5 Apr 2009 16:20:02 +0200
Subject: branch tracer: Fix for enabling branch profiling makes sparse
 unusable

One of the changes between kernels 2.6.28 and 2.6.29 is that a branch profiler
has been added for if() statements. Unfortunately this patch makes the sparse
output unusable with CONFIG_TRACE_BRANCH_PROFILING=y: when branch profiling is
enabled, sparse prints so much false positives that the real issues are no
longer visible. This behavior can be reproduced as follows:
* enable CONFIG_TRACE_BRANCH_PROFILING, e.g. by running make allyesconfig or
  make allmodconfig.
* run make C=2

Result: a huge number of the following sparse warnings.
...
include/linux/cpumask.h:547:2: warning: symbol '______r' shadows an earlier one
include/linux/cpumask.h:547:2: originally declared here
...

The patch below fixes this by disabling branch profiling while analyzing the
kernel code with sparse.

See also:
* http://lkml.org/lkml/2008/11/21/18
* http://bugzilla.kernel.org/show_bug.cgi?id=12925

Signed-off-by: Bart Van Assche <bart.vanassche@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <200904051620.02311.bart.vanassche@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/compiler.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 6faa7e549de4..8872ad6dd89b 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -76,7 +76,8 @@ struct ftrace_branch_data {
  * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
  * to disable branch tracing on a per file basis.
  */
-#if defined(CONFIG_TRACE_BRANCH_PROFILING) && !defined(DISABLE_BRANCH_PROFILING)
+#if defined(CONFIG_TRACE_BRANCH_PROFILING) \
+    && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__)
 void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 
 #define likely_notrace(x)	__builtin_expect(!!(x), 1)
-- 
cgit v1.2.3


From ab3c9c686e22ab264269337ce7b75d9760211198 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 7 Apr 2009 07:59:41 -0700
Subject: branch tracer, intel-iommu: fix build with CONFIG_BRANCH_TRACER=y

Fix the branch tracer barfing on comma statements within if ()
statements.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/compiler.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8872ad6dd89b..37bcb50a4d7c 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -115,7 +115,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
  * "Define 'is'", Bill Clinton
  * "Define 'if'", Steven Rostedt
  */
-#define if(cond) if (__builtin_constant_p((cond)) ? !!(cond) :		\
+#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
+#define __trace_if(cond) \
+	if (__builtin_constant_p((cond)) ? !!(cond) :			\
 	({								\
 		int ______r;						\
 		static struct ftrace_branch_data			\
-- 
cgit v1.2.3


From fafd688e4c0c34da0f3de909881117d374e4c7af Mon Sep 17 00:00:00 2001
From: Peter W Morreale <pmorreale@novell.com>
Date: Mon, 6 Apr 2009 19:00:29 -0700
Subject: mm: add /proc controls for pdflush threads

Add /proc entries to give the admin the ability to control the minimum and
maximum number of pdflush threads.  This allows finer control of pdflush
on both large and small machines.

The rationale is simply one size does not fit all.  Admins on large and/or
small systems may want to tune the min/max pdflush thread count to best
suit their needs.  Right now the min/max is hardcoded to 2/8.  While
probably a fair estimate for smaller machines, large machines with large
numbers of CPUs and large numbers of filesystems/block devices may benefit
from larger numbers of threads working on different block devices.

Even if the background flushing algorithm is radically changed, it is
still likely that multiple threads will be involved and admins would still
desire finer control on the min/max other than to have to recompile the
kernel.

The patch adds '/proc/sys/vm/nr_pdflush_threads_min' and
'/proc/sys/vm/nr_pdflush_threads_max' with r/w permissions.

The minimum value for nr_pdflush_threads_min is 1 and the maximum value is
the current value of nr_pdflush_threads_max.  This minimum is required
since additional thread creation is performed in a pdflush thread itself.

The minimum value for nr_pdflush_threads_max is the current value of
nr_pdflush_threads_min and the maximum value can be 1000.

Documentation/sysctl/vm.txt is also updated.

[akpm@linux-foundation.org: fix comment, fix whitespace, use __read_mostly]
Signed-off-by: Peter W Morreale <pmorreale@novell.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 28 ++++++++++++++++++++++++++++
 include/linux/writeback.h   |  2 ++
 kernel/sysctl.c             | 23 +++++++++++++++++++++++
 mm/pdflush.c                | 31 +++++++++++++++++++------------
 4 files changed, 72 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 3197fc83bc51..97c4b3284329 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -39,6 +39,8 @@ Currently, these files are in /proc/sys/vm:
 - nr_hugepages
 - nr_overcommit_hugepages
 - nr_pdflush_threads
+- nr_pdflush_threads_min
+- nr_pdflush_threads_max
 - nr_trim_pages         (only if CONFIG_MMU=n)
 - numa_zonelist_order
 - oom_dump_tasks
@@ -463,6 +465,32 @@ The default value is 0.
 
 ==============================================================
 
+nr_pdflush_threads_min
+
+This value controls the minimum number of pdflush threads.
+
+At boot time, the kernel will create and maintain 'nr_pdflush_threads_min'
+threads for the kernel's lifetime.
+
+The default value is 2.  The minimum value you can specify is 1, and
+the maximum value is the current setting of 'nr_pdflush_threads_max'.
+
+See 'nr_pdflush_threads_max' below for more information.
+
+==============================================================
+
+nr_pdflush_threads_max
+
+This value controls the maximum number of pdflush threads that can be
+created.  The pdflush algorithm will create a new pdflush thread (up to
+this maximum) if no pdflush threads have been available for >= 1 second.
+
+The default value is 8.  The minimum value you can specify is the
+current value of 'nr_pdflush_threads_min' and the
+maximum is 1000.
+
+==============================================================
+
 overcommit_memory:
 
 This value contains a flag that enables memory overcommitment.
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 93445477f86a..9c1ed1fb6ddb 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -168,6 +168,8 @@ void writeback_set_ratelimit(void);
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
 				   read-only. */
+extern int nr_pdflush_threads_max; /* Global so it can be exported to sysctl */
+extern int nr_pdflush_threads_min; /* Global so it can be exported to sysctl */
 
 
 #endif		/* WRITEBACK_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b125e3387568..72eb1a41dcab 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -101,6 +101,7 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
+static int one_thousand = 1000;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -1026,6 +1027,28 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0444 /* read-only*/,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_pdflush_threads_min",
+		.data		= &nr_pdflush_threads_min,
+		.maxlen		= sizeof nr_pdflush_threads_min,
+		.mode		= 0644 /* read-write */,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &nr_pdflush_threads_max,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_pdflush_threads_max",
+		.data		= &nr_pdflush_threads_max,
+		.maxlen		= sizeof nr_pdflush_threads_max,
+		.mode		= 0644 /* read-write */,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &nr_pdflush_threads_min,
+		.extra2		= &one_thousand,
+	},
 	{
 		.ctl_name	= VM_SWAPPINESS,
 		.procname	= "swappiness",
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 235ac440c44e..f2caf96993f8 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -57,6 +57,14 @@ static DEFINE_SPINLOCK(pdflush_lock);
  */
 int nr_pdflush_threads = 0;
 
+/*
+ * The max/min number of pdflush threads. R/W by sysctl at
+ * /proc/sys/vm/nr_pdflush_threads_max/min
+ */
+int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS;
+int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS;
+
+
 /*
  * The time at which the pdflush thread pool last went empty
  */
@@ -68,7 +76,7 @@ static unsigned long last_empty_jifs;
  * Thread pool management algorithm:
  * 
  * - The minimum and maximum number of pdflush instances are bound
- *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
+ *   by nr_pdflush_threads_min and nr_pdflush_threads_max.
  * 
  * - If there have been no idle pdflush instances for 1 second, create
  *   a new one.
@@ -134,14 +142,13 @@ static int __pdflush(struct pdflush_work *my_work)
 		 * To throttle creation, we reset last_empty_jifs.
 		 */
 		if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
-			if (list_empty(&pdflush_list)) {
-				if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
-					last_empty_jifs = jiffies;
-					nr_pdflush_threads++;
-					spin_unlock_irq(&pdflush_lock);
-					start_one_pdflush_thread();
-					spin_lock_irq(&pdflush_lock);
-				}
+			if (list_empty(&pdflush_list) &&
+			    nr_pdflush_threads < nr_pdflush_threads_max) {
+				last_empty_jifs = jiffies;
+				nr_pdflush_threads++;
+				spin_unlock_irq(&pdflush_lock);
+				start_one_pdflush_thread();
+				spin_lock_irq(&pdflush_lock);
 			}
 		}
 
@@ -153,7 +160,7 @@ static int __pdflush(struct pdflush_work *my_work)
 		 */
 		if (list_empty(&pdflush_list))
 			continue;
-		if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
+		if (nr_pdflush_threads <= nr_pdflush_threads_min)
 			continue;
 		pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
 		if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
@@ -259,9 +266,9 @@ static int __init pdflush_init(void)
 	 * Pre-set nr_pdflush_threads...  If we fail to create,
 	 * the count will be decremented.
 	 */
-	nr_pdflush_threads = MIN_PDFLUSH_THREADS;
+	nr_pdflush_threads = nr_pdflush_threads_min;
 
-	for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
+	for (i = 0; i < nr_pdflush_threads_min; i++)
 		start_one_pdflush_thread();
 	return 0;
 }
-- 
cgit v1.2.3


From fd5e191e7610eb7ecb5e35b2045ceb6554bea15a Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Mon, 6 Apr 2009 19:00:56 -0700
Subject: SPI: add dma_alignment field to spi_master

Some SPI controllers have restrictions on DMAable buffers alignemt.
Currently if the buffer supplied by protocol driver is not properly
aligned, the controller silently performs transfer in PIO mode.  Addition
of dma_alignment field to spi_master allows protocol drivers to perform
proper alignment.

Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Cc: Bryan Wu <bryan.wu@analog.com>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/spi/spi.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 68bb1c501d0d..2cc43fa380cb 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -204,6 +204,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  *	SPI slaves, and are numbered from zero to num_chipselects.
  *	each slave has a chipselect signal, but it's common that not
  *	every chipselect is connected to a slave.
+ * @dma_alignment: SPI controller constraint on DMA buffers alignment.
  * @setup: updates the device mode and clocking records used by a
  *	device's SPI controller; protocol code may call this.  This
  *	must fail if an unrecognized or unsupported mode is requested.
@@ -239,6 +240,11 @@ struct spi_master {
 	 */
 	u16			num_chipselect;
 
+	/* some SPI controllers pose alignment requirements on DMAable
+	 * buffers; let protocol drivers know about these requirements.
+	 */
+	u16			dma_alignment;
+
 	/* setup mode and clock, etc (spi driver may call many times) */
 	int			(*setup)(struct spi_device *spi);
 
-- 
cgit v1.2.3


From cc00e9cfe0e5c4c31057c722e49fdf2c76dd5953 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:01:00 -0700
Subject: kprobes: cleanup comment style in kprobes.h

Fix comment style in kprobes.h.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kprobes.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 2ec6cc14a114..39826a678364 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -94,12 +94,16 @@ struct kprobe {
 	/* Called after addr is executed, unless... */
 	kprobe_post_handler_t post_handler;
 
-	/* ... called if executing addr causes a fault (eg. page fault).
-	 * Return 1 if it handled fault, otherwise kernel will see it. */
+	/*
+	 * ... called if executing addr causes a fault (eg. page fault).
+	 * Return 1 if it handled fault, otherwise kernel will see it.
+	 */
 	kprobe_fault_handler_t fault_handler;
 
-	/* ... called if breakpoint trap occurs in probe handler.
-	 * Return 1 if it handled break, otherwise kernel will see it. */
+	/*
+	 * ... called if breakpoint trap occurs in probe handler.
+	 * Return 1 if it handled break, otherwise kernel will see it.
+	 */
 	kprobe_break_handler_t break_handler;
 
 	/* Saved opcode (which has been replaced with breakpoint) */
-- 
cgit v1.2.3


From de5bd88d5a5cce3cacea904d3503e5ebdb3852a2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:01:02 -0700
Subject: kprobes: support per-kprobe disabling

Add disable_kprobe() and enable_kprobe() to disable/enable kprobes
temporarily.

disable_kprobe() asynchronously disables probe handlers of specified
kprobe.  So, after calling it, some handlers can be called at a while.
enable_kprobe() enables specified kprobe.

aggr_pre_handler and aggr_post_handler check disabled probes.  On the
other hand aggr_break_handler and aggr_fault_handler don't check it
because these handlers will be called while executing pre or post handlers
and usually those help error handling.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kprobes.txt |  34 ++++++++--
 include/linux/kprobes.h   |  23 ++++++-
 kernel/kprobes.c          | 167 ++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 191 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 48b3de90eb1e..f609af242d6c 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -212,7 +212,9 @@ hit, Kprobes calls kp->pre_handler.  After the probed instruction
 is single-stepped, Kprobe calls kp->post_handler.  If a fault
 occurs during execution of kp->pre_handler or kp->post_handler,
 or during single-stepping of the probed instruction, Kprobes calls
-kp->fault_handler.  Any or all handlers can be NULL.
+kp->fault_handler.  Any or all handlers can be NULL. If kp->flags
+is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
+so, it's handlers aren't hit until calling enable_kprobe(kp).
 
 NOTE:
 1. With the introduction of the "symbol_name" field to struct kprobe,
@@ -363,6 +365,22 @@ probes) in the specified array, they clear the addr field of those
 incorrect probes. However, other probes in the array are
 unregistered correctly.
 
+4.7 disable_kprobe
+
+#include <linux/kprobes.h>
+int disable_kprobe(struct kprobe *kp);
+
+Temporarily disables the specified kprobe. You can enable it again by using
+enable_kprobe(). You must specify the kprobe which has been registered.
+
+4.8 enable_kprobe
+
+#include <linux/kprobes.h>
+int enable_kprobe(struct kprobe *kp);
+
+Enables kprobe which has been disabled by disable_kprobe(). You must specify
+the kprobe which has been registered.
+
 5. Kprobes Features and Limitations
 
 Kprobes allows multiple probes at the same address.  Currently,
@@ -500,10 +518,14 @@ the probe. If the probed function belongs to a module, the module name
 is also specified. Following columns show probe status. If the probe is on
 a virtual address that is no longer valid (module init sections, module
 virtual addresses that correspond to modules that've been unloaded),
-such probes are marked with [GONE].
+such probes are marked with [GONE]. If the probe is temporarily disabled,
+such probes are marked with [DISABLED].
 
-/debug/kprobes/enabled: Turn kprobes ON/OFF
+/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
 
-Provides a knob to globally turn registered kprobes ON or OFF. By default,
-all kprobes are enabled. By echoing "0" to this file, all registered probes
-will be disarmed, till such time a "1" is echoed to this file.
+Provides a knob to globally and forcibly turn registered kprobes ON or OFF.
+By default, all kprobes are enabled. By echoing "0" to this file, all
+registered probes will be disarmed, till such time a "1" is echoed to this
+file. Note that this knob just disarms and arms all kprobes and doesn't
+change each probe's disabling state. This means that disabled kprobes (marked
+[DISABLED]) will be not enabled if you turn ON all kprobes by this knob.
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 39826a678364..1071cfddddc9 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -112,18 +112,28 @@ struct kprobe {
 	/* copy of the original instruction */
 	struct arch_specific_insn ainsn;
 
-	/* Indicates various status flags.  Protected by kprobe_mutex. */
+	/*
+	 * Indicates various status flags.
+	 * Protected by kprobe_mutex after this kprobe is registered.
+	 */
 	u32 flags;
 };
 
 /* Kprobe status flags */
 #define KPROBE_FLAG_GONE	1 /* breakpoint has already gone */
+#define KPROBE_FLAG_DISABLED	2 /* probe is temporarily disabled */
 
+/* Has this kprobe gone ? */
 static inline int kprobe_gone(struct kprobe *p)
 {
 	return p->flags & KPROBE_FLAG_GONE;
 }
 
+/* Is this kprobe disabled ? */
+static inline int kprobe_disabled(struct kprobe *p)
+{
+	return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE);
+}
 /*
  * Special probe type that uses setjmp-longjmp type tricks to resume
  * execution at a specified entry with a matching prototype corresponding
@@ -283,6 +293,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num);
 void kprobe_flush_task(struct task_struct *tk);
 void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 
+int disable_kprobe(struct kprobe *kp);
+int enable_kprobe(struct kprobe *kp);
+
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobes_built_in(void)
@@ -349,5 +362,13 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num)
 static inline void kprobe_flush_task(struct task_struct *tk)
 {
 }
+static inline int disable_kprobe(struct kprobe *kp)
+{
+	return -ENOSYS;
+}
+static inline int enable_kprobe(struct kprobe *kp)
+{
+	return -ENOSYS;
+}
 #endif /* CONFIG_KPROBES */
 #endif /* _LINUX_KPROBES_H */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index dae198b68e97..a5e74ddee0e2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -328,7 +328,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe *kp;
 
 	list_for_each_entry_rcu(kp, &p->list, list) {
-		if (kp->pre_handler && !kprobe_gone(kp)) {
+		if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
 			set_kprobe_instance(kp);
 			if (kp->pre_handler(kp, regs))
 				return 1;
@@ -344,7 +344,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 	struct kprobe *kp;
 
 	list_for_each_entry_rcu(kp, &p->list, list) {
-		if (kp->post_handler && !kprobe_gone(kp)) {
+		if (kp->post_handler && likely(!kprobe_disabled(kp))) {
 			set_kprobe_instance(kp);
 			kp->post_handler(kp, regs, flags);
 			reset_kprobe_instance();
@@ -523,6 +523,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 */
 static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
+	BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
 	if (p->break_handler) {
 		if (ap->break_handler)
 			return -EEXIST;
@@ -532,6 +533,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 		list_add_rcu(&p->list, &ap->list);
 	if (p->post_handler && !ap->post_handler)
 		ap->post_handler = aggr_post_handler;
+
+	if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
+		ap->flags &= ~KPROBE_FLAG_DISABLED;
+		if (!kprobes_all_disarmed)
+			/* Arm the breakpoint again. */
+			arch_arm_kprobe(ap);
+	}
 	return 0;
 }
 
@@ -592,20 +600,36 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 			 * freed by unregister_kprobe.
 			 */
 			return ret;
-		/* Clear gone flag to prevent allocating new slot again. */
-		ap->flags &= ~KPROBE_FLAG_GONE;
+
 		/*
-		 * If the old_p has gone, its breakpoint has been disarmed.
-		 * We have to arm it again after preparing real kprobes.
+		 * Clear gone flag to prevent allocating new slot again, and
+		 * set disabled flag because it is not armed yet.
 		 */
-		if (!kprobes_all_disarmed)
-			arch_arm_kprobe(ap);
+		ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
+			    | KPROBE_FLAG_DISABLED;
 	}
 
 	copy_kprobe(ap, p);
 	return add_new_kprobe(ap, p);
 }
 
+/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
+static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
+{
+	struct kprobe *kp;
+
+	list_for_each_entry_rcu(kp, &p->list, list) {
+		if (!kprobe_disabled(kp))
+			/*
+			 * There is an active probe on the list.
+			 * We can't disable aggr_kprobe.
+			 */
+			return 0;
+	}
+	p->flags |= KPROBE_FLAG_DISABLED;
+	return 1;
+}
+
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
 	struct kprobe_blackpoint *kb;
@@ -664,7 +688,9 @@ int __kprobes register_kprobe(struct kprobe *p)
 		return -EINVAL;
 	}
 
-	p->flags = 0;
+	/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+	p->flags &= KPROBE_FLAG_DISABLED;
+
 	/*
 	 * Check if are we probing a module.
 	 */
@@ -709,7 +735,7 @@ int __kprobes register_kprobe(struct kprobe *p)
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	if (!kprobes_all_disarmed)
+	if (!kprobes_all_disarmed && !kprobe_disabled(p))
 		arch_arm_kprobe(p);
 
 out_unlock_text:
@@ -724,25 +750,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
 
-/*
- * Unregister a kprobe without a scheduler synchronization.
- */
-static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
 {
 	struct kprobe *old_p, *list_p;
 
 	old_p = get_kprobe(p->addr);
 	if (unlikely(!old_p))
-		return -EINVAL;
+		return NULL;
 
 	if (p != old_p) {
 		list_for_each_entry_rcu(list_p, &old_p->list, list)
 			if (list_p == p)
 			/* kprobe p is a valid probe */
-				goto valid_p;
-		return -EINVAL;
+				goto valid;
+		return NULL;
 	}
-valid_p:
+valid:
+	return old_p;
+}
+
+/*
+ * Unregister a kprobe without a scheduler synchronization.
+ */
+static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+{
+	struct kprobe *old_p, *list_p;
+
+	old_p = __get_valid_kprobe(p);
+	if (old_p == NULL)
+		return -EINVAL;
+
 	if (old_p == p ||
 	    (old_p->pre_handler == aggr_pre_handler &&
 	     list_is_singular(&old_p->list))) {
@@ -751,7 +789,7 @@ valid_p:
 		 * enabled and not gone - otherwise, the breakpoint would
 		 * already have been removed. We save on flushing icache.
 		 */
-		if (!kprobes_all_disarmed && !kprobe_gone(old_p)) {
+		if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) {
 			mutex_lock(&text_mutex);
 			arch_disarm_kprobe(p);
 			mutex_unlock(&text_mutex);
@@ -769,6 +807,11 @@ valid_p:
 		}
 noclean:
 		list_del_rcu(&p->list);
+		if (!kprobe_disabled(old_p)) {
+			try_to_disable_aggr_kprobe(old_p);
+			if (!kprobes_all_disarmed && kprobe_disabled(old_p))
+				arch_disarm_kprobe(old_p);
+		}
 	}
 	return 0;
 }
@@ -1078,6 +1121,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 static void __kprobes kill_kprobe(struct kprobe *p)
 {
 	struct kprobe *kp;
+
 	p->flags |= KPROBE_FLAG_GONE;
 	if (p->pre_handler == aggr_pre_handler) {
 		/*
@@ -1219,12 +1263,18 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
 	else
 		kprobe_type = "k";
 	if (sym)
-		seq_printf(pi, "%p  %s  %s+0x%x  %s %s\n", p->addr, kprobe_type,
-			sym, offset, (modname ? modname : " "),
-			(kprobe_gone(p) ? "[GONE]" : ""));
+		seq_printf(pi, "%p  %s  %s+0x%x  %s %s%s\n",
+			p->addr, kprobe_type, sym, offset,
+			(modname ? modname : " "),
+			(kprobe_gone(p) ? "[GONE]" : ""),
+			((kprobe_disabled(p) && !kprobe_gone(p)) ?
+			 "[DISABLED]" : ""));
 	else
-		seq_printf(pi, "%p  %s  %p %s\n", p->addr, kprobe_type, p->addr,
-			(kprobe_gone(p) ? "[GONE]" : ""));
+		seq_printf(pi, "%p  %s  %p %s%s\n",
+			p->addr, kprobe_type, p->addr,
+			(kprobe_gone(p) ? "[GONE]" : ""),
+			((kprobe_disabled(p) && !kprobe_gone(p)) ?
+			 "[DISABLED]" : ""));
 }
 
 static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1289,6 +1339,71 @@ static struct file_operations debugfs_kprobes_operations = {
 	.release        = seq_release,
 };
 
+/* Disable one kprobe */
+int __kprobes disable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* If the probe is already disabled (or gone), just return */
+	if (kprobe_disabled(kp))
+		goto out;
+
+	kp->flags |= KPROBE_FLAG_DISABLED;
+	if (p != kp)
+		/* When kp != p, p is always enabled. */
+		try_to_disable_aggr_kprobe(p);
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p))
+		arch_disarm_kprobe(p);
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+
+/* Enable one kprobe */
+int __kprobes enable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (kprobe_gone(kp)) {
+		/* This kprobe has gone, we couldn't enable it. */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p))
+		arch_arm_kprobe(p);
+
+	p->flags &= ~KPROBE_FLAG_DISABLED;
+	if (p != kp)
+		kp->flags &= ~KPROBE_FLAG_DISABLED;
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
+
 static void __kprobes arm_all_kprobes(void)
 {
 	struct hlist_head *head;
@@ -1306,7 +1421,7 @@ static void __kprobes arm_all_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist)
-			if (!kprobe_gone(p))
+			if (!kprobe_disabled(p))
 				arch_arm_kprobe(p);
 	}
 	mutex_unlock(&text_mutex);
@@ -1338,7 +1453,7 @@ static void __kprobes disarm_all_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist) {
-			if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
+			if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
 				arch_disarm_kprobe(p);
 		}
 	}
-- 
cgit v1.2.3


From 8f9b15286a8ea49e997e845d02d357ed33ebd090 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:01:02 -0700
Subject: kprobes: support kretprobe and jprobe per-probe disabling

Add disable/enable_kretprobe() and disable/enable_jprobe().

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kprobes.txt | 16 ++++++++++------
 include/linux/kprobes.h   | 17 +++++++++++++++++
 2 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index f609af242d6c..1e7a769a10f9 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -365,21 +365,25 @@ probes) in the specified array, they clear the addr field of those
 incorrect probes. However, other probes in the array are
 unregistered correctly.
 
-4.7 disable_kprobe
+4.7 disable_*probe
 
 #include <linux/kprobes.h>
 int disable_kprobe(struct kprobe *kp);
+int disable_kretprobe(struct kretprobe *rp);
+int disable_jprobe(struct jprobe *jp);
 
-Temporarily disables the specified kprobe. You can enable it again by using
-enable_kprobe(). You must specify the kprobe which has been registered.
+Temporarily disables the specified *probe. You can enable it again by using
+enable_*probe(). You must specify the probe which has been registered.
 
-4.8 enable_kprobe
+4.8 enable_*probe
 
 #include <linux/kprobes.h>
 int enable_kprobe(struct kprobe *kp);
+int enable_kretprobe(struct kretprobe *rp);
+int enable_jprobe(struct jprobe *jp);
 
-Enables kprobe which has been disabled by disable_kprobe(). You must specify
-the kprobe which has been registered.
+Enables *probe which has been disabled by disable_*probe(). You must specify
+the probe which has been registered.
 
 5. Kprobes Features and Limitations
 
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 1071cfddddc9..bcd9c07848be 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -371,4 +371,21 @@ static inline int enable_kprobe(struct kprobe *kp)
 	return -ENOSYS;
 }
 #endif /* CONFIG_KPROBES */
+static inline int disable_kretprobe(struct kretprobe *rp)
+{
+	return disable_kprobe(&rp->kp);
+}
+static inline int enable_kretprobe(struct kretprobe *rp)
+{
+	return enable_kprobe(&rp->kp);
+}
+static inline int disable_jprobe(struct jprobe *jp)
+{
+	return disable_kprobe(&jp->kp);
+}
+static inline int enable_jprobe(struct jprobe *jp)
+{
+	return enable_kprobe(&jp->kp);
+}
+
 #endif /* _LINUX_KPROBES_H */
-- 
cgit v1.2.3


From 909e6d94795654040ed416ac69858d5d2ce66dd3 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 6 Apr 2009 19:01:07 -0700
Subject: namespaces: move proc_net_get_sb to a generic fs/super.c helper

The mqueuefs filesystem will use this helper as well.  Proc's main get_sb
could also be made to use it, but that will require a bit more rework.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c         | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  3 +++
 2 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 77cb4ec919b9..786fe7d72790 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -771,6 +771,46 @@ void kill_litter_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_litter_super);
 
+static int ns_test_super(struct super_block *sb, void *data)
+{
+	return sb->s_fs_info == data;
+}
+
+static int ns_set_super(struct super_block *sb, void *data)
+{
+	sb->s_fs_info = data;
+	return set_anon_super(sb, NULL);
+}
+
+int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
+{
+	struct super_block *sb;
+
+	sb = sget(fs_type, ns_test_super, ns_set_super, data);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
+
+	if (!sb->s_root) {
+		int err;
+		sb->s_flags = flags;
+		err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		if (err) {
+			up_write(&sb->s_umount);
+			deactivate_super(sb);
+			return err;
+		}
+
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	simple_set_mnt(mnt, sb);
+	return 0;
+}
+
+EXPORT_SYMBOL(get_sb_ns);
+
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bce40a2207ee..562d2855cf30 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1699,6 +1699,9 @@ struct file_system_type {
 	struct lock_class_key i_alloc_sem_key;
 };
 
+extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt);
 extern int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
-- 
cgit v1.2.3


From 614b84cf4e4a920d2af32b8f147ea1e3b8c27ea6 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 6 Apr 2009 19:01:08 -0700
Subject: namespaces: mqueue ns: move mqueue_mnt into struct ipc_namespace

Move mqueue vfsmount plus a few tunables into the ipc_namespace struct.
The CONFIG_IPC_NS boolean and the ipc_namespace struct will serve both the
posix message queue namespaces and the SYSV ipc namespaces.

The sysctl code will be fixed separately in patch 3.  After just this
patch, making a change to posix mqueue tunables always changes the values
in the initial ipc namespace.

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  39 +++++++++++--
 init/Kconfig                  |   4 +-
 ipc/mqueue.c                  | 124 +++++++++++++++++++++++-------------------
 ipc/msgutil.c                 |  22 ++++++++
 ipc/namespace.c               |   2 +
 ipc/util.c                    |   9 ---
 ipc/util.h                    |  16 ++++++
 7 files changed, 145 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index ea330f9e7100..3e6fcacebe8a 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -44,24 +44,55 @@ struct ipc_namespace {
 	int		shm_tot;
 
 	struct notifier_block ipcns_nb;
+
+	/* The kern_mount of the mqueuefs sb.  We take a ref on it */
+	struct vfsmount	*mq_mnt;
+
+	/* # queues in this ns, protected by mq_lock */
+	unsigned int    mq_queues_count;
+
+	/* next fields are set through sysctl */
+	unsigned int    mq_queues_max;   /* initialized to DFLT_QUEUESMAX */
+	unsigned int    mq_msg_max;      /* initialized to DFLT_MSGMAX */
+	unsigned int    mq_msgsize_max;  /* initialized to DFLT_MSGSIZEMAX */
+
 };
 
 extern struct ipc_namespace init_ipc_ns;
 extern atomic_t nr_ipc_ns;
 
-#ifdef CONFIG_SYSVIPC
+#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
 #define INIT_IPC_NS(ns)		.ns		= &init_ipc_ns,
+#else
+#define INIT_IPC_NS(ns)
+#endif
 
+#ifdef CONFIG_SYSVIPC
 extern int register_ipcns_notifier(struct ipc_namespace *);
 extern int cond_register_ipcns_notifier(struct ipc_namespace *);
 extern void unregister_ipcns_notifier(struct ipc_namespace *);
 extern int ipcns_notify(unsigned long);
-
 #else /* CONFIG_SYSVIPC */
-#define INIT_IPC_NS(ns)
+static inline int register_ipcns_notifier(struct ipc_namespace *ns)
+{ return 0; }
+static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns)
+{ return 0; }
+static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { }
+static inline int ipcns_notify(unsigned long l) { return 0; }
 #endif /* CONFIG_SYSVIPC */
 
-#if defined(CONFIG_SYSVIPC) && defined(CONFIG_IPC_NS)
+#ifdef CONFIG_POSIX_MQUEUE
+extern void mq_init_ns(struct ipc_namespace *ns);
+/* default values */
+#define DFLT_QUEUESMAX 256     /* max number of message queues */
+#define DFLT_MSGMAX    10      /* max number of messages in each queue */
+#define HARD_MSGMAX    (131072/sizeof(void *))
+#define DFLT_MSGSIZEMAX 8192   /* max message size */
+#else
+#define mq_init_ns(ns) ((void) 0)
+#endif
+
+#if defined(CONFIG_IPC_NS)
 extern void free_ipc_ns(struct kref *kref);
 extern struct ipc_namespace *copy_ipcs(unsigned long flags,
 				       struct ipc_namespace *ns);
diff --git a/init/Kconfig b/init/Kconfig
index c52d1d48272a..a0807ba91644 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -670,10 +670,10 @@ config UTS_NS
 
 config IPC_NS
 	bool "IPC namespace"
-	depends on NAMESPACES && SYSVIPC
+	depends on NAMESPACES && (SYSVIPC || POSIX_MQUEUE)
 	help
 	  In this namespace tasks work with IPC ids which correspond to
-	  different IPC objects in different namespaces
+	  different IPC objects in different namespaces.
 
 config USER_NS
 	bool "User namespace (EXPERIMENTAL)"
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 916785363f0f..a3673a09069a 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -31,6 +31,7 @@
 #include <linux/mutex.h>
 #include <linux/nsproxy.h>
 #include <linux/pid.h>
+#include <linux/ipc_namespace.h>
 
 #include <net/sock.h>
 #include "util.h"
@@ -46,12 +47,6 @@
 #define STATE_PENDING	1
 #define STATE_READY	2
 
-/* default values */
-#define DFLT_QUEUESMAX	256	/* max number of message queues */
-#define DFLT_MSGMAX 	10	/* max number of messages in each queue */
-#define HARD_MSGMAX 	(131072/sizeof(void*))
-#define DFLT_MSGSIZEMAX 8192	/* max message size */
-
 /*
  * Define the ranges various user-specified maximum values can
  * be set to.
@@ -95,12 +90,6 @@ static void remove_notification(struct mqueue_inode_info *info);
 
 static spinlock_t mq_lock;
 static struct kmem_cache *mqueue_inode_cachep;
-static struct vfsmount *mqueue_mnt;
-
-static unsigned int queues_count;
-static unsigned int queues_max 	= DFLT_QUEUESMAX;
-static unsigned int msg_max 	= DFLT_MSGMAX;
-static unsigned int msgsize_max = DFLT_MSGSIZEMAX;
 
 static struct ctl_table_header * mq_sysctl_table;
 
@@ -109,11 +98,27 @@ static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
 	return container_of(inode, struct mqueue_inode_info, vfs_inode);
 }
 
+void mq_init_ns(struct ipc_namespace *ns)
+{
+	ns->mq_queues_count  = 0;
+	ns->mq_queues_max    = DFLT_QUEUESMAX;
+	ns->mq_msg_max       = DFLT_MSGMAX;
+	ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
+	ns->mq_mnt           = mntget(init_ipc_ns.mq_mnt);
+}
+
+void mq_exit_ns(struct ipc_namespace *ns)
+{
+	/* will need to clear out ns->mq_mnt->mnt_sb->s_fs_info here */
+	mntput(ns->mq_mnt);
+}
+
 static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 							struct mq_attr *attr)
 {
 	struct user_struct *u = current_user();
 	struct inode *inode;
+	struct ipc_namespace *ipc_ns = &init_ipc_ns;
 
 	inode = new_inode(sb);
 	if (inode) {
@@ -141,8 +146,8 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 			info->qsize = 0;
 			info->user = NULL;	/* set when all is ok */
 			memset(&info->attr, 0, sizeof(info->attr));
-			info->attr.mq_maxmsg = msg_max;
-			info->attr.mq_msgsize = msgsize_max;
+			info->attr.mq_maxmsg = ipc_ns->mq_msg_max;
+			info->attr.mq_msgsize = ipc_ns->mq_msgsize_max;
 			if (attr) {
 				info->attr.mq_maxmsg = attr->mq_maxmsg;
 				info->attr.mq_msgsize = attr->mq_msgsize;
@@ -242,6 +247,7 @@ static void mqueue_delete_inode(struct inode *inode)
 	struct user_struct *user;
 	unsigned long mq_bytes;
 	int i;
+	struct ipc_namespace *ipc_ns = &init_ipc_ns;
 
 	if (S_ISDIR(inode->i_mode)) {
 		clear_inode(inode);
@@ -262,7 +268,7 @@ static void mqueue_delete_inode(struct inode *inode)
 	if (user) {
 		spin_lock(&mq_lock);
 		user->mq_bytes -= mq_bytes;
-		queues_count--;
+		ipc_ns->mq_queues_count--;
 		spin_unlock(&mq_lock);
 		free_uid(user);
 	}
@@ -274,21 +280,23 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	struct mq_attr *attr = dentry->d_fsdata;
 	int error;
+	struct ipc_namespace *ipc_ns = &init_ipc_ns;
 
 	spin_lock(&mq_lock);
-	if (queues_count >= queues_max && !capable(CAP_SYS_RESOURCE)) {
+	if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
+			!capable(CAP_SYS_RESOURCE)) {
 		error = -ENOSPC;
-		goto out_lock;
+		goto out_unlock;
 	}
-	queues_count++;
+	ipc_ns->mq_queues_count++;
 	spin_unlock(&mq_lock);
 
 	inode = mqueue_get_inode(dir->i_sb, mode, attr);
 	if (!inode) {
 		error = -ENOMEM;
 		spin_lock(&mq_lock);
-		queues_count--;
-		goto out_lock;
+		ipc_ns->mq_queues_count--;
+		goto out_unlock;
 	}
 
 	dir->i_size += DIRENT_SIZE;
@@ -297,7 +305,7 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
 	d_instantiate(dentry, inode);
 	dget(dentry);
 	return 0;
-out_lock:
+out_unlock:
 	spin_unlock(&mq_lock);
 	return error;
 }
@@ -562,7 +570,7 @@ static void remove_notification(struct mqueue_inode_info *info)
 	info->notify_owner = NULL;
 }
 
-static int mq_attr_ok(struct mq_attr *attr)
+static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr)
 {
 	if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0)
 		return 0;
@@ -570,8 +578,8 @@ static int mq_attr_ok(struct mq_attr *attr)
 		if (attr->mq_maxmsg > HARD_MSGMAX)
 			return 0;
 	} else {
-		if (attr->mq_maxmsg > msg_max ||
-				attr->mq_msgsize > msgsize_max)
+		if (attr->mq_maxmsg > ipc_ns->mq_msg_max ||
+				attr->mq_msgsize > ipc_ns->mq_msgsize_max)
 			return 0;
 	}
 	/* check for overflow */
@@ -587,8 +595,9 @@ static int mq_attr_ok(struct mq_attr *attr)
 /*
  * Invoked when creating a new queue via sys_mq_open
  */
-static struct file *do_create(struct dentry *dir, struct dentry *dentry,
-			int oflag, mode_t mode, struct mq_attr *attr)
+static struct file *do_create(struct ipc_namespace *ipc_ns, struct dentry *dir,
+			struct dentry *dentry, int oflag, mode_t mode,
+			struct mq_attr *attr)
 {
 	const struct cred *cred = current_cred();
 	struct file *result;
@@ -596,14 +605,14 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
 
 	if (attr) {
 		ret = -EINVAL;
-		if (!mq_attr_ok(attr))
+		if (!mq_attr_ok(ipc_ns, attr))
 			goto out;
 		/* store for use during create */
 		dentry->d_fsdata = attr;
 	}
 
 	mode &= ~current_umask();
-	ret = mnt_want_write(mqueue_mnt);
+	ret = mnt_want_write(ipc_ns->mq_mnt);
 	if (ret)
 		goto out;
 	ret = vfs_create(dir->d_inode, dentry, mode, NULL);
@@ -611,24 +620,25 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
 	if (ret)
 		goto out_drop_write;
 
-	result = dentry_open(dentry, mqueue_mnt, oflag, cred);
+	result = dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred);
 	/*
 	 * dentry_open() took a persistent mnt_want_write(),
 	 * so we can now drop this one.
 	 */
-	mnt_drop_write(mqueue_mnt);
+	mnt_drop_write(ipc_ns->mq_mnt);
 	return result;
 
 out_drop_write:
-	mnt_drop_write(mqueue_mnt);
+	mnt_drop_write(ipc_ns->mq_mnt);
 out:
 	dput(dentry);
-	mntput(mqueue_mnt);
+	mntput(ipc_ns->mq_mnt);
 	return ERR_PTR(ret);
 }
 
 /* Opens existing queue */
-static struct file *do_open(struct dentry *dentry, int oflag)
+static struct file *do_open(struct ipc_namespace *ipc_ns,
+				struct dentry *dentry, int oflag)
 {
 	const struct cred *cred = current_cred();
 
@@ -637,17 +647,17 @@ static struct file *do_open(struct dentry *dentry, int oflag)
 
 	if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) {
 		dput(dentry);
-		mntput(mqueue_mnt);
+		mntput(ipc_ns->mq_mnt);
 		return ERR_PTR(-EINVAL);
 	}
 
 	if (inode_permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE])) {
 		dput(dentry);
-		mntput(mqueue_mnt);
+		mntput(ipc_ns->mq_mnt);
 		return ERR_PTR(-EACCES);
 	}
 
-	return dentry_open(dentry, mqueue_mnt, oflag, cred);
+	return dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred);
 }
 
 SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
@@ -658,6 +668,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
 	char *name;
 	struct mq_attr attr;
 	int fd, error;
+	struct ipc_namespace *ipc_ns = &init_ipc_ns;
 
 	if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
 		return -EFAULT;
@@ -671,13 +682,13 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
 	if (fd < 0)
 		goto out_putname;
 
-	mutex_lock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
-	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
+	mutex_lock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
+	dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name));
 	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
 		goto out_err;
 	}
-	mntget(mqueue_mnt);
+	mntget(ipc_ns->mq_mnt);
 
 	if (oflag & O_CREAT) {
 		if (dentry->d_inode) {	/* entry already exists */
@@ -685,10 +696,10 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
 			error = -EEXIST;
 			if (oflag & O_EXCL)
 				goto out;
-			filp = do_open(dentry, oflag);
+			filp = do_open(ipc_ns, dentry, oflag);
 		} else {
-			filp = do_create(mqueue_mnt->mnt_root, dentry,
-						oflag, mode,
+			filp = do_create(ipc_ns, ipc_ns->mq_mnt->mnt_root,
+						dentry, oflag, mode,
 						u_attr ? &attr : NULL);
 		}
 	} else {
@@ -696,7 +707,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
 		if (!dentry->d_inode)
 			goto out;
 		audit_inode(name, dentry);
-		filp = do_open(dentry, oflag);
+		filp = do_open(ipc_ns, dentry, oflag);
 	}
 
 	if (IS_ERR(filp)) {
@@ -709,13 +720,13 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
 
 out:
 	dput(dentry);
-	mntput(mqueue_mnt);
+	mntput(ipc_ns->mq_mnt);
 out_putfd:
 	put_unused_fd(fd);
 out_err:
 	fd = error;
 out_upsem:
-	mutex_unlock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
+	mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
 out_putname:
 	putname(name);
 	return fd;
@@ -727,14 +738,15 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 	char *name;
 	struct dentry *dentry;
 	struct inode *inode = NULL;
+	struct ipc_namespace *ipc_ns = &init_ipc_ns;
 
 	name = getname(u_name);
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
-	mutex_lock_nested(&mqueue_mnt->mnt_root->d_inode->i_mutex,
+	mutex_lock_nested(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex,
 			I_MUTEX_PARENT);
-	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
+	dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name));
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
 		goto out_unlock;
@@ -748,16 +760,16 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 	inode = dentry->d_inode;
 	if (inode)
 		atomic_inc(&inode->i_count);
-	err = mnt_want_write(mqueue_mnt);
+	err = mnt_want_write(ipc_ns->mq_mnt);
 	if (err)
 		goto out_err;
 	err = vfs_unlink(dentry->d_parent->d_inode, dentry);
-	mnt_drop_write(mqueue_mnt);
+	mnt_drop_write(ipc_ns->mq_mnt);
 out_err:
 	dput(dentry);
 
 out_unlock:
-	mutex_unlock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
+	mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
 	putname(name);
 	if (inode)
 		iput(inode);
@@ -1214,14 +1226,14 @@ static int msg_maxsize_limit_max = MAX_MSGSIZEMAX;
 static ctl_table mq_sysctls[] = {
 	{
 		.procname	= "queues_max",
-		.data		= &queues_max,
+		.data		= &init_ipc_ns.mq_queues_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.procname	= "msg_max",
-		.data		= &msg_max,
+		.data		= &init_ipc_ns.mq_msg_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
@@ -1230,7 +1242,7 @@ static ctl_table mq_sysctls[] = {
 	},
 	{
 		.procname	= "msgsize_max",
-		.data		= &msgsize_max,
+		.data		= &init_ipc_ns.mq_msgsize_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
@@ -1276,13 +1288,13 @@ static int __init init_mqueue_fs(void)
 	if (error)
 		goto out_sysctl;
 
-	if (IS_ERR(mqueue_mnt = kern_mount(&mqueue_fs_type))) {
-		error = PTR_ERR(mqueue_mnt);
+	init_ipc_ns.mq_mnt = kern_mount(&mqueue_fs_type);
+	if (IS_ERR(init_ipc_ns.mq_mnt)) {
+		error = PTR_ERR(init_ipc_ns.mq_mnt);
 		goto out_filesystem;
 	}
 
 	/* internal initialization - not common for vfs */
-	queues_count = 0;
 	spin_lock_init(&mq_lock);
 
 	return 0;
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index c82c215693d7..73c316cb8613 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -13,10 +13,32 @@
 #include <linux/security.h>
 #include <linux/slab.h>
 #include <linux/ipc.h>
+#include <linux/ipc_namespace.h>
 #include <asm/uaccess.h>
 
 #include "util.h"
 
+/*
+ * The next 2 defines are here bc this is the only file
+ * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE
+ * and not CONFIG_IPC_NS.
+ */
+struct ipc_namespace init_ipc_ns = {
+	.kref = {
+		/* It's not for this patch to change, but should this be 1? */
+		.refcount	= ATOMIC_INIT(2),
+	},
+#ifdef CONFIG_POSIX_MQUEUE
+	.mq_mnt          = NULL,
+	.mq_queues_count = 0,
+	.mq_queues_max   = DFLT_QUEUESMAX,
+	.mq_msg_max      = DFLT_MSGMAX,
+	.mq_msgsize_max  = DFLT_MSGSIZEMAX,
+#endif
+};
+
+atomic_t nr_ipc_ns = ATOMIC_INIT(1);
+
 struct msg_msgseg {
 	struct msg_msgseg* next;
 	/* the next part of the message follows immediately */
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 9171d948751e..4b4dc6d847f1 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -25,6 +25,7 @@ static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
 	sem_init_ns(ns);
 	msg_init_ns(ns);
 	shm_init_ns(ns);
+	mq_init_ns(ns);
 
 	/*
 	 * msgmni has already been computed for the new ipc ns.
@@ -101,6 +102,7 @@ void free_ipc_ns(struct kref *kref)
 	sem_exit_ns(ns);
 	msg_exit_ns(ns);
 	shm_exit_ns(ns);
+	mq_exit_ns(ns);
 	kfree(ns);
 	atomic_dec(&nr_ipc_ns);
 
diff --git a/ipc/util.c b/ipc/util.c
index 7585a72e259b..b8e4ba92f6d1 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -47,15 +47,6 @@ struct ipc_proc_iface {
 	int (*show)(struct seq_file *, void *);
 };
 
-struct ipc_namespace init_ipc_ns = {
-	.kref = {
-		.refcount	= ATOMIC_INIT(2),
-	},
-};
-
-atomic_t nr_ipc_ns = ATOMIC_INIT(1);
-
-
 #ifdef CONFIG_MEMORY_HOTPLUG
 
 static void ipc_memory_notifier(struct work_struct *work)
diff --git a/ipc/util.h b/ipc/util.h
index 3646b45a03c9..0e7d9223acc1 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -20,6 +20,13 @@ void shm_init (void);
 
 struct ipc_namespace;
 
+#ifdef CONFIG_POSIX_MQUEUE
+void mq_exit_ns(struct ipc_namespace *ns);
+#else
+static inline void mq_exit_ns(struct ipc_namespace *ns) { }
+#endif
+
+#ifdef CONFIG_SYSVIPC
 void sem_init_ns(struct ipc_namespace *ns);
 void msg_init_ns(struct ipc_namespace *ns);
 void shm_init_ns(struct ipc_namespace *ns);
@@ -27,6 +34,15 @@ void shm_init_ns(struct ipc_namespace *ns);
 void sem_exit_ns(struct ipc_namespace *ns);
 void msg_exit_ns(struct ipc_namespace *ns);
 void shm_exit_ns(struct ipc_namespace *ns);
+#else
+static inline void sem_init_ns(struct ipc_namespace *ns) { }
+static inline void msg_init_ns(struct ipc_namespace *ns) { }
+static inline void shm_init_ns(struct ipc_namespace *ns) { }
+
+static inline void sem_exit_ns(struct ipc_namespace *ns) { }
+static inline void msg_exit_ns(struct ipc_namespace *ns) { }
+static inline void shm_exit_ns(struct ipc_namespace *ns) { }
+#endif
 
 /*
  * Structure that holds the parameters needed by the ipc operations
-- 
cgit v1.2.3


From 7eafd7c74c3f2e67c27621b987b28397110d643f Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 6 Apr 2009 19:01:10 -0700
Subject: namespaces: ipc namespaces: implement support for posix msqueues

Implement multiple mounts of the mqueue file system, and link it to usage
of CLONE_NEWIPC.

Each ipc ns has a corresponding mqueuefs superblock.  When a user does
clone(CLONE_NEWIPC) or unshare(CLONE_NEWIPC), the unshare will cause an
internal mount of a new mqueuefs sb linked to the new ipc ns.

When a user does 'mount -t mqueue mqueue /dev/mqueue', he mounts the
mqueuefs superblock.

Posix message queues can be worked with both through the mq_* system calls
(see mq_overview(7)), and through the VFS through the mqueue mount.  Any
usage of mq_open() and friends will work with the acting task's ipc
namespace.  Any actions through the VFS will work with the mqueuefs in
which the file was created.  So if a user doesn't remount mqueuefs after
unshare(CLONE_NEWIPC), mq_open("/ab") will not be reflected in "ls
/dev/mqueue".

If task a mounts mqueue for ipc_ns:1, then clones task b with a new ipcns,
ipcns:2, and then task a is the last task in ipc_ns:1 to exit, then (1)
ipc_ns:1 will be freed, (2) it's superblock will live on until task b
umounts the corresponding mqueuefs, and vfs actions will continue to
succeed, but (3) sb->s_fs_info will be NULL for the sb corresponding to
the deceased ipc_ns:1.

To make this happen, we must protect the ipc reference count when

a) a task exits and drops its ipcns->count, since it might be dropping
   it to 0 and freeing the ipcns

b) a task accesses the ipcns through its mqueuefs interface, since it
   bumps the ipcns refcount and might race with the last task in the ipcns
   exiting.

So the kref is changed to an atomic_t so we can use
atomic_dec_and_lock(&ns->count,mq_lock), and every access to the ipcns
through ns = mqueuefs_sb->s_fs_info is protected by the same lock.

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  16 +++---
 ipc/mqueue.c                  | 111 +++++++++++++++++++++++++++++++-----------
 ipc/msgutil.c                 |   9 ++--
 ipc/namespace.c               |  41 +++++++++++++---
 ipc/util.h                    |   6 ++-
 5 files changed, 131 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 3e6fcacebe8a..3392d50de351 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -25,7 +25,7 @@ struct ipc_ids {
 };
 
 struct ipc_namespace {
-	struct kref	kref;
+	atomic_t	count;
 	struct ipc_ids	ids[3];
 
 	int		sem_ctls[4];
@@ -61,6 +61,7 @@ struct ipc_namespace {
 extern struct ipc_namespace init_ipc_ns;
 extern atomic_t nr_ipc_ns;
 
+extern spinlock_t mq_lock;
 #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
 #define INIT_IPC_NS(ns)		.ns		= &init_ipc_ns,
 #else
@@ -82,18 +83,18 @@ static inline int ipcns_notify(unsigned long l) { return 0; }
 #endif /* CONFIG_SYSVIPC */
 
 #ifdef CONFIG_POSIX_MQUEUE
-extern void mq_init_ns(struct ipc_namespace *ns);
+extern int mq_init_ns(struct ipc_namespace *ns);
 /* default values */
 #define DFLT_QUEUESMAX 256     /* max number of message queues */
 #define DFLT_MSGMAX    10      /* max number of messages in each queue */
 #define HARD_MSGMAX    (131072/sizeof(void *))
 #define DFLT_MSGSIZEMAX 8192   /* max message size */
 #else
-#define mq_init_ns(ns) ((void) 0)
+static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
 #endif
 
 #if defined(CONFIG_IPC_NS)
-extern void free_ipc_ns(struct kref *kref);
+extern void free_ipc_ns(struct ipc_namespace *ns);
 extern struct ipc_namespace *copy_ipcs(unsigned long flags,
 				       struct ipc_namespace *ns);
 extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
@@ -103,14 +104,11 @@ extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 {
 	if (ns)
-		kref_get(&ns->kref);
+		atomic_inc(&ns->count);
 	return ns;
 }
 
-static inline void put_ipc_ns(struct ipc_namespace *ns)
-{
-	kref_put(&ns->kref, free_ipc_ns);
-}
+extern void put_ipc_ns(struct ipc_namespace *ns);
 #else
 static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
 		struct ipc_namespace *ns)
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a3673a09069a..c82d7b51ef68 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -88,7 +88,6 @@ static const struct file_operations mqueue_file_operations;
 static struct super_operations mqueue_super_ops;
 static void remove_notification(struct mqueue_inode_info *info);
 
-static spinlock_t mq_lock;
 static struct kmem_cache *mqueue_inode_cachep;
 
 static struct ctl_table_header * mq_sysctl_table;
@@ -98,27 +97,30 @@ static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
 	return container_of(inode, struct mqueue_inode_info, vfs_inode);
 }
 
-void mq_init_ns(struct ipc_namespace *ns)
+/*
+ * This routine should be called with the mq_lock held.
+ */
+static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
 {
-	ns->mq_queues_count  = 0;
-	ns->mq_queues_max    = DFLT_QUEUESMAX;
-	ns->mq_msg_max       = DFLT_MSGMAX;
-	ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
-	ns->mq_mnt           = mntget(init_ipc_ns.mq_mnt);
+	return get_ipc_ns(inode->i_sb->s_fs_info);
 }
 
-void mq_exit_ns(struct ipc_namespace *ns)
+static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
 {
-	/* will need to clear out ns->mq_mnt->mnt_sb->s_fs_info here */
-	mntput(ns->mq_mnt);
+	struct ipc_namespace *ns;
+
+	spin_lock(&mq_lock);
+	ns = __get_ns_from_inode(inode);
+	spin_unlock(&mq_lock);
+	return ns;
 }
 
-static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
-							struct mq_attr *attr)
+static struct inode *mqueue_get_inode(struct super_block *sb,
+		struct ipc_namespace *ipc_ns, int mode,
+		struct mq_attr *attr)
 {
 	struct user_struct *u = current_user();
 	struct inode *inode;
-	struct ipc_namespace *ipc_ns = &init_ipc_ns;
 
 	inode = new_inode(sb);
 	if (inode) {
@@ -193,30 +195,38 @@ out_inode:
 static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *inode;
+	struct ipc_namespace *ns = data;
+	int error = 0;
 
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = MQUEUE_MAGIC;
 	sb->s_op = &mqueue_super_ops;
 
-	inode = mqueue_get_inode(sb, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
-	if (!inode)
-		return -ENOMEM;
+	inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO,
+				NULL);
+	if (!inode) {
+		error = -ENOMEM;
+		goto out;
+	}
 
 	sb->s_root = d_alloc_root(inode);
 	if (!sb->s_root) {
 		iput(inode);
-		return -ENOMEM;
+		error = -ENOMEM;
 	}
 
-	return 0;
+out:
+	return error;
 }
 
 static int mqueue_get_sb(struct file_system_type *fs_type,
 			 int flags, const char *dev_name,
 			 void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
+	if (!(flags & MS_KERNMOUNT))
+		data = current->nsproxy->ipc_ns;
+	return get_sb_ns(fs_type, flags, data, mqueue_fill_super, mnt);
 }
 
 static void init_once(void *foo)
@@ -247,12 +257,13 @@ static void mqueue_delete_inode(struct inode *inode)
 	struct user_struct *user;
 	unsigned long mq_bytes;
 	int i;
-	struct ipc_namespace *ipc_ns = &init_ipc_ns;
+	struct ipc_namespace *ipc_ns;
 
 	if (S_ISDIR(inode->i_mode)) {
 		clear_inode(inode);
 		return;
 	}
+	ipc_ns = get_ns_from_inode(inode);
 	info = MQUEUE_I(inode);
 	spin_lock(&info->lock);
 	for (i = 0; i < info->attr.mq_curmsgs; i++)
@@ -268,10 +279,19 @@ static void mqueue_delete_inode(struct inode *inode)
 	if (user) {
 		spin_lock(&mq_lock);
 		user->mq_bytes -= mq_bytes;
-		ipc_ns->mq_queues_count--;
+		/*
+		 * get_ns_from_inode() ensures that the
+		 * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
+		 * to which we now hold a reference, or it is NULL.
+		 * We can't put it here under mq_lock, though.
+		 */
+		if (ipc_ns)
+			ipc_ns->mq_queues_count--;
 		spin_unlock(&mq_lock);
 		free_uid(user);
 	}
+	if (ipc_ns)
+		put_ipc_ns(ipc_ns);
 }
 
 static int mqueue_create(struct inode *dir, struct dentry *dentry,
@@ -280,9 +300,14 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	struct mq_attr *attr = dentry->d_fsdata;
 	int error;
-	struct ipc_namespace *ipc_ns = &init_ipc_ns;
+	struct ipc_namespace *ipc_ns;
 
 	spin_lock(&mq_lock);
+	ipc_ns = __get_ns_from_inode(dir);
+	if (!ipc_ns) {
+		error = -EACCES;
+		goto out_unlock;
+	}
 	if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
 			!capable(CAP_SYS_RESOURCE)) {
 		error = -ENOSPC;
@@ -291,7 +316,7 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
 	ipc_ns->mq_queues_count++;
 	spin_unlock(&mq_lock);
 
-	inode = mqueue_get_inode(dir->i_sb, mode, attr);
+	inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
 	if (!inode) {
 		error = -ENOMEM;
 		spin_lock(&mq_lock);
@@ -299,6 +324,7 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
+	put_ipc_ns(ipc_ns);
 	dir->i_size += DIRENT_SIZE;
 	dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
 
@@ -307,6 +333,8 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
 	return 0;
 out_unlock:
 	spin_unlock(&mq_lock);
+	if (ipc_ns)
+		put_ipc_ns(ipc_ns);
 	return error;
 }
 
@@ -668,7 +696,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
 	char *name;
 	struct mq_attr attr;
 	int fd, error;
-	struct ipc_namespace *ipc_ns = &init_ipc_ns;
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
 
 	if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
 		return -EFAULT;
@@ -738,7 +766,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 	char *name;
 	struct dentry *dentry;
 	struct inode *inode = NULL;
-	struct ipc_namespace *ipc_ns = &init_ipc_ns;
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
 
 	name = getname(u_name);
 	if (IS_ERR(name))
@@ -1217,6 +1245,32 @@ static struct file_system_type mqueue_fs_type = {
 	.kill_sb = kill_litter_super,
 };
 
+int mq_init_ns(struct ipc_namespace *ns)
+{
+	ns->mq_queues_count  = 0;
+	ns->mq_queues_max    = DFLT_QUEUESMAX;
+	ns->mq_msg_max       = DFLT_MSGMAX;
+	ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
+
+	ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
+	if (IS_ERR(ns->mq_mnt)) {
+		int err = PTR_ERR(ns->mq_mnt);
+		ns->mq_mnt = NULL;
+		return err;
+	}
+	return 0;
+}
+
+void mq_clear_sbinfo(struct ipc_namespace *ns)
+{
+	ns->mq_mnt->mnt_sb->s_fs_info = NULL;
+}
+
+void mq_put_mnt(struct ipc_namespace *ns)
+{
+	mntput(ns->mq_mnt);
+}
+
 static int msg_max_limit_min = MIN_MSGMAX;
 static int msg_max_limit_max = MAX_MSGMAX;
 
@@ -1288,15 +1342,14 @@ static int __init init_mqueue_fs(void)
 	if (error)
 		goto out_sysctl;
 
-	init_ipc_ns.mq_mnt = kern_mount(&mqueue_fs_type);
+	spin_lock_init(&mq_lock);
+
+	init_ipc_ns.mq_mnt = kern_mount_data(&mqueue_fs_type, &init_ipc_ns);
 	if (IS_ERR(init_ipc_ns.mq_mnt)) {
 		error = PTR_ERR(init_ipc_ns.mq_mnt);
 		goto out_filesystem;
 	}
 
-	/* internal initialization - not common for vfs */
-	spin_lock_init(&mq_lock);
-
 	return 0;
 
 out_filesystem:
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 73c316cb8613..f095ee268833 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -18,19 +18,16 @@
 
 #include "util.h"
 
+DEFINE_SPINLOCK(mq_lock);
+
 /*
  * The next 2 defines are here bc this is the only file
  * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE
  * and not CONFIG_IPC_NS.
  */
 struct ipc_namespace init_ipc_ns = {
-	.kref = {
-		/* It's not for this patch to change, but should this be 1? */
-		.refcount	= ATOMIC_INIT(2),
-	},
+	.count		= ATOMIC_INIT(1),
 #ifdef CONFIG_POSIX_MQUEUE
-	.mq_mnt          = NULL,
-	.mq_queues_count = 0,
 	.mq_queues_max   = DFLT_QUEUESMAX,
 	.mq_msg_max      = DFLT_MSGMAX,
 	.mq_msgsize_max  = DFLT_MSGSIZEMAX,
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 4b4dc6d847f1..4a5e752a9276 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -9,23 +9,31 @@
 #include <linux/rcupdate.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
 
 #include "util.h"
 
 static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
 {
 	struct ipc_namespace *ns;
+	int err;
 
 	ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
 	if (ns == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	atomic_set(&ns->count, 1);
+	err = mq_init_ns(ns);
+	if (err) {
+		kfree(ns);
+		return ERR_PTR(err);
+	}
 	atomic_inc(&nr_ipc_ns);
 
 	sem_init_ns(ns);
 	msg_init_ns(ns);
 	shm_init_ns(ns);
-	mq_init_ns(ns);
 
 	/*
 	 * msgmni has already been computed for the new ipc ns.
@@ -35,7 +43,6 @@ static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
 	ipcns_notify(IPCNS_CREATED);
 	register_ipcns_notifier(ns);
 
-	kref_init(&ns->kref);
 	return ns;
 }
 
@@ -85,11 +92,34 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
 	up_write(&ids->rw_mutex);
 }
 
-void free_ipc_ns(struct kref *kref)
+/*
+ * put_ipc_ns - drop a reference to an ipc namespace.
+ * @ns: the namespace to put
+ *
+ * If this is the last task in the namespace exiting, and
+ * it is dropping the refcount to 0, then it can race with
+ * a task in another ipc namespace but in a mounts namespace
+ * which has this ipcns's mqueuefs mounted, doing some action
+ * with one of the mqueuefs files.  That can raise the refcount.
+ * So dropping the refcount, and raising the refcount when
+ * accessing it through the VFS, are protected with mq_lock.
+ *
+ * (Clearly, a task raising the refcount on its own ipc_ns
+ * needn't take mq_lock since it can't race with the last task
+ * in the ipcns exiting).
+ */
+void put_ipc_ns(struct ipc_namespace *ns)
 {
-	struct ipc_namespace *ns;
+	if (atomic_dec_and_lock(&ns->count, &mq_lock)) {
+		mq_clear_sbinfo(ns);
+		spin_unlock(&mq_lock);
+		mq_put_mnt(ns);
+		free_ipc_ns(ns);
+	}
+}
 
-	ns = container_of(kref, struct ipc_namespace, kref);
+void free_ipc_ns(struct ipc_namespace *ns)
+{
 	/*
 	 * Unregistering the hotplug notifier at the beginning guarantees
 	 * that the ipc namespace won't be freed while we are inside the
@@ -102,7 +132,6 @@ void free_ipc_ns(struct kref *kref)
 	sem_exit_ns(ns);
 	msg_exit_ns(ns);
 	shm_exit_ns(ns);
-	mq_exit_ns(ns);
 	kfree(ns);
 	atomic_dec(&nr_ipc_ns);
 
diff --git a/ipc/util.h b/ipc/util.h
index 0e7d9223acc1..1187332a89d2 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -21,9 +21,11 @@ void shm_init (void);
 struct ipc_namespace;
 
 #ifdef CONFIG_POSIX_MQUEUE
-void mq_exit_ns(struct ipc_namespace *ns);
+extern void mq_clear_sbinfo(struct ipc_namespace *ns);
+extern void mq_put_mnt(struct ipc_namespace *ns);
 #else
-static inline void mq_exit_ns(struct ipc_namespace *ns) { }
+static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { }
+static inline void mq_put_mnt(struct ipc_namespace *ns) { }
 #endif
 
 #ifdef CONFIG_SYSVIPC
-- 
cgit v1.2.3


From bdc8e5f85f9abe2e7c78dcf39d81f9a97178788b Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 6 Apr 2009 19:01:11 -0700
Subject: namespaces: mqueue namespace: adapt sysctl

Largely inspired from ipc/ipc_sysctl.c.  This patch isolates the mqueue
sysctl stuff in its own file.

[akpm@linux-foundation.org: build fix]
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Nadia Derbey <Nadia.Derbey@bull.net>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  14 +++++
 init/Kconfig                  |   6 +++
 ipc/Makefile                  |   1 +
 ipc/mq_sysctl.c               | 116 ++++++++++++++++++++++++++++++++++++++++++
 ipc/mqueue.c                  |  65 +----------------------
 5 files changed, 138 insertions(+), 64 deletions(-)
 create mode 100644 ipc/mq_sysctl.c

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 3392d50de351..3bf40e246a80 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -128,4 +128,18 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
 {
 }
 #endif
+
+#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
+
+struct ctl_table_header;
+extern struct ctl_table_header *mq_register_sysctl_table(void);
+
+#else /* CONFIG_POSIX_MQUEUE_SYSCTL */
+
+static inline struct ctl_table_header *mq_register_sysctl_table(void)
+{
+	return NULL;
+}
+
+#endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index a0807ba91644..f2f9b5362b48 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -208,6 +208,12 @@ config POSIX_MQUEUE
 
 	  If unsure, say Y.
 
+config POSIX_MQUEUE_SYSCTL
+	bool
+	depends on POSIX_MQUEUE
+	depends on SYSCTL
+	default y
+
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
 	help
diff --git a/ipc/Makefile b/ipc/Makefile
index 65c384395801..4e1955ea815d 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -8,4 +8,5 @@ obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o
 obj_mq-$(CONFIG_COMPAT) += compat_mq.o
 obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
 obj-$(CONFIG_IPC_NS) += namespace.o
+obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o
 
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
new file mode 100644
index 000000000000..89f60ec8ee54
--- /dev/null
+++ b/ipc/mq_sysctl.c
@@ -0,0 +1,116 @@
+/*
+ *  Copyright (C) 2007 IBM Corporation
+ *
+ *  Author: Cedric Le Goater <clg@fr.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+#include <linux/sysctl.h>
+
+/*
+ * Define the ranges various user-specified maximum values can
+ * be set to.
+ */
+#define MIN_MSGMAX	1		/* min value for msg_max */
+#define MAX_MSGMAX	HARD_MSGMAX	/* max value for msg_max */
+#define MIN_MSGSIZEMAX	128		/* min value for msgsize_max */
+#define MAX_MSGSIZEMAX	(8192*128)	/* max value for msgsize_max */
+
+static void *get_mq(ctl_table *table)
+{
+	char *which = table->data;
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+	which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
+	return which;
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+static int proc_mq_dointvec(ctl_table *table, int write, struct file *filp,
+	void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table mq_table;
+	memcpy(&mq_table, table, sizeof(mq_table));
+	mq_table.data = get_mq(table);
+
+	return proc_dointvec(&mq_table, write, filp, buffer, lenp, ppos);
+}
+
+static int proc_mq_dointvec_minmax(ctl_table *table, int write,
+	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table mq_table;
+	memcpy(&mq_table, table, sizeof(mq_table));
+	mq_table.data = get_mq(table);
+
+	return proc_dointvec_minmax(&mq_table, write, filp, buffer,
+					lenp, ppos);
+}
+#else
+#define proc_mq_dointvec NULL
+#define proc_mq_dointvec_minmax NULL
+#endif
+
+static int msg_max_limit_min = MIN_MSGMAX;
+static int msg_max_limit_max = MAX_MSGMAX;
+
+static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
+static int msg_maxsize_limit_max = MAX_MSGSIZEMAX;
+
+static ctl_table mq_sysctls[] = {
+	{
+		.procname	= "queues_max",
+		.data		= &init_ipc_ns.mq_queues_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_mq_dointvec,
+	},
+	{
+		.procname	= "msg_max",
+		.data		= &init_ipc_ns.mq_msg_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_mq_dointvec_minmax,
+		.extra1		= &msg_max_limit_min,
+		.extra2		= &msg_max_limit_max,
+	},
+	{
+		.procname	= "msgsize_max",
+		.data		= &init_ipc_ns.mq_msgsize_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_mq_dointvec_minmax,
+		.extra1		= &msg_maxsize_limit_min,
+		.extra2		= &msg_maxsize_limit_max,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table mq_sysctl_dir[] = {
+	{
+		.procname	= "mqueue",
+		.mode		= 0555,
+		.child		= mq_sysctls,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table mq_sysctl_root[] = {
+	{
+		.ctl_name	= CTL_FS,
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= mq_sysctl_dir,
+	},
+	{ .ctl_name = 0 }
+};
+
+struct ctl_table_header *mq_register_sysctl_table(void)
+{
+	return register_sysctl_table(mq_sysctl_root);
+}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c82d7b51ef68..e35ba2c3a8d7 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -47,15 +47,6 @@
 #define STATE_PENDING	1
 #define STATE_READY	2
 
-/*
- * Define the ranges various user-specified maximum values can
- * be set to.
- */
-#define MIN_MSGMAX	1		/* min value for msg_max */
-#define MAX_MSGMAX	HARD_MSGMAX	/* max value for msg_max */
-#define MIN_MSGSIZEMAX	128		/* min value for msgsize_max */
-#define MAX_MSGSIZEMAX	(8192*128)	/* max value for msgsize_max */
-
 struct ext_wait_queue {		/* queue of sleeping tasks */
 	struct task_struct *task;
 	struct list_head list;
@@ -1271,60 +1262,6 @@ void mq_put_mnt(struct ipc_namespace *ns)
 	mntput(ns->mq_mnt);
 }
 
-static int msg_max_limit_min = MIN_MSGMAX;
-static int msg_max_limit_max = MAX_MSGMAX;
-
-static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
-static int msg_maxsize_limit_max = MAX_MSGSIZEMAX;
-
-static ctl_table mq_sysctls[] = {
-	{
-		.procname	= "queues_max",
-		.data		= &init_ipc_ns.mq_queues_max,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.procname	= "msg_max",
-		.data		= &init_ipc_ns.mq_msg_max,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= &msg_max_limit_min,
-		.extra2		= &msg_max_limit_max,
-	},
-	{
-		.procname	= "msgsize_max",
-		.data		= &init_ipc_ns.mq_msgsize_max,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= &msg_maxsize_limit_min,
-		.extra2		= &msg_maxsize_limit_max,
-	},
-	{ .ctl_name = 0 }
-};
-
-static ctl_table mq_sysctl_dir[] = {
-	{
-		.procname	= "mqueue",
-		.mode		= 0555,
-		.child		= mq_sysctls,
-	},
-	{ .ctl_name = 0 }
-};
-
-static ctl_table mq_sysctl_root[] = {
-	{
-		.ctl_name	= CTL_FS,
-		.procname	= "fs",
-		.mode		= 0555,
-		.child		= mq_sysctl_dir,
-	},
-	{ .ctl_name = 0 }
-};
-
 static int __init init_mqueue_fs(void)
 {
 	int error;
@@ -1336,7 +1273,7 @@ static int __init init_mqueue_fs(void)
 		return -ENOMEM;
 
 	/* ignore failues - they are not fatal */
-	mq_sysctl_table = register_sysctl_table(mq_sysctl_root);
+	mq_sysctl_table = mq_register_sysctl_table();
 
 	error = register_filesystem(&mqueue_fs_type);
 	if (error)
-- 
cgit v1.2.3


From 284901a90a9e0b812ca3f5f852cbbfb60d10249d Mon Sep 17 00:00:00 2001
From: Yang Hongyang <yanghy@cn.fujitsu.com>
Date: Mon, 6 Apr 2009 19:01:15 -0700
Subject: dma-mapping: replace all DMA_32BIT_MASK macro with DMA_BIT_MASK(32)

Replace all DMA_32BIT_MASK macro with DMA_BIT_MASK(32)

Signed-off-by: Yang Hongyang<yanghy@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-davinci/board-evm.c             |  4 ++--
 arch/arm/mach-davinci/usb.c                   |  4 ++--
 arch/arm/mach-kirkwood/common.c               |  2 +-
 arch/arm/mach-orion5x/common.c                |  2 +-
 arch/arm/plat-iop/adma.c                      |  2 +-
 arch/avr32/boards/hammerhead/flash.c          |  4 ++--
 arch/avr32/mach-at32ap/at32ap700x.c           |  8 +++----
 arch/ia64/kernel/pci-dma.c                    |  2 +-
 arch/mips/alchemy/common/platform.c           | 30 +++++++++++++--------------
 arch/mips/alchemy/devboards/pb1200/platform.c |  4 ++--
 arch/mips/nxp/pnx833x/common/platform.c       | 12 +++++------
 arch/mips/nxp/pnx8550/common/platform.c       |  8 +++----
 arch/mips/pmc-sierra/msp71xx/msp_usb.c        |  8 +++----
 arch/powerpc/kernel/dma.c                     |  2 +-
 arch/powerpc/kernel/of_platform.c             |  2 +-
 arch/powerpc/platforms/iseries/iommu.c        |  4 ++--
 arch/powerpc/platforms/ps3/system-bus.c       |  2 +-
 arch/x86/include/asm/dma-mapping.h            |  4 ++--
 arch/x86/kernel/pci-dma.c                     |  4 ++--
 arch/x86/kernel/pci-nommu.c                   |  2 +-
 drivers/ata/ahci.c                            |  6 +++---
 drivers/ata/pata_cs5520.c                     |  4 ++--
 drivers/ata/pata_ixp4xx_cf.c                  |  2 +-
 drivers/ata/pdc_adma.c                        |  4 ++--
 drivers/ata/sata_inic162x.c                   |  4 ++--
 drivers/ata/sata_mv.c                         |  6 +++---
 drivers/ata/sata_qstor.c                      |  6 +++---
 drivers/ata/sata_sil24.c                      |  6 +++---
 drivers/ata/sata_vsc.c                        |  4 ++--
 drivers/atm/he.c                              |  2 +-
 drivers/atm/lanai.c                           |  4 ++--
 drivers/block/DAC960.c                        |  8 +++----
 drivers/block/cciss.c                         |  2 +-
 drivers/block/sx8.c                           |  2 +-
 drivers/block/umem.c                          |  2 +-
 drivers/crypto/hifn_795x.c                    |  2 +-
 drivers/crypto/ixp4xx_crypto.c                |  2 +-
 drivers/dma/ioat.c                            |  4 ++--
 drivers/firmware/dcdbas.c                     |  2 +-
 drivers/ide/cs5520.c                          |  2 +-
 drivers/ide/setup-pci.c                       |  2 +-
 drivers/ieee1394/pcilynx.c                    |  2 +-
 drivers/infiniband/hw/amso1100/c2.c           |  2 +-
 drivers/infiniband/hw/ipath/ipath_driver.c    |  4 ++--
 drivers/infiniband/hw/mthca/mthca_main.c      |  4 ++--
 drivers/infiniband/hw/nes/nes.c               |  4 ++--
 drivers/media/dvb/dm1105/dm1105.c             |  2 +-
 drivers/media/dvb/pluto2/pluto2.c             |  2 +-
 drivers/media/video/bt8xx/bttv-driver.c       |  2 +-
 drivers/media/video/cx88/cx88-alsa.c          |  2 +-
 drivers/media/video/cx88/cx88-mpeg.c          |  2 +-
 drivers/media/video/cx88/cx88-video.c         |  2 +-
 drivers/media/video/meye.c                    |  2 +-
 drivers/media/video/saa7134/saa7134-core.c    |  2 +-
 drivers/memstick/host/jmb38x_ms.c             |  2 +-
 drivers/message/fusion/mptbase.c              |  4 ++--
 drivers/message/i2o/memory.c                  |  2 +-
 drivers/message/i2o/pci.c                     |  2 +-
 drivers/misc/tifm_7xx1.c                      |  2 +-
 drivers/mmc/host/sdhci-pci.c                  |  2 +-
 drivers/net/8139cp.c                          |  6 +++---
 drivers/net/acenic.c                          |  2 +-
 drivers/net/amd8111e.c                        |  2 +-
 drivers/net/atl1e/atl1e_main.c                |  4 ++--
 drivers/net/atlx/atl1.c                       |  2 +-
 drivers/net/atlx/atl2.c                       |  4 ++--
 drivers/net/bnx2.c                            |  2 +-
 drivers/net/bnx2x_main.c                      |  2 +-
 drivers/net/cassini.c                         |  2 +-
 drivers/net/chelsio/cxgb2.c                   |  2 +-
 drivers/net/cxgb3/cxgb3_main.c                |  2 +-
 drivers/net/e100.c                            |  2 +-
 drivers/net/e1000/e1000_main.c                |  4 ++--
 drivers/net/e1000e/netdev.c                   |  8 +++----
 drivers/net/enic/enic_main.c                  |  4 ++--
 drivers/net/hp100.c                           |  2 +-
 drivers/net/igb/igb_main.c                    |  4 ++--
 drivers/net/ioc3-eth.c                        |  2 +-
 drivers/net/ipg.c                             |  2 +-
 drivers/net/ixgb/ixgb_main.c                  |  4 ++--
 drivers/net/ixgbe/ixgbe_main.c                |  8 +++----
 drivers/net/jme.c                             |  4 ++--
 drivers/net/mlx4/main.c                       |  4 ++--
 drivers/net/myri10ge/myri10ge.c               |  4 ++--
 drivers/net/niu.c                             |  4 ++--
 drivers/net/ns83820.c                         |  2 +-
 drivers/net/qla3xxx.c                         |  4 ++--
 drivers/net/qlge/qlge_main.c                  |  4 ++--
 drivers/net/r6040.c                           |  4 ++--
 drivers/net/r8169.c                           |  8 +++----
 drivers/net/s2io.c                            |  2 +-
 drivers/net/sc92031.c                         |  4 ++--
 drivers/net/sis190.c                          |  2 +-
 drivers/net/sis900.c                          |  2 +-
 drivers/net/skge.c                            |  4 ++--
 drivers/net/sky2.c                            |  2 +-
 drivers/net/smsc9420.c                        |  2 +-
 drivers/net/sungem.c                          |  2 +-
 drivers/net/tehuti.c                          |  4 ++--
 drivers/net/tehuti.h                          |  4 ----
 drivers/net/tg3.c                             | 10 ++++-----
 drivers/net/tlan.c                            |  2 +-
 drivers/net/tokenring/lanstreamer.c           |  2 +-
 drivers/net/tulip/dmfe.c                      |  2 +-
 drivers/net/tulip/uli526x.c                   |  2 +-
 drivers/net/tulip/winbond-840.c               |  2 +-
 drivers/net/typhoon.c                         |  2 +-
 drivers/net/via-rhine.c                       |  2 +-
 drivers/net/wan/wanxl.c                       |  4 ++--
 drivers/net/wireless/adm8211.c                |  4 ++--
 drivers/net/wireless/ath5k/base.c             |  2 +-
 drivers/net/wireless/b43/dma.c                |  8 +++----
 drivers/net/wireless/b43legacy/dma.c          |  8 +++----
 drivers/net/wireless/ipw2x00/ipw2100.c        |  2 +-
 drivers/net/wireless/ipw2x00/ipw2200.c        |  4 ++--
 drivers/net/wireless/iwlwifi/iwl3945-base.c   |  4 ++--
 drivers/net/wireless/prism54/islpci_hotplug.c |  2 +-
 drivers/net/wireless/rt2x00/rt2x00pci.c       |  2 +-
 drivers/pci/intel-iommu.c                     |  8 +++----
 drivers/rapidio/rio-scan.c                    |  4 ++--
 drivers/scsi/3w-9xxx.c                        |  4 ++--
 drivers/scsi/3w-xxxx.h                        |  2 +-
 drivers/scsi/BusLogic.c                       |  6 +++---
 drivers/scsi/a100u2w.c                        |  2 +-
 drivers/scsi/aacraid/aachba.c                 |  4 ++--
 drivers/scsi/aacraid/commsup.c                |  6 +++---
 drivers/scsi/aacraid/linit.c                  |  6 +++---
 drivers/scsi/aic7xxx/aic79xx_osm_pci.c        |  6 +++---
 drivers/scsi/aic7xxx/aic7xxx_osm_pci.c        |  4 ++--
 drivers/scsi/aic94xx/aic94xx_init.c           |  4 ++--
 drivers/scsi/arcmsr/arcmsr_hba.c              |  2 +-
 drivers/scsi/atp870u.c                        |  2 +-
 drivers/scsi/dpt_i2o.c                        |  6 +++---
 drivers/scsi/eata.c                           |  2 +-
 drivers/scsi/gdth.c                           |  4 ++--
 drivers/scsi/hptiop.c                         |  2 +-
 drivers/scsi/initio.c                         |  2 +-
 drivers/scsi/ipr.c                            |  2 +-
 drivers/scsi/ips.c                            |  2 +-
 drivers/scsi/lasi700.c                        |  2 +-
 drivers/scsi/lpfc/lpfc_init.c                 |  2 +-
 drivers/scsi/megaraid.c                       |  4 ++--
 drivers/scsi/megaraid/megaraid_mbox.c         |  4 ++--
 drivers/scsi/megaraid/megaraid_sas.c          |  4 ++--
 drivers/scsi/mvsas.c                          |  6 +++---
 drivers/scsi/nsp32.c                          |  2 +-
 drivers/scsi/qla1280.c                        |  4 ++--
 drivers/scsi/qla2xxx/qla_os.c                 |  4 ++--
 drivers/scsi/qla4xxx/ql4_os.c                 |  4 ++--
 drivers/scsi/sni_53c710.c                     |  2 +-
 drivers/scsi/stex.c                           |  4 ++--
 drivers/scsi/sym53c8xx_2/sym_glue.c           |  2 +-
 drivers/scsi/sym53c8xx_2/sym_hipd.h           |  2 +-
 drivers/staging/agnx/pci.c                    |  4 ++--
 drivers/staging/altpciechdma/altpciechdma.c   |  4 ++--
 drivers/staging/sxg/sxg.c                     |  4 ++--
 drivers/usb/host/ehci-ps3.c                   |  2 +-
 drivers/usb/host/ohci-ps3.c                   |  2 +-
 drivers/uwb/whci.c                            |  4 ++--
 include/linux/dma-mapping.h                   |  2 +-
 lib/swiotlb.c                                 |  2 +-
 sound/pci/ad1889.c                            |  4 ++--
 sound/pci/au88x0/au88x0.c                     |  4 ++--
 sound/pci/aw2/aw2-alsa.c                      |  4 ++--
 sound/pci/ca0106/ca0106_main.c                |  4 ++--
 sound/pci/cs5535audio/cs5535audio.c           |  4 ++--
 sound/pci/mixart/mixart.c                     |  2 +-
 sound/pci/pcxhr/pcxhr.c                       |  2 +-
 sound/soc/blackfin/bf5xx-ac97-pcm.c           |  4 ++--
 sound/soc/blackfin/bf5xx-i2s-pcm.c            |  4 ++--
 sound/soc/omap/omap-pcm.c                     |  2 +-
 sound/soc/pxa/pxa2xx-pcm.c                    |  4 ++--
 sound/soc/s3c24xx/s3c24xx-pcm.c               |  2 +-
 173 files changed, 309 insertions(+), 313 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-davinci/board-evm.c b/arch/arm/mach-davinci/board-evm.c
index 38b6a9ce2a93..0b97a528902b 100644
--- a/arch/arm/mach-davinci/board-evm.c
+++ b/arch/arm/mach-davinci/board-evm.c
@@ -118,7 +118,7 @@ static struct resource ide_resources[] = {
 	},
 };
 
-static u64 ide_dma_mask = DMA_32BIT_MASK;
+static u64 ide_dma_mask = DMA_BIT_MASK(32);
 
 static struct platform_device ide_dev = {
 	.name           = "palm_bk3710",
@@ -127,7 +127,7 @@ static struct platform_device ide_dev = {
 	.num_resources  = ARRAY_SIZE(ide_resources),
 	.dev = {
 		.dma_mask		= &ide_dma_mask,
-		.coherent_dma_mask      = DMA_32BIT_MASK,
+		.coherent_dma_mask      = DMA_BIT_MASK(32),
 	},
 };
 
diff --git a/arch/arm/mach-davinci/usb.c b/arch/arm/mach-davinci/usb.c
index 69680784448a..2429b79f6da2 100644
--- a/arch/arm/mach-davinci/usb.c
+++ b/arch/arm/mach-davinci/usb.c
@@ -64,7 +64,7 @@ static struct resource usb_resources[] = {
 	},
 };
 
-static u64 usb_dmamask = DMA_32BIT_MASK;
+static u64 usb_dmamask = DMA_BIT_MASK(32);
 
 static struct platform_device usb_dev = {
 	.name           = "musb_hdrc",
@@ -72,7 +72,7 @@ static struct platform_device usb_dev = {
 	.dev = {
 		.platform_data		= &usb_data,
 		.dma_mask		= &usb_dmamask,
-		.coherent_dma_mask      = DMA_32BIT_MASK,
+		.coherent_dma_mask      = DMA_BIT_MASK(32),
 	},
 	.resource       = usb_resources,
 	.num_resources  = ARRAY_SIZE(usb_resources),
diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c
index c691848714d9..16dc9ea08393 100644
--- a/arch/arm/mach-kirkwood/common.c
+++ b/arch/arm/mach-kirkwood/common.c
@@ -508,7 +508,7 @@ static struct mv_xor_platform_shared_data kirkwood_xor_shared_data = {
 	.dram		= &kirkwood_mbus_dram_info,
 };
 
-static u64 kirkwood_xor_dmamask = DMA_32BIT_MASK;
+static u64 kirkwood_xor_dmamask = DMA_BIT_MASK(32);
 
 
 /*****************************************************************************
diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c
index e8d42e8cb465..6af99ddabdfb 100644
--- a/arch/arm/mach-orion5x/common.c
+++ b/arch/arm/mach-orion5x/common.c
@@ -463,7 +463,7 @@ static struct platform_device orion5x_xor_shared = {
 	.resource	= orion5x_xor_shared_resources,
 };
 
-static u64 orion5x_xor_dmamask = DMA_32BIT_MASK;
+static u64 orion5x_xor_dmamask = DMA_BIT_MASK(32);
 
 static struct resource orion5x_xor0_resources[] = {
 	[0] = {
diff --git a/arch/arm/plat-iop/adma.c b/arch/arm/plat-iop/adma.c
index a2a94f6d2e7c..3c127aabe214 100644
--- a/arch/arm/plat-iop/adma.c
+++ b/arch/arm/plat-iop/adma.c
@@ -119,7 +119,7 @@ static struct resource iop3xx_aau_resources[] = {
 	}
 };
 
-static u64 iop3xx_adma_dmamask = DMA_32BIT_MASK;
+static u64 iop3xx_adma_dmamask = DMA_BIT_MASK(32);
 
 static struct iop_adma_platform_data iop3xx_dma_0_data = {
 	.hw_id = DMA0_ID,
diff --git a/arch/avr32/boards/hammerhead/flash.c b/arch/avr32/boards/hammerhead/flash.c
index 559bbcb03f9b..776c3cb9b6e4 100644
--- a/arch/avr32/boards/hammerhead/flash.c
+++ b/arch/avr32/boards/hammerhead/flash.c
@@ -280,13 +280,13 @@ static struct resource hh_fpga0_resource[] = {
 	},
 };
 
-static u64 hh_fpga0_dma_mask = DMA_32BIT_MASK;
+static u64 hh_fpga0_dma_mask = DMA_BIT_MASK(32);
 static struct platform_device hh_fpga0_device = {
 	.name		= "hh_fpga",
 	.id		= 0,
 	.dev		= {
 		.dma_mask = &hh_fpga0_dma_mask,
-		.coherent_dma_mask = DMA_32BIT_MASK,
+		.coherent_dma_mask = DMA_BIT_MASK(32),
 	},
 	.resource	= hh_fpga0_resource,
 	.num_resources	= ARRAY_SIZE(hh_fpga0_resource),
diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c
index 7cc653798327..eb9d4dc2e86d 100644
--- a/arch/avr32/mach-at32ap/at32ap700x.c
+++ b/arch/avr32/mach-at32ap/at32ap700x.c
@@ -60,26 +60,26 @@
  * don't ... tc, smc, pio, rtc, watchdog, pwm, ps2, and more.
  */
 #define DEFINE_DEV(_name, _id)					\
-static u64 _name##_id##_dma_mask = DMA_32BIT_MASK;		\
+static u64 _name##_id##_dma_mask = DMA_BIT_MASK(32);		\
 static struct platform_device _name##_id##_device = {		\
 	.name		= #_name,				\
 	.id		= _id,					\
 	.dev		= {					\
 		.dma_mask = &_name##_id##_dma_mask,		\
-		.coherent_dma_mask = DMA_32BIT_MASK,		\
+		.coherent_dma_mask = DMA_BIT_MASK(32),		\
 	},							\
 	.resource	= _name##_id##_resource,		\
 	.num_resources	= ARRAY_SIZE(_name##_id##_resource),	\
 }
 #define DEFINE_DEV_DATA(_name, _id)				\
-static u64 _name##_id##_dma_mask = DMA_32BIT_MASK;		\
+static u64 _name##_id##_dma_mask = DMA_BIT_MASK(32);		\
 static struct platform_device _name##_id##_device = {		\
 	.name		= #_name,				\
 	.id		= _id,					\
 	.dev		= {					\
 		.dma_mask = &_name##_id##_dma_mask,		\
 		.platform_data	= &_name##_id##_data,		\
-		.coherent_dma_mask = DMA_32BIT_MASK,		\
+		.coherent_dma_mask = DMA_BIT_MASK(32),		\
 	},							\
 	.resource	= _name##_id##_resource,		\
 	.num_resources	= ARRAY_SIZE(_name##_id##_resource),	\
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
index 8f34f3ddb83b..f82b0ee6bb1f 100644
--- a/arch/ia64/kernel/pci-dma.c
+++ b/arch/ia64/kernel/pci-dma.c
@@ -37,7 +37,7 @@ int force_iommu __read_mostly;
    to i386. */
 struct device fallback_dev = {
 	.init_name = "fallback device",
-	.coherent_dma_mask = DMA_32BIT_MASK,
+	.coherent_dma_mask = DMA_BIT_MASK(32),
 	.dma_mask = &fallback_dev.coherent_dma_mask,
 };
 
diff --git a/arch/mips/alchemy/common/platform.c b/arch/mips/alchemy/common/platform.c
index 5c76c6448e04..117f99f70649 100644
--- a/arch/mips/alchemy/common/platform.c
+++ b/arch/mips/alchemy/common/platform.c
@@ -80,14 +80,14 @@ static struct resource au1xxx_usb_ohci_resources[] = {
 };
 
 /* The dmamask must be set for OHCI to work */
-static u64 ohci_dmamask = DMA_32BIT_MASK;
+static u64 ohci_dmamask = DMA_BIT_MASK(32);
 
 static struct platform_device au1xxx_usb_ohci_device = {
 	.name		= "au1xxx-ohci",
 	.id		= 0,
 	.dev = {
 		.dma_mask		= &ohci_dmamask,
-		.coherent_dma_mask	= DMA_32BIT_MASK,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
 	},
 	.num_resources	= ARRAY_SIZE(au1xxx_usb_ohci_resources),
 	.resource	= au1xxx_usb_ohci_resources,
@@ -109,14 +109,14 @@ static struct resource au1100_lcd_resources[] = {
 	}
 };
 
-static u64 au1100_lcd_dmamask = DMA_32BIT_MASK;
+static u64 au1100_lcd_dmamask = DMA_BIT_MASK(32);
 
 static struct platform_device au1100_lcd_device = {
 	.name           = "au1100-lcd",
 	.id             = 0,
 	.dev = {
 		.dma_mask               = &au1100_lcd_dmamask,
-		.coherent_dma_mask      = DMA_32BIT_MASK,
+		.coherent_dma_mask      = DMA_BIT_MASK(32),
 	},
 	.num_resources  = ARRAY_SIZE(au1100_lcd_resources),
 	.resource       = au1100_lcd_resources,
@@ -138,14 +138,14 @@ static struct resource au1xxx_usb_ehci_resources[] = {
 	},
 };
 
-static u64 ehci_dmamask = DMA_32BIT_MASK;
+static u64 ehci_dmamask = DMA_BIT_MASK(32);
 
 static struct platform_device au1xxx_usb_ehci_device = {
 	.name		= "au1xxx-ehci",
 	.id		= 0,
 	.dev = {
 		.dma_mask		= &ehci_dmamask,
-		.coherent_dma_mask	= DMA_32BIT_MASK,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
 	},
 	.num_resources	= ARRAY_SIZE(au1xxx_usb_ehci_resources),
 	.resource	= au1xxx_usb_ehci_resources,
@@ -165,14 +165,14 @@ static struct resource au1xxx_usb_gdt_resources[] = {
 	},
 };
 
-static u64 udc_dmamask = DMA_32BIT_MASK;
+static u64 udc_dmamask = DMA_BIT_MASK(32);
 
 static struct platform_device au1xxx_usb_gdt_device = {
 	.name		= "au1xxx-udc",
 	.id		= 0,
 	.dev = {
 		.dma_mask		= &udc_dmamask,
-		.coherent_dma_mask	= DMA_32BIT_MASK,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
 	},
 	.num_resources	= ARRAY_SIZE(au1xxx_usb_gdt_resources),
 	.resource	= au1xxx_usb_gdt_resources,
@@ -192,14 +192,14 @@ static struct resource au1xxx_usb_otg_resources[] = {
 	},
 };
 
-static u64 uoc_dmamask = DMA_32BIT_MASK;
+static u64 uoc_dmamask = DMA_BIT_MASK(32);
 
 static struct platform_device au1xxx_usb_otg_device = {
 	.name		= "au1xxx-uoc",
 	.id		= 0,
 	.dev = {
 		.dma_mask		= &uoc_dmamask,
-		.coherent_dma_mask	= DMA_32BIT_MASK,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
 	},
 	.num_resources	= ARRAY_SIZE(au1xxx_usb_otg_resources),
 	.resource	= au1xxx_usb_otg_resources,
@@ -218,20 +218,20 @@ static struct resource au1200_lcd_resources[] = {
 	}
 };
 
-static u64 au1200_lcd_dmamask = DMA_32BIT_MASK;
+static u64 au1200_lcd_dmamask = DMA_BIT_MASK(32);
 
 static struct platform_device au1200_lcd_device = {
 	.name           = "au1200-lcd",
 	.id             = 0,
 	.dev = {
 		.dma_mask               = &au1200_lcd_dmamask,
-		.coherent_dma_mask      = DMA_32BIT_MASK,
+		.coherent_dma_mask      = DMA_BIT_MASK(32),
 	},
 	.num_resources  = ARRAY_SIZE(au1200_lcd_resources),
 	.resource       = au1200_lcd_resources,
 };
 
-static u64 au1xxx_mmc_dmamask =  DMA_32BIT_MASK;
+static u64 au1xxx_mmc_dmamask =  DMA_BIT_MASK(32);
 
 extern struct au1xmmc_platform_data au1xmmc_platdata[2];
 
@@ -263,7 +263,7 @@ static struct platform_device au1200_mmc0_device = {
 	.id = 0,
 	.dev = {
 		.dma_mask		= &au1xxx_mmc_dmamask,
-		.coherent_dma_mask	= DMA_32BIT_MASK,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
 		.platform_data		= &au1xmmc_platdata[0],
 	},
 	.num_resources	= ARRAY_SIZE(au1200_mmc0_resources),
@@ -299,7 +299,7 @@ static struct platform_device au1200_mmc1_device = {
 	.id = 1,
 	.dev = {
 		.dma_mask		= &au1xxx_mmc_dmamask,
-		.coherent_dma_mask	= DMA_32BIT_MASK,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
 		.platform_data		= &au1xmmc_platdata[1],
 	},
 	.num_resources	= ARRAY_SIZE(au1200_mmc1_resources),
diff --git a/arch/mips/alchemy/devboards/pb1200/platform.c b/arch/mips/alchemy/devboards/pb1200/platform.c
index 0d68e1985ffd..b93dff4a6789 100644
--- a/arch/mips/alchemy/devboards/pb1200/platform.c
+++ b/arch/mips/alchemy/devboards/pb1200/platform.c
@@ -119,14 +119,14 @@ static struct resource ide_resources[] = {
 	}
 };
 
-static u64 ide_dmamask = DMA_32BIT_MASK;
+static u64 ide_dmamask = DMA_BIT_MASK(32);
 
 static struct platform_device ide_device = {
 	.name		= "au1200-ide",
 	.id		= 0,
 	.dev = {
 		.dma_mask 		= &ide_dmamask,
-		.coherent_dma_mask	= DMA_32BIT_MASK,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
 	},
 	.num_resources	= ARRAY_SIZE(ide_resources),
 	.resource	= ide_resources
diff --git a/arch/mips/nxp/pnx833x/common/platform.c b/arch/mips/nxp/pnx833x/common/platform.c
index b1ccbcc18f78..01f8345a2069 100644
--- a/arch/mips/nxp/pnx833x/common/platform.c
+++ b/arch/mips/nxp/pnx833x/common/platform.c
@@ -42,7 +42,7 @@
 #include <irq-mapping.h>
 #include <pnx833x.h>
 
-static u64 uart_dmamask     = DMA_32BIT_MASK;
+static u64 uart_dmamask     = DMA_BIT_MASK(32);
 
 static struct resource pnx833x_uart_resources[] = {
 	[0] = {
@@ -101,14 +101,14 @@ static struct platform_device pnx833x_uart_device = {
 	.id		= -1,
 	.dev = {
 		.dma_mask		= &uart_dmamask,
-		.coherent_dma_mask	= DMA_32BIT_MASK,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
 		.platform_data		= pnx8xxx_ports,
 	},
 	.num_resources	= ARRAY_SIZE(pnx833x_uart_resources),
 	.resource	= pnx833x_uart_resources,
 };
 
-static u64 ehci_dmamask     = DMA_32BIT_MASK;
+static u64 ehci_dmamask     = DMA_BIT_MASK(32);
 
 static struct resource pnx833x_usb_ehci_resources[] = {
 	[0] = {
@@ -128,7 +128,7 @@ static struct platform_device pnx833x_usb_ehci_device = {
 	.id		= -1,
 	.dev = {
 		.dma_mask		= &ehci_dmamask,
-		.coherent_dma_mask	= DMA_32BIT_MASK,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
 	},
 	.num_resources	= ARRAY_SIZE(pnx833x_usb_ehci_resources),
 	.resource	= pnx833x_usb_ehci_resources,
@@ -198,7 +198,7 @@ static struct platform_device pnx833x_i2c1_device = {
 };
 #endif
 
-static u64 ethernet_dmamask = DMA_32BIT_MASK;
+static u64 ethernet_dmamask = DMA_BIT_MASK(32);
 
 static struct resource pnx833x_ethernet_resources[] = {
 	[0] = {
@@ -218,7 +218,7 @@ static struct platform_device pnx833x_ethernet_device = {
 	.id   = -1,
 	.dev  = {
 		.dma_mask          = &ethernet_dmamask,
-		.coherent_dma_mask = DMA_32BIT_MASK,
+		.coherent_dma_mask = DMA_BIT_MASK(32),
 	},
 	.num_resources = ARRAY_SIZE(pnx833x_ethernet_resources),
 	.resource      = pnx833x_ethernet_resources,
diff --git a/arch/mips/nxp/pnx8550/common/platform.c b/arch/mips/nxp/pnx8550/common/platform.c
index 21d2955359b3..5264cc09a27b 100644
--- a/arch/mips/nxp/pnx8550/common/platform.c
+++ b/arch/mips/nxp/pnx8550/common/platform.c
@@ -92,16 +92,16 @@ struct pnx8xxx_port pnx8xxx_ports[] = {
 };
 
 /* The dmamask must be set for OHCI to work */
-static u64 ohci_dmamask = DMA_32BIT_MASK;
+static u64 ohci_dmamask = DMA_BIT_MASK(32);
 
-static u64 uart_dmamask = DMA_32BIT_MASK;
+static u64 uart_dmamask = DMA_BIT_MASK(32);
 
 static struct platform_device pnx8550_usb_ohci_device = {
 	.name		= "pnx8550-ohci",
 	.id		= -1,
 	.dev = {
 		.dma_mask		= &ohci_dmamask,
-		.coherent_dma_mask	= DMA_32BIT_MASK,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
 	},
 	.num_resources	= ARRAY_SIZE(pnx8550_usb_ohci_resources),
 	.resource	= pnx8550_usb_ohci_resources,
@@ -112,7 +112,7 @@ static struct platform_device pnx8550_uart_device = {
 	.id		= -1,
 	.dev = {
 		.dma_mask		= &uart_dmamask,
-		.coherent_dma_mask	= DMA_32BIT_MASK,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
 		.platform_data = pnx8xxx_ports,
 	},
 	.num_resources	= ARRAY_SIZE(pnx8550_uart_resources),
diff --git a/arch/mips/pmc-sierra/msp71xx/msp_usb.c b/arch/mips/pmc-sierra/msp71xx/msp_usb.c
index f7ca4f582331..0ee01e359dd8 100644
--- a/arch/mips/pmc-sierra/msp71xx/msp_usb.c
+++ b/arch/mips/pmc-sierra/msp71xx/msp_usb.c
@@ -49,14 +49,14 @@ static struct resource msp_usbhost_resources [] = {
 	},
 };
 
-static u64 msp_usbhost_dma_mask = DMA_32BIT_MASK;
+static u64 msp_usbhost_dma_mask = DMA_BIT_MASK(32);
 
 static struct platform_device msp_usbhost_device = {
 	.name	= "pmcmsp-ehci",
 	.id	= 0,
 	.dev	= {
 		.dma_mask = &msp_usbhost_dma_mask,
-		.coherent_dma_mask = DMA_32BIT_MASK,
+		.coherent_dma_mask = DMA_BIT_MASK(32),
 	},
 	.num_resources 	= ARRAY_SIZE(msp_usbhost_resources),
 	.resource	= msp_usbhost_resources,
@@ -77,14 +77,14 @@ static struct resource msp_usbdev_resources [] = {
 	},
 };
 
-static u64 msp_usbdev_dma_mask = DMA_32BIT_MASK;
+static u64 msp_usbdev_dma_mask = DMA_BIT_MASK(32);
 
 static struct platform_device msp_usbdev_device = {
 	.name	= "msp71xx_udc",
 	.id	= 0,
 	.dev	= {
 		.dma_mask = &msp_usbdev_dma_mask,
-		.coherent_dma_mask = DMA_32BIT_MASK,
+		.coherent_dma_mask = DMA_BIT_MASK(32),
 	},
 	.num_resources	= ARRAY_SIZE(msp_usbdev_resources),
 	.resource	= msp_usbdev_resources,
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 1c5c8a6fc129..53c7788cba78 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -94,7 +94,7 @@ static int dma_direct_dma_supported(struct device *dev, u64 mask)
 	 * done via some global so platforms can set the limit in case
 	 * they have limited DMA windows
 	 */
-	return mask >= DMA_32BIT_MASK;
+	return mask >= DMA_BIT_MASK(32);
 #else
 	return 1;
 #endif
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index 3f37a6e62771..87df428e3588 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -76,7 +76,7 @@ struct of_device* of_platform_device_create(struct device_node *np,
 		return NULL;
 
 	dev->dma_mask = 0xffffffffUL;
-	dev->dev.coherent_dma_mask = DMA_32BIT_MASK;
+	dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
 
 	dev->dev.bus = &of_platform_bus_type;
 
diff --git a/arch/powerpc/platforms/iseries/iommu.c b/arch/powerpc/platforms/iseries/iommu.c
index 6ed75bffc8ab..ff43f1fd8343 100644
--- a/arch/powerpc/platforms/iseries/iommu.c
+++ b/arch/powerpc/platforms/iseries/iommu.c
@@ -202,7 +202,7 @@ static struct iommu_table vio_iommu_table;
 void *iseries_hv_alloc(size_t size, dma_addr_t *dma_handle, gfp_t flag)
 {
 	return iommu_alloc_coherent(NULL, &vio_iommu_table, size, dma_handle,
-				DMA_32BIT_MASK, flag, -1);
+				DMA_BIT_MASK(32), flag, -1);
 }
 EXPORT_SYMBOL_GPL(iseries_hv_alloc);
 
@@ -217,7 +217,7 @@ dma_addr_t iseries_hv_map(void *vaddr, size_t size,
 {
 	return iommu_map_page(NULL, &vio_iommu_table, virt_to_page(vaddr),
 			      (unsigned long)vaddr % PAGE_SIZE, size,
-			      DMA_32BIT_MASK, direction, NULL);
+			      DMA_BIT_MASK(32), direction, NULL);
 }
 
 void iseries_hv_unmap(dma_addr_t dma_handle, size_t size,
diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c
index a705fffbb498..9a73d0238639 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -689,7 +689,7 @@ static void ps3_ioc0_unmap_sg(struct device *_dev, struct scatterlist *sg,
 
 static int ps3_dma_supported(struct device *_dev, u64 mask)
 {
-	return mask >= DMA_32BIT_MASK;
+	return mask >= DMA_BIT_MASK(32);
 }
 
 static struct dma_mapping_ops ps3_sb_dma_ops = {
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index cea7b74963e9..10a6be94eb77 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -238,7 +238,7 @@ static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
 
 	dma_mask = dev->coherent_dma_mask;
 	if (!dma_mask)
-		dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
+		dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_BIT_MASK(32);
 
 	return dma_mask;
 }
@@ -250,7 +250,7 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
 	if (dma_mask <= DMA_24BIT_MASK)
 		gfp |= GFP_DMA;
 #ifdef CONFIG_X86_64
-	if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA))
+	if (dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA))
 		gfp |= GFP_DMA32;
 #endif
        return gfp;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 0cf2d900422b..136a01d52db0 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -40,7 +40,7 @@ EXPORT_SYMBOL(bad_dma_address);
    to older i386. */
 struct device x86_dma_fallback_dev = {
 	.init_name = "fallback device",
-	.coherent_dma_mask = DMA_32BIT_MASK,
+	.coherent_dma_mask = DMA_BIT_MASK(32),
 	.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
 };
 EXPORT_SYMBOL(x86_dma_fallback_dev);
@@ -148,7 +148,7 @@ again:
 	if (!is_buffer_dma_capable(dma_mask, addr, size)) {
 		__free_pages(page, get_order(size));
 
-		if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
+		if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
 			flag = (flag & ~GFP_DMA32) | GFP_DMA;
 			goto again;
 		}
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index c6d703b39326..71d412a09f30 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -15,7 +15,7 @@ static int
 check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
 {
 	if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
-		if (*hwdev->dma_mask >= DMA_32BIT_MASK)
+		if (*hwdev->dma_mask >= DMA_BIT_MASK(32))
 			printk(KERN_ERR
 			    "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
 				name, (long long)bus, size,
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 207d775c3c27..f75dac57dc2b 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -2408,7 +2408,7 @@ static int ahci_configure_dma_masks(struct pci_dev *pdev, int using_dac)
 	    !pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
 		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 		if (rc) {
-			rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+			rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 			if (rc) {
 				dev_printk(KERN_ERR, &pdev->dev,
 					   "64-bit DMA enable failed\n");
@@ -2416,13 +2416,13 @@ static int ahci_configure_dma_masks(struct pci_dev *pdev, int using_dac)
 			}
 		}
 	} else {
-		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_printk(KERN_ERR, &pdev->dev,
 				   "32-bit DMA enable failed\n");
 			return rc;
 		}
-		rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_printk(KERN_ERR, &pdev->dev,
 				   "32-bit consistent DMA enable failed\n");
diff --git a/drivers/ata/pata_cs5520.c b/drivers/ata/pata_cs5520.c
index db6a96984f3f..0df83cf74233 100644
--- a/drivers/ata/pata_cs5520.c
+++ b/drivers/ata/pata_cs5520.c
@@ -203,11 +203,11 @@ static int __devinit cs5520_init_one(struct pci_dev *pdev, const struct pci_devi
 		return -ENODEV;
 	}
 
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_ERR DRV_NAME ": unable to configure DMA mask.\n");
 		return -ENODEV;
 	}
-	if (pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_ERR DRV_NAME ": unable to configure consistent DMA mask.\n");
 		return -ENODEV;
 	}
diff --git a/drivers/ata/pata_ixp4xx_cf.c b/drivers/ata/pata_ixp4xx_cf.c
index 19fdecf319a6..ba54b089f98c 100644
--- a/drivers/ata/pata_ixp4xx_cf.c
+++ b/drivers/ata/pata_ixp4xx_cf.c
@@ -157,7 +157,7 @@ static __devinit int ixp4xx_pata_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	/* acquire resources and fill host */
-	pdev->dev.coherent_dma_mask = DMA_32BIT_MASK;
+	pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
 
 	data->cs0 = devm_ioremap(&pdev->dev, cs0->start, 0x1000);
 	data->cs1 = devm_ioremap(&pdev->dev, cs1->start, 0x1000);
diff --git a/drivers/ata/pdc_adma.c b/drivers/ata/pdc_adma.c
index 39588178d028..6c65b0776a2c 100644
--- a/drivers/ata/pdc_adma.c
+++ b/drivers/ata/pdc_adma.c
@@ -607,13 +607,13 @@ static int adma_set_dma_masks(struct pci_dev *pdev, void __iomem *mmio_base)
 {
 	int rc;
 
-	rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc) {
 		dev_printk(KERN_ERR, &pdev->dev,
 			"32-bit DMA enable failed\n");
 		return rc;
 	}
-	rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc) {
 		dev_printk(KERN_ERR, &pdev->dev,
 			"32-bit consistent DMA enable failed\n");
diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 305a4f825f53..8d890cc5a7ee 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -861,14 +861,14 @@ static int inic_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	/* Set dma_mask.  This devices doesn't support 64bit addressing. */
-	rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc) {
 		dev_printk(KERN_ERR, &pdev->dev,
 			   "32-bit DMA enable failed\n");
 		return rc;
 	}
 
-	rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc) {
 		dev_printk(KERN_ERR, &pdev->dev,
 			   "32-bit consistent DMA enable failed\n");
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 45e0fe191afc..5af3ea19d3c5 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -3916,7 +3916,7 @@ static int pci_go_64(struct pci_dev *pdev)
 	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
 		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 		if (rc) {
-			rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+			rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 			if (rc) {
 				dev_printk(KERN_ERR, &pdev->dev,
 					   "64-bit DMA enable failed\n");
@@ -3924,13 +3924,13 @@ static int pci_go_64(struct pci_dev *pdev)
 			}
 		}
 	} else {
-		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_printk(KERN_ERR, &pdev->dev,
 				   "32-bit DMA enable failed\n");
 			return rc;
 		}
-		rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_printk(KERN_ERR, &pdev->dev,
 				   "32-bit consistent DMA enable failed\n");
diff --git a/drivers/ata/sata_qstor.c b/drivers/ata/sata_qstor.c
index 7b37c27d7972..326c0cfc29b3 100644
--- a/drivers/ata/sata_qstor.c
+++ b/drivers/ata/sata_qstor.c
@@ -587,7 +587,7 @@ static int qs_set_dma_masks(struct pci_dev *pdev, void __iomem *mmio_base)
 	    !pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
 		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 		if (rc) {
-			rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+			rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 			if (rc) {
 				dev_printk(KERN_ERR, &pdev->dev,
 					   "64-bit DMA enable failed\n");
@@ -595,13 +595,13 @@ static int qs_set_dma_masks(struct pci_dev *pdev, void __iomem *mmio_base)
 			}
 		}
 	} else {
-		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_printk(KERN_ERR, &pdev->dev,
 				"32-bit DMA enable failed\n");
 			return rc;
 		}
-		rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_printk(KERN_ERR, &pdev->dev,
 				"32-bit consistent DMA enable failed\n");
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index 37730bc2f09f..77aa8d7ecec4 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -1300,7 +1300,7 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
 		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 		if (rc) {
-			rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+			rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 			if (rc) {
 				dev_printk(KERN_ERR, &pdev->dev,
 					   "64-bit DMA enable failed\n");
@@ -1308,13 +1308,13 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			}
 		}
 	} else {
-		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_printk(KERN_ERR, &pdev->dev,
 				   "32-bit DMA enable failed\n");
 			return rc;
 		}
-		rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_printk(KERN_ERR, &pdev->dev,
 				   "32-bit consistent DMA enable failed\n");
diff --git a/drivers/ata/sata_vsc.c b/drivers/ata/sata_vsc.c
index ed70bd28fa2c..8b2a278b2547 100644
--- a/drivers/ata/sata_vsc.c
+++ b/drivers/ata/sata_vsc.c
@@ -399,10 +399,10 @@ static int __devinit vsc_sata_init_one(struct pci_dev *pdev,
 	/*
 	 * Use 32 bit DMA mask, because 64 bit address support is poor.
 	 */
-	rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc)
 		return rc;
-	rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc)
 		return rc;
 
diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index bdbad7edf682..2de64065aa1b 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -358,7 +358,7 @@ he_init_one(struct pci_dev *pci_dev, const struct pci_device_id *pci_ent)
 
 	if (pci_enable_device(pci_dev))
 		return -EIO;
-	if (pci_set_dma_mask(pci_dev, DMA_32BIT_MASK) != 0) {
+	if (pci_set_dma_mask(pci_dev, DMA_BIT_MASK(32)) != 0) {
 		printk(KERN_WARNING "he: no suitable dma available\n");
 		err = -EIO;
 		goto init_one_failure;
diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index 8733a2ea04c2..cf97c34cbaf1 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -1957,12 +1957,12 @@ static int __devinit lanai_pci_start(struct lanai_dev *lanai)
 		return -ENXIO;
 	}
 	pci_set_master(pci);
-	if (pci_set_dma_mask(pci, DMA_32BIT_MASK) != 0) {
+	if (pci_set_dma_mask(pci, DMA_BIT_MASK(32)) != 0) {
 		printk(KERN_WARNING DEV_LABEL
 		    "(itf %d): No suitable DMA available.\n", lanai->number);
 		return -EBUSY;
 	}
-	if (pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK) != 0) {
+	if (pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(32)) != 0) {
 		printk(KERN_WARNING DEV_LABEL
 		    "(itf %d): No suitable DMA available.\n", lanai->number);
 		return -EBUSY;
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 5496865b297e..f22ed6cc69f2 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -1169,9 +1169,9 @@ static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T
   int i;
 
   
-  if (pci_set_dma_mask(Controller->PCIDevice, DMA_32BIT_MASK))
+  if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
 	return DAC960_Failure(Controller, "DMA mask out of range");
-  Controller->BounceBufferLimit = DMA_32BIT_MASK;
+  Controller->BounceBufferLimit = DMA_BIT_MASK(32);
 
   if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller)) {
     CommandMailboxesSize =  0;
@@ -1374,8 +1374,8 @@ static bool DAC960_V2_EnableMemoryMailboxInterface(DAC960_Controller_T
 
 	if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(64)))
 		Controller->BounceBufferLimit = DMA_BIT_MASK(64);
-	else if (!pci_set_dma_mask(Controller->PCIDevice, DMA_32BIT_MASK))
-		Controller->BounceBufferLimit = DMA_32BIT_MASK;
+	else if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
+		Controller->BounceBufferLimit = DMA_BIT_MASK(32);
 	else
 		return DAC960_Failure(Controller, "DMA mask out of range");
 
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 149a611e8fe5..a6c55432819b 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -3639,7 +3639,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	/* configure PCI DMA stuff */
 	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
 		dac = 1;
-	else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK))
+	else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))
 		dac = 0;
 	else {
 		printk(KERN_ERR "cciss: no suitable DMA available\n");
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index db5783ace1e4..ff0448e4bf03 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -1597,7 +1597,7 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 		pci_dac = 1;
 	} else {
 #endif
-		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			printk(KERN_ERR DRV_NAME "(%s): DMA mask failure\n",
 				pci_name(pdev));
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index e93e99c9103c..9744d59a69f2 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -830,7 +830,7 @@ static int __devinit mm_pci_probe(struct pci_dev *dev,
 	  "Micro Memory(tm) controller found (PCI Mem Module (Battery Backup))\n");
 
 	if (pci_set_dma_mask(dev, DMA_BIT_MASK(64)) &&
-	    pci_set_dma_mask(dev, DMA_32BIT_MASK)) {
+	    pci_set_dma_mask(dev, DMA_BIT_MASK(32))) {
 		dev_printk(KERN_WARNING, &dev->dev, "NO suitable DMA found\n");
 		return  -ENOMEM;
 	}
diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c
index 4d85402a9e4a..2bef086fb342 100644
--- a/drivers/crypto/hifn_795x.c
+++ b/drivers/crypto/hifn_795x.c
@@ -2575,7 +2575,7 @@ static int hifn_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return err;
 	pci_set_master(pdev);
 
-	err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (err)
 		goto err_out_disable_pci_device;
 
diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c
index af9761ccf9f1..f9f05d7a707d 100644
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
@@ -209,7 +209,7 @@ static struct platform_device pseudo_dev = {
 	.id   = 0,
 	.num_resources = 0,
 	.dev  = {
-		.coherent_dma_mask = DMA_32BIT_MASK,
+		.coherent_dma_mask = DMA_BIT_MASK(32),
 		.release = dev_release,
 	}
 };
diff --git a/drivers/dma/ioat.c b/drivers/dma/ioat.c
index 22e9f1911111..2225bb6ba3d1 100644
--- a/drivers/dma/ioat.c
+++ b/drivers/dma/ioat.c
@@ -100,13 +100,13 @@ static int __devinit ioat_probe(struct pci_dev *pdev,
 
 	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
 	if (err)
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (err)
 		goto err_set_dma_mask;
 
 	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 	if (err)
-		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (err)
 		goto err_set_dma_mask;
 
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index 3009e0171e54..18d65fb42ee7 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -545,7 +545,7 @@ static int __devinit dcdbas_probe(struct platform_device *dev)
 	 * BIOS SMI calls require buffer addresses be in 32-bit address space.
 	 * This is done by setting the DMA mask below.
 	 */
-	dcdbas_pdev->dev.coherent_dma_mask = DMA_32BIT_MASK;
+	dcdbas_pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
 	dcdbas_pdev->dev.dma_mask = &dcdbas_pdev->dev.coherent_dma_mask;
 
 	error = sysfs_create_group(&dev->dev.kobj, &dcdbas_attr_group);
diff --git a/drivers/ide/cs5520.c b/drivers/ide/cs5520.c
index 58fb90e5b763..87987a7d36c9 100644
--- a/drivers/ide/cs5520.c
+++ b/drivers/ide/cs5520.c
@@ -122,7 +122,7 @@ static int __devinit cs5520_init_one(struct pci_dev *dev, const struct pci_devic
 		return -ENODEV;
 	}
 	pci_set_master(dev);
-	if (pci_set_dma_mask(dev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(dev, DMA_BIT_MASK(32))) {
 		printk(KERN_WARNING "%s: No suitable DMA available.\n",
 			d->name);
 		return -ENODEV;
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index a19dbccd7617..7a3a12d6e638 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -208,7 +208,7 @@ static int ide_pci_enable(struct pci_dev *dev, const struct ide_port_info *d)
 	 * a DMA mask field to the struct ide_port_info if we need it
 	 * (or let lower level driver set the DMA mask)
 	 */
-	ret = pci_set_dma_mask(dev, DMA_32BIT_MASK);
+	ret = pci_set_dma_mask(dev, DMA_BIT_MASK(32));
 	if (ret < 0) {
 		printk(KERN_ERR "%s %s: can't set DMA mask\n",
 			d->name, pci_name(dev));
diff --git a/drivers/ieee1394/pcilynx.c b/drivers/ieee1394/pcilynx.c
index 38f712036201..9555fd253865 100644
--- a/drivers/ieee1394/pcilynx.c
+++ b/drivers/ieee1394/pcilynx.c
@@ -1171,7 +1171,7 @@ static int __devinit add_card(struct pci_dev *dev,
 
         error = -ENXIO;
 
-        if (pci_set_dma_mask(dev, DMA_32BIT_MASK))
+        if (pci_set_dma_mask(dev, DMA_BIT_MASK(32)))
                 FAIL("DMA address limits not supported for PCILynx hardware");
         if (pci_enable_device(dev))
                 FAIL("failed to enable PCILynx hardware");
diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c
index 54d2e0760abf..0cfbb6d2f762 100644
--- a/drivers/infiniband/hw/amso1100/c2.c
+++ b/drivers/infiniband/hw/amso1100/c2.c
@@ -995,7 +995,7 @@ static int __devinit c2_probe(struct pci_dev *pcidev,
 			goto bail2;
 		}
 	} else {
-		ret = pci_set_dma_mask(pcidev, DMA_32BIT_MASK);
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
 		if (ret < 0) {
 			printk(KERN_ERR PFX "32b DMA configuration failed\n");
 			goto bail2;
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index 77b2fb5b7c35..04e88b600558 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -477,7 +477,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev,
 		 * do not setup 64 bit maps on systems with 2GB or less
 		 * memory installed.
 		 */
-		ret = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (ret) {
 			dev_info(&pdev->dev,
 				"Unable to set DMA mask for unit %u: %d\n",
@@ -486,7 +486,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev,
 		}
 		else {
 			ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
-			ret = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+			ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 			if (ret)
 				dev_info(&pdev->dev,
 					"Unable to set DMA consistent mask "
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 5d234204f7b7..1d83cf7caf38 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -1019,7 +1019,7 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
 	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
 	if (err) {
 		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
 			goto err_free_res;
@@ -1029,7 +1029,7 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
 	if (err) {
 		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
 			 "consistent PCI DMA mask.\n");
-		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
 				"aborting.\n");
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index 7446810446e1..cbde0cfe27e0 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -489,12 +489,12 @@ static int __devinit nes_probe(struct pci_dev *pcidev, const struct pci_device_i
 			goto bail2;
 		}
 	} else {
-		ret = pci_set_dma_mask(pcidev, DMA_32BIT_MASK);
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
 		if (ret < 0) {
 			printk(KERN_ERR PFX "32b DMA mask configuration failed\n");
 			goto bail2;
 		}
-		ret = pci_set_consistent_dma_mask(pcidev, DMA_32BIT_MASK);
+		ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(32));
 		if (ret) {
 			printk(KERN_ERR PFX "32b DMA consistent mask configuration failed\n");
 			goto bail2;
diff --git a/drivers/media/dvb/dm1105/dm1105.c b/drivers/media/dvb/dm1105/dm1105.c
index 5b20cf5a29f0..971a8b18f6dd 100644
--- a/drivers/media/dvb/dm1105/dm1105.c
+++ b/drivers/media/dvb/dm1105/dm1105.c
@@ -662,7 +662,7 @@ static int __devinit dm1105_probe(struct pci_dev *pdev,
 	if (ret < 0)
 		goto err_kfree;
 
-	ret = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (ret < 0)
 		goto err_pci_disable_device;
 
diff --git a/drivers/media/dvb/pluto2/pluto2.c b/drivers/media/dvb/pluto2/pluto2.c
index ee89623be85a..598eaf8acc6e 100644
--- a/drivers/media/dvb/pluto2/pluto2.c
+++ b/drivers/media/dvb/pluto2/pluto2.c
@@ -616,7 +616,7 @@ static int __devinit pluto2_probe(struct pci_dev *pdev,
 	/* enable interrupts */
 	pci_write_config_dword(pdev, 0x6c, 0x8000);
 
-	ret = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (ret < 0)
 		goto err_pci_disable_device;
 
diff --git a/drivers/media/video/bt8xx/bttv-driver.c b/drivers/media/video/bt8xx/bttv-driver.c
index 74f619d6cc93..23b7499b3185 100644
--- a/drivers/media/video/bt8xx/bttv-driver.c
+++ b/drivers/media/video/bt8xx/bttv-driver.c
@@ -4317,7 +4317,7 @@ static int __devinit bttv_probe(struct pci_dev *dev,
 		       btv->c.nr);
 		return -EIO;
 	}
-	if (pci_set_dma_mask(dev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(dev, DMA_BIT_MASK(32))) {
 		printk(KERN_WARNING "bttv%d: No suitable DMA available.\n",
 		       btv->c.nr);
 		return -EIO;
diff --git a/drivers/media/video/cx88/cx88-alsa.c b/drivers/media/video/cx88/cx88-alsa.c
index ce98d955231a..0ccdf36626e3 100644
--- a/drivers/media/video/cx88/cx88-alsa.c
+++ b/drivers/media/video/cx88/cx88-alsa.c
@@ -745,7 +745,7 @@ static int __devinit snd_cx88_create(struct snd_card *card,
 		return err;
 	}
 
-	if (!pci_dma_supported(pci,DMA_32BIT_MASK)) {
+	if (!pci_dma_supported(pci,DMA_BIT_MASK(32))) {
 		dprintk(0, "%s/1: Oops: no 32bit PCI DMA ???\n",core->name);
 		err = -EIO;
 		cx88_core_put(core,pci);
diff --git a/drivers/media/video/cx88/cx88-mpeg.c b/drivers/media/video/cx88/cx88-mpeg.c
index b295b76737e3..da4e3912cd37 100644
--- a/drivers/media/video/cx88/cx88-mpeg.c
+++ b/drivers/media/video/cx88/cx88-mpeg.c
@@ -455,7 +455,7 @@ static int cx8802_init_common(struct cx8802_dev *dev)
 	if (pci_enable_device(dev->pci))
 		return -EIO;
 	pci_set_master(dev->pci);
-	if (!pci_dma_supported(dev->pci,DMA_32BIT_MASK)) {
+	if (!pci_dma_supported(dev->pci,DMA_BIT_MASK(32))) {
 		printk("%s/2: Oops: no 32bit PCI DMA ???\n",dev->core->name);
 		return -EIO;
 	}
diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c
index ec0425d9043a..b993d42fe73c 100644
--- a/drivers/media/video/cx88/cx88-video.c
+++ b/drivers/media/video/cx88/cx88-video.c
@@ -1832,7 +1832,7 @@ static int __devinit cx8800_initdev(struct pci_dev *pci_dev,
 	       dev->pci_lat,(unsigned long long)pci_resource_start(pci_dev,0));
 
 	pci_set_master(pci_dev);
-	if (!pci_dma_supported(pci_dev,DMA_32BIT_MASK)) {
+	if (!pci_dma_supported(pci_dev,DMA_BIT_MASK(32))) {
 		printk("%s/0: Oops: no 32bit PCI DMA ???\n",core->name);
 		err = -EIO;
 		goto fail_core;
diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c
index 2ad11f0999c6..1d66855a379a 100644
--- a/drivers/media/video/meye.c
+++ b/drivers/media/video/meye.c
@@ -117,7 +117,7 @@ static int ptable_alloc(void)
 	memset(meye.mchip_ptable, 0, sizeof(meye.mchip_ptable));
 
 	/* give only 32 bit DMA addresses */
-	if (dma_set_mask(&meye.mchip_dev->dev, DMA_32BIT_MASK))
+	if (dma_set_mask(&meye.mchip_dev->dev, DMA_BIT_MASK(32)))
 		return -1;
 
 	meye.mchip_ptable_toc = dma_alloc_coherent(&meye.mchip_dev->dev,
diff --git a/drivers/media/video/saa7134/saa7134-core.c b/drivers/media/video/saa7134/saa7134-core.c
index 0bb09f1723d1..2def6fec814b 100644
--- a/drivers/media/video/saa7134/saa7134-core.c
+++ b/drivers/media/video/saa7134/saa7134-core.c
@@ -911,7 +911,7 @@ static int __devinit saa7134_initdev(struct pci_dev *pci_dev,
 	       pci_name(pci_dev), dev->pci_rev, pci_dev->irq,
 	       dev->pci_lat,(unsigned long long)pci_resource_start(pci_dev,0));
 	pci_set_master(pci_dev);
-	if (!pci_dma_supported(pci_dev, DMA_32BIT_MASK)) {
+	if (!pci_dma_supported(pci_dev, DMA_BIT_MASK(32))) {
 		printk("%s: Oops: no 32bit PCI DMA ???\n",dev->name);
 		err = -EIO;
 		goto fail1;
diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c
index 2fb95a5b72eb..f4a162a4bece 100644
--- a/drivers/memstick/host/jmb38x_ms.c
+++ b/drivers/memstick/host/jmb38x_ms.c
@@ -877,7 +877,7 @@ static int jmb38x_ms_probe(struct pci_dev *pdev,
 	int pci_dev_busy = 0;
 	int rc, cnt;
 
-	rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc)
 		return rc;
 
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index 98026016a935..d0d126c69354 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -1539,8 +1539,8 @@ mpt_mapresources(MPT_ADAPTER *ioc)
 		dinitprintk(ioc, printk(MYIOC_s_INFO_FMT
 		    ": 64 BIT PCI BUS DMA ADDRESSING SUPPORTED\n",
 		    ioc->name));
-	} else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)
-	    && !pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK)) {
+	} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))
+	    && !pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		dinitprintk(ioc, printk(MYIOC_s_INFO_FMT
 		    ": 32 BIT PCI BUS DMA ADDRESSING SUPPORTED\n",
 		    ioc->name));
diff --git a/drivers/message/i2o/memory.c b/drivers/message/i2o/memory.c
index 9a08d8e45516..292b41e49fbd 100644
--- a/drivers/message/i2o/memory.c
+++ b/drivers/message/i2o/memory.c
@@ -187,7 +187,7 @@ int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr, size_t len)
 	mutex_lock(&mem_lock);
 	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_BIT_MASK(64))) {
 		dma_64 = 1;
-		if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 			mutex_unlock(&mem_lock);
 			return -ENOMEM;
 		}
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index ed17ac5af3e0..35ba2ae38b42 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -334,7 +334,7 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 		return rc;
 	}
 
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_WARNING "i2o: no suitable DMA found for %s\n",
 		       pci_name(pdev));
 		rc = -ENODEV;
diff --git a/drivers/misc/tifm_7xx1.c b/drivers/misc/tifm_7xx1.c
index be5672a98702..a6ef18259da0 100644
--- a/drivers/misc/tifm_7xx1.c
+++ b/drivers/misc/tifm_7xx1.c
@@ -324,7 +324,7 @@ static int tifm_7xx1_probe(struct pci_dev *dev,
 	int pci_dev_busy = 0;
 	int rc;
 
-	rc = pci_set_dma_mask(dev, DMA_32BIT_MASK);
+	rc = pci_set_dma_mask(dev, DMA_BIT_MASK(32));
 	if (rc)
 		return rc;
 
diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c
index 406da9a8d453..c5b316e22371 100644
--- a/drivers/mmc/host/sdhci-pci.c
+++ b/drivers/mmc/host/sdhci-pci.c
@@ -380,7 +380,7 @@ static int sdhci_pci_enable_dma(struct sdhci_host *host)
 			"doesn't fully claim to support it.\n");
 	}
 
-	ret = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (ret)
 		return ret;
 
diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
index d08475a7f676..02330f3d5a55 100644
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -1530,7 +1530,7 @@ static void cp_get_ethtool_stats (struct net_device *dev,
 
 	/* begin NIC statistics dump */
 	cpw32(StatsAddr + 4, (u64)dma >> 32);
-	cpw32(StatsAddr, ((u64)dma & DMA_32BIT_MASK) | DumpStats);
+	cpw32(StatsAddr, ((u64)dma & DMA_BIT_MASK(32)) | DumpStats);
 	cpr32(StatsAddr);
 
 	for (i = 0; i < 1000; i++) {
@@ -1935,13 +1935,13 @@ static int cp_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	} else {
 		pci_using_dac = 0;
 
-		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_err(&pdev->dev,
 				   "No usable DMA configuration, aborting.\n");
 			goto err_out_res;
 		}
-		rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_err(&pdev->dev,
 				   "No usable consistent DMA configuration, "
diff --git a/drivers/net/acenic.c b/drivers/net/acenic.c
index 9509c17b3667..57bc71527850 100644
--- a/drivers/net/acenic.c
+++ b/drivers/net/acenic.c
@@ -1163,7 +1163,7 @@ static int __devinit ace_init(struct net_device *dev)
 	 */
 	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
 		ap->pci_using_dac = 1;
-	} else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		ap->pci_using_dac = 0;
 	} else {
 		ecode = -ENODEV;
diff --git a/drivers/net/amd8111e.c b/drivers/net/amd8111e.c
index cb9c95d3ed0a..19831bd64016 100644
--- a/drivers/net/amd8111e.c
+++ b/drivers/net/amd8111e.c
@@ -1871,7 +1871,7 @@ static int __devinit amd8111e_probe_one(struct pci_dev *pdev,
 	}
 
 	/* Initialize DMA */
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK) < 0) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) < 0) {
 		printk(KERN_ERR "amd8111e: DMA not supported,"
 			"exiting.\n");
 		goto err_free_reg;
diff --git a/drivers/net/atl1e/atl1e_main.c b/drivers/net/atl1e/atl1e_main.c
index c758884728a5..fb57b750866b 100644
--- a/drivers/net/atl1e/atl1e_main.c
+++ b/drivers/net/atl1e/atl1e_main.c
@@ -2326,8 +2326,8 @@ static int __devinit atl1e_probe(struct pci_dev *pdev,
 	 * various kernel subsystems to support the mechanics required by a
 	 * fixed-high-32-bit system.
 	 */
-	if ((pci_set_dma_mask(pdev, DMA_32BIT_MASK) != 0) ||
-	    (pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK) != 0)) {
+	if ((pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) ||
+	    (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)) != 0)) {
 		dev_err(&pdev->dev, "No usable DMA configuration,aborting\n");
 		goto err_dma;
 	}
diff --git a/drivers/net/atlx/atl1.c b/drivers/net/atlx/atl1.c
index 43fc1b2ca3cd..0ab22540bf59 100644
--- a/drivers/net/atlx/atl1.c
+++ b/drivers/net/atlx/atl1.c
@@ -2929,7 +2929,7 @@ static int __devinit atl1_probe(struct pci_dev *pdev,
 	 * various kernel subsystems to support the mechanics required by a
 	 * fixed-high-32-bit system.
 	 */
-	err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (err) {
 		dev_err(&pdev->dev, "no usable DMA configuration\n");
 		goto err_dma;
diff --git a/drivers/net/atlx/atl2.c b/drivers/net/atlx/atl2.c
index 9fe06c3f4097..c734b1983ec1 100644
--- a/drivers/net/atlx/atl2.c
+++ b/drivers/net/atlx/atl2.c
@@ -1358,8 +1358,8 @@ static int __devinit atl2_probe(struct pci_dev *pdev,
 	 * until the kernel has the proper infrastructure to support 64-bit DMA
 	 * on these devices.
 	 */
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK) &&
-		pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) &&
+		pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_ERR "atl2: No usable DMA configuration, aborting\n");
 		goto err_dma;
 	}
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 5fd57703c65a..9d268be0b670 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -7538,7 +7538,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
 				"pci_set_consistent_dma_mask failed, aborting.\n");
 			goto err_out_unmap;
 		}
-	} else if ((rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK)) != 0) {
+	} else if ((rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) != 0) {
 		dev_err(&pdev->dev, "System does not support DMA, aborting.\n");
 		goto err_out_unmap;
 	}
diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 03e01243b45e..ad5ef25add3e 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -10988,7 +10988,7 @@ static int __devinit bnx2x_init_dev(struct pci_dev *pdev,
 			goto err_out_release;
 		}
 
-	} else if (pci_set_dma_mask(pdev, DMA_32BIT_MASK) != 0) {
+	} else if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
 		printk(KERN_ERR PFX "System does not support DMA,"
 		       " aborting\n");
 		rc = -EIO;
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index c2895240e467..f5222764061c 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -5085,7 +5085,7 @@ static int __devinit cas_init_one(struct pci_dev *pdev,
 		}
 
 	} else {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			dev_err(&pdev->dev, "No usable DMA configuration, "
 			       "aborting.\n");
diff --git a/drivers/net/chelsio/cxgb2.c b/drivers/net/chelsio/cxgb2.c
index 57cfbc369f51..fa06994f9737 100644
--- a/drivers/net/chelsio/cxgb2.c
+++ b/drivers/net/chelsio/cxgb2.c
@@ -1066,7 +1066,7 @@ static int __devinit init_one(struct pci_dev *pdev,
 			goto out_disable_pdev;
 		}
 
-	} else if ((err = pci_set_dma_mask(pdev, DMA_32BIT_MASK)) != 0) {
+	} else if ((err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) != 0) {
 		CH_ERR("%s: no usable DMA configuration\n", pci_name(pdev));
 		goto out_disable_pdev;
 	}
diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index ec35d3b82409..ab0e5febef83 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -3046,7 +3046,7 @@ static int __devinit init_one(struct pci_dev *pdev,
 			       "coherent allocations\n");
 			goto out_disable_device;
 		}
-	} else if ((err = pci_set_dma_mask(pdev, DMA_32BIT_MASK)) != 0) {
+	} else if ((err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) != 0) {
 		dev_err(&pdev->dev, "no usable DMA configuration\n");
 		goto out_disable_device;
 	}
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 0504db9ad643..5c0b457c7868 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -2604,7 +2604,7 @@ static int __devinit e100_probe(struct pci_dev *pdev,
 		goto err_out_disable_pdev;
 	}
 
-	if ((err = pci_set_dma_mask(pdev, DMA_32BIT_MASK))) {
+	if ((err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))) {
 		DPRINTK(PROBE, ERR, "No usable DMA configuration, aborting.\n");
 		goto err_out_free_res;
 	}
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index caa71dffb3d0..ddc5c533e89c 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -966,9 +966,9 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
 	    !pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
 		pci_using_dac = 1;
 	} else {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
-			err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+			err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 			if (err) {
 				E1000_ERR("No usable DMA configuration, "
 					  "aborting\n");
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 4a61160052a3..409b58cad0e5 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -2203,7 +2203,7 @@ static void e1000_configure_tx(struct e1000_adapter *adapter)
 	/* Setup the HW Tx Head and Tail descriptor pointers */
 	tdba = tx_ring->dma;
 	tdlen = tx_ring->count * sizeof(struct e1000_tx_desc);
-	ew32(TDBAL, (tdba & DMA_32BIT_MASK));
+	ew32(TDBAL, (tdba & DMA_BIT_MASK(32)));
 	ew32(TDBAH, (tdba >> 32));
 	ew32(TDLEN, tdlen);
 	ew32(TDH, 0);
@@ -2459,7 +2459,7 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
 	 * the Base and Length of the Rx Descriptor Ring
 	 */
 	rdba = rx_ring->dma;
-	ew32(RDBAL, (rdba & DMA_32BIT_MASK));
+	ew32(RDBAL, (rdba & DMA_BIT_MASK(32)));
 	ew32(RDBAH, (rdba >> 32));
 	ew32(RDLEN, rdlen);
 	ew32(RDH, 0);
@@ -4769,10 +4769,10 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
 		if (!err)
 			pci_using_dac = 1;
 	} else {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			err = pci_set_consistent_dma_mask(pdev,
-							  DMA_32BIT_MASK);
+							  DMA_BIT_MASK(32));
 			if (err) {
 				dev_err(&pdev->dev, "No usable DMA "
 					"configuration, aborting\n");
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index 5b87105ac9e0..9080f07da8fe 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -1687,13 +1687,13 @@ static int __devinit enic_probe(struct pci_dev *pdev,
 
 	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(40));
 	if (err) {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			printk(KERN_ERR PFX
 				"No usable DMA configuration, aborting.\n");
 			goto err_out_release_regions;
 		}
-		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			printk(KERN_ERR PFX
 				"Unable to obtain 32-bit DMA "
diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c
index ad8be7e78290..de3f49f991a3 100644
--- a/drivers/net/hp100.c
+++ b/drivers/net/hp100.c
@@ -580,7 +580,7 @@ static int __devinit hp100_probe1(struct net_device *dev, int ioaddr,
 			 * Also, we can have EISA Busmaster cards (not tested),
 			 * so beware !!! - Jean II */
 			if((bus == HP100_BUS_PCI) &&
-			   (pci_set_dma_mask(pci_dev, DMA_32BIT_MASK))) {
+			   (pci_set_dma_mask(pci_dev, DMA_BIT_MASK(32)))) {
 				/* Gracefully fallback to shared memory */
 				goto busmasterfail;
 			}
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 82278beaac83..6b0697c565b9 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -1160,9 +1160,9 @@ static int __devinit igb_probe(struct pci_dev *pdev,
 		if (!err)
 			pci_using_dac = 1;
 	} else {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
-			err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+			err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 			if (err) {
 				dev_err(&pdev->dev, "No usable DMA "
 					"configuration, aborting\n");
diff --git a/drivers/net/ioc3-eth.c b/drivers/net/ioc3-eth.c
index 43be0b01f12c..cbc63ff13add 100644
--- a/drivers/net/ioc3-eth.c
+++ b/drivers/net/ioc3-eth.c
@@ -1236,7 +1236,7 @@ static int __devinit ioc3_probe(struct pci_dev *pdev,
 			goto out;
 		}
 	} else {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			printk(KERN_ERR "%s: No usable DMA configuration, "
 			       "aborting.\n", pci_name(pdev));
diff --git a/drivers/net/ipg.c b/drivers/net/ipg.c
index ed9ded939d96..43019461b776 100644
--- a/drivers/net/ipg.c
+++ b/drivers/net/ipg.c
@@ -2242,7 +2242,7 @@ static int __devinit ipg_probe(struct pci_dev *pdev,
 
 	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(40));
 	if (rc < 0) {
-		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc < 0) {
 			printk(KERN_ERR "%s: DMA config failed.\n",
 			       pci_name(pdev));
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
index 0ac51758bc6c..4a0826b8f6f2 100644
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -369,8 +369,8 @@ ixgb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	    !(err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)))) {
 		pci_using_dac = 1;
 	} else {
-		if ((err = pci_set_dma_mask(pdev, DMA_32BIT_MASK)) ||
-		    (err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK))) {
+		if ((err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) ||
+		    (err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)))) {
 			printk(KERN_ERR
 			 "ixgb: No usable DMA configuration, aborting\n");
 			goto err_dma_mask;
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 126735ca6d7f..9ef128ae6458 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1643,7 +1643,7 @@ static void ixgbe_configure_tx(struct ixgbe_adapter *adapter)
 		tdba = ring->dma;
 		tdlen = ring->count * sizeof(union ixgbe_adv_tx_desc);
 		IXGBE_WRITE_REG(hw, IXGBE_TDBAL(j),
-		                (tdba & DMA_32BIT_MASK));
+		                (tdba & DMA_BIT_MASK(32)));
 		IXGBE_WRITE_REG(hw, IXGBE_TDBAH(j), (tdba >> 32));
 		IXGBE_WRITE_REG(hw, IXGBE_TDLEN(j), tdlen);
 		IXGBE_WRITE_REG(hw, IXGBE_TDH(j), 0);
@@ -1782,7 +1782,7 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter)
 	for (i = 0; i < adapter->num_rx_queues; i++) {
 		rdba = adapter->rx_ring[i].dma;
 		j = adapter->rx_ring[i].reg_idx;
-		IXGBE_WRITE_REG(hw, IXGBE_RDBAL(j), (rdba & DMA_32BIT_MASK));
+		IXGBE_WRITE_REG(hw, IXGBE_RDBAL(j), (rdba & DMA_BIT_MASK(32)));
 		IXGBE_WRITE_REG(hw, IXGBE_RDBAH(j), (rdba >> 32));
 		IXGBE_WRITE_REG(hw, IXGBE_RDLEN(j), rdlen);
 		IXGBE_WRITE_REG(hw, IXGBE_RDH(j), 0);
@@ -4513,9 +4513,9 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	    !pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
 		pci_using_dac = 1;
 	} else {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
-			err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+			err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 			if (err) {
 				dev_err(&pdev->dev, "No usable DMA "
 				        "configuration, aborting\n");
diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index 860dcd98a07c..ece35040288c 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -2600,8 +2600,8 @@ jme_pci_dma64(struct pci_dev *pdev)
 		if (!pci_set_consistent_dma_mask(pdev, DMA_40BIT_MASK))
 			return 1;
 
-	if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK))
-		if (!pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK))
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))
+		if (!pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)))
 			return 0;
 
 	return -1;
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index fed53fbaf545..102bac90a302 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -1079,7 +1079,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
 	if (err) {
 		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
 			goto err_release_bar2;
@@ -1089,7 +1089,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err) {
 		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
 			 "consistent PCI DMA mask.\n");
-		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
 				"aborting.\n");
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 27655466dbeb..9eed126a82f0 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -1130,7 +1130,7 @@ myri10ge_submit_8rx(struct mcp_kreq_ether_recv __iomem * dst,
 	__be32 low;
 
 	low = src->addr_low;
-	src->addr_low = htonl(DMA_32BIT_MASK);
+	src->addr_low = htonl(DMA_BIT_MASK(32));
 	myri10ge_pio_copy(dst, src, 4 * sizeof(*src));
 	mb();
 	myri10ge_pio_copy(dst + 4, src + 4, 4 * sizeof(*src));
@@ -3798,7 +3798,7 @@ static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		dev_err(&pdev->dev,
 			"64-bit pci address mask was refused, "
 			"trying 32-bit\n");
-		status = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		status = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	}
 	if (status != 0) {
 		dev_err(&pdev->dev, "Error %d setting DMA mask\n", status);
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index 02c37e2f08a9..73cac6c78cb6 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -9889,8 +9889,8 @@ static int __devinit niu_pci_init_one(struct pci_dev *pdev,
 			goto err_out_release_parent;
 		}
 	}
-	if (err || dma_mask == DMA_32BIT_MASK) {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	if (err || dma_mask == DMA_BIT_MASK(32)) {
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			dev_err(&pdev->dev, PFX "No usable DMA configuration, "
 				"aborting.\n");
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index e30ab06e7103..d531614a90b5 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -1975,7 +1975,7 @@ static int __devinit ns83820_init_one(struct pci_dev *pci_dev,
 	if (sizeof(dma_addr_t) == 8 &&
 		!pci_set_dma_mask(pci_dev, DMA_BIT_MASK(64))) {
 		using_dac = 1;
-	} else if (!pci_set_dma_mask(pci_dev, DMA_32BIT_MASK)) {
+	} else if (!pci_set_dma_mask(pci_dev, DMA_BIT_MASK(32))) {
 		using_dac = 0;
 	} else {
 		dev_warn(&pci_dev->dev, "pci_set_dma_mask failed!\n");
diff --git a/drivers/net/qla3xxx.c b/drivers/net/qla3xxx.c
index aef047e4515a..cadc32c94c1e 100644
--- a/drivers/net/qla3xxx.c
+++ b/drivers/net/qla3xxx.c
@@ -3937,9 +3937,9 @@ static int __devinit ql3xxx_probe(struct pci_dev *pdev,
 	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
 		pci_using_dac = 1;
 		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	} else if (!(err = pci_set_dma_mask(pdev, DMA_32BIT_MASK))) {
+	} else if (!(err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))) {
 		pci_using_dac = 0;
-		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	}
 
 	if (err) {
diff --git a/drivers/net/qlge/qlge_main.c b/drivers/net/qlge/qlge_main.c
index 0add30d38d62..c92ced247947 100644
--- a/drivers/net/qlge/qlge_main.c
+++ b/drivers/net/qlge/qlge_main.c
@@ -3730,9 +3730,9 @@ static int __devinit ql_init_device(struct pci_dev *pdev,
 		set_bit(QL_DMA64, &qdev->flags);
 		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 	} else {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (!err)
-		       err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		       err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	}
 
 	if (err) {
diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index 0a37f9902a07..5e8540b6ffa1 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -1085,13 +1085,13 @@ static int __devinit r6040_init_one(struct pci_dev *pdev,
 		goto err_out;
 
 	/* this should always be supported */
-	err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (err) {
 		printk(KERN_ERR DRV_NAME ": 32-bit PCI DMA addresses"
 				"not supported by the card\n");
 		goto err_out;
 	}
-	err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (err) {
 		printk(KERN_ERR DRV_NAME ": 32-bit PCI DMA addresses"
 				"not supported by the card\n");
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index fe676d38d342..0b6e8c896835 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -1148,7 +1148,7 @@ static void rtl8169_update_counters(struct net_device *dev)
 		return;
 
 	RTL_W32(CounterAddrHigh, (u64)paddr >> 32);
-	cmd = (u64)paddr & DMA_32BIT_MASK;
+	cmd = (u64)paddr & DMA_BIT_MASK(32);
 	RTL_W32(CounterAddrLow, cmd);
 	RTL_W32(CounterAddrLow, cmd | CounterDump);
 
@@ -2050,7 +2050,7 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		tp->cp_cmd |= PCIDAC;
 		dev->features |= NETIF_F_HIGHDMA;
 	} else {
-		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc < 0) {
 			if (netif_msg_probe(tp)) {
 				dev_err(&pdev->dev,
@@ -2343,9 +2343,9 @@ static void rtl_set_rx_tx_desc_registers(struct rtl8169_private *tp,
 	 * Switching from MMIO to I/O access fixes the issue as well.
 	 */
 	RTL_W32(TxDescStartAddrHigh, ((u64) tp->TxPhyAddr) >> 32);
-	RTL_W32(TxDescStartAddrLow, ((u64) tp->TxPhyAddr) & DMA_32BIT_MASK);
+	RTL_W32(TxDescStartAddrLow, ((u64) tp->TxPhyAddr) & DMA_BIT_MASK(32));
 	RTL_W32(RxDescAddrHigh, ((u64) tp->RxPhyAddr) >> 32);
-	RTL_W32(RxDescAddrLow, ((u64) tp->RxPhyAddr) & DMA_32BIT_MASK);
+	RTL_W32(RxDescAddrLow, ((u64) tp->RxPhyAddr) & DMA_BIT_MASK(32));
 }
 
 static u16 rtl_rw_cpluscmd(void __iomem *ioaddr)
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
index 7c8d5613a1da..1a4979f27fb5 100644
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -7786,7 +7786,7 @@ s2io_init_nic(struct pci_dev *pdev, const struct pci_device_id *pre)
 			pci_disable_device(pdev);
 			return -ENOMEM;
 		}
-	} else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		DBG_PRINT(INIT_DBG, "s2io_init_nic: Using 32bit DMA\n");
 	} else {
 		pci_disable_device(pdev);
diff --git a/drivers/net/sc92031.c b/drivers/net/sc92031.c
index c13cbf099b88..18821f217e19 100644
--- a/drivers/net/sc92031.c
+++ b/drivers/net/sc92031.c
@@ -1427,11 +1427,11 @@ static int __devinit sc92031_probe(struct pci_dev *pdev,
 
 	pci_set_master(pdev);
 
-	err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (unlikely(err < 0))
 		goto out_set_dma_mask;
 
-	err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (unlikely(err < 0))
 		goto out_set_dma_mask;
 
diff --git a/drivers/net/sis190.c b/drivers/net/sis190.c
index a9732686134b..55ccd51d247e 100644
--- a/drivers/net/sis190.c
+++ b/drivers/net/sis190.c
@@ -1467,7 +1467,7 @@ static struct net_device * __devinit sis190_init_board(struct pci_dev *pdev)
 		goto err_pci_disable_2;
 	}
 
-	rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc < 0) {
 		net_probe(tp, KERN_ERR "%s: DMA configuration failed.\n",
 			  pci_name(pdev));
diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c
index 8a70de72ea2c..2d4617b3e208 100644
--- a/drivers/net/sis900.c
+++ b/drivers/net/sis900.c
@@ -432,7 +432,7 @@ static int __devinit sis900_probe(struct pci_dev *pci_dev,
 	ret = pci_enable_device(pci_dev);
 	if(ret) return ret;
 
-	i = pci_set_dma_mask(pci_dev, DMA_32BIT_MASK);
+	i = pci_set_dma_mask(pci_dev, DMA_BIT_MASK(32));
 	if(i){
 		printk(KERN_ERR "sis900.c: architecture does not support "
 			"32bit PCI busmaster DMA\n");
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 18a7b662aff1..b8978d4af1b7 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -3915,9 +3915,9 @@ static int __devinit skge_probe(struct pci_dev *pdev,
 	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
 		using_dac = 1;
 		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	} else if (!(err = pci_set_dma_mask(pdev, DMA_32BIT_MASK))) {
+	} else if (!(err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))) {
 		using_dac = 0;
-		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	}
 
 	if (err) {
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index da570f44b12b..a2ff9cb1e7ac 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -4383,7 +4383,7 @@ static int __devinit sky2_probe(struct pci_dev *pdev,
 			goto err_out_free_regions;
 		}
 	} else {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			dev_err(&pdev->dev, "no usable DMA configuration\n");
 			goto err_out_free_regions;
diff --git a/drivers/net/smsc9420.c b/drivers/net/smsc9420.c
index 5959ae86e57d..60abdb1081ad 100644
--- a/drivers/net/smsc9420.c
+++ b/drivers/net/smsc9420.c
@@ -1598,7 +1598,7 @@ smsc9420_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto out_free_netdev_2;
 	}
 
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_ERR "No usable DMA configuration, aborting.\n");
 		goto out_free_regions_3;
 	}
diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c
index 2312d10f3b54..d2dfe0ab5106 100644
--- a/drivers/net/sungem.c
+++ b/drivers/net/sungem.c
@@ -3045,7 +3045,7 @@ static int __devinit gem_init_one(struct pci_dev *pdev,
 	    !pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
 		pci_using_dac = 1;
 	} else {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			printk(KERN_ERR PFX "No usable DMA configuration, "
 			       "aborting.\n");
diff --git a/drivers/net/tehuti.c b/drivers/net/tehuti.c
index d77a0918a325..7f4a9683ba1e 100644
--- a/drivers/net/tehuti.c
+++ b/drivers/net/tehuti.c
@@ -1945,8 +1945,8 @@ bdx_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	    !(err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)))) {
 		pci_using_dac = 1;
 	} else {
-		if ((err = pci_set_dma_mask(pdev, DMA_32BIT_MASK)) ||
-		    (err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK))) {
+		if ((err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) ||
+		    (err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)))) {
 			printk(KERN_ERR "tehuti: No usable DMA configuration"
 					", aborting\n");
 			goto err_dma;
diff --git a/drivers/net/tehuti.h b/drivers/net/tehuti.h
index 121dda917fdc..4fc875e5dcdd 100644
--- a/drivers/net/tehuti.h
+++ b/drivers/net/tehuti.h
@@ -99,10 +99,6 @@
 #define READ_REG(pp, reg)         readl(pp->pBdxRegs + reg)
 #define WRITE_REG(pp, reg, val)   writel(val, pp->pBdxRegs + reg)
 
-#ifndef DMA_32BIT_MASK
-#   define DMA_32BIT_MASK  0x00000000ffffffffULL
-#endif
-
 #ifndef NET_IP_ALIGN
 #   define NET_IP_ALIGN 2
 #endif
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 5ba91d7f7587..6a736dda3ee2 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -13228,7 +13228,7 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 	 * do DMA address check in tg3_start_xmit().
 	 */
 	if (tp->tg3_flags2 & TG3_FLG2_IS_5788)
-		persist_dma_mask = dma_mask = DMA_32BIT_MASK;
+		persist_dma_mask = dma_mask = DMA_BIT_MASK(32);
 	else if (tp->tg3_flags & TG3_FLAG_40BIT_DMA_BUG) {
 		persist_dma_mask = dma_mask = DMA_BIT_MASK(40);
 #ifdef CONFIG_HIGHMEM
@@ -13238,7 +13238,7 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 		persist_dma_mask = dma_mask = DMA_BIT_MASK(64);
 
 	/* Configure DMA attributes. */
-	if (dma_mask > DMA_32BIT_MASK) {
+	if (dma_mask > DMA_BIT_MASK(32)) {
 		err = pci_set_dma_mask(pdev, dma_mask);
 		if (!err) {
 			dev->features |= NETIF_F_HIGHDMA;
@@ -13251,8 +13251,8 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 			}
 		}
 	}
-	if (err || dma_mask == DMA_32BIT_MASK) {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	if (err || dma_mask == DMA_BIT_MASK(32)) {
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			printk(KERN_ERR PFX "No usable DMA configuration, "
 			       "aborting.\n");
@@ -13393,7 +13393,7 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 	       (tp->tg3_flags2 & TG3_FLG2_TSO_CAPABLE) != 0);
 	printk(KERN_INFO "%s: dma_rwctrl[%08x] dma_mask[%d-bit]\n",
 	       dev->name, tp->dma_rwctrl,
-	       (pdev->dma_mask == DMA_32BIT_MASK) ? 32 :
+	       (pdev->dma_mask == DMA_BIT_MASK(32)) ? 32 :
 	        (((u64) pdev->dma_mask == DMA_BIT_MASK(40)) ? 40 : 64));
 
 	return 0;
diff --git a/drivers/net/tlan.c b/drivers/net/tlan.c
index 68b967b585aa..aa6964922d5e 100644
--- a/drivers/net/tlan.c
+++ b/drivers/net/tlan.c
@@ -570,7 +570,7 @@ static int __devinit TLan_probe1(struct pci_dev *pdev,
 
 		priv->adapter = &board_info[ent->driver_data];
 
-		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			printk(KERN_ERR "TLAN: No suitable PCI mapping available.\n");
 			goto err_out_free_dev;
diff --git a/drivers/net/tokenring/lanstreamer.c b/drivers/net/tokenring/lanstreamer.c
index f309b8f703bd..2e70ee8f1459 100644
--- a/drivers/net/tokenring/lanstreamer.c
+++ b/drivers/net/tokenring/lanstreamer.c
@@ -267,7 +267,7 @@ static int __devinit streamer_init_one(struct pci_dev *pdev,
 #endif
 #endif
 
-	rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc) {
 		printk(KERN_ERR "%s: No suitable PCI mapping available.\n",
 				dev->name);
diff --git a/drivers/net/tulip/dmfe.c b/drivers/net/tulip/dmfe.c
index e2c9d0f5a755..f2e669974c78 100644
--- a/drivers/net/tulip/dmfe.c
+++ b/drivers/net/tulip/dmfe.c
@@ -383,7 +383,7 @@ static int __devinit dmfe_init_one (struct pci_dev *pdev,
 		return -ENOMEM;
 	SET_NETDEV_DEV(dev, &pdev->dev);
 
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_WARNING DRV_NAME
 			": 32-bit PCI DMA not available.\n");
 		err = -ENODEV;
diff --git a/drivers/net/tulip/uli526x.c b/drivers/net/tulip/uli526x.c
index c227db079621..8761a5a5bd79 100644
--- a/drivers/net/tulip/uli526x.c
+++ b/drivers/net/tulip/uli526x.c
@@ -282,7 +282,7 @@ static int __devinit uli526x_init_one (struct pci_dev *pdev,
 		return -ENOMEM;
 	SET_NETDEV_DEV(dev, &pdev->dev);
 
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_WARNING DRV_NAME ": 32-bit PCI DMA not available.\n");
 		err = -ENODEV;
 		goto err_out_free;
diff --git a/drivers/net/tulip/winbond-840.c b/drivers/net/tulip/winbond-840.c
index c61a01b029af..264e61404f34 100644
--- a/drivers/net/tulip/winbond-840.c
+++ b/drivers/net/tulip/winbond-840.c
@@ -375,7 +375,7 @@ static int __devinit w840_probe1 (struct pci_dev *pdev,
 
 	irq = pdev->irq;
 
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_WARNING "Winbond-840: Device %s disabled due to DMA limitations.\n",
 		       pci_name(pdev));
 		return -EIO;
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index 9dd4f76a2ff5..cf25eb41b1ce 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -2406,7 +2406,7 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto error_out_disable;
 	}
 
-	err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if(err < 0) {
 		printk(ERR_PFX "%s: No usable DMA configuration\n",
 		       pci_name(pdev));
diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c
index 880eaf07413b..45daba726b66 100644
--- a/drivers/net/via-rhine.c
+++ b/drivers/net/via-rhine.c
@@ -686,7 +686,7 @@ static int __devinit rhine_init_one(struct pci_dev *pdev,
 		goto err_out;
 
 	/* this should always be supported */
-	rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc) {
 		printk(KERN_ERR "32-bit PCI DMA addresses not supported by "
 		       "the card!?\n");
diff --git a/drivers/net/wan/wanxl.c b/drivers/net/wan/wanxl.c
index 887acb0dc807..850d70d7b05d 100644
--- a/drivers/net/wan/wanxl.c
+++ b/drivers/net/wan/wanxl.c
@@ -633,8 +633,8 @@ static int __devinit wanxl_pci_init_one(struct pci_dev *pdev,
 	/* FIXME when PCI/DMA subsystems are fixed.
 	   We set both dma_mask and consistent_dma_mask back to 32 bits
 	   to indicate the card can do 32-bit DMA addressing */
-	if (pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK) ||
-	    pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)) ||
+	    pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_ERR "wanXL: No usable DMA configuration\n");
 		wanxl_pci_remove_one(pdev);
 		return -EIO;
diff --git a/drivers/net/wireless/adm8211.c b/drivers/net/wireless/adm8211.c
index fc0897fb2239..f71821795018 100644
--- a/drivers/net/wireless/adm8211.c
+++ b/drivers/net/wireless/adm8211.c
@@ -1804,8 +1804,8 @@ static int __devinit adm8211_probe(struct pci_dev *pdev,
 		return err; /* someone else grabbed it? don't disable it */
 	}
 
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK) ||
-	    pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) ||
+	    pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_ERR "%s (adm8211): No suitable DMA available\n",
 		       pci_name(pdev));
 		goto err_free_reg;
diff --git a/drivers/net/wireless/ath5k/base.c b/drivers/net/wireless/ath5k/base.c
index 5d57d774e466..a08bc8a4fb69 100644
--- a/drivers/net/wireless/ath5k/base.c
+++ b/drivers/net/wireless/ath5k/base.c
@@ -445,7 +445,7 @@ ath5k_pci_probe(struct pci_dev *pdev,
 	}
 
 	/* XXX 32-bit addressing only */
-	ret = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (ret) {
 		dev_err(&pdev->dev, "32-bit DMA not available\n");
 		goto err_dis;
diff --git a/drivers/net/wireless/b43/dma.c b/drivers/net/wireless/b43/dma.c
index 461d680d2cec..de1964f7d9e0 100644
--- a/drivers/net/wireless/b43/dma.c
+++ b/drivers/net/wireless/b43/dma.c
@@ -777,7 +777,7 @@ static u64 supported_dma_mask(struct b43_wldev *dev)
 	b43_write32(dev, mmio_base + B43_DMA32_TXCTL, B43_DMA32_TXADDREXT_MASK);
 	tmp = b43_read32(dev, mmio_base + B43_DMA32_TXCTL);
 	if (tmp & B43_DMA32_TXADDREXT_MASK)
-		return DMA_32BIT_MASK;
+		return DMA_BIT_MASK(32);
 
 	return DMA_30BIT_MASK;
 }
@@ -786,7 +786,7 @@ static enum b43_dmatype dma_mask_to_engine_type(u64 dmamask)
 {
 	if (dmamask == DMA_30BIT_MASK)
 		return B43_DMA_30BIT;
-	if (dmamask == DMA_32BIT_MASK)
+	if (dmamask == DMA_BIT_MASK(32))
 		return B43_DMA_32BIT;
 	if (dmamask == DMA_BIT_MASK(64))
 		return B43_DMA_64BIT;
@@ -1000,11 +1000,11 @@ static int b43_dma_set_mask(struct b43_wldev *dev, u64 mask)
 		if (!err)
 			break;
 		if (mask == DMA_BIT_MASK(64)) {
-			mask = DMA_32BIT_MASK;
+			mask = DMA_BIT_MASK(32);
 			fallback = 1;
 			continue;
 		}
-		if (mask == DMA_32BIT_MASK) {
+		if (mask == DMA_BIT_MASK(32)) {
 			mask = DMA_30BIT_MASK;
 			fallback = 1;
 			continue;
diff --git a/drivers/net/wireless/b43legacy/dma.c b/drivers/net/wireless/b43legacy/dma.c
index 61bb91266aa8..1d3e0d239314 100644
--- a/drivers/net/wireless/b43legacy/dma.c
+++ b/drivers/net/wireless/b43legacy/dma.c
@@ -854,7 +854,7 @@ static u64 supported_dma_mask(struct b43legacy_wldev *dev)
 	tmp = b43legacy_read32(dev, mmio_base +
 			       B43legacy_DMA32_TXCTL);
 	if (tmp & B43legacy_DMA32_TXADDREXT_MASK)
-		return DMA_32BIT_MASK;
+		return DMA_BIT_MASK(32);
 
 	return DMA_30BIT_MASK;
 }
@@ -863,7 +863,7 @@ static enum b43legacy_dmatype dma_mask_to_engine_type(u64 dmamask)
 {
 	if (dmamask == DMA_30BIT_MASK)
 		return B43legacy_DMA_30BIT;
-	if (dmamask == DMA_32BIT_MASK)
+	if (dmamask == DMA_BIT_MASK(32))
 		return B43legacy_DMA_32BIT;
 	if (dmamask == DMA_BIT_MASK(64))
 		return B43legacy_DMA_64BIT;
@@ -1043,11 +1043,11 @@ static int b43legacy_dma_set_mask(struct b43legacy_wldev *dev, u64 mask)
 		if (!err)
 			break;
 		if (mask == DMA_BIT_MASK(64)) {
-			mask = DMA_32BIT_MASK;
+			mask = DMA_BIT_MASK(32);
 			fallback = 1;
 			continue;
 		}
-		if (mask == DMA_32BIT_MASK) {
+		if (mask == DMA_BIT_MASK(32)) {
 			mask = DMA_30BIT_MASK;
 			fallback = 1;
 			continue;
diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index f4e963ba768b..97e5647ff050 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c
@@ -6207,7 +6207,7 @@ static int ipw2100_pci_init_one(struct pci_dev *pci_dev,
 	pci_set_master(pci_dev);
 	pci_set_drvdata(pci_dev, priv);
 
-	err = pci_set_dma_mask(pci_dev, DMA_32BIT_MASK);
+	err = pci_set_dma_mask(pci_dev, DMA_BIT_MASK(32));
 	if (err) {
 		printk(KERN_WARNING DRV_NAME
 		       "Error calling pci_set_dma_mask.\n");
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index e17a4593e1f5..bd4dbcfe1bbe 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -11631,9 +11631,9 @@ static int __devinit ipw_pci_probe(struct pci_dev *pdev,
 
 	pci_set_master(pdev);
 
-	err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (!err)
-		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (err) {
 		printk(KERN_WARNING DRV_NAME ": No suitable DMA available.\n");
 		goto out_pci_disable_device;
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index 9d5f97dd7c73..ce729281ff62 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -4998,9 +4998,9 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e
 
 	pci_set_master(pdev);
 
-	err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (!err)
-		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (err) {
 		IWL_WARN(priv, "No suitable DMA available.\n");
 		goto out_pci_disable_device;
diff --git a/drivers/net/wireless/prism54/islpci_hotplug.c b/drivers/net/wireless/prism54/islpci_hotplug.c
index 9a72b1e3e163..30876728d7e6 100644
--- a/drivers/net/wireless/prism54/islpci_hotplug.c
+++ b/drivers/net/wireless/prism54/islpci_hotplug.c
@@ -120,7 +120,7 @@ prism54_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 
 	/* enable PCI DMA */
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_ERR "%s: 32-bit PCI DMA not supported", DRV_NAME);
 		goto do_pci_disable_device;
         }
diff --git a/drivers/net/wireless/rt2x00/rt2x00pci.c b/drivers/net/wireless/rt2x00/rt2x00pci.c
index e616c20d4a78..43fa0f849003 100644
--- a/drivers/net/wireless/rt2x00/rt2x00pci.c
+++ b/drivers/net/wireless/rt2x00/rt2x00pci.c
@@ -288,7 +288,7 @@ int rt2x00pci_probe(struct pci_dev *pci_dev, const struct pci_device_id *id)
 	if (pci_set_mwi(pci_dev))
 		ERROR_PROBE("MWI not available.\n");
 
-	if (dma_set_mask(&pci_dev->dev, DMA_32BIT_MASK)) {
+	if (dma_set_mask(&pci_dev->dev, DMA_BIT_MASK(32))) {
 		ERROR_PROBE("PCI DMA not supported.\n");
 		retval = -EIO;
 		goto exit_disable_device;
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 09dc98b84553..fb3a3f3fca7a 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -56,7 +56,7 @@
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
-#define DMA_32BIT_PFN		IOVA_PFN(DMA_32BIT_MASK)
+#define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
 #define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
 
 /* global iommu list, set NULL for ignored DMAR units */
@@ -2080,15 +2080,15 @@ __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct iova *iova = NULL;
 
-	if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
+	if (dma_mask <= DMA_BIT_MASK(32) || dmar_forcedac)
 		iova = iommu_alloc_iova(domain, size, dma_mask);
 	else {
 		/*
 		 * First try to allocate an io virtual address in
-		 * DMA_32BIT_MASK and if that fails then try allocating
+		 * DMA_BIT_MASK(32) and if that fails then try allocating
 		 * from higher range
 		 */
-		iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
+		iova = iommu_alloc_iova(domain, size, DMA_BIT_MASK(32));
 		if (!iova)
 			iova = iommu_alloc_iova(domain, size, dma_mask);
 	}
diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index 5c13f61bfb1b..74d0bfa3f310 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -381,9 +381,9 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
 	rdev->dev.release = rio_release_dev;
 	rio_dev_get(rdev);
 
-	rdev->dma_mask = DMA_32BIT_MASK;
+	rdev->dma_mask = DMA_BIT_MASK(32);
 	rdev->dev.dma_mask = &rdev->dma_mask;
-	rdev->dev.coherent_dma_mask = DMA_32BIT_MASK;
+	rdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
 
 	if ((rdev->pef & RIO_PEF_INB_DOORBELL) &&
 	    (rdev->dst_ops & RIO_DST_OPS_DOORBELL))
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 6697652740b8..fdb14ec4fd47 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -2018,8 +2018,8 @@ static int __devinit twa_probe(struct pci_dev *pdev, const struct pci_device_id
 
 	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64))
 	    || pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)))
-		if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)
-		    || pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK)) {
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))
+		    || pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32))) {
 			TW_PRINTK(host, TW_DRIVER, 0x23, "Failed to set dma mask");
 			retval = -ENODEV;
 			goto out_disable_device;
diff --git a/drivers/scsi/3w-xxxx.h b/drivers/scsi/3w-xxxx.h
index 0742e6846656..8e71e5e122b3 100644
--- a/drivers/scsi/3w-xxxx.h
+++ b/drivers/scsi/3w-xxxx.h
@@ -234,7 +234,7 @@ static unsigned char tw_sense_table[][4] =
 #define TW_IOCTL_TIMEOUT                      25 /* 25 seconds */
 #define TW_IOCTL_CHRDEV_TIMEOUT               60 /* 60 seconds */
 #define TW_IOCTL_CHRDEV_FREE                  -1
-#define TW_DMA_MASK			      DMA_32BIT_MASK
+#define TW_DMA_MASK			      DMA_BIT_MASK(32)
 #define TW_MAX_CDB_LEN			      16
 
 /* Bitmask macros to eliminate bitfields */
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c
index 2d689af24664..1ddcf4031d4c 100644
--- a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -667,7 +667,7 @@ static int __init BusLogic_InitializeMultiMasterProbeInfo(struct BusLogic_HostAd
 		if (pci_enable_device(PCI_Device))
 			continue;
 
-		if (pci_set_dma_mask(PCI_Device, DMA_32BIT_MASK ))
+		if (pci_set_dma_mask(PCI_Device, DMA_BIT_MASK(32) ))
 			continue;
 
 		Bus = PCI_Device->bus->number;
@@ -834,7 +834,7 @@ static int __init BusLogic_InitializeMultiMasterProbeInfo(struct BusLogic_HostAd
 		if (pci_enable_device(PCI_Device))
 			continue;
 
-		if (pci_set_dma_mask(PCI_Device, DMA_32BIT_MASK))
+		if (pci_set_dma_mask(PCI_Device, DMA_BIT_MASK(32)))
 			continue;
 
 		Bus = PCI_Device->bus->number;
@@ -888,7 +888,7 @@ static int __init BusLogic_InitializeFlashPointProbeInfo(struct BusLogic_HostAda
 		if (pci_enable_device(PCI_Device))
 			continue;
 
-		if (pci_set_dma_mask(PCI_Device, DMA_32BIT_MASK))
+		if (pci_set_dma_mask(PCI_Device, DMA_BIT_MASK(32)))
 			continue;
 
 		Bus = PCI_Device->bus->number;
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c
index 964769f66eac..208d6df9ed59 100644
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -1094,7 +1094,7 @@ static int __devinit inia100_probe_one(struct pci_dev *pdev,
 
 	if (pci_enable_device(pdev))
 		goto out;
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_WARNING "Unable to set 32bit DMA "
 				    "on inia100 adapter, ignoring.\n");
 		goto out_disable_device;
diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index e83ef8aaa98c..280261c451d6 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -1407,8 +1407,8 @@ int aac_get_adapter_info(struct aac_dev* dev)
 			if (!dev->in_reset)
 				printk(KERN_INFO"%s%d: 64 Bit DAC enabled\n",
 					dev->name, dev->id);
-		} else if (!pci_set_dma_mask(dev->pdev, DMA_32BIT_MASK) &&
-			!pci_set_consistent_dma_mask(dev->pdev, DMA_32BIT_MASK)) {
+		} else if (!pci_set_dma_mask(dev->pdev, DMA_BIT_MASK(32)) &&
+			!pci_set_consistent_dma_mask(dev->pdev, DMA_BIT_MASK(32))) {
 			printk(KERN_INFO"%s%d: DMA mask set failed, 64 Bit DAC disabled\n",
 				dev->name, dev->id);
 			dev->dac_support = 0;
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index d24c2670040b..3b69c2d98dd6 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -1210,14 +1210,14 @@ static int _aac_reset_adapter(struct aac_dev *aac, int forced)
 		  ((retval = pci_set_consistent_dma_mask(aac->pdev, DMA_31BIT_MASK))))
 			goto out;
 	} else {
-		if (((retval = pci_set_dma_mask(aac->pdev, DMA_32BIT_MASK))) ||
-		  ((retval = pci_set_consistent_dma_mask(aac->pdev, DMA_32BIT_MASK))))
+		if (((retval = pci_set_dma_mask(aac->pdev, DMA_BIT_MASK(32)))) ||
+		  ((retval = pci_set_consistent_dma_mask(aac->pdev, DMA_BIT_MASK(32)))))
 			goto out;
 	}
 	if ((retval = (*(aac_get_driver_ident(index)->init))(aac)))
 		goto out;
 	if (quirks & AAC_QUIRK_31BIT)
-		if ((retval = pci_set_dma_mask(aac->pdev, DMA_32BIT_MASK)))
+		if ((retval = pci_set_dma_mask(aac->pdev, DMA_BIT_MASK(32))))
 			goto out;
 	if (jafo) {
 		aac->thread = kthread_run(aac_command_thread, aac, aac->name);
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index c507719c0d44..3c3ed4b3311a 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1095,8 +1095,8 @@ static int __devinit aac_probe_one(struct pci_dev *pdev,
 		goto out;
 	error = -ENODEV;
 
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK) ||
-			pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK))
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) ||
+			pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)))
 		goto out_disable_pdev;
 	/*
 	 * If the quirk31 bit is set, the adapter needs adapter
@@ -1154,7 +1154,7 @@ static int __devinit aac_probe_one(struct pci_dev *pdev,
 	 * address space.
 	 */
 	if (aac_drivers[index].quirks & AAC_QUIRK_31BIT)
-		if (pci_set_dma_mask(pdev, DMA_32BIT_MASK))
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))
 			goto out_deinit;
 
 	aac->maximum_num_channels = aac_drivers[index].channels;
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
index c583d89f569a..8f686122d54e 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
@@ -197,13 +197,13 @@ ahd_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		if (required_mask > DMA_BIT_MASK(39) &&
 		    dma_set_mask(dev, DMA_BIT_MASK(64)) == 0)
 			ahd->flags |= AHD_64BIT_ADDRESSING;
-		else if (required_mask > DMA_32BIT_MASK &&
+		else if (required_mask > DMA_BIT_MASK(32) &&
 			 dma_set_mask(dev, DMA_BIT_MASK(39)) == 0)
 			ahd->flags |= AHD_39BIT_ADDRESSING;
 		else
-			dma_set_mask(dev, DMA_32BIT_MASK);
+			dma_set_mask(dev, DMA_BIT_MASK(32));
 	} else {
-		dma_set_mask(dev, DMA_32BIT_MASK);
+		dma_set_mask(dev, DMA_BIT_MASK(32));
 	}
 	ahd->dev_softc = pci;
 	error = ahd_pci_config(ahd, entry);
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
index 00f5b9868574..78fc70c24e07 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
@@ -241,10 +241,10 @@ ahc_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (sizeof(dma_addr_t) > 4
 	    && ahc->features & AHC_LARGE_SCBS
 	    && dma_set_mask(dev, mask_39bit) == 0
-	    && dma_get_required_mask(dev) > DMA_32BIT_MASK) {
+	    && dma_get_required_mask(dev) > DMA_BIT_MASK(32)) {
 		ahc->flags |= AHC_39BIT_ADDRESSING;
 	} else {
-		if (dma_set_mask(dev, DMA_32BIT_MASK)) {
+		if (dma_set_mask(dev, DMA_BIT_MASK(32))) {
 			ahc_free(ahc);
 			printk(KERN_WARNING "aic7xxx: No suitable DMA available.\n");
                 	return (-ENODEV);
diff --git a/drivers/scsi/aic94xx/aic94xx_init.c b/drivers/scsi/aic94xx/aic94xx_init.c
index 7edbe4309c61..996f7224f90e 100644
--- a/drivers/scsi/aic94xx/aic94xx_init.c
+++ b/drivers/scsi/aic94xx/aic94xx_init.c
@@ -793,8 +793,8 @@ static int __devinit asd_pci_probe(struct pci_dev *dev,
 	if (!pci_set_dma_mask(dev, DMA_BIT_MASK(64))
 	    && !pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(64)))
 		;
-	else if (!pci_set_dma_mask(dev, DMA_32BIT_MASK)
-		 && !pci_set_consistent_dma_mask(dev, DMA_32BIT_MASK))
+	else if (!pci_set_dma_mask(dev, DMA_BIT_MASK(32))
+		 && !pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(32)))
 		;
 	else {
 		asd_printk("no suitable DMA mask for %s\n", pci_name(dev));
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index e95b72dd34b1..80aac01b5a6f 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -395,7 +395,7 @@ static int arcmsr_probe(struct pci_dev *pdev,
 
 	error = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
 	if (error) {
-		error = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		error = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (error) {
 			printk(KERN_WARNING
 			       "scsi%d: No suitable DMA mask available\n",
diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c
index 20ca0a6374b5..b137e561f5bc 100644
--- a/drivers/scsi/atp870u.c
+++ b/drivers/scsi/atp870u.c
@@ -2568,7 +2568,7 @@ static int atp870u_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (pci_enable_device(pdev))
 		goto err_eio;
 
-        if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+        if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
                 printk(KERN_INFO "atp870u: use 32bit DMA mask.\n");
         } else {
                 printk(KERN_ERR "atp870u: DMA mask required but not available.\n");
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index 75a94e438fa5..b6af63ca980b 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -1015,14 +1015,14 @@ static int adpt_install_hba(struct scsi_host_template* sht, struct pci_dev* pDev
 	 */
 	if (sizeof(dma_addr_t) > 4 &&
 	    pci_set_dma_mask(pDev, DMA_BIT_MASK(64)) == 0) {
-		if (dma_get_required_mask(&pDev->dev) > DMA_32BIT_MASK)
+		if (dma_get_required_mask(&pDev->dev) > DMA_BIT_MASK(32))
 			dma64 = 1;
 	}
-	if (!dma64 && pci_set_dma_mask(pDev, DMA_32BIT_MASK) != 0)
+	if (!dma64 && pci_set_dma_mask(pDev, DMA_BIT_MASK(32)) != 0)
 		return -EINVAL;
 
 	/* adapter only supports message blocks below 4GB */
-	pci_set_consistent_dma_mask(pDev, DMA_32BIT_MASK);
+	pci_set_consistent_dma_mask(pDev, DMA_BIT_MASK(32));
 
 	base_addr0_phys = pci_resource_start(pDev,0);
 	hba_map0_area_size = pci_resource_len(pDev,0);
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c
index 976cdd5c94ef..be5099dd94b5 100644
--- a/drivers/scsi/eata.c
+++ b/drivers/scsi/eata.c
@@ -1426,7 +1426,7 @@ static int port_detect(unsigned long port_base, unsigned int j,
 
 	if (ha->pdev) {
 		pci_set_master(ha->pdev);
-		if (pci_set_dma_mask(ha->pdev, DMA_32BIT_MASK))
+		if (pci_set_dma_mask(ha->pdev, DMA_BIT_MASK(32)))
 			printk("%s: warning, pci_set_dma_mask failed.\n",
 			       ha->board_name);
 	}
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index 4982344f1f09..185e6bc4dd40 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -5023,7 +5023,7 @@ static int __devinit gdth_pci_probe_one(gdth_pci_str *pcistr,
 	/* 64-bit DMA only supported from FW >= x.43 */
 	if (!(ha->cache_feat & ha->raw_feat & ha->screen_feat & GDT_64BIT) ||
 	    !ha->dma64_support) {
-		if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 			printk(KERN_WARNING "GDT-PCI %d: "
 				"Unable to set 32-bit DMA\n", ha->hanum);
 				goto out_free_coal_stat;
@@ -5032,7 +5032,7 @@ static int __devinit gdth_pci_probe_one(gdth_pci_str *pcistr,
 		shp->max_cmd_len = 16;
 		if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
 			printk("GDT-PCI %d: 64-bit DMA enabled\n", ha->hanum);
-		} else if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+		} else if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 			printk(KERN_WARNING "GDT-PCI %d: "
 				"Unable to set 64/32-bit DMA\n", ha->hanum);
 			goto out_free_coal_stat;
diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c
index a13d78150cb5..c596ab5f05c3 100644
--- a/drivers/scsi/hptiop.c
+++ b/drivers/scsi/hptiop.c
@@ -959,7 +959,7 @@ static int __devinit hptiop_probe(struct pci_dev *pcidev,
 
 	/* Enable 64bit DMA if possible */
 	if (pci_set_dma_mask(pcidev, DMA_BIT_MASK(64))) {
-		if (pci_set_dma_mask(pcidev, DMA_32BIT_MASK)) {
+		if (pci_set_dma_mask(pcidev, DMA_BIT_MASK(32))) {
 			printk(KERN_ERR "hptiop: fail to set dma_mask\n");
 			goto disable_pci_device;
 		}
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index 5529518ff2fa..89a59484be02 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -2856,7 +2856,7 @@ static int initio_probe_one(struct pci_dev *pdev,
 		reg = 0;
 	bios_seg = (bios_seg << 8) + ((u16) ((reg & 0xFF00) >> 8));
 
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_WARNING  "i91u: Could not set 32 bit DMA mask\n");
 		error = -ENODEV;
 		goto out_disable_device;
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index def473f0a98f..c09d77591f92 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -7498,7 +7498,7 @@ static int __devinit ipr_probe_ioa(struct pci_dev *pdev,
 
 	pci_set_master(pdev);
 
-	rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (rc < 0) {
 		dev_err(&pdev->dev, "Failed to set PCI DMA mask\n");
 		goto cleanup_nomem;
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 45296a975f8b..f83a116955f2 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -7051,7 +7051,7 @@ ips_init_phase1(struct pci_dev *pci_dev, int *indexPtr)
 	    !pci_set_dma_mask(ha->pcidev, DMA_BIT_MASK(64))) {
 		(ha)->flags |= IPS_HA_ENH_SG;
 	} else {
-		if (pci_set_dma_mask(ha->pcidev, DMA_32BIT_MASK) != 0) {
+		if (pci_set_dma_mask(ha->pcidev, DMA_BIT_MASK(32)) != 0) {
 			printk(KERN_WARNING "Unable to set DMA Mask\n");
 			return ips_abort_init(ha, index);
 		}
diff --git a/drivers/scsi/lasi700.c b/drivers/scsi/lasi700.c
index f23c4ca9a2ee..b3d31315ac32 100644
--- a/drivers/scsi/lasi700.c
+++ b/drivers/scsi/lasi700.c
@@ -108,7 +108,7 @@ lasi700_probe(struct parisc_device *dev)
 	}
 
 	hostdata->dev = &dev->dev;
-	dma_set_mask(&dev->dev, DMA_32BIT_MASK);
+	dma_set_mask(&dev->dev, DMA_BIT_MASK(32));
 	hostdata->base = ioremap_nocache(base, 0x100);
 	hostdata->differential = 0;
 
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index c255b4d94b1b..06874e6009ea 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -2661,7 +2661,7 @@ lpfc_pci_probe_one(struct pci_dev *pdev, const struct pci_device_id *pid)
 	pci_try_set_mwi(pdev);
 
 	if (pci_set_dma_mask(phba->pcidev, DMA_BIT_MASK(64)) != 0)
-		if (pci_set_dma_mask(phba->pcidev, DMA_32BIT_MASK) != 0)
+		if (pci_set_dma_mask(phba->pcidev, DMA_BIT_MASK(32)) != 0)
 			goto out_idr_remove;
 
 	/*
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 112991d46d95..49eb0612d5af 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -2021,7 +2021,7 @@ make_local_pdev(adapter_t *adapter, struct pci_dev **pdev)
 
 	memcpy(*pdev, adapter->dev, sizeof(struct pci_dev));
 
-	if( pci_set_dma_mask(*pdev, DMA_32BIT_MASK) != 0 ) {
+	if( pci_set_dma_mask(*pdev, DMA_BIT_MASK(32)) != 0 ) {
 		kfree(*pdev);
 		return -1;
 	}
@@ -4796,7 +4796,7 @@ megaraid_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
 		adapter->has_64bit_addr = 1;
 	} else  {
-		pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		adapter->has_64bit_addr = 0;
 	}
 		
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c
index b2c19449119d..234f0b7eb21c 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -473,7 +473,7 @@ megaraid_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	// Setup the default DMA mask. This would be changed later on
 	// depending on hardware capabilities
-	if (pci_set_dma_mask(adapter->pdev, DMA_32BIT_MASK) != 0) {
+	if (pci_set_dma_mask(adapter->pdev, DMA_BIT_MASK(32)) != 0) {
 
 		con_log(CL_ANN, (KERN_WARNING
 			"megaraid: pci_set_dma_mask failed:%d\n", __LINE__));
@@ -904,7 +904,7 @@ megaraid_init_mbox(adapter_t *adapter)
 			con_log(CL_ANN, (KERN_WARNING
 				"megaraid: DMA mask for 64-bit failed\n"));
 
-			if (pci_set_dma_mask (adapter->pdev, DMA_32BIT_MASK)) {
+			if (pci_set_dma_mask (adapter->pdev, DMA_BIT_MASK(32))) {
 				con_log(CL_ANN, (KERN_WARNING
 					"megaraid: 32-bit DMA mask failed\n"));
 				goto out_free_sysfs_res;
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
index 713de85771f3..7dc3d1894b1a 100644
--- a/drivers/scsi/megaraid/megaraid_sas.c
+++ b/drivers/scsi/megaraid/megaraid_sas.c
@@ -2499,11 +2499,11 @@ megasas_set_dma_mask(struct pci_dev *pdev)
 	if (IS_DMA64) {
 		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
 
-			if (pci_set_dma_mask(pdev, DMA_32BIT_MASK) != 0)
+			if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0)
 				goto fail_set_dma_mask;
 		}
 	} else {
-		if (pci_set_dma_mask(pdev, DMA_32BIT_MASK) != 0)
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0)
 			goto fail_set_dma_mask;
 	}
 	return 0;
diff --git a/drivers/scsi/mvsas.c b/drivers/scsi/mvsas.c
index d8093a288438..e4acebd10d1b 100644
--- a/drivers/scsi/mvsas.c
+++ b/drivers/scsi/mvsas.c
@@ -878,7 +878,7 @@ static int pci_go_64(struct pci_dev *pdev)
 	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
 		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 		if (rc) {
-			rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+			rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 			if (rc) {
 				dev_printk(KERN_ERR, &pdev->dev,
 					   "64-bit DMA enable failed\n");
@@ -886,13 +886,13 @@ static int pci_go_64(struct pci_dev *pdev)
 			}
 		}
 	} else {
-		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_printk(KERN_ERR, &pdev->dev,
 				   "32-bit DMA enable failed\n");
 			return rc;
 		}
-		rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc) {
 			dev_printk(KERN_ERR, &pdev->dev,
 				   "32-bit consistent DMA enable failed\n");
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c
index d06ec5aa6924..2be7d5b018d2 100644
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -2672,7 +2672,7 @@ static int nsp32_detect(struct pci_dev *pdev)
 	/*
 	 * setup DMA 
 	 */
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK) != 0) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
 		nsp32_msg (KERN_ERR, "failed to set PCI DMA mask");
 		goto scsi_unregister;
 	}
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index 18f7f98ba571..687dcf2d0154 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -4276,7 +4276,7 @@ qla1280_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 
 #ifdef QLA_64BIT_PTR
 	if (pci_set_dma_mask(ha->pdev, DMA_BIT_MASK(64))) {
-		if (pci_set_dma_mask(ha->pdev, DMA_32BIT_MASK)) {
+		if (pci_set_dma_mask(ha->pdev, DMA_BIT_MASK(32))) {
 			printk(KERN_WARNING "scsi(%li): Unable to set a "
 			       "suitable DMA mask - aborting\n", ha->host_no);
 			error = -ENODEV;
@@ -4286,7 +4286,7 @@ qla1280_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		dprintk(2, "scsi(%li): 64 Bit PCI Addressing Enabled\n",
 			ha->host_no);
 #else
-	if (pci_set_dma_mask(ha->pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(ha->pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_WARNING "scsi(%li): Unable to set a "
 		       "suitable DMA mask - aborting\n", ha->host_no);
 		error = -ENODEV;
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index f60e136b5e1b..e4fdcdad80d0 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1188,8 +1188,8 @@ qla2x00_config_dma_addressing(struct qla_hw_data *ha)
 		}
 	}
 
-	dma_set_mask(&ha->pdev->dev, DMA_32BIT_MASK);
-	pci_set_consistent_dma_mask(ha->pdev, DMA_32BIT_MASK);
+	dma_set_mask(&ha->pdev->dev, DMA_BIT_MASK(32));
+	pci_set_consistent_dma_mask(ha->pdev, DMA_BIT_MASK(32));
 }
 
 static void
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index d427fab7a183..ec9da6ce8489 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -1375,10 +1375,10 @@ static void qla4xxx_config_dma_addressing(struct scsi_qla_host *ha)
 				  "Failed to set 64 bit PCI consistent mask; "
 				   "using 32 bit.\n");
 			retval = pci_set_consistent_dma_mask(ha->pdev,
-							     DMA_32BIT_MASK);
+							     DMA_BIT_MASK(32));
 		}
 	} else
-		retval = pci_set_dma_mask(ha->pdev, DMA_32BIT_MASK);
+		retval = pci_set_dma_mask(ha->pdev, DMA_BIT_MASK(32));
 }
 
 static int qla4xxx_slave_alloc(struct scsi_device *sdev)
diff --git a/drivers/scsi/sni_53c710.c b/drivers/scsi/sni_53c710.c
index 77f0b2cdaa94..37b3359e863e 100644
--- a/drivers/scsi/sni_53c710.c
+++ b/drivers/scsi/sni_53c710.c
@@ -83,7 +83,7 @@ static int __init snirm710_probe(struct platform_device *dev)
 	}
 
 	hostdata->dev = &dev->dev;
-	dma_set_mask(&dev->dev, DMA_32BIT_MASK);
+	dma_set_mask(&dev->dev, DMA_BIT_MASK(32));
 	hostdata->base = ioremap_nocache(base, 0x100);
 	hostdata->differential = 0;
 
diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c
index 7cb8063c4355..8d2a95c4e5b5 100644
--- a/drivers/scsi/stex.c
+++ b/drivers/scsi/stex.c
@@ -1398,9 +1398,9 @@ static int stex_set_dma_mask(struct pci_dev * pdev)
 	if (!pci_set_dma_mask(pdev,  DMA_BIT_MASK(64))
 		&& !pci_set_consistent_dma_mask(pdev,  DMA_BIT_MASK(64)))
 		return 0;
-	ret = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 	if (!ret)
-		ret = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 	return ret;
 }
 
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c
index 23e782015880..583966ec8266 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -1349,7 +1349,7 @@ static struct Scsi_Host * __devinit sym_attach(struct scsi_host_template *tpnt,
 	if ((SYM_CONF_DMA_ADDRESSING_MODE > 0) && (np->features & FE_DAC) &&
 			!pci_set_dma_mask(pdev, DMA_DAC_MASK)) {
 		set_dac(np);
-	} else if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+	} else if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printf_warning("%s: No suitable DMA available\n", sym_name(np));
 		goto attach_failed;
 	}
diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.h b/drivers/scsi/sym53c8xx_2/sym_hipd.h
index b707abe90821..9ebc8706b6bf 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.h
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.h
@@ -1080,7 +1080,7 @@ int sym_hcb_attach(struct Scsi_Host *shost, struct sym_fw *fw, struct sym_nvram
  */
 
 #if   SYM_CONF_DMA_ADDRESSING_MODE == 0
-#define DMA_DAC_MASK	DMA_32BIT_MASK
+#define DMA_DAC_MASK	DMA_BIT_MASK(32)
 #define sym_build_sge(np, data, badd, len)	\
 do {						\
 	(data)->addr = cpu_to_scr(badd);	\
diff --git a/drivers/staging/agnx/pci.c b/drivers/staging/agnx/pci.c
index 4ff4c1601423..25c0ffd2faa0 100644
--- a/drivers/staging/agnx/pci.c
+++ b/drivers/staging/agnx/pci.c
@@ -477,8 +477,8 @@ static int __devinit agnx_pci_probe(struct pci_dev *pdev,
 		return err;
 	}
 
-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK) ||
-	    pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK)) {
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) ||
+	    pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32))) {
 		printk(KERN_ERR PFX "No suitable DMA available\n");
 		goto err_free_reg;
 	}
diff --git a/drivers/staging/altpciechdma/altpciechdma.c b/drivers/staging/altpciechdma/altpciechdma.c
index 6a4d0b8e0960..5869e1484a95 100644
--- a/drivers/staging/altpciechdma/altpciechdma.c
+++ b/drivers/staging/altpciechdma/altpciechdma.c
@@ -855,9 +855,9 @@ static int __devinit probe(struct pci_dev *dev, const struct pci_device_id *id)
 		printk(KERN_DEBUG "Using a 64-bit DMA mask.\n");
 	} else
 #endif
-	if (!pci_set_dma_mask(dev, DMA_32BIT_MASK)) {
+	if (!pci_set_dma_mask(dev, DMA_BIT_MASK(32))) {
 		printk(KERN_DEBUG "Could not set 64-bit DMA mask.\n");
-		pci_set_consistent_dma_mask(dev, DMA_32BIT_MASK);
+		pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(32));
 		/* use 32-bit DMA */
 		printk(KERN_DEBUG "Using a 32-bit DMA mask.\n");
 	} else {
diff --git a/drivers/staging/sxg/sxg.c b/drivers/staging/sxg/sxg.c
index a77e1eee5693..891f6e334672 100644
--- a/drivers/staging/sxg/sxg.c
+++ b/drivers/staging/sxg/sxg.c
@@ -937,13 +937,13 @@ static int sxg_entry_probe(struct pci_dev *pcidev,
 	if (!(err = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64)))) {
 		DBG_ERROR("pci_set_dma_mask(DMA_BIT_MASK(64)) successful\n");
 	} else {
-		if ((err = pci_set_dma_mask(pcidev, DMA_32BIT_MASK))) {
+		if ((err = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32)))) {
 			DBG_ERROR
 			    ("No usable DMA configuration, aborting  err[%x]\n",
 			     err);
 			return err;
 		}
-		DBG_ERROR("pci_set_dma_mask(DMA_32BIT_MASK) successful\n");
+		DBG_ERROR("pci_set_dma_mask(DMA_BIT_MASK(32)) successful\n");
 	}
 
 	DBG_ERROR("Call pci_request_regions\n");
diff --git a/drivers/usb/host/ehci-ps3.c b/drivers/usb/host/ehci-ps3.c
index 9c9da35abc6c..1ba9f9a8c308 100644
--- a/drivers/usb/host/ehci-ps3.c
+++ b/drivers/usb/host/ehci-ps3.c
@@ -81,7 +81,7 @@ static int ps3_ehci_probe(struct ps3_system_bus_device *dev)
 	int result;
 	struct usb_hcd *hcd;
 	unsigned int virq;
-	static u64 dummy_mask = DMA_32BIT_MASK;
+	static u64 dummy_mask = DMA_BIT_MASK(32);
 
 	if (usb_disabled()) {
 		result = -ENODEV;
diff --git a/drivers/usb/host/ohci-ps3.c b/drivers/usb/host/ohci-ps3.c
index 3c1a3b5f89f1..3d1910317328 100644
--- a/drivers/usb/host/ohci-ps3.c
+++ b/drivers/usb/host/ohci-ps3.c
@@ -80,7 +80,7 @@ static int ps3_ohci_probe(struct ps3_system_bus_device *dev)
 	int result;
 	struct usb_hcd *hcd;
 	unsigned int virq;
-	static u64 dummy_mask = DMA_32BIT_MASK;
+	static u64 dummy_mask = DMA_BIT_MASK(32);
 
 	if (usb_disabled()) {
 		result = -ENODEV;
diff --git a/drivers/uwb/whci.c b/drivers/uwb/whci.c
index 79bb06c7a76b..2e2784627ad6 100644
--- a/drivers/uwb/whci.c
+++ b/drivers/uwb/whci.c
@@ -162,8 +162,8 @@ static int whci_probe(struct pci_dev *pci, const struct pci_device_id *id)
 	err = -ENXIO;
 	if (!pci_set_dma_mask(pci, DMA_BIT_MASK(64)))
 		pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(64));
-	else if (!pci_set_dma_mask(pci, DMA_32BIT_MASK))
-		pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK);
+	else if (!pci_set_dma_mask(pci, DMA_BIT_MASK(32)))
+		pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(32));
 	else
 		goto error_dma;
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index d7d090d21031..8083b6a36a38 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -115,7 +115,7 @@ static inline u64 dma_get_mask(struct device *dev)
 {
 	if (dev && dev->dma_mask && *dev->dma_mask)
 		return *dev->dma_mask;
-	return DMA_32BIT_MASK;
+	return DMA_BIT_MASK(32);
 }
 
 extern u64 dma_get_required_mask(struct device *dev);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 32e2bd3b1142..2b0b5a7d2ced 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -549,7 +549,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	dma_addr_t dev_addr;
 	void *ret;
 	int order = get_order(size);
-	u64 dma_mask = DMA_32BIT_MASK;
+	u64 dma_mask = DMA_BIT_MASK(32);
 
 	if (hwdev && hwdev->coherent_dma_mask)
 		dma_mask = hwdev->coherent_dma_mask;
diff --git a/sound/pci/ad1889.c b/sound/pci/ad1889.c
index d1f242bd0ac5..8f5098f92c37 100644
--- a/sound/pci/ad1889.c
+++ b/sound/pci/ad1889.c
@@ -909,8 +909,8 @@ snd_ad1889_create(struct snd_card *card,
 		return err;
 
 	/* check PCI availability (32bit DMA) */
-	if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0 ||
-	    pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK) < 0) {
+	if (pci_set_dma_mask(pci, DMA_BIT_MASK(32)) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(32)) < 0) {
 		printk(KERN_ERR PFX "error setting 32-bit DMA mask.\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/au88x0/au88x0.c b/sound/pci/au88x0/au88x0.c
index 9ec122383eef..7b72c88e449d 100644
--- a/sound/pci/au88x0/au88x0.c
+++ b/sound/pci/au88x0/au88x0.c
@@ -151,8 +151,8 @@ snd_vortex_create(struct snd_card *card, struct pci_dev *pci, vortex_t ** rchip)
 	// check PCI availability (DMA).
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
-	if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0 ||
-	    pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK) < 0) {
+	if (pci_set_dma_mask(pci, DMA_BIT_MASK(32)) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(32)) < 0) {
 		printk(KERN_ERR "error to set DMA mask\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/aw2/aw2-alsa.c b/sound/pci/aw2/aw2-alsa.c
index 8eea29fc42fe..4d34bb0d99d3 100644
--- a/sound/pci/aw2/aw2-alsa.c
+++ b/sound/pci/aw2/aw2-alsa.c
@@ -279,8 +279,8 @@ static int __devinit snd_aw2_create(struct snd_card *card,
 	pci_set_master(pci);
 
 	/* check PCI availability (32bit DMA) */
-	if ((pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0) ||
-	    (pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK) < 0)) {
+	if ((pci_set_dma_mask(pci, DMA_BIT_MASK(32)) < 0) ||
+	    (pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(32)) < 0)) {
 		printk(KERN_ERR "aw2: Impossible to set 32bit mask DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/ca0106/ca0106_main.c b/sound/pci/ca0106/ca0106_main.c
index df757575798a..bfac30f7929f 100644
--- a/sound/pci/ca0106/ca0106_main.c
+++ b/sound/pci/ca0106/ca0106_main.c
@@ -1589,8 +1589,8 @@ static int __devinit snd_ca0106_create(int dev, struct snd_card *card,
 	err = pci_enable_device(pci);
 	if (err < 0)
 		return err;
-	if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0 ||
-	    pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK) < 0) {
+	if (pci_set_dma_mask(pci, DMA_BIT_MASK(32)) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(32)) < 0) {
 		printk(KERN_ERR "error to set 32bit mask DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/cs5535audio/cs5535audio.c b/sound/pci/cs5535audio/cs5535audio.c
index c89ed1f5bc2b..05f56e04849b 100644
--- a/sound/pci/cs5535audio/cs5535audio.c
+++ b/sound/pci/cs5535audio/cs5535audio.c
@@ -285,8 +285,8 @@ static int __devinit snd_cs5535audio_create(struct snd_card *card,
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
 
-	if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0 ||
-	    pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK) < 0) {
+	if (pci_set_dma_mask(pci, DMA_BIT_MASK(32)) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(32)) < 0) {
 		printk(KERN_WARNING "unable to get 32bit dma\n");
 		err = -ENXIO;
 		goto pcifail;
diff --git a/sound/pci/mixart/mixart.c b/sound/pci/mixart/mixart.c
index c1eb84a14c42..82bc5b9e7629 100644
--- a/sound/pci/mixart/mixart.c
+++ b/sound/pci/mixart/mixart.c
@@ -1291,7 +1291,7 @@ static int __devinit snd_mixart_probe(struct pci_dev *pci,
 	pci_set_master(pci);
 
 	/* check if we can restrict PCI DMA transfers to 32 bits */
-	if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0) {
+	if (pci_set_dma_mask(pci, DMA_BIT_MASK(32)) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 32bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/pcxhr/pcxhr.c b/sound/pci/pcxhr/pcxhr.c
index 80e064a3efff..833e9c7b27c7 100644
--- a/sound/pci/pcxhr/pcxhr.c
+++ b/sound/pci/pcxhr/pcxhr.c
@@ -1449,7 +1449,7 @@ static int __devinit pcxhr_probe(struct pci_dev *pci,
 	pci_set_master(pci);
 
 	/* check if we can restrict PCI DMA transfers to 32 bits */
-	if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0) {
+	if (pci_set_dma_mask(pci, DMA_BIT_MASK(32)) < 0) {
 		snd_printk(KERN_ERR "architecture does not support "
 			   "32bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
diff --git a/sound/soc/blackfin/bf5xx-ac97-pcm.c b/sound/soc/blackfin/bf5xx-ac97-pcm.c
index 8cfed1a5dcbe..cf0dfb7ca221 100644
--- a/sound/soc/blackfin/bf5xx-ac97-pcm.c
+++ b/sound/soc/blackfin/bf5xx-ac97-pcm.c
@@ -413,7 +413,7 @@ static void bf5xx_pcm_free_dma_buffers(struct snd_pcm *pcm)
 		sport_done(sport_handle);
 }
 
-static u64 bf5xx_pcm_dmamask = DMA_32BIT_MASK;
+static u64 bf5xx_pcm_dmamask = DMA_BIT_MASK(32);
 
 int bf5xx_pcm_ac97_new(struct snd_card *card, struct snd_soc_dai *dai,
 	struct snd_pcm *pcm)
@@ -424,7 +424,7 @@ int bf5xx_pcm_ac97_new(struct snd_card *card, struct snd_soc_dai *dai,
 	if (!card->dev->dma_mask)
 		card->dev->dma_mask = &bf5xx_pcm_dmamask;
 	if (!card->dev->coherent_dma_mask)
-		card->dev->coherent_dma_mask = DMA_32BIT_MASK;
+		card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
 
 	if (dai->playback.channels_min) {
 		ret = bf5xx_pcm_preallocate_dma_buffer(pcm,
diff --git a/sound/soc/blackfin/bf5xx-i2s-pcm.c b/sound/soc/blackfin/bf5xx-i2s-pcm.c
index 1318c4f627b7..62fbb8459569 100644
--- a/sound/soc/blackfin/bf5xx-i2s-pcm.c
+++ b/sound/soc/blackfin/bf5xx-i2s-pcm.c
@@ -245,7 +245,7 @@ static void bf5xx_pcm_free_dma_buffers(struct snd_pcm *pcm)
 		sport_done(sport_handle);
 }
 
-static u64 bf5xx_pcm_dmamask = DMA_32BIT_MASK;
+static u64 bf5xx_pcm_dmamask = DMA_BIT_MASK(32);
 
 int bf5xx_pcm_i2s_new(struct snd_card *card, struct snd_soc_dai *dai,
 	struct snd_pcm *pcm)
@@ -256,7 +256,7 @@ int bf5xx_pcm_i2s_new(struct snd_card *card, struct snd_soc_dai *dai,
 	if (!card->dev->dma_mask)
 		card->dev->dma_mask = &bf5xx_pcm_dmamask;
 	if (!card->dev->coherent_dma_mask)
-		card->dev->coherent_dma_mask = DMA_32BIT_MASK;
+		card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
 
 	if (dai->playback.channels_min) {
 		ret = bf5xx_pcm_preallocate_dma_buffer(pcm,
diff --git a/sound/soc/omap/omap-pcm.c b/sound/soc/omap/omap-pcm.c
index 8e1431cb46bb..1bdbb0427183 100644
--- a/sound/soc/omap/omap-pcm.c
+++ b/sound/soc/omap/omap-pcm.c
@@ -327,7 +327,7 @@ int omap_pcm_new(struct snd_card *card, struct snd_soc_dai *dai,
 	if (!card->dev->dma_mask)
 		card->dev->dma_mask = &omap_pcm_dmamask;
 	if (!card->dev->coherent_dma_mask)
-		card->dev->coherent_dma_mask = DMA_32BIT_MASK;
+		card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
 
 	if (dai->playback.channels_min) {
 		ret = omap_pcm_preallocate_dma_buffer(pcm,
diff --git a/sound/soc/pxa/pxa2xx-pcm.c b/sound/soc/pxa/pxa2xx-pcm.c
index 53b9fb127a6d..d38e39575f51 100644
--- a/sound/soc/pxa/pxa2xx-pcm.c
+++ b/sound/soc/pxa/pxa2xx-pcm.c
@@ -81,7 +81,7 @@ static struct snd_pcm_ops pxa2xx_pcm_ops = {
 	.mmap		= pxa2xx_pcm_mmap,
 };
 
-static u64 pxa2xx_pcm_dmamask = DMA_32BIT_MASK;
+static u64 pxa2xx_pcm_dmamask = DMA_BIT_MASK(32);
 
 static int pxa2xx_soc_pcm_new(struct snd_card *card, struct snd_soc_dai *dai,
 	struct snd_pcm *pcm)
@@ -91,7 +91,7 @@ static int pxa2xx_soc_pcm_new(struct snd_card *card, struct snd_soc_dai *dai,
 	if (!card->dev->dma_mask)
 		card->dev->dma_mask = &pxa2xx_pcm_dmamask;
 	if (!card->dev->coherent_dma_mask)
-		card->dev->coherent_dma_mask = DMA_32BIT_MASK;
+		card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
 
 	if (dai->playback.channels_min) {
 		ret = pxa2xx_pcm_preallocate_dma_buffer(pcm,
diff --git a/sound/soc/s3c24xx/s3c24xx-pcm.c b/sound/soc/s3c24xx/s3c24xx-pcm.c
index a9d68fa2b34a..169ddad31575 100644
--- a/sound/soc/s3c24xx/s3c24xx-pcm.c
+++ b/sound/soc/s3c24xx/s3c24xx-pcm.c
@@ -419,7 +419,7 @@ static void s3c24xx_pcm_free_dma_buffers(struct snd_pcm *pcm)
 	}
 }
 
-static u64 s3c24xx_pcm_dmamask = DMA_32BIT_MASK;
+static u64 s3c24xx_pcm_dmamask = DMA_BIT_MASK(32);
 
 static int s3c24xx_pcm_new(struct snd_card *card,
 	struct snd_soc_dai *dai, struct snd_pcm *pcm)
-- 
cgit v1.2.3


From eed10e39b142eb4d284c8e4c751c0eaaa0bcd707 Mon Sep 17 00:00:00 2001
From: Koji Sato <sato.koji@lab.ntt.co.jp>
Date: Mon, 6 Apr 2009 19:01:21 -0700
Subject: nilfs2: disk format and userland interface

This adds a header file which specifies the on-disk format and ioctl
interface of the nilfs2 file system.

Signed-off-by: Koji Sato <sato.koji@lab.ntt.co.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nilfs2_fs.h | 854 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 854 insertions(+)
 create mode 100644 include/linux/nilfs2_fs.h

(limited to 'include/linux')

diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
new file mode 100644
index 000000000000..e38fad2f7c0b
--- /dev/null
+++ b/include/linux/nilfs2_fs.h
@@ -0,0 +1,854 @@
+/*
+ * nilfs2_fs.h - NILFS2 on-disk structures and common declarations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/include/linux/ext2_fs.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/include/linux/minix_fs.h
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#ifndef _LINUX_NILFS_FS_H
+#define _LINUX_NILFS_FS_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Inode flags stored in nilfs_inode and on-memory nilfs inode
+ *
+ * We define these flags based on ext2-fs because of the
+ * compatibility reason; to avoid problems in chattr(1)
+ */
+#define NILFS_SECRM_FL		0x00000001 /* Secure deletion */
+#define NILFS_UNRM_FL		0x00000002 /* Undelete */
+#define NILFS_SYNC_FL		0x00000008 /* Synchronous updates */
+#define NILFS_IMMUTABLE_FL	0x00000010 /* Immutable file */
+#define NILFS_APPEND_FL		0x00000020 /* writes to file may only append */
+#define NILFS_NODUMP_FL		0x00000040 /* do not dump file */
+#define NILFS_NOATIME_FL	0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define NILFS_NOTAIL_FL		0x00008000 /* file tail should not be merged */
+#define NILFS_DIRSYNC_FL	0x00010000 /* dirsync behaviour */
+
+#define NILFS_FL_USER_VISIBLE	0x0003DFFF /* User visible flags */
+#define NILFS_FL_USER_MODIFIABLE	0x000380FF /* User modifiable flags */
+
+
+#define NILFS_INODE_BMAP_SIZE	7
+/**
+ * struct nilfs_inode - structure of an inode on disk
+ * @i_blocks: blocks count
+ * @i_size: size in bytes
+ * @i_ctime: creation time
+ * @i_mtime: modification time
+ * @i_dtime: deletion time
+ * @i_uid: user id
+ * @i_gid: group id
+ * @i_mode: file mode
+ * @i_links_count: links count
+ * @i_flags: file flags
+ * @i_bmap: block mapping
+ * @i_xattr: extended attributes
+ * @i_generation: file generation (for NFS)
+ * @i_pad:	padding
+ */
+struct nilfs_inode {
+	__le64	i_blocks;
+	__le64	i_size;
+	__le64	i_ctime;
+	__le64	i_mtime;
+	__le64	i_dtime;
+	__le32	i_uid;
+	__le32	i_gid;
+	__le16	i_mode;
+	__le16	i_links_count;
+	__le32	i_flags;
+	__le64	i_bmap[NILFS_INODE_BMAP_SIZE];
+#define i_device_code	i_bmap[0]
+	__le64	i_xattr;
+	__le32	i_generation;
+	__le32	i_pad;
+};
+
+/**
+ * struct nilfs_super_root - structure of super root
+ * @sr_sum: check sum
+ * @sr_bytes: byte count of the structure
+ * @sr_flags: flags (reserved)
+ * @sr_nongc_ctime: write time of the last segment not for cleaner operation
+ * @sr_dat: DAT file inode
+ * @sr_cpfile: checkpoint file inode
+ * @sr_sufile: segment usage file inode
+ */
+struct nilfs_super_root {
+	__le32 sr_sum;
+	__le16 sr_bytes;
+	__le16 sr_flags;
+	__le64 sr_nongc_ctime;
+	struct nilfs_inode sr_dat;
+	struct nilfs_inode sr_cpfile;
+	struct nilfs_inode sr_sufile;
+};
+
+#define NILFS_SR_MDT_OFFSET(inode_size, i)  \
+	((unsigned long)&((struct nilfs_super_root *)0)->sr_dat + \
+			(inode_size) * (i))
+#define NILFS_SR_DAT_OFFSET(inode_size)     NILFS_SR_MDT_OFFSET(inode_size, 0)
+#define NILFS_SR_CPFILE_OFFSET(inode_size)  NILFS_SR_MDT_OFFSET(inode_size, 1)
+#define NILFS_SR_SUFILE_OFFSET(inode_size)  NILFS_SR_MDT_OFFSET(inode_size, 2)
+#define NILFS_SR_BYTES                  (sizeof(struct nilfs_super_root))
+
+/*
+ * Maximal mount counts
+ */
+#define NILFS_DFL_MAX_MNT_COUNT		50      /* 50 mounts */
+
+/*
+ * File system states (sbp->s_state, nilfs->ns_mount_state)
+ */
+#define NILFS_VALID_FS			0x0001  /* Unmounted cleanly */
+#define NILFS_ERROR_FS			0x0002  /* Errors detected */
+#define NILFS_RESIZE_FS			0x0004	/* Resize required */
+
+/*
+ * Mount flags (sbi->s_mount_opt)
+ */
+#define NILFS_MOUNT_ERROR_MODE		0x0070  /* Error mode mask */
+#define NILFS_MOUNT_ERRORS_CONT		0x0010  /* Continue on errors */
+#define NILFS_MOUNT_ERRORS_RO		0x0020  /* Remount fs ro on errors */
+#define NILFS_MOUNT_ERRORS_PANIC	0x0040  /* Panic on errors */
+#define NILFS_MOUNT_SNAPSHOT		0x0080  /* Snapshot flag */
+#define NILFS_MOUNT_BARRIER		0x1000  /* Use block barriers */
+#define NILFS_MOUNT_STRICT_ORDER	0x2000  /* Apply strict in-order
+						   semantics also for data */
+
+
+/**
+ * struct nilfs_super_block - structure of super block on disk
+ */
+struct nilfs_super_block {
+	__le32	s_rev_level;		/* Revision level */
+	__le16	s_minor_rev_level;	/* minor revision level */
+	__le16	s_magic;		/* Magic signature */
+
+	__le16  s_bytes;		/* Bytes count of CRC calculation
+					   for this structure. s_reserved
+					   is excluded. */
+	__le16  s_flags;		/* flags */
+	__le32  s_crc_seed;		/* Seed value of CRC calculation */
+	__le32	s_sum;			/* Check sum of super block */
+
+	__le32	s_log_block_size;	/* Block size represented as follows
+					   blocksize =
+					       1 << (s_log_block_size + 10) */
+	__le64  s_nsegments;		/* Number of segments in filesystem */
+	__le64  s_dev_size;		/* block device size in bytes */
+	__le64	s_first_data_block;	/* 1st seg disk block number */
+	__le32  s_blocks_per_segment;   /* number of blocks per full segment */
+	__le32	s_r_segments_percentage; /* Reserved segments percentage */
+
+	__le64  s_last_cno;		/* Last checkpoint number */
+	__le64  s_last_pseg;		/* disk block addr pseg written last */
+	__le64  s_last_seq;             /* seq. number of seg written last */
+	__le64	s_free_blocks_count;	/* Free blocks count */
+
+	__le64	s_ctime;		/* Creation time (execution time of
+					   newfs) */
+	__le64	s_mtime;		/* Mount time */
+	__le64	s_wtime;		/* Write time */
+	__le16	s_mnt_count;		/* Mount count */
+	__le16	s_max_mnt_count;	/* Maximal mount count */
+	__le16	s_state;		/* File system state */
+	__le16	s_errors;		/* Behaviour when detecting errors */
+	__le64	s_lastcheck;		/* time of last check */
+
+	__le32	s_checkinterval;	/* max. time between checks */
+	__le32	s_creator_os;		/* OS */
+	__le16	s_def_resuid;		/* Default uid for reserved blocks */
+	__le16	s_def_resgid;		/* Default gid for reserved blocks */
+	__le32	s_first_ino; 		/* First non-reserved inode */
+
+	__le16  s_inode_size; 		/* Size of an inode */
+	__le16  s_dat_entry_size;       /* Size of a dat entry */
+	__le16  s_checkpoint_size;      /* Size of a checkpoint */
+	__le16	s_segment_usage_size;	/* Size of a segment usage */
+
+	__u8	s_uuid[16];		/* 128-bit uuid for volume */
+	char	s_volume_name[16]; 	/* volume name */
+	char	s_last_mounted[64]; 	/* directory where last mounted */
+
+	__le32  s_c_interval;           /* Commit interval of segment */
+	__le32  s_c_block_max;          /* Threshold of data amount for
+					   the segment construction */
+	__u32	s_reserved[192];	/* padding to the end of the block */
+};
+
+/*
+ * Codes for operating systems
+ */
+#define NILFS_OS_LINUX		0
+/* Codes from 1 to 4 are reserved to keep compatibility with ext2 creator-OS */
+
+/*
+ * Revision levels
+ */
+#define NILFS_CURRENT_REV	2	/* current major revision */
+#define NILFS_MINOR_REV		0	/* minor revision */
+
+/*
+ * Bytes count of super_block for CRC-calculation
+ */
+#define NILFS_SB_BYTES  \
+	((long)&((struct nilfs_super_block *)0)->s_reserved)
+
+/*
+ * Special inode number
+ */
+#define NILFS_ROOT_INO		2	/* Root file inode */
+#define NILFS_DAT_INO		3	/* DAT file */
+#define NILFS_CPFILE_INO	4	/* checkpoint file */
+#define NILFS_SUFILE_INO	5	/* segment usage file */
+#define NILFS_IFILE_INO		6	/* ifile */
+#define NILFS_ATIME_INO		7	/* Atime file (reserved) */
+#define NILFS_XATTR_INO		8	/* Xattribute file (reserved) */
+#define NILFS_SKETCH_INO	10	/* Sketch file */
+#define NILFS_USER_INO		11	/* Fisrt user's file inode number */
+
+#define NILFS_SB_OFFSET_BYTES	1024	/* byte offset of nilfs superblock */
+#define NILFS_SUPER_MAGIC	0x3434	/* NILFS filesystem  magic number */
+
+#define NILFS_SEG_MIN_BLOCKS	16	/* Minimum number of blocks in
+					   a full segment */
+#define NILFS_PSEG_MIN_BLOCKS	2	/* Minimum number of blocks in
+					   a partial segment */
+#define NILFS_MIN_NRSVSEGS	8	/* Minimum number of reserved
+					   segments */
+
+
+/*
+ * Maximal count of links to a file
+ */
+#define NILFS_LINK_MAX		32000
+
+/*
+ * Structure of a directory entry
+ *  (Same as ext2)
+ */
+
+#define NILFS_NAME_LEN 255
+
+/*
+ * The new version of the directory entry.  Since V0 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct nilfs_dir_entry {
+	__le64	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__u8	name_len;		/* Name length */
+	__u8	file_type;
+	char	name[NILFS_NAME_LEN];	/* File name */
+	char    pad;
+};
+
+/*
+ * NILFS directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+enum {
+	NILFS_FT_UNKNOWN,
+	NILFS_FT_REG_FILE,
+	NILFS_FT_DIR,
+	NILFS_FT_CHRDEV,
+	NILFS_FT_BLKDEV,
+	NILFS_FT_FIFO,
+	NILFS_FT_SOCK,
+	NILFS_FT_SYMLINK,
+	NILFS_FT_MAX
+};
+
+/*
+ * NILFS_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 8
+ */
+#define NILFS_DIR_PAD			8
+#define NILFS_DIR_ROUND			(NILFS_DIR_PAD - 1)
+#define NILFS_DIR_REC_LEN(name_len)	(((name_len) + 12 + NILFS_DIR_ROUND) & \
+					~NILFS_DIR_ROUND)
+
+
+/**
+ * struct nilfs_finfo - file information
+ * @fi_ino: inode number
+ * @fi_cno: checkpoint number
+ * @fi_nblocks: number of blocks (including intermediate blocks)
+ * @fi_ndatablk: number of file data blocks
+ */
+struct nilfs_finfo {
+	__le64 fi_ino;
+	__le64 fi_cno;
+	__le32 fi_nblocks;
+	__le32 fi_ndatablk;
+	/* array of virtual block numbers */
+};
+
+/**
+ * struct nilfs_binfo_v - information for the block to which a virtual block number is assigned
+ * @bi_vblocknr: virtual block number
+ * @bi_blkoff: block offset
+ */
+struct nilfs_binfo_v {
+	__le64 bi_vblocknr;
+	__le64 bi_blkoff;
+};
+
+/**
+ * struct nilfs_binfo_dat - information for the block which belongs to the DAT file
+ * @bi_blkoff: block offset
+ * @bi_level: level
+ * @bi_pad: padding
+ */
+struct nilfs_binfo_dat {
+	__le64 bi_blkoff;
+	__u8 bi_level;
+	__u8 bi_pad[7];
+};
+
+/**
+ * union nilfs_binfo: block information
+ * @bi_v: nilfs_binfo_v structure
+ * @bi_dat: nilfs_binfo_dat structure
+ */
+union nilfs_binfo {
+	struct nilfs_binfo_v bi_v;
+	struct nilfs_binfo_dat bi_dat;
+};
+
+/**
+ * struct nilfs_segment_summary - segment summary
+ * @ss_datasum: checksum of data
+ * @ss_sumsum: checksum of segment summary
+ * @ss_magic: magic number
+ * @ss_bytes: size of this structure in bytes
+ * @ss_flags: flags
+ * @ss_seq: sequence number
+ * @ss_create: creation timestamp
+ * @ss_next: next segment
+ * @ss_nblocks: number of blocks
+ * @ss_nfinfo: number of finfo structures
+ * @ss_sumbytes: total size of segment summary in bytes
+ * @ss_pad: padding
+ */
+struct nilfs_segment_summary {
+	__le32 ss_datasum;
+	__le32 ss_sumsum;
+	__le32 ss_magic;
+	__le16 ss_bytes;
+	__le16 ss_flags;
+	__le64 ss_seq;
+	__le64 ss_create;
+	__le64 ss_next;
+	__le32 ss_nblocks;
+	__le32 ss_nfinfo;
+	__le32 ss_sumbytes;
+	__le32 ss_pad;
+	/* array of finfo structures */
+};
+
+#define NILFS_SEGSUM_MAGIC	0x1eaffa11  /* segment summary magic number */
+
+/*
+ * Segment summary flags
+ */
+#define NILFS_SS_LOGBGN 0x0001  /* begins a logical segment */
+#define NILFS_SS_LOGEND 0x0002  /* ends a logical segment */
+#define NILFS_SS_SR     0x0004  /* has super root */
+#define NILFS_SS_SYNDT  0x0008  /* includes data only updates */
+#define NILFS_SS_GC     0x0010  /* segment written for cleaner operation */
+
+/**
+ * struct nilfs_palloc_group_desc - block group descriptor
+ * @pg_nfrees: number of free entries in block group
+ */
+struct nilfs_palloc_group_desc {
+	__le32 pg_nfrees;
+};
+
+/**
+ * struct nilfs_dat_entry - disk address translation entry
+ * @dt_blocknr: block number
+ * @dt_start: start checkpoint number
+ * @dt_end: end checkpoint number
+ * @dt_rsv: reserved for future use
+ */
+struct nilfs_dat_entry {
+	__le64 de_blocknr;
+	__le64 de_start;
+	__le64 de_end;
+	__le64 de_rsv;
+};
+
+/**
+ * struct nilfs_dat_group_desc - block group descriptor
+ * @dg_nfrees: number of free virtual block numbers in block group
+ */
+struct nilfs_dat_group_desc {
+	__le32 dg_nfrees;
+};
+
+
+/**
+ * struct nilfs_snapshot_list - snapshot list
+ * @ssl_next: next checkpoint number on snapshot list
+ * @ssl_prev: previous checkpoint number on snapshot list
+ */
+struct nilfs_snapshot_list {
+	__le64 ssl_next;
+	__le64 ssl_prev;
+};
+
+/**
+ * struct nilfs_checkpoint - checkpoint structure
+ * @cp_flags: flags
+ * @cp_checkpoints_count: checkpoints count in a block
+ * @cp_snapshot_list: snapshot list
+ * @cp_cno: checkpoint number
+ * @cp_create: creation timestamp
+ * @cp_nblk_inc: number of blocks incremented by this checkpoint
+ * @cp_inodes_count: inodes count
+ * @cp_blocks_count: blocks count
+ * @cp_ifile_inode: inode of ifile
+ */
+struct nilfs_checkpoint {
+	__le32 cp_flags;
+	__le32 cp_checkpoints_count;
+	struct nilfs_snapshot_list cp_snapshot_list;
+	__le64 cp_cno;
+	__le64 cp_create;
+	__le64 cp_nblk_inc;
+	__le64 cp_inodes_count;
+	__le64 cp_blocks_count;		/* Reserved (might be deleted) */
+
+	/* Do not change the byte offset of ifile inode.
+	   To keep the compatibility of the disk format,
+	   additional fields should be added behind cp_ifile_inode. */
+	struct nilfs_inode cp_ifile_inode;
+};
+
+/* checkpoint flags */
+enum {
+	NILFS_CHECKPOINT_SNAPSHOT,
+	NILFS_CHECKPOINT_INVALID,
+	NILFS_CHECKPOINT_SKETCH,
+};
+
+#define NILFS_CHECKPOINT_FNS(flag, name)				\
+static inline void							\
+nilfs_checkpoint_set_##name(struct nilfs_checkpoint *cp)		\
+{									\
+	cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) |		\
+				   (1UL << NILFS_CHECKPOINT_##flag));	\
+}									\
+static inline void							\
+nilfs_checkpoint_clear_##name(struct nilfs_checkpoint *cp)		\
+{									\
+	cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) &		\
+				   ~(1UL << NILFS_CHECKPOINT_##flag));	\
+}									\
+static inline int							\
+nilfs_checkpoint_##name(const struct nilfs_checkpoint *cp)		\
+{									\
+	return !!(le32_to_cpu(cp->cp_flags) &				\
+		  (1UL << NILFS_CHECKPOINT_##flag));			\
+}
+
+NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot)
+NILFS_CHECKPOINT_FNS(INVALID, invalid)
+NILFS_CHECKPOINT_FNS(SKETCH, sketch)
+
+/**
+ * struct nilfs_cpinfo - checkpoint information
+ * @ci_flags: flags
+ * @ci_cno: checkpoint number
+ * @ci_create: creation timestamp
+ * @ci_nblk_inc: number of blocks incremented by this checkpoint
+ * @ci_inodes_count: inodes count
+ * @ci_blocks_count: blocks count
+ * @ci_next: next checkpoint number in snapshot list
+ */
+struct nilfs_cpinfo {
+	__u32 ci_flags;
+	__u64 ci_cno;
+	__u64 ci_create;
+	__u64 ci_nblk_inc;
+	__u64 ci_inodes_count;
+	__u64 ci_blocks_count;
+	__u64 ci_next;
+};
+
+#define NILFS_CPINFO_FNS(flag, name)					\
+static inline int							\
+nilfs_cpinfo_##name(const struct nilfs_cpinfo *cpinfo)			\
+{									\
+	return !!(cpinfo->ci_flags & (1UL << NILFS_CHECKPOINT_##flag));	\
+}
+
+NILFS_CPINFO_FNS(SNAPSHOT, snapshot)
+NILFS_CPINFO_FNS(INVALID, invalid)
+NILFS_CPINFO_FNS(SKETCH, sketch)
+
+
+/**
+ * struct nilfs_cpfile_header - checkpoint file header
+ * @ch_ncheckpoints: number of checkpoints
+ * @ch_nsnapshots: number of snapshots
+ * @ch_snapshot_list: snapshot list
+ */
+struct nilfs_cpfile_header {
+	__le64 ch_ncheckpoints;
+	__le64 ch_nsnapshots;
+	struct nilfs_snapshot_list ch_snapshot_list;
+};
+
+#define NILFS_CPFILE_FIRST_CHECKPOINT_OFFSET	\
+	((sizeof(struct nilfs_cpfile_header) +				\
+	  sizeof(struct nilfs_checkpoint) - 1) /			\
+			sizeof(struct nilfs_checkpoint))
+
+/**
+ * struct nilfs_segment_usage - segment usage
+ * @su_lastmod: last modified timestamp
+ * @su_nblocks: number of blocks in segment
+ * @su_flags: flags
+ */
+struct nilfs_segment_usage {
+	__le64 su_lastmod;
+	__le32 su_nblocks;
+	__le32 su_flags;
+};
+
+/* segment usage flag */
+enum {
+	NILFS_SEGMENT_USAGE_ACTIVE,
+	NILFS_SEGMENT_USAGE_DIRTY,
+	NILFS_SEGMENT_USAGE_ERROR,
+
+	/* on-memory only */
+	NILFS_SEGMENT_USAGE_VOLATILE_ACTIVE,
+	/* ... */
+};
+
+#define NILFS_SEGMENT_USAGE_FNS(flag, name)				\
+static inline void							\
+nilfs_segment_usage_set_##name(struct nilfs_segment_usage *su)		\
+{									\
+	su->su_flags = cpu_to_le32(le32_to_cpu(su->su_flags) |		\
+				   (1UL << NILFS_SEGMENT_USAGE_##flag));\
+}									\
+static inline void							\
+nilfs_segment_usage_clear_##name(struct nilfs_segment_usage *su)	\
+{									\
+	su->su_flags =							\
+		cpu_to_le32(le32_to_cpu(su->su_flags) &			\
+			    ~(1UL << NILFS_SEGMENT_USAGE_##flag));      \
+}									\
+static inline int							\
+nilfs_segment_usage_##name(const struct nilfs_segment_usage *su)	\
+{									\
+	return !!(le32_to_cpu(su->su_flags) &				\
+		  (1UL << NILFS_SEGMENT_USAGE_##flag));			\
+}
+
+NILFS_SEGMENT_USAGE_FNS(ACTIVE, active)
+NILFS_SEGMENT_USAGE_FNS(DIRTY, dirty)
+NILFS_SEGMENT_USAGE_FNS(ERROR, error)
+NILFS_SEGMENT_USAGE_FNS(VOLATILE_ACTIVE, volatile_active)
+
+static inline void
+nilfs_segment_usage_set_clean(struct nilfs_segment_usage *su)
+{
+	su->su_lastmod = cpu_to_le64(0);
+	su->su_nblocks = cpu_to_le32(0);
+	su->su_flags = cpu_to_le32(0);
+}
+
+static inline int
+nilfs_segment_usage_clean(const struct nilfs_segment_usage *su)
+{
+	return !le32_to_cpu(su->su_flags);
+}
+
+/**
+ * struct nilfs_sufile_header - segment usage file header
+ * @sh_ncleansegs: number of clean segments
+ * @sh_ndirtysegs: number of dirty segments
+ * @sh_last_alloc: last allocated segment number
+ */
+struct nilfs_sufile_header {
+	__le64 sh_ncleansegs;
+	__le64 sh_ndirtysegs;
+	__le64 sh_last_alloc;
+	/* ... */
+};
+
+#define NILFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET	\
+	((sizeof(struct nilfs_sufile_header) +				\
+	  sizeof(struct nilfs_segment_usage) - 1) /			\
+			 sizeof(struct nilfs_segment_usage))
+
+/**
+ * nilfs_suinfo - segment usage information
+ * @sui_lastmod:
+ * @sui_nblocks:
+ * @sui_flags:
+ */
+struct nilfs_suinfo {
+	__u64 sui_lastmod;
+	__u32 sui_nblocks;
+	__u32 sui_flags;
+};
+
+#define NILFS_SUINFO_FNS(flag, name)					\
+static inline int							\
+nilfs_suinfo_##name(const struct nilfs_suinfo *si)			\
+{									\
+	return si->sui_flags & (1UL << NILFS_SEGMENT_USAGE_##flag);	\
+}
+
+NILFS_SUINFO_FNS(ACTIVE, active)
+NILFS_SUINFO_FNS(DIRTY, dirty)
+NILFS_SUINFO_FNS(ERROR, error)
+NILFS_SUINFO_FNS(VOLATILE_ACTIVE, volatile_active)
+
+static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si)
+{
+	return !si->sui_flags;
+}
+
+/* ioctl */
+enum {
+	NILFS_CHECKPOINT,
+	NILFS_SNAPSHOT,
+};
+
+/**
+ * struct nilfs_cpmode -
+ * @cc_cno:
+ * @cc_mode:
+ */
+struct nilfs_cpmode {
+	__u64 cm_cno;
+	int cm_mode;
+};
+
+/**
+ * struct nilfs_argv - argument vector
+ * @v_base:
+ * @v_nmembs:
+ * @v_size:
+ * @v_index:
+ * @v_flags:
+ */
+struct nilfs_argv {
+	void *v_base;
+	size_t v_nmembs;	/* number of members */
+	size_t v_size;		/* size of members */
+	int v_index;
+	int v_flags;
+};
+
+/**
+ * struct nilfs_period -
+ * @p_start:
+ * @p_end:
+ */
+struct nilfs_period {
+	__u64 p_start;
+	__u64 p_end;
+};
+
+/**
+ * struct nilfs_cpstat -
+ * @cs_cno: checkpoint number
+ * @cs_ncps: number of checkpoints
+ * @cs_nsss: number of snapshots
+ */
+struct nilfs_cpstat {
+	__u64 cs_cno;
+	__u64 cs_ncps;
+	__u64 cs_nsss;
+};
+
+/**
+ * struct nilfs_sustat -
+ * @ss_nsegs: number of segments
+ * @ss_ncleansegs: number of clean segments
+ * @ss_ndirtysegs: number of dirty segments
+ * @ss_ctime:
+ * @ss_nongc_ctime:
+ */
+struct nilfs_sustat {
+	__u64 ss_nsegs;
+	__u64 ss_ncleansegs;
+	__u64 ss_ndirtysegs;
+	time_t ss_ctime;
+	time_t ss_nongc_ctime;
+};
+
+/**
+ * struct nilfs_vinfo - virtual block number information
+ * @vi_vblocknr:
+ * @vi_start:
+ * @vi_end:
+ * @vi_blocknr:
+ */
+struct nilfs_vinfo {
+	__u64 vi_vblocknr;
+	__u64 vi_start;
+	__u64 vi_end;
+	__u64 vi_blocknr;
+};
+
+/**
+ * struct nilfs_vdesc -
+ */
+struct nilfs_vdesc {
+	__u64 vd_ino;
+	__u64 vd_cno;
+	__u64 vd_vblocknr;
+	struct nilfs_period vd_period;
+	__u64 vd_blocknr;
+	__u64 vd_offset;
+	__u32 vd_flags;
+};
+
+/**
+ * struct nilfs_bdesc -
+ */
+struct nilfs_bdesc {
+	__u64 bd_ino;
+	__u64 bd_oblocknr;
+	__u64 bd_blocknr;
+	__u64 bd_offset;
+	__u32 bd_level;
+};
+
+#define	NILFS_TIMEDWAIT_WRITE_LOCKED	0x1
+#define	NILFS_TIMEDWAIT_SEG_WRITE	0x2
+
+/**
+ * struct nilfs_wait_cond -
+ */
+struct nilfs_wait_cond {
+	int wc_cond;
+	int wc_flags;
+	struct timespec wc_timeout;
+};
+
+#define NILFS_IOCTL_IDENT		'n'
+
+#define NILFS_IOCTL_CHANGE_CPMODE  \
+	_IOW(NILFS_IOCTL_IDENT, 0x80, struct nilfs_cpmode)
+#define NILFS_IOCTL_DELETE_CHECKPOINT  \
+	_IOW(NILFS_IOCTL_IDENT, 0x81, __u64)
+#define NILFS_IOCTL_GET_CPINFO  \
+	_IOR(NILFS_IOCTL_IDENT, 0x82, struct nilfs_argv)
+#define NILFS_IOCTL_GET_CPSTAT  \
+	_IOR(NILFS_IOCTL_IDENT, 0x83, struct nilfs_cpstat)
+#define NILFS_IOCTL_GET_SUINFO  \
+	_IOR(NILFS_IOCTL_IDENT, 0x84, struct nilfs_argv)
+#define NILFS_IOCTL_GET_SUSTAT  \
+	_IOR(NILFS_IOCTL_IDENT, 0x85, struct nilfs_sustat)
+#define NILFS_IOCTL_GET_VINFO  \
+	_IOWR(NILFS_IOCTL_IDENT, 0x86, struct nilfs_argv)
+#define NILFS_IOCTL_GET_BDESCS  \
+	_IOWR(NILFS_IOCTL_IDENT, 0x87, struct nilfs_argv)
+#define NILFS_IOCTL_CLEAN_SEGMENTS  \
+	_IOW(NILFS_IOCTL_IDENT, 0x88, struct nilfs_argv[5])
+#define NILFS_IOCTL_TIMEDWAIT  \
+	_IOWR(NILFS_IOCTL_IDENT, 0x89, struct nilfs_wait_cond)
+#define NILFS_IOCTL_SYNC  \
+	_IOR(NILFS_IOCTL_IDENT, 0x8A, __u64)
+#define NILFS_IOCTL_RESIZE  \
+	_IOW(NILFS_IOCTL_IDENT, 0x8B, __u64)
+
+/* compat_ioctl */
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct nilfs_cpmode32 {
+	__u64 cm_cno;
+	compat_int_t cm_mode;
+};
+
+struct nilfs_argv32 {
+	compat_caddr_t v_base;
+	compat_size_t v_nmembs;
+	compat_size_t v_size;
+	compat_int_t v_index;
+	compat_int_t v_flags;
+};
+
+struct nilfs_sustat32 {
+	__u64 ss_nsegs;
+	__u64 ss_ncleansegs;
+	__u64 ss_ndirtysegs;
+	compat_time_t ss_ctime;
+	compat_time_t ss_nongc_ctime;
+};
+
+struct nilfs_wait_cond32 {
+	compat_int_t wc_cond;
+	compat_int_t wc_flags;
+	struct compat_timespec wc_timeout;
+};
+
+#define NILFS_IOCTL32_CHANGE_CPMODE  \
+	_IOW(NILFS_IOCTL_IDENT, 0x80, struct nilfs_cpmode32)
+#define NILFS_IOCTL32_GET_CPINFO  \
+	_IOR(NILFS_IOCTL_IDENT, 0x82, struct nilfs_argv32)
+#define NILFS_IOCTL32_GET_SUINFO  \
+	_IOR(NILFS_IOCTL_IDENT, 0x84, struct nilfs_argv32)
+#define NILFS_IOCTL32_GET_SUSTAT  \
+	_IOR(NILFS_IOCTL_IDENT, 0x85, struct nilfs_sustat32)
+#define NILFS_IOCTL32_GET_VINFO  \
+	_IOWR(NILFS_IOCTL_IDENT, 0x86, struct nilfs_argv32)
+#define NILFS_IOCTL32_GET_BDESCS  \
+	_IOWR(NILFS_IOCTL_IDENT, 0x87, struct nilfs_argv32)
+#define NILFS_IOCTL32_CLEAN_SEGMENTS  \
+	_IOW(NILFS_IOCTL_IDENT, 0x88, struct nilfs_argv32[5])
+#define NILFS_IOCTL32_TIMEDWAIT  \
+	_IOWR(NILFS_IOCTL_IDENT, 0x89, struct nilfs_wait_cond32)
+#endif	/* CONFIG_COMPAT */
+
+#endif	/* _LINUX_NILFS_FS_H */
-- 
cgit v1.2.3


From 1088dcf4c3a0a27fdad5214781d5084b11405238 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 6 Apr 2009 19:01:51 -0700
Subject: nilfs2: remove timedwait ioctl command

This removes NILFS_IOCTL_TIMEDWAIT command from ioctl interface along
with the related flags and wait queue.

The command is terrible because it just sleeps in the ioctl.  I prefer
to avoid this by devising means of event polling in userland program.
By reconsidering the userland GC daemon, I found this is possible
without changing behaviour of the daemon and sacrificing efficiency.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/ioctl.c         | 95 +----------------------------------------------
 fs/nilfs2/segment.c       |  5 +--
 fs/nilfs2/the_nilfs.c     |  1 -
 fs/nilfs2/the_nilfs.h     |  6 ---
 include/linux/nilfs2_fs.h | 22 -----------
 5 files changed, 2 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 9e4d9e64c8ff..85a291ccc1be 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -578,62 +578,9 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
 static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 				      unsigned int cmd, void __user *argp)
 {
-	int ret;
-
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-
-	ret = nilfs_clean_segments(inode->i_sb, argp);
-	clear_nilfs_cond_nongc_write(NILFS_SB(inode->i_sb)->s_nilfs);
-	return ret;
-}
-
-static int nilfs_ioctl_test_cond(struct the_nilfs *nilfs, int cond)
-{
-	return (cond & NILFS_TIMEDWAIT_SEG_WRITE) &&
-		nilfs_cond_nongc_write(nilfs);
-}
-
-static void nilfs_ioctl_clear_cond(struct the_nilfs *nilfs, int cond)
-{
-	if (cond & NILFS_TIMEDWAIT_SEG_WRITE)
-		clear_nilfs_cond_nongc_write(nilfs);
-}
-
-static int nilfs_ioctl_timedwait(struct inode *inode, struct file *filp,
-				 unsigned int cmd, void __user *argp)
-{
-	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
-	struct nilfs_wait_cond wc;
-	long ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	if (copy_from_user(&wc, argp, sizeof(wc)))
-		return -EFAULT;
-
-	unlock_kernel();
-	ret = wc.wc_flags ?
-		wait_event_interruptible_timeout(
-			nilfs->ns_cleanerd_wq,
-			nilfs_ioctl_test_cond(nilfs, wc.wc_cond),
-			timespec_to_jiffies(&wc.wc_timeout)) :
-		wait_event_interruptible(
-			nilfs->ns_cleanerd_wq,
-			nilfs_ioctl_test_cond(nilfs, wc.wc_cond));
-	lock_kernel();
-	nilfs_ioctl_clear_cond(nilfs, wc.wc_cond);
-
-	if (ret > 0) {
-		jiffies_to_timespec(ret, &wc.wc_timeout);
-		if (copy_to_user(argp, &wc, sizeof(wc)))
-			return -EFAULT;
-		return 0;
-	}
-	if (ret != 0)
-		return -EINTR;
-
-	return wc.wc_flags ? -ETIME : 0;
+	return nilfs_clean_segments(inode->i_sb, argp);
 }
 
 static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
@@ -679,8 +626,6 @@ int nilfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp);
 	case NILFS_IOCTL_CLEAN_SEGMENTS:
 		return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
-	case NILFS_IOCTL_TIMEDWAIT:
-		return nilfs_ioctl_timedwait(inode, filp, cmd, argp);
 	case NILFS_IOCTL_SYNC:
 		return nilfs_ioctl_sync(inode, filp, cmd, argp);
 	default:
@@ -871,41 +816,6 @@ nilfs_compat_ioctl_clean_segments(struct inode *inode, struct file *filp,
 		inode, filp, cmd, (unsigned long)uargv);
 }
 
-static int
-nilfs_compat_ioctl_timedwait(struct inode *inode, struct file *filp,
-			     unsigned int cmd, unsigned long arg)
-{
-	struct nilfs_wait_cond __user *uwcond;
-	struct nilfs_wait_cond32 __user *uwcond32;
-	struct timespec ts;
-	int cond, flags, ret;
-
-	uwcond = compat_alloc_user_space(sizeof(struct nilfs_wait_cond));
-	uwcond32 = compat_ptr(arg);
-	if (get_user(cond, &uwcond32->wc_cond) ||
-	    put_user(cond, &uwcond->wc_cond) ||
-	    get_user(flags, &uwcond32->wc_flags) ||
-	    put_user(flags, &uwcond->wc_flags) ||
-	    get_user(ts.tv_sec, &uwcond32->wc_timeout.tv_sec) ||
-	    get_user(ts.tv_nsec, &uwcond32->wc_timeout.tv_nsec) ||
-	    put_user(ts.tv_sec, &uwcond->wc_timeout.tv_sec) ||
-	    put_user(ts.tv_nsec, &uwcond->wc_timeout.tv_nsec))
-		return -EFAULT;
-
-	ret = nilfs_compat_locked_ioctl(inode, filp, cmd,
-					(unsigned long)uwcond);
-	if (ret < 0)
-		return ret;
-
-	if (get_user(ts.tv_sec, &uwcond->wc_timeout.tv_sec) ||
-	    get_user(ts.tv_nsec, &uwcond->wc_timeout.tv_nsec) ||
-	    put_user(ts.tv_sec, &uwcond32->wc_timeout.tv_sec) ||
-	    put_user(ts.tv_nsec, &uwcond32->wc_timeout.tv_nsec))
-		return -EFAULT;
-
-	return 0;
-}
-
 static int nilfs_compat_ioctl_sync(struct inode *inode, struct file *filp,
 				   unsigned int cmd, unsigned long arg)
 {
@@ -943,9 +853,6 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case NILFS_IOCTL32_CLEAN_SEGMENTS:
 		return nilfs_compat_ioctl_clean_segments(
 			inode, filp, NILFS_IOCTL_CLEAN_SEGMENTS, arg);
-	case NILFS_IOCTL32_TIMEDWAIT:
-		return nilfs_compat_ioctl_timedwait(
-			inode, filp, NILFS_IOCTL_TIMEDWAIT, arg);
 	case NILFS_IOCTL_SYNC:
 		return nilfs_compat_ioctl_sync(inode, filp, cmd, arg);
 	default:
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6d66c5cb7b51..5db12d774a03 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2114,11 +2114,8 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 		nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
 		if (update_sr)
 			nilfs_commit_gcdat_inode(nilfs);
-	} else {
+	} else
 		nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
-		set_nilfs_cond_nongc_write(nilfs);
-		wake_up(&nilfs->ns_cleanerd_wq);
-	}
 
 	sci->sc_nblk_inc += sci->sc_nblk_this_inc;
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 852e0bf3a3c5..69b625586226 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -73,7 +73,6 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	nilfs->ns_gc_inodes_h = NULL;
 	INIT_LIST_HEAD(&nilfs->ns_used_segments);
 	init_rwsem(&nilfs->ns_segctor_sem);
-	init_waitqueue_head(&nilfs->ns_cleanerd_wq);
 
 	return nilfs;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 9cd3c113f052..75da37306964 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -37,7 +37,6 @@ enum {
 	THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
 				   the latest checkpoint was loaded */
 	THE_NILFS_DISCONTINUED,	/* 'next' pointer chain has broken */
-	THE_NILFS_COND_NONGC_WRITE,	/* Condition to wake up cleanerd */
 };
 
 /**
@@ -74,7 +73,6 @@ enum {
  * @ns_gc_dat: shadow inode of the DAT file inode for GC
  * @ns_gc_inodes: dummy inodes to keep live blocks
  * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
- * @ns_cleanerd_wq: wait queue for cleanerd
  * @ns_blocksize_bits: bit length of block size
  * @ns_nsegments: number of segments in filesystem
  * @ns_blocks_per_segment: number of blocks per segment
@@ -151,9 +149,6 @@ struct the_nilfs {
 	struct list_head	ns_gc_inodes;
 	struct hlist_head      *ns_gc_inodes_h;
 
-	/* cleanerd */
-	wait_queue_head_t	ns_cleanerd_wq;
-
 	/* Disk layout information (static) */
 	unsigned int		ns_blocksize_bits;
 	unsigned long		ns_nsegments;
@@ -186,7 +181,6 @@ static inline int nilfs_##name(struct the_nilfs *nilfs)			\
 THE_NILFS_FNS(INIT, init)
 THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
-THE_NILFS_FNS(COND_NONGC_WRITE, cond_nongc_write)
 
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
 struct the_nilfs *alloc_nilfs(struct block_device *);
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index e38fad2f7c0b..b0a6b39eedbc 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -763,18 +763,6 @@ struct nilfs_bdesc {
 	__u32 bd_level;
 };
 
-#define	NILFS_TIMEDWAIT_WRITE_LOCKED	0x1
-#define	NILFS_TIMEDWAIT_SEG_WRITE	0x2
-
-/**
- * struct nilfs_wait_cond -
- */
-struct nilfs_wait_cond {
-	int wc_cond;
-	int wc_flags;
-	struct timespec wc_timeout;
-};
-
 #define NILFS_IOCTL_IDENT		'n'
 
 #define NILFS_IOCTL_CHANGE_CPMODE  \
@@ -795,8 +783,6 @@ struct nilfs_wait_cond {
 	_IOWR(NILFS_IOCTL_IDENT, 0x87, struct nilfs_argv)
 #define NILFS_IOCTL_CLEAN_SEGMENTS  \
 	_IOW(NILFS_IOCTL_IDENT, 0x88, struct nilfs_argv[5])
-#define NILFS_IOCTL_TIMEDWAIT  \
-	_IOWR(NILFS_IOCTL_IDENT, 0x89, struct nilfs_wait_cond)
 #define NILFS_IOCTL_SYNC  \
 	_IOR(NILFS_IOCTL_IDENT, 0x8A, __u64)
 #define NILFS_IOCTL_RESIZE  \
@@ -827,12 +813,6 @@ struct nilfs_sustat32 {
 	compat_time_t ss_nongc_ctime;
 };
 
-struct nilfs_wait_cond32 {
-	compat_int_t wc_cond;
-	compat_int_t wc_flags;
-	struct compat_timespec wc_timeout;
-};
-
 #define NILFS_IOCTL32_CHANGE_CPMODE  \
 	_IOW(NILFS_IOCTL_IDENT, 0x80, struct nilfs_cpmode32)
 #define NILFS_IOCTL32_GET_CPINFO  \
@@ -847,8 +827,6 @@ struct nilfs_wait_cond32 {
 	_IOWR(NILFS_IOCTL_IDENT, 0x87, struct nilfs_argv32)
 #define NILFS_IOCTL32_CLEAN_SEGMENTS  \
 	_IOW(NILFS_IOCTL_IDENT, 0x88, struct nilfs_argv32[5])
-#define NILFS_IOCTL32_TIMEDWAIT  \
-	_IOWR(NILFS_IOCTL_IDENT, 0x89, struct nilfs_wait_cond32)
 #endif	/* CONFIG_COMPAT */
 
 #endif	/* _LINUX_NILFS_FS_H */
-- 
cgit v1.2.3


From dc498d09be28172846cacded35ca2378222a8c7b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 6 Apr 2009 19:01:52 -0700
Subject: nilfs2: use fixed sized types for ioctl structures

Nilfs ioctl had structures not having fixed sized types such as:

  struct nilfs_argv {
         void *v_base;
         size_t v_nmembs;
         size_t v_size;
         int v_index;
         int v_flags;
  };

Further, some of them are wrongly aligned:

  e.g.

  struct nilfs_cpmode {
        __u64 cm_cno;
        int cm_mode;
  };

The size of wrongly aligned structures varies depending on
architectures, and it breaks the identity of ioctl commands, which
leads to arch dependent errors.

Previously, these are compensated by using compat_ioctl.

This fixes these problems and allows removal of compat ioctl.

Since this will change sizes of those structures, binary compatibility
for the past utilities will once break; new utilities have to be used
instead.  However, it would be helpful to avoid platform dependent
problems in the long term.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/ioctl.c         | 11 +++++------
 include/linux/nilfs2_fs.h | 23 ++++++++++++++---------
 2 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 85a291ccc1be..7fbd9fe1d035 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -41,6 +41,7 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
 						   void *, size_t, size_t))
 {
 	void *buf;
+	void __user *base = (void __user *)(unsigned long)argv->v_base;
 	size_t maxmembs, total, n;
 	ssize_t nr;
 	int ret, i;
@@ -64,9 +65,8 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
 		n = (argv->v_nmembs - i < maxmembs) ?
 			argv->v_nmembs - i : maxmembs;
 		if ((dir & _IOC_WRITE) &&
-		    copy_from_user(buf,
-			    (void __user *)argv->v_base + argv->v_size * i,
-			    argv->v_size * n)) {
+		    copy_from_user(buf, base + argv->v_size * i,
+				   argv->v_size * n)) {
 			ret = -EFAULT;
 			break;
 		}
@@ -78,9 +78,8 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
 			break;
 		}
 		if ((dir & _IOC_READ) &&
-		    copy_to_user(
-			    (void __user *)argv->v_base + argv->v_size * i,
-			    buf, argv->v_size * nr)) {
+		    copy_to_user(base + argv->v_size * i, buf,
+				 argv->v_size * nr)) {
 			ret = -EFAULT;
 			break;
 		}
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index b0a6b39eedbc..8fb64ce285fd 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -499,6 +499,7 @@ NILFS_CHECKPOINT_FNS(SKETCH, sketch)
 /**
  * struct nilfs_cpinfo - checkpoint information
  * @ci_flags: flags
+ * @ci_pad: padding
  * @ci_cno: checkpoint number
  * @ci_create: creation timestamp
  * @ci_nblk_inc: number of blocks incremented by this checkpoint
@@ -508,6 +509,7 @@ NILFS_CHECKPOINT_FNS(SKETCH, sketch)
  */
 struct nilfs_cpinfo {
 	__u32 ci_flags;
+	__u32 ci_pad;
 	__u64 ci_cno;
 	__u64 ci_create;
 	__u64 ci_nblk_inc;
@@ -668,7 +670,8 @@ enum {
  */
 struct nilfs_cpmode {
 	__u64 cm_cno;
-	int cm_mode;
+	__u32 cm_mode;
+	__u32 cm_pad;
 };
 
 /**
@@ -676,15 +679,15 @@ struct nilfs_cpmode {
  * @v_base:
  * @v_nmembs:
  * @v_size:
- * @v_index:
  * @v_flags:
+ * @v_index:
  */
 struct nilfs_argv {
-	void *v_base;
-	size_t v_nmembs;	/* number of members */
-	size_t v_size;		/* size of members */
-	int v_index;
-	int v_flags;
+	__u64 v_base;
+	__u32 v_nmembs;	/* number of members */
+	__u16 v_size;	/* size of members */
+	__u16 v_flags;
+	__u64 v_index;
 };
 
 /**
@@ -721,8 +724,8 @@ struct nilfs_sustat {
 	__u64 ss_nsegs;
 	__u64 ss_ncleansegs;
 	__u64 ss_ndirtysegs;
-	time_t ss_ctime;
-	time_t ss_nongc_ctime;
+	__u64 ss_ctime;
+	__u64 ss_nongc_ctime;
 };
 
 /**
@@ -750,6 +753,7 @@ struct nilfs_vdesc {
 	__u64 vd_blocknr;
 	__u64 vd_offset;
 	__u32 vd_flags;
+	__u32 vd_pad;
 };
 
 /**
@@ -761,6 +765,7 @@ struct nilfs_bdesc {
 	__u64 bd_blocknr;
 	__u64 bd_offset;
 	__u32 bd_level;
+	__u32 bd_pad;
 };
 
 #define NILFS_IOCTL_IDENT		'n'
-- 
cgit v1.2.3


From 8082d36aed26c4fb6ed43e4008303682eabf839e Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 6 Apr 2009 19:01:53 -0700
Subject: nilfs2: remove compat ioctl code

This removes compat code from the nilfs ioctls and applies the same
function for both .ioctl and .compat_ioctl file operations.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/dir.c           |   2 +-
 fs/nilfs2/file.c          |   2 +-
 fs/nilfs2/ioctl.c         | 228 ----------------------------------------------
 fs/nilfs2/nilfs.h         |   1 -
 include/linux/nilfs2_fs.h |  41 ---------
 5 files changed, 2 insertions(+), 272 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 1b7e6ddabbeb..393316cd3cad 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -704,7 +704,7 @@ struct file_operations nilfs_dir_operations = {
 	.readdir	= nilfs_readdir,
 	.ioctl		= nilfs_ioctl,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl	= nilfs_compat_ioctl,
+	.compat_ioctl	= nilfs_ioctl,
 #endif	/* CONFIG_COMPAT */
 	.fsync		= nilfs_sync_file,
 
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index cd38124372f3..a2bd962ebd85 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -142,7 +142,7 @@ struct file_operations nilfs_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.ioctl		= nilfs_ioctl,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl	= nilfs_compat_ioctl,
+	.compat_ioctl	= nilfs_ioctl,
 #endif	/* CONFIG_COMPAT */
 	.mmap		= nilfs_file_mmap,
 	.open		= generic_file_open,
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 7fbd9fe1d035..33aff8842ce9 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -631,231 +631,3 @@ int nilfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		return -ENOTTY;
 	}
 }
-
-/* compat_ioctl */
-#ifdef CONFIG_COMPAT
-#include <linux/compat.h>
-
-static int nilfs_compat_locked_ioctl(struct inode *inode, struct file *filp,
-				     unsigned int cmd, unsigned long arg)
-{
-	int ret;
-
-	lock_kernel();
-	ret = nilfs_ioctl(inode, filp, cmd, arg);
-	unlock_kernel();
-	return ret;
-}
-
-static int
-nilfs_compat_ioctl_uargv32_to_uargv(struct nilfs_argv32 __user *uargv32,
-				    struct nilfs_argv __user *uargv)
-{
-	compat_uptr_t base;
-	compat_size_t nmembs, size;
-	compat_int_t index, flags;
-
-	if (get_user(base, &uargv32->v_base) ||
-	    put_user(compat_ptr(base), &uargv->v_base) ||
-	    get_user(nmembs, &uargv32->v_nmembs) ||
-	    put_user(nmembs, &uargv->v_nmembs) ||
-	    get_user(size, &uargv32->v_size) ||
-	    put_user(size, &uargv->v_size) ||
-	    get_user(index, &uargv32->v_index) ||
-	    put_user(index, &uargv->v_index) ||
-	    get_user(flags, &uargv32->v_flags) ||
-	    put_user(flags, &uargv->v_flags))
-		return -EFAULT;
-	return 0;
-}
-
-static int
-nilfs_compat_ioctl_uargv_to_uargv32(struct nilfs_argv __user *uargv,
-				    struct nilfs_argv32 __user *uargv32)
-{
-	size_t nmembs;
-
-	if (get_user(nmembs, &uargv->v_nmembs) ||
-	    put_user(nmembs, &uargv32->v_nmembs))
-		return -EFAULT;
-	return 0;
-}
-
-static int
-nilfs_compat_ioctl_get_by_argv(struct inode *inode, struct file *filp,
-			       unsigned int cmd, unsigned long arg)
-{
-	struct nilfs_argv __user *uargv;
-	struct nilfs_argv32 __user *uargv32;
-	int ret;
-
-	uargv = compat_alloc_user_space(sizeof(struct nilfs_argv));
-	uargv32 = compat_ptr(arg);
-	ret = nilfs_compat_ioctl_uargv32_to_uargv(uargv32, uargv);
-	if (ret < 0)
-		return ret;
-
-	ret = nilfs_compat_locked_ioctl(inode, filp, cmd, (unsigned long)uargv);
-	if (ret < 0)
-		return ret;
-
-	return nilfs_compat_ioctl_uargv_to_uargv32(uargv, uargv32);
-}
-
-static int
-nilfs_compat_ioctl_change_cpmode(struct inode *inode, struct file *filp,
-				 unsigned int cmd, unsigned long arg)
-{
-	struct nilfs_cpmode __user *ucpmode;
-	struct nilfs_cpmode32 __user *ucpmode32;
-	int mode;
-
-	ucpmode = compat_alloc_user_space(sizeof(struct nilfs_cpmode));
-	ucpmode32 = compat_ptr(arg);
-	if (copy_in_user(&ucpmode->cm_cno, &ucpmode32->cm_cno,
-			 sizeof(__u64)) ||
-	    get_user(mode, &ucpmode32->cm_mode) ||
-	    put_user(mode, &ucpmode->cm_mode))
-		return -EFAULT;
-
-	return nilfs_compat_locked_ioctl(
-		inode, filp, cmd, (unsigned long)ucpmode);
-}
-
-
-static inline int
-nilfs_compat_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
-				     unsigned int cmd, unsigned long arg)
-{
-	return nilfs_compat_locked_ioctl(inode, filp, cmd, arg);
-}
-
-static inline int
-nilfs_compat_ioctl_get_cpinfo(struct inode *inode, struct file *filp,
-			      unsigned int cmd, unsigned long arg)
-{
-	return nilfs_compat_ioctl_get_by_argv(inode, filp, cmd, arg);
-}
-
-static inline int
-nilfs_compat_ioctl_get_cpstat(struct inode *inode, struct file *filp,
-			      unsigned int cmd, unsigned long arg)
-{
-	return nilfs_compat_locked_ioctl(inode, filp, cmd, arg);
-}
-
-static inline int
-nilfs_compat_ioctl_get_suinfo(struct inode *inode, struct file *filp,
-			      unsigned int cmd, unsigned long arg)
-{
-	return nilfs_compat_ioctl_get_by_argv(inode, filp, cmd, arg);
-}
-
-static int
-nilfs_compat_ioctl_get_sustat(struct inode *inode, struct file *filp,
-			      unsigned int cmd, unsigned long arg)
-{
-	struct nilfs_sustat __user *usustat;
-	struct nilfs_sustat32 __user *usustat32;
-	time_t ctime, nongc_ctime;
-	int ret;
-
-	usustat = compat_alloc_user_space(sizeof(struct nilfs_sustat));
-	ret = nilfs_compat_locked_ioctl(inode, filp, cmd,
-					(unsigned long)usustat);
-	if (ret < 0)
-		return ret;
-
-	usustat32 = compat_ptr(arg);
-	if (copy_in_user(&usustat32->ss_nsegs, &usustat->ss_nsegs,
-			 sizeof(__u64)) ||
-	    copy_in_user(&usustat32->ss_ncleansegs, &usustat->ss_ncleansegs,
-			 sizeof(__u64)) ||
-	    copy_in_user(&usustat32->ss_ndirtysegs, &usustat->ss_ndirtysegs,
-			 sizeof(__u64)) ||
-	    get_user(ctime, &usustat->ss_ctime) ||
-	    put_user(ctime, &usustat32->ss_ctime) ||
-	    get_user(nongc_ctime, &usustat->ss_nongc_ctime) ||
-	    put_user(nongc_ctime, &usustat32->ss_nongc_ctime))
-		return -EFAULT;
-	return 0;
-}
-
-static inline int
-nilfs_compat_ioctl_get_vinfo(struct inode *inode, struct file *filp,
-			      unsigned int cmd, unsigned long arg)
-{
-	return nilfs_compat_ioctl_get_by_argv(inode, filp, cmd, arg);
-}
-
-static inline int
-nilfs_compat_ioctl_get_bdescs(struct inode *inode, struct file *filp,
-			     unsigned int cmd, unsigned long arg)
-{
-	return nilfs_compat_ioctl_get_by_argv(inode, filp, cmd, arg);
-}
-
-static int
-nilfs_compat_ioctl_clean_segments(struct inode *inode, struct file *filp,
-				  unsigned int cmd, unsigned long arg)
-{
-	struct nilfs_argv __user *uargv;
-	struct nilfs_argv32 __user *uargv32;
-	int i, ret;
-
-	uargv = compat_alloc_user_space(sizeof(struct nilfs_argv) * 5);
-	uargv32 = compat_ptr(arg);
-	for (i = 0; i < 5; i++) {
-		ret = nilfs_compat_ioctl_uargv32_to_uargv(&uargv32[i],
-							  &uargv[i]);
-		if (ret < 0)
-			return ret;
-	}
-	return nilfs_compat_locked_ioctl(
-		inode, filp, cmd, (unsigned long)uargv);
-}
-
-static int nilfs_compat_ioctl_sync(struct inode *inode, struct file *filp,
-				   unsigned int cmd, unsigned long arg)
-{
-	return nilfs_compat_locked_ioctl(inode, filp, cmd, arg);
-}
-
-long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	struct inode *inode = filp->f_dentry->d_inode;
-
-	switch (cmd) {
-	case NILFS_IOCTL32_CHANGE_CPMODE:
-		return nilfs_compat_ioctl_change_cpmode(
-			inode, filp, NILFS_IOCTL_CHANGE_CPMODE, arg);
-	case NILFS_IOCTL_DELETE_CHECKPOINT:
-		return nilfs_compat_ioctl_delete_checkpoint(
-			inode, filp, cmd, arg);
-	case NILFS_IOCTL32_GET_CPINFO:
-		return nilfs_compat_ioctl_get_cpinfo(
-			inode, filp, NILFS_IOCTL_GET_CPINFO, arg);
-	case NILFS_IOCTL_GET_CPSTAT:
-		return nilfs_compat_ioctl_get_cpstat(inode, filp, cmd, arg);
-	case NILFS_IOCTL32_GET_SUINFO:
-		return nilfs_compat_ioctl_get_suinfo(
-			inode, filp, NILFS_IOCTL_GET_SUINFO, arg);
-	case NILFS_IOCTL32_GET_SUSTAT:
-		return nilfs_compat_ioctl_get_sustat(
-			inode, filp, NILFS_IOCTL_GET_SUSTAT, arg);
-	case NILFS_IOCTL32_GET_VINFO:
-		return nilfs_compat_ioctl_get_vinfo(
-			inode, filp, NILFS_IOCTL_GET_VINFO, arg);
-	case NILFS_IOCTL32_GET_BDESCS:
-		return nilfs_compat_ioctl_get_bdescs(
-			inode, filp, NILFS_IOCTL_GET_BDESCS, arg);
-	case NILFS_IOCTL32_CLEAN_SEGMENTS:
-		return nilfs_compat_ioctl_clean_segments(
-			inode, filp, NILFS_IOCTL_CLEAN_SEGMENTS, arg);
-	case NILFS_IOCTL_SYNC:
-		return nilfs_compat_ioctl_sync(inode, filp, cmd, arg);
-	default:
-		return -ENOIOCTLCMD;
-	}
-}
-#endif
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 48c070676cc5..f767644a7242 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -243,7 +243,6 @@ extern int nilfs_sync_file(struct file *, struct dentry *, int);
 
 /* ioctl.c */
 int nilfs_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
-long nilfs_compat_ioctl(struct file *, unsigned int, unsigned long);
 int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, void __user *);
 
 /* inode.c */
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 8fb64ce285fd..306c446e694e 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -793,45 +793,4 @@ struct nilfs_bdesc {
 #define NILFS_IOCTL_RESIZE  \
 	_IOW(NILFS_IOCTL_IDENT, 0x8B, __u64)
 
-/* compat_ioctl */
-#ifdef CONFIG_COMPAT
-#include <linux/compat.h>
-
-struct nilfs_cpmode32 {
-	__u64 cm_cno;
-	compat_int_t cm_mode;
-};
-
-struct nilfs_argv32 {
-	compat_caddr_t v_base;
-	compat_size_t v_nmembs;
-	compat_size_t v_size;
-	compat_int_t v_index;
-	compat_int_t v_flags;
-};
-
-struct nilfs_sustat32 {
-	__u64 ss_nsegs;
-	__u64 ss_ncleansegs;
-	__u64 ss_ndirtysegs;
-	compat_time_t ss_ctime;
-	compat_time_t ss_nongc_ctime;
-};
-
-#define NILFS_IOCTL32_CHANGE_CPMODE  \
-	_IOW(NILFS_IOCTL_IDENT, 0x80, struct nilfs_cpmode32)
-#define NILFS_IOCTL32_GET_CPINFO  \
-	_IOR(NILFS_IOCTL_IDENT, 0x82, struct nilfs_argv32)
-#define NILFS_IOCTL32_GET_SUINFO  \
-	_IOR(NILFS_IOCTL_IDENT, 0x84, struct nilfs_argv32)
-#define NILFS_IOCTL32_GET_SUSTAT  \
-	_IOR(NILFS_IOCTL_IDENT, 0x85, struct nilfs_sustat32)
-#define NILFS_IOCTL32_GET_VINFO  \
-	_IOWR(NILFS_IOCTL_IDENT, 0x86, struct nilfs_argv32)
-#define NILFS_IOCTL32_GET_BDESCS  \
-	_IOWR(NILFS_IOCTL_IDENT, 0x87, struct nilfs_argv32)
-#define NILFS_IOCTL32_CLEAN_SEGMENTS  \
-	_IOW(NILFS_IOCTL_IDENT, 0x88, struct nilfs_argv32[5])
-#endif	/* CONFIG_COMPAT */
-
 #endif	/* _LINUX_NILFS_FS_H */
-- 
cgit v1.2.3


From 2c2e52fc4fca251e68f90821c9ff5cb18be4df58 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 6 Apr 2009 19:01:54 -0700
Subject: nilfs2: extend nilfs_sustat ioctl struct

This adds a new argument to the nilfs_sustat structure.

The extended field allows to delete volatile active state of segments,
which was needed to protect freshly-created segments from garbage
collection but has confused code dealing with segments.  This
extension alleviates the mess and gives room for further
simplifications.

The volatile active flag is not persistent, so it's eliminable on this
occasion without affecting compatibility other than the ioctl change.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/recovery.c      | 32 +++++++++++++++++---------------
 fs/nilfs2/segment.c       | 39 +++++++++------------------------------
 fs/nilfs2/sufile.c        |  8 ++++++--
 fs/nilfs2/super.c         |  4 +++-
 fs/nilfs2/the_nilfs.c     | 18 ------------------
 fs/nilfs2/the_nilfs.h     |  5 ++---
 include/linux/nilfs2_fs.h | 10 ++++------
 7 files changed, 41 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 877dc1ba23f3..a4253f34e138 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -416,6 +416,7 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 	struct nilfs_segment_entry *ent, *n;
 	struct inode *sufile = nilfs->ns_sufile;
 	__u64 segnum[4];
+	time_t mtime;
 	int err;
 	int i;
 
@@ -442,9 +443,9 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 
 	/*
 	 * Collecting segments written after the latest super root.
-	 * These are marked volatile active, and won't be reallocated in
-	 * the next construction.
+	 * These are marked dirty to avoid being reallocated in the next write.
 	 */
+	mtime = get_seconds();
 	list_for_each_entry_safe(ent, n, head, list) {
 		if (ent->segnum == segnum[0]) {
 			list_del(&ent->list);
@@ -454,17 +455,16 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 		err = nilfs_open_segment_entry(ent, sufile);
 		if (unlikely(err))
 			goto failed;
-		if (nilfs_segment_usage_clean(ent->raw_su)) {
-			nilfs_segment_usage_set_volatile_active(ent->raw_su);
-			/* Keep it open */
-		} else {
-			/* Removing duplicated entries */
-			list_del(&ent->list);
-			nilfs_close_segment_entry(ent, sufile);
-			nilfs_free_segment_entry(ent);
+		if (!nilfs_segment_usage_dirty(ent->raw_su)) {
+			/* make the segment garbage */
+			ent->raw_su->su_nblocks = cpu_to_le32(0);
+			ent->raw_su->su_lastmod = cpu_to_le32(mtime);
+			nilfs_segment_usage_set_dirty(ent->raw_su);
 		}
+		list_del(&ent->list);
+		nilfs_close_segment_entry(ent, sufile);
+		nilfs_free_segment_entry(ent);
 	}
-	list_splice_init(head, nilfs->ns_used_segments.prev);
 
 	/*
 	 * The segment having the latest super root is active, and
@@ -882,10 +882,12 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 
 		if (scan_newer)
 			ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
-		else if (nilfs->ns_mount_state & NILFS_VALID_FS)
-			goto super_root_found;
-
-		scan_newer = 1;
+		else {
+			nilfs->ns_prot_seq = ssi.seg_seq;
+			if (nilfs->ns_mount_state & NILFS_VALID_FS)
+				goto super_root_found;
+			scan_newer = 1;
+		}
 
 		/* reset region for roll-forward */
 		pseg_start += ssi.nblocks;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 5db12d774a03..24d0fbd4271c 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2229,13 +2229,6 @@ static void nilfs_segctor_reactivate_segments(struct nilfs_sc_info *sci,
 		nilfs_segment_usage_set_active(ent->raw_su);
 		nilfs_close_segment_entry(ent, sufile);
 	}
-
-	down_write(&nilfs->ns_sem);
-	head = &nilfs->ns_used_segments;
-	list_for_each_entry(ent, head, list) {
-		nilfs_segment_usage_set_volatile_active(ent->raw_su);
-	}
-	up_write(&nilfs->ns_sem);
 }
 
 static int nilfs_segctor_deactivate_segments(struct nilfs_sc_info *sci,
@@ -2244,7 +2237,6 @@ static int nilfs_segctor_deactivate_segments(struct nilfs_sc_info *sci,
 	struct nilfs_segment_buffer *segbuf, *last;
 	struct nilfs_segment_entry *ent;
 	struct inode *sufile = nilfs->ns_sufile;
-	struct list_head *head;
 	int err;
 
 	last = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
@@ -2265,22 +2257,13 @@ static int nilfs_segctor_deactivate_segments(struct nilfs_sc_info *sci,
 		BUG_ON(!buffer_dirty(ent->bh_su));
 	}
 
-	head = &sci->sc_active_segments;
-	list_for_each_entry(ent, head, list) {
+	list_for_each_entry(ent, &sci->sc_active_segments, list) {
 		err = nilfs_open_segment_entry(ent, sufile);
 		if (unlikely(err))
 			goto failed;
 		nilfs_segment_usage_clear_active(ent->raw_su);
 		BUG_ON(!buffer_dirty(ent->bh_su));
 	}
-
-	down_write(&nilfs->ns_sem);
-	head = &nilfs->ns_used_segments;
-	list_for_each_entry(ent, head, list) {
-		/* clear volatile active for segments of older generations */
-		nilfs_segment_usage_clear_volatile_active(ent->raw_su);
-	}
-	up_write(&nilfs->ns_sem);
 	return 0;
 
  failed:
@@ -2304,19 +2287,15 @@ static void nilfs_segctor_bead_completed_segments(struct nilfs_sc_info *sci)
 	}
 }
 
-static void
-__nilfs_segctor_commit_deactivate_segments(struct nilfs_sc_info *sci,
-					   struct the_nilfs *nilfs)
-
+static void nilfs_segctor_commit_deactivate_segments(struct nilfs_sc_info *sci,
+						     struct the_nilfs *nilfs)
 {
-	struct nilfs_segment_entry *ent;
-
-	list_splice_init(&sci->sc_active_segments,
-			 nilfs->ns_used_segments.prev);
+	struct nilfs_segment_entry *ent, *n;
 
-	list_for_each_entry(ent, &nilfs->ns_used_segments, list) {
-		nilfs_segment_usage_set_volatile_active(ent->raw_su);
-		/* These segments are kept open */
+	list_for_each_entry_safe(ent, n, &sci->sc_active_segments, list) {
+		list_del(&ent->list);
+		nilfs_close_segment_entry(ent, nilfs->ns_sufile);
+		nilfs_free_segment_entry(ent);
 	}
 }
 
@@ -2405,8 +2384,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 		if (has_sr) {
 			down_write(&nilfs->ns_sem);
 			nilfs_update_last_segment(sbi, 1);
-			__nilfs_segctor_commit_deactivate_segments(sci, nilfs);
 			up_write(&nilfs->ns_sem);
+			nilfs_segctor_commit_deactivate_segments(sci, nilfs);
 			nilfs_segctor_commit_free_segments(sci);
 			nilfs_segctor_clear_metadata_dirty(sci);
 		}
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index b3674a8162a1..cc714c72b138 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -446,6 +446,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 {
 	struct buffer_head *header_bh;
 	struct nilfs_sufile_header *header;
+	struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
 	void *kaddr;
 	int ret;
 
@@ -460,8 +461,11 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 	sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
 	sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
 	sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
-	sustat->ss_ctime = NILFS_MDT(sufile)->mi_nilfs->ns_ctime;
-	sustat->ss_nongc_ctime = NILFS_MDT(sufile)->mi_nilfs->ns_nongc_ctime;
+	sustat->ss_ctime = nilfs->ns_ctime;
+	sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
+	spin_lock(&nilfs->ns_last_segment_lock);
+	sustat->ss_prot_seq = nilfs->ns_prot_seq;
+	spin_unlock(&nilfs->ns_last_segment_lock);
 	kunmap_atomic(kaddr, KM_USER0);
 	brelse(header_bh);
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 268b563d215a..2f0e9f7bf152 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -262,8 +262,10 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi)
 		printk(KERN_ERR
 		       "NILFS: unable to write superblock (err=%d)\n", err);
 	else {
-		nilfs_dispose_used_segments(nilfs);
 		clear_nilfs_discontinued(nilfs);
+		spin_lock(&nilfs->ns_last_segment_lock);
+		nilfs->ns_prot_seq = le64_to_cpu(nilfs->ns_sbp->s_last_seq);
+		spin_unlock(&nilfs->ns_last_segment_lock);
 	}
 
 	return err;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 69b625586226..661ab762d765 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -71,7 +71,6 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	INIT_LIST_HEAD(&nilfs->ns_supers);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
 	nilfs->ns_gc_inodes_h = NULL;
-	INIT_LIST_HEAD(&nilfs->ns_used_segments);
 	init_rwsem(&nilfs->ns_segctor_sem);
 
 	return nilfs;
@@ -95,7 +94,6 @@ void put_nilfs(struct the_nilfs *nilfs)
 	 */
 	might_sleep();
 	if (nilfs_loaded(nilfs)) {
-		nilfs_dispose_used_segments(nilfs);
 		nilfs_mdt_clear(nilfs->ns_sufile);
 		nilfs_mdt_destroy(nilfs->ns_sufile);
 		nilfs_mdt_clear(nilfs->ns_cpfile);
@@ -463,22 +461,6 @@ int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 	return err;
 }
 
-void nilfs_dispose_used_segments(struct the_nilfs *nilfs)
-{
-	struct nilfs_segment_entry *ent, *n;
-
-	/* nilfs->sem must be locked by the caller. */
-	if (!nilfs_loaded(nilfs))
-		return;
-
-	list_for_each_entry_safe(ent, n, &nilfs->ns_used_segments, list) {
-		list_del_init(&ent->list);
-		nilfs_segment_usage_clear_volatile_active(ent->raw_su);
-		nilfs_close_segment_entry(ent, nilfs->ns_sufile);
-		nilfs_free_segment_entry(ent);
-	}
-}
-
 int nilfs_near_disk_full(struct the_nilfs *nilfs)
 {
 	struct inode *sufile = nilfs->ns_sufile;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 75da37306964..af566e78f7af 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -51,7 +51,6 @@ enum {
  * @ns_writer_refcount: number of referrers on ns_writer
  * @ns_sbh: buffer head of the on-disk super block
  * @ns_sbp: pointer to the super block data
- * @ns_used_segments: list of full segments in volatile active state
  * @ns_supers: list of nilfs super block structs
  * @ns_seg_seq: segment sequence counter
  * @ns_segnum: index number of the latest full segment.
@@ -65,6 +64,7 @@ enum {
  * @ns_last_pseg: start block number of the latest segment
  * @ns_last_seq: sequence value of the latest segment
  * @ns_last_cno: checkpoint number of the latest segment
+ * @ns_prot_seq: least sequence number of segments which must not be reclaimed
  * @ns_free_segments_count: counter of free segments
  * @ns_segctor_sem: segment constructor semaphore
  * @ns_dat: DAT file inode
@@ -103,7 +103,6 @@ struct the_nilfs {
 	 */
 	struct buffer_head     *ns_sbh;
 	struct nilfs_super_block *ns_sbp;
-	struct list_head	ns_used_segments;
 	unsigned		ns_mount_state;
 	struct list_head	ns_supers;
 
@@ -132,6 +131,7 @@ struct the_nilfs {
 	sector_t		ns_last_pseg;
 	u64			ns_last_seq;
 	__u64			ns_last_cno;
+	u64			ns_prot_seq;
 	unsigned long		ns_free_segments_count;
 
 	struct rw_semaphore	ns_segctor_sem;
@@ -188,7 +188,6 @@ void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
-void nilfs_dispose_used_segments(struct the_nilfs *);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
 
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 306c446e694e..aa93f0ee29d4 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -565,8 +565,6 @@ enum {
 	NILFS_SEGMENT_USAGE_DIRTY,
 	NILFS_SEGMENT_USAGE_ERROR,
 
-	/* on-memory only */
-	NILFS_SEGMENT_USAGE_VOLATILE_ACTIVE,
 	/* ... */
 };
 
@@ -594,7 +592,6 @@ nilfs_segment_usage_##name(const struct nilfs_segment_usage *su)	\
 NILFS_SEGMENT_USAGE_FNS(ACTIVE, active)
 NILFS_SEGMENT_USAGE_FNS(DIRTY, dirty)
 NILFS_SEGMENT_USAGE_FNS(ERROR, error)
-NILFS_SEGMENT_USAGE_FNS(VOLATILE_ACTIVE, volatile_active)
 
 static inline void
 nilfs_segment_usage_set_clean(struct nilfs_segment_usage *su)
@@ -650,7 +647,6 @@ nilfs_suinfo_##name(const struct nilfs_suinfo *si)			\
 NILFS_SUINFO_FNS(ACTIVE, active)
 NILFS_SUINFO_FNS(DIRTY, dirty)
 NILFS_SUINFO_FNS(ERROR, error)
-NILFS_SUINFO_FNS(VOLATILE_ACTIVE, volatile_active)
 
 static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si)
 {
@@ -717,8 +713,9 @@ struct nilfs_cpstat {
  * @ss_nsegs: number of segments
  * @ss_ncleansegs: number of clean segments
  * @ss_ndirtysegs: number of dirty segments
- * @ss_ctime:
- * @ss_nongc_ctime:
+ * @ss_ctime: creation time of the last segment
+ * @ss_nongc_ctime: creation time of the last segment not for GC
+ * @ss_prot_seq: least sequence number of segments which must not be reclaimed
  */
 struct nilfs_sustat {
 	__u64 ss_nsegs;
@@ -726,6 +723,7 @@ struct nilfs_sustat {
 	__u64 ss_ndirtysegs;
 	__u64 ss_ctime;
 	__u64 ss_nongc_ctime;
+	__u64 ss_prot_seq;
 };
 
 /**
-- 
cgit v1.2.3


From 458c5b0822a669d170fdb7bb16c9145f652ebe06 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 6 Apr 2009 19:01:56 -0700
Subject: nilfs2: clean up sketch file

The sketch file is a file to mark checkpoints with user data.  It was
experimentally introduced in the original implementation, and now
obsolete.  The file was handled differently with regular files; the file
size got truncated when a checkpoint was created.

This stops the special treatment and will treat it as a regular file.
Most users are not affected because mkfs.nilfs2 no longer makes this file.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/nilfs2.txt |  2 --
 fs/nilfs2/inode.c                    | 35 ++------------------------
 fs/nilfs2/segment.c                  | 49 +-----------------------------------
 fs/nilfs2/segment.h                  |  8 ------
 include/linux/nilfs2_fs.h            |  2 --
 5 files changed, 3 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 3367fc44388d..55c4300abfcb 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -161,8 +161,6 @@ the following meta data files:
  4) Data address translation file  -- Maps virtual block numbers to usual
     (DAT)                             block numbers.  This file serves to
                                       make on-disk blocks relocatable.
- 5) Sketch file (sketch)           -- Keeps read-only data which can be
-                                      associated with checkpoints (optional)
 
 The following figure shows a typical organization of the logs:
 
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b6536bb2a324..a1922b17662c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -418,30 +418,6 @@ int nilfs_read_inode_common(struct inode *inode,
 	return 0;
 }
 
-static int nilfs_read_sketch_inode(struct inode *inode)
-{
-	struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
-	int err = 0;
-
-	if (sbi->s_snapshot_cno) {
-		struct the_nilfs *nilfs = sbi->s_nilfs;
-		struct buffer_head *bh_cp;
-		struct nilfs_checkpoint *raw_cp;
-
-		err = nilfs_cpfile_get_checkpoint(
-			nilfs->ns_cpfile, sbi->s_snapshot_cno, 0, &raw_cp,
-			&bh_cp);
-		if (likely(!err)) {
-			if (!nilfs_checkpoint_sketch(raw_cp))
-				inode->i_size = 0;
-			nilfs_cpfile_put_checkpoint(
-				nilfs->ns_cpfile, sbi->s_snapshot_cno, bh_cp);
-		}
-		inode->i_flags |= S_NOCMTIME;
-	}
-	return err;
-}
-
 static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
 			      struct inode *inode)
 {
@@ -469,11 +445,6 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
 		inode->i_op = &nilfs_file_inode_operations;
 		inode->i_fop = &nilfs_file_operations;
 		inode->i_mapping->a_ops = &nilfs_aops;
-		if (unlikely(inode->i_ino == NILFS_SKETCH_INO)) {
-			err = nilfs_read_sketch_inode(inode);
-			if (unlikely(err))
-				goto failed_unmap;
-		}
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &nilfs_dir_inode_operations;
 		inode->i_fop = &nilfs_dir_operations;
@@ -742,8 +713,7 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
 
 	atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
 
-	if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state) ||
-	    unlikely(inode->i_ino == NILFS_SKETCH_INO))
+	if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
 		return 0;
 
 	spin_lock(&sbi->s_inode_lock);
@@ -811,7 +781,6 @@ void nilfs_dirty_inode(struct inode *inode)
 		return;
 	}
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
-	if (likely(inode->i_ino != NILFS_SKETCH_INO))
-		nilfs_mark_inode_dirty(inode);
+	nilfs_mark_inode_dirty(inode);
 	nilfs_transaction_commit(inode->i_sb); /* never fails */
 }
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9a87410985b9..981c34a0cd69 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -67,7 +67,6 @@ enum {
 	NILFS_ST_INIT = 0,
 	NILFS_ST_GC,		/* Collecting dirty blocks for GC */
 	NILFS_ST_FILE,
-	NILFS_ST_SKETCH,
 	NILFS_ST_IFILE,
 	NILFS_ST_CPFILE,
 	NILFS_ST_SUFILE,
@@ -887,8 +886,7 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
 	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
 	raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
-	if (sci->sc_sketch_inode && i_size_read(sci->sc_sketch_inode) > 0)
-		nilfs_checkpoint_set_sketch(raw_cp);
+
 	nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
 	return 0;
@@ -923,11 +921,6 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
 		nilfs_fill_in_file_bmap(ifile, ii);
 		set_bit(NILFS_I_COLLECTED, &ii->i_state);
 	}
-	if (sci->sc_sketch_inode) {
-		ii = NILFS_I(sci->sc_sketch_inode);
-		if (test_bit(NILFS_I_DIRTY, &ii->i_state))
-			nilfs_fill_in_file_bmap(ifile, ii);
-	}
 }
 
 /*
@@ -1228,26 +1221,6 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 			sci->sc_stage.scnt = NILFS_ST_DONE;
 			return 0;
 		}
-		sci->sc_stage.scnt++;  /* Fall through */
-	case NILFS_ST_SKETCH:
-		if (mode == SC_LSEG_SR && sci->sc_sketch_inode) {
-			ii = NILFS_I(sci->sc_sketch_inode);
-			if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
-				sci->sc_sketch_inode->i_ctime.tv_sec
-					= sci->sc_seg_ctime;
-				sci->sc_sketch_inode->i_mtime.tv_sec
-					= sci->sc_seg_ctime;
-				err = nilfs_mark_inode_dirty(
-					sci->sc_sketch_inode);
-				if (unlikely(err))
-					goto break_or_fail;
-			}
-			err = nilfs_segctor_scan_file(sci,
-						      sci->sc_sketch_inode,
-						      &nilfs_sc_file_ops);
-			if (unlikely(err))
-				goto break_or_fail;
-		}
 		sci->sc_stage.scnt++;
 		sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
 		/* Fall through */
@@ -2385,13 +2358,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 
 	} while (sci->sc_stage.scnt != NILFS_ST_DONE);
 
-	/* Clearing sketch data */
-	if (has_sr && sci->sc_sketch_inode) {
-		if (i_size_read(sci->sc_sketch_inode) == 0)
-			clear_bit(NILFS_I_DIRTY,
-				  &NILFS_I(sci->sc_sketch_inode)->i_state);
-		i_size_write(sci->sc_sketch_inode, 0);
-	}
  out:
 	nilfs_segctor_destroy_segment_buffers(sci);
 	nilfs_segctor_check_out_files(sci, sbi);
@@ -2971,11 +2937,6 @@ static int nilfs_segctor_init(struct nilfs_sc_info *sci,
 			      struct nilfs_recovery_info *ri)
 {
 	int err;
-	struct inode *inode = nilfs_iget(sci->sc_super, NILFS_SKETCH_INO);
-
-	sci->sc_sketch_inode = IS_ERR(inode) ? NULL : inode;
-	if (sci->sc_sketch_inode)
-		i_size_write(sci->sc_sketch_inode, 0);
 
 	sci->sc_seq_done = sci->sc_seq_request;
 	if (ri)
@@ -2987,10 +2948,6 @@ static int nilfs_segctor_init(struct nilfs_sc_info *sci,
 		if (ri)
 			list_splice_init(&sci->sc_active_segments,
 					 ri->ri_used_segments.prev);
-		if (sci->sc_sketch_inode) {
-			iput(sci->sc_sketch_inode);
-			sci->sc_sketch_inode = NULL;
-		}
 	}
 	return err;
 }
@@ -3090,10 +3047,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 
 	WARN_ON(!list_empty(&sci->sc_segbufs));
 
-	if (sci->sc_sketch_inode) {
-		iput(sci->sc_sketch_inode);
-		sci->sc_sketch_inode = NULL;
-	}
 	down_write(&sbi->s_nilfs->ns_segctor_sem);
 
 	kfree(sci);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 2dd39da9f386..fbd162d71707 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -108,7 +108,6 @@ struct nilfs_segsum_pointer {
  * @sc_nblk_this_inc: Number of blocks included in the current logical segment
  * @sc_seg_ctime: Creation time
  * @sc_flags: Internal flags
- * @sc_sketch_inode: Inode of the sketch file
  * @sc_state_lock: spinlock for sc_state and so on
  * @sc_state: Segctord state flags
  * @sc_flush_request: inode bitmap of metadata files to be flushed
@@ -158,13 +157,6 @@ struct nilfs_sc_info {
 
 	unsigned long		sc_flags;
 
-	/*
-	 * Pointer to an inode of the sketch.
-	 * This pointer is kept only while it contains data.
-	 * We protect it with a semaphore of the segment constructor.
-	 */
-	struct inode	       *sc_sketch_inode;
-
 	spinlock_t		sc_state_lock;
 	unsigned long		sc_state;
 	unsigned long		sc_flush_request;
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index aa93f0ee29d4..e9c84aa4a8e1 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -494,7 +494,6 @@ nilfs_checkpoint_##name(const struct nilfs_checkpoint *cp)		\
 
 NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot)
 NILFS_CHECKPOINT_FNS(INVALID, invalid)
-NILFS_CHECKPOINT_FNS(SKETCH, sketch)
 
 /**
  * struct nilfs_cpinfo - checkpoint information
@@ -527,7 +526,6 @@ nilfs_cpinfo_##name(const struct nilfs_cpinfo *cpinfo)			\
 
 NILFS_CPINFO_FNS(SNAPSHOT, snapshot)
 NILFS_CPINFO_FNS(INVALID, invalid)
-NILFS_CPINFO_FNS(SKETCH, sketch)
 
 
 /**
-- 
cgit v1.2.3


From c96fa464a567a2a8796009af0e79bc68af73f485 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 6 Apr 2009 19:01:57 -0700
Subject: nilfs2: mark minor flag for checkpoint created by internal operation

Nilfs creates checkpoints even for garbage collection or metadata updates
such as checkpoint mode change.  So, user often sees checkpoints created
only by such internal operations.

This is inconvenient in some situations.  For example, application that
monitors checkpoints and changes them to snapshots, will fall into an
infinite loop because it cannot distinguish internally created
checkpoints.

This patch solves this sort of problem by adding a flag to checkpoint for
identification.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/segment.c       | 9 +++++++++
 fs/nilfs2/segment.h       | 3 +++
 include/linux/nilfs2_fs.h | 3 +++
 3 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 981c34a0cd69..2879704509fd 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -462,6 +462,9 @@ static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
 	sci->sc_binfo_ptr = sci->sc_finfo_ptr;
 	nilfs_segctor_map_segsum_entry(
 		sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
+
+	if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+		set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
 	/* skip finfo */
 }
 
@@ -887,6 +890,11 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
 	raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
 
+	if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+		nilfs_checkpoint_clear_minor(raw_cp);
+	else
+		nilfs_checkpoint_set_minor(raw_cp);
+
 	nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
 	return 0;
@@ -2091,6 +2099,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 		nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
 				       segbuf->sb_sum.seg_seq, nilfs->ns_cno);
 
+		clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
 		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
 		set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
 	} else
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index fbd162d71707..bb7d417fec62 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -185,6 +185,9 @@ enum {
 	NILFS_SC_SUPER_ROOT,	/* The latest segment has a super root */
 	NILFS_SC_PRIOR_FLUSH,	/* Requesting immediate flush without making a
 				   checkpoint */
+	NILFS_SC_HAVE_DELTA,	/* Next checkpoint will have update of files
+				   other than DAT, cpfile, sufile, or files
+				   moved by GC */
 };
 
 /* sc_state */
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index e9c84aa4a8e1..cbce6647f7fd 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -470,6 +470,7 @@ enum {
 	NILFS_CHECKPOINT_SNAPSHOT,
 	NILFS_CHECKPOINT_INVALID,
 	NILFS_CHECKPOINT_SKETCH,
+	NILFS_CHECKPOINT_MINOR,
 };
 
 #define NILFS_CHECKPOINT_FNS(flag, name)				\
@@ -494,6 +495,7 @@ nilfs_checkpoint_##name(const struct nilfs_checkpoint *cp)		\
 
 NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot)
 NILFS_CHECKPOINT_FNS(INVALID, invalid)
+NILFS_CHECKPOINT_FNS(MINOR, minor)
 
 /**
  * struct nilfs_cpinfo - checkpoint information
@@ -526,6 +528,7 @@ nilfs_cpinfo_##name(const struct nilfs_cpinfo *cpinfo)			\
 
 NILFS_CPINFO_FNS(SNAPSHOT, snapshot)
 NILFS_CPINFO_FNS(INVALID, invalid)
+NILFS_CPINFO_FNS(MINOR, minor)
 
 
 /**
-- 
cgit v1.2.3


From e339ad31f59925b48a92ee3947692fdf9758b8c7 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 6 Apr 2009 19:01:59 -0700
Subject: nilfs2: introduce secondary super block

The former versions didn't have extra super blocks.  This improves the
weak point by introducing another super block at unused region in tail of
the partition.

This doesn't break disk format compatibility; older versions just ingore
the secondary super block, and new versions just recover it if it doesn't
exist.  The partition created by an old mkfs may not have unused region,
but in that case, the secondary super block will not be added.

This doesn't make more redundant copies of the super block; it is a future
work.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/nilfs.h         |   7 +-
 fs/nilfs2/recovery.c      |   1 -
 fs/nilfs2/segment.c       |   8 +-
 fs/nilfs2/segment.h       |   2 -
 fs/nilfs2/super.c         | 229 +++++++++++++++++++---------------------------
 fs/nilfs2/the_nilfs.c     | 180 +++++++++++++++++++++++++++++++-----
 fs/nilfs2/the_nilfs.h     |  18 +++-
 include/linux/nilfs2_fs.h |   4 +
 8 files changed, 274 insertions(+), 175 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a7f5bc724e33..19af5ab86275 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -275,13 +275,10 @@ extern void nilfs_error(struct super_block *, const char *, const char *, ...)
 extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern struct nilfs_super_block *
-nilfs_load_super_block(struct super_block *, struct buffer_head **);
-extern struct nilfs_super_block *
-nilfs_reload_super_block(struct super_block *, struct buffer_head **, int);
+nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
 extern int nilfs_store_magic_and_option(struct super_block *,
 					struct nilfs_super_block *, char *);
-extern void nilfs_update_last_segment(struct nilfs_sb_info *, int);
-extern int nilfs_commit_super(struct nilfs_sb_info *);
+extern int nilfs_commit_super(struct nilfs_sb_info *, int);
 extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
 extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
 
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 6ab4c8fc5e9f..6ade0963fc1d 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -870,7 +870,6 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 		if (scan_newer)
 			ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
 		else {
-			nilfs->ns_prot_seq = ssi.seg_seq;
 			if (nilfs->ns_mount_state & NILFS_VALID_FS)
 				goto super_root_found;
 			scan_newer = 1;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index e43558d50e78..fb70ec3be20e 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2068,7 +2068,8 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 
 	if (update_sr) {
 		nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
-				       segbuf->sb_sum.seg_seq, nilfs->ns_cno);
+				       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
+		sbi->s_super->s_dirt = 1;
 
 		clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
 		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -2224,9 +2225,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 
 		/* Commit segments */
 		if (has_sr) {
-			down_write(&nilfs->ns_sem);
-			nilfs_update_last_segment(sbi, 1);
-			up_write(&nilfs->ns_sem);
 			nilfs_segctor_commit_free_segments(sci);
 			nilfs_segctor_clear_metadata_dirty(sci);
 		}
@@ -2564,7 +2562,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
 		if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
 		    nilfs_discontinued(nilfs)) {
 			down_write(&nilfs->ns_sem);
-			req->sb_err = nilfs_commit_super(sbi);
+			req->sb_err = nilfs_commit_super(sbi, 0);
 			up_write(&nilfs->ns_sem);
 		}
 	}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 4a64eb82f1f5..a98fc1ed0bbb 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -206,8 +206,6 @@ enum {
 					   logical segment with a super root */
 #define NILFS_SC_DEFAULT_SR_FREQ    30  /* Maximum frequency of super root
 					   creation */
-#define NILFS_SC_DEFAULT_SB_FREQ    30  /* Minimum interval of periodical
-					   update of superblock (reserved) */
 
 /*
  * The default threshold amount of data, in block counts.
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ef31e9a51c84..e2ced824c624 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -103,8 +103,9 @@ void nilfs_error(struct super_block *sb, const char *function,
 		down_write(&nilfs->ns_sem);
 		if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
 			nilfs->ns_mount_state |= NILFS_ERROR_FS;
-			nilfs->ns_sbp->s_state |= cpu_to_le16(NILFS_ERROR_FS);
-			nilfs_commit_super(sbi);
+			nilfs->ns_sbp[0]->s_state |=
+				cpu_to_le16(NILFS_ERROR_FS);
+			nilfs_commit_super(sbi, 1);
 		}
 		up_write(&nilfs->ns_sem);
 
@@ -208,90 +209,106 @@ static void nilfs_clear_inode(struct inode *inode)
 	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
 }
 
-/**
- * nilfs_update_last_segment - change pointer to the latest segment
- * @sbi: nilfs_sb_info
- * @update_cno: flag whether to update checkpoint number.
- *
- * nilfs_update_last_segment() changes information in the super block
- * after a partial segment is written out successfully. The super
- * block is marked dirty. It will be written out at the next VFS sync
- * operations such as sync_supers() and generic_shutdown_super().
- */
-void nilfs_update_last_segment(struct nilfs_sb_info *sbi, int update_cno)
-{
-	struct the_nilfs *nilfs = sbi->s_nilfs;
-	struct nilfs_super_block *sbp = nilfs->ns_sbp;
-
-	/* nilfs->sem must be locked by the caller. */
-	spin_lock(&nilfs->ns_last_segment_lock);
-	if (update_cno)
-		nilfs->ns_last_cno = nilfs->ns_cno++;
-	sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
-	sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
-	sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
-	spin_unlock(&nilfs->ns_last_segment_lock);
-
-	sbi->s_super->s_dirt = 1; /* must be set if delaying the call of
-				     nilfs_commit_super() */
-}
-
-static int nilfs_sync_super(struct nilfs_sb_info *sbi)
+static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err;
 	int barrier_done = 0;
 
 	if (nilfs_test_opt(sbi, BARRIER)) {
-		set_buffer_ordered(nilfs->ns_sbh);
+		set_buffer_ordered(nilfs->ns_sbh[0]);
 		barrier_done = 1;
 	}
  retry:
-	set_buffer_dirty(nilfs->ns_sbh);
-	err = sync_dirty_buffer(nilfs->ns_sbh);
+	set_buffer_dirty(nilfs->ns_sbh[0]);
+	err = sync_dirty_buffer(nilfs->ns_sbh[0]);
 	if (err == -EOPNOTSUPP && barrier_done) {
 		nilfs_warning(sbi->s_super, __func__,
 			      "barrier-based sync failed. "
 			      "disabling barriers\n");
 		nilfs_clear_opt(sbi, BARRIER);
 		barrier_done = 0;
-		clear_buffer_ordered(nilfs->ns_sbh);
+		clear_buffer_ordered(nilfs->ns_sbh[0]);
 		goto retry;
 	}
-	if (unlikely(err))
+	if (unlikely(err)) {
 		printk(KERN_ERR
 		       "NILFS: unable to write superblock (err=%d)\n", err);
-	else {
+		if (err == -EIO && nilfs->ns_sbh[1]) {
+			nilfs_fall_back_super_block(nilfs);
+			goto retry;
+		}
+	} else {
+		struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+
+		/*
+		 * The latest segment becomes trailable from the position
+		 * written in superblock.
+		 */
 		clear_nilfs_discontinued(nilfs);
-		spin_lock(&nilfs->ns_last_segment_lock);
-		nilfs->ns_prot_seq = le64_to_cpu(nilfs->ns_sbp->s_last_seq);
-		spin_unlock(&nilfs->ns_last_segment_lock);
+
+		/* update GC protection for recent segments */
+		if (nilfs->ns_sbh[1]) {
+			sbp = NULL;
+			if (dupsb) {
+				set_buffer_dirty(nilfs->ns_sbh[1]);
+				if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
+					sbp = nilfs->ns_sbp[1];
+			}
+		}
+		if (sbp) {
+			spin_lock(&nilfs->ns_last_segment_lock);
+			nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
+			spin_unlock(&nilfs->ns_last_segment_lock);
+		}
 	}
 
 	return err;
 }
 
-int nilfs_commit_super(struct nilfs_sb_info *sbi)
+int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
-	struct nilfs_super_block *sbp = nilfs->ns_sbp;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
 	sector_t nfreeblocks;
+	time_t t;
 	int err;
 
 	/* nilfs->sem must be locked by the caller. */
+	if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
+		if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
+			nilfs_swap_super_block(nilfs);
+		else {
+			printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
+			       sbi->s_super->s_id);
+			return -EIO;
+		}
+	}
 	err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
 	if (unlikely(err)) {
 		printk(KERN_ERR "NILFS: failed to count free blocks\n");
 		return err;
 	}
-	sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks);
-	sbp->s_wtime = cpu_to_le64(get_seconds());
-	sbp->s_sum = 0;
-	sbp->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
-					  (unsigned char *)sbp,
-					  le16_to_cpu(sbp->s_bytes)));
+	spin_lock(&nilfs->ns_last_segment_lock);
+	sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
+	sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
+	sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
+	spin_unlock(&nilfs->ns_last_segment_lock);
+
+	t = get_seconds();
+	nilfs->ns_sbwtime[0] = t;
+	sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
+	sbp[0]->s_wtime = cpu_to_le64(t);
+	sbp[0]->s_sum = 0;
+	sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+					     (unsigned char *)sbp[0],
+					     nilfs->ns_sbsize));
+	if (dupsb && sbp[1]) {
+		memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+		nilfs->ns_sbwtime[1] = t;
+	}
 	sbi->s_super->s_dirt = 0;
-	return nilfs_sync_super(sbi);
+	return nilfs_sync_super(sbi, dupsb);
 }
 
 static void nilfs_put_super(struct super_block *sb)
@@ -303,8 +320,8 @@ static void nilfs_put_super(struct super_block *sb)
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		down_write(&nilfs->ns_sem);
-		nilfs->ns_sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
-		nilfs_commit_super(sbi);
+		nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+		nilfs_commit_super(sbi, 1);
 		up_write(&nilfs->ns_sem);
 	}
 
@@ -330,7 +347,7 @@ static void nilfs_put_super(struct super_block *sb)
  *   2.    down_write(&nilfs->ns_sem)
  *
  * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
- * of the super block (nilfs->ns_sbp).
+ * of the super block (nilfs->ns_sbp[]).
  *
  * In most cases, VFS functions call lock_super() before calling these
  * methods.  So we must be careful not to bring on deadlocks when using
@@ -346,8 +363,19 @@ static void nilfs_write_super(struct super_block *sb)
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
 	down_write(&nilfs->ns_sem);
-	if (!(sb->s_flags & MS_RDONLY))
-		nilfs_commit_super(sbi);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		struct nilfs_super_block **sbp = nilfs->ns_sbp;
+		u64 t = get_seconds();
+		int dupsb;
+
+		if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
+		    t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
+			up_write(&nilfs->ns_sem);
+			return;
+		}
+		dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+		nilfs_commit_super(sbi, dupsb);
+	}
 	sb->s_dirt = 0;
 	up_write(&nilfs->ns_sem);
 }
@@ -436,7 +464,7 @@ static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
 	down_write(&nilfs->ns_sem);
 	if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
 		nilfs->ns_mount_state |= NILFS_VALID_FS;
-		err = nilfs_commit_super(sbi);
+		err = nilfs_commit_super(sbi, 1);
 		if (likely(!err))
 			printk(KERN_INFO "NILFS: recovery complete.\n");
 	}
@@ -652,7 +680,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
 static int nilfs_setup_super(struct nilfs_sb_info *sbi)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
-	struct nilfs_super_block *sbp = nilfs->ns_sbp;
+	struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
 	int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
 	int mnt_count = le16_to_cpu(sbp->s_mnt_count);
 
@@ -674,88 +702,29 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
 	sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
 	sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
 	sbp->s_mtime = cpu_to_le64(get_seconds());
-	return nilfs_commit_super(sbi);
+	return nilfs_commit_super(sbi, 1);
 }
 
-struct nilfs_super_block *
-nilfs_load_super_block(struct super_block *sb, struct buffer_head **pbh)
+struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
+						 u64 pos, int blocksize,
+						 struct buffer_head **pbh)
 {
-	int blocksize;
-	unsigned long offset, sb_index;
-
-	/*
-	 * Adjusting block size
-	 * Blocksize will be enlarged when it is smaller than hardware
-	 * sector size.
-	 * Disk format of superblock does not change.
-	 */
-	blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
-	if (!blocksize) {
-		printk(KERN_ERR
-		       "NILFS: unable to set blocksize of superblock\n");
-		return NULL;
-	}
-	sb_index = NILFS_SB_OFFSET_BYTES / blocksize;
-	offset = NILFS_SB_OFFSET_BYTES % blocksize;
+	unsigned long long sb_index = pos;
+	unsigned long offset;
 
+	offset = do_div(sb_index, blocksize);
 	*pbh = sb_bread(sb, sb_index);
-	if (!*pbh) {
-		printk(KERN_ERR "NILFS: unable to read superblock\n");
+	if (!*pbh)
 		return NULL;
-	}
 	return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
 }
 
-struct nilfs_super_block *
-nilfs_reload_super_block(struct super_block *sb, struct buffer_head **pbh,
-			 int blocksize)
-{
-	struct nilfs_super_block *sbp;
-	unsigned long offset, sb_index;
-	int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
-
-	if (blocksize < hw_blocksize) {
-		printk(KERN_ERR
-		       "NILFS: blocksize %d too small for device "
-		       "(sector-size = %d).\n",
-		       blocksize, hw_blocksize);
-		goto failed_sbh;
-	}
-	brelse(*pbh);
-	sb_set_blocksize(sb, blocksize);
-
-	sb_index = NILFS_SB_OFFSET_BYTES / blocksize;
-	offset = NILFS_SB_OFFSET_BYTES % blocksize;
-
-	*pbh = sb_bread(sb, sb_index);
-	if (!*pbh) {
-		printk(KERN_ERR
-		       "NILFS: cannot read superblock on 2nd try.\n");
-		goto failed;
-	}
-
-	sbp = (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
-	if (sbp->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
-		printk(KERN_ERR
-		       "NILFS: !? Magic mismatch on 2nd try.\n");
-		goto failed_sbh;
-	}
-	return sbp;
-
- failed_sbh:
-	brelse(*pbh);
-
- failed:
-	return NULL;
-}
-
 int nilfs_store_magic_and_option(struct super_block *sb,
 				 struct nilfs_super_block *sbp,
 				 char *data)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 
-	/* trying to fill super (1st stage) */
 	sb->s_magic = le16_to_cpu(sbp->s_magic);
 
 	/* FS independent flags */
@@ -763,11 +732,6 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 	sb->s_flags |= MS_NOATIME;
 #endif
 
-	if (sb->s_magic != NILFS_SUPER_MAGIC) {
-		printk("NILFS: Can't find nilfs on dev %s.\n", sb->s_id);
-		return -EINVAL;
-	}
-
 	nilfs_set_default_options(sbi, sbp);
 
 	sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
@@ -775,10 +739,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 	sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
 	sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
 
-	if (!parse_options(data, sb))
-		return -EINVAL;
-
-	return 0;
+	return !parse_options(data, sb) ? -EINVAL : 0 ;
 }
 
 /**
@@ -967,12 +928,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		 * the RDONLY flag and then mark the partition as valid again.
 		 */
 		down_write(&nilfs->ns_sem);
-		sbp = nilfs->ns_sbp;
+		sbp = nilfs->ns_sbp[0];
 		if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
 		    (nilfs->ns_mount_state & NILFS_VALID_FS))
 			sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
 		sbp->s_mtime = cpu_to_le64(get_seconds());
-		nilfs_commit_super(sbi);
+		nilfs_commit_super(sbi, 1);
 		up_write(&nilfs->ns_sem);
 	} else {
 		/*
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 661ab762d765..33400cf0bbe2 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/crc32.h>
 #include "nilfs.h"
 #include "segment.h"
 #include "alloc.h"
@@ -105,7 +106,8 @@ void put_nilfs(struct the_nilfs *nilfs)
 	}
 	if (nilfs_init(nilfs)) {
 		nilfs_destroy_gccache(nilfs);
-		brelse(nilfs->ns_sbh);
+		brelse(nilfs->ns_sbh[0]);
+		brelse(nilfs->ns_sbh[1]);
 	}
 	kfree(nilfs);
 }
@@ -115,6 +117,7 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
 {
 	struct buffer_head *bh_sr;
 	struct nilfs_super_root *raw_sr;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
 	unsigned dat_entry_size, segment_usage_size, checkpoint_size;
 	unsigned inode_size;
 	int err;
@@ -124,9 +127,9 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
 		return err;
 
 	down_read(&nilfs->ns_sem);
-	dat_entry_size = le16_to_cpu(nilfs->ns_sbp->s_dat_entry_size);
-	checkpoint_size = le16_to_cpu(nilfs->ns_sbp->s_checkpoint_size);
-	segment_usage_size = le16_to_cpu(nilfs->ns_sbp->s_segment_usage_size);
+	dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
+	checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
+	segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
 	up_read(&nilfs->ns_sem);
 
 	inode_size = nilfs->ns_inode_size;
@@ -270,11 +273,8 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 			nilfs_mdt_destroy(nilfs->ns_dat);
 			goto failed;
 		}
-		if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED) {
-			down_write(&nilfs->ns_sem);
-			nilfs_update_last_segment(sbi, 0);
-			up_write(&nilfs->ns_sem);
-		}
+		if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
+			sbi->s_super->s_dirt = 1;
 	}
 
 	set_nilfs_loaded(nilfs);
@@ -296,9 +296,8 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
 	return res;
 }
 
-static int
-nilfs_store_disk_layout(struct the_nilfs *nilfs, struct super_block *sb,
-			struct nilfs_super_block *sbp)
+static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
+				   struct nilfs_super_block *sbp)
 {
 	if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
 		printk(KERN_ERR "NILFS: revision mismatch "
@@ -309,6 +308,10 @@ nilfs_store_disk_layout(struct the_nilfs *nilfs, struct super_block *sb,
 		       NILFS_CURRENT_REV, NILFS_MINOR_REV);
 		return -EINVAL;
 	}
+	nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
+	if (nilfs->ns_sbsize > BLOCK_SIZE)
+		return -EINVAL;
+
 	nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
 	nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
 
@@ -330,6 +333,122 @@ nilfs_store_disk_layout(struct the_nilfs *nilfs, struct super_block *sb,
 	return 0;
 }
 
+static int nilfs_valid_sb(struct nilfs_super_block *sbp)
+{
+	static unsigned char sum[4];
+	const int sumoff = offsetof(struct nilfs_super_block, s_sum);
+	size_t bytes;
+	u32 crc;
+
+	if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
+		return 0;
+	bytes = le16_to_cpu(sbp->s_bytes);
+	if (bytes > BLOCK_SIZE)
+		return 0;
+	crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
+		       sumoff);
+	crc = crc32_le(crc, sum, 4);
+	crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
+		       bytes - sumoff - 4);
+	return crc == le32_to_cpu(sbp->s_sum);
+}
+
+static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+{
+	return offset < ((le64_to_cpu(sbp->s_nsegments) *
+			  le32_to_cpu(sbp->s_blocks_per_segment)) <<
+			 (le32_to_cpu(sbp->s_log_block_size) + 10));
+}
+
+static void nilfs_release_super_block(struct the_nilfs *nilfs)
+{
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		if (nilfs->ns_sbp[i]) {
+			brelse(nilfs->ns_sbh[i]);
+			nilfs->ns_sbh[i] = NULL;
+			nilfs->ns_sbp[i] = NULL;
+		}
+	}
+}
+
+void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
+{
+	brelse(nilfs->ns_sbh[0]);
+	nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+	nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+	nilfs->ns_sbh[1] = NULL;
+	nilfs->ns_sbp[1] = NULL;
+}
+
+void nilfs_swap_super_block(struct the_nilfs *nilfs)
+{
+	struct buffer_head *tsbh = nilfs->ns_sbh[0];
+	struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];
+
+	nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+	nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+	nilfs->ns_sbh[1] = tsbh;
+	nilfs->ns_sbp[1] = tsbp;
+}
+
+static int nilfs_load_super_block(struct the_nilfs *nilfs,
+				  struct super_block *sb, int blocksize,
+				  struct nilfs_super_block **sbpp)
+{
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	struct buffer_head **sbh = nilfs->ns_sbh;
+	u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+	int valid[2], swp = 0;
+
+	sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
+					&sbh[0]);
+	sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
+
+	if (!sbp[0]) {
+		if (!sbp[1]) {
+			printk(KERN_ERR "NILFS: unable to read superblock\n");
+			return -EIO;
+		}
+		printk(KERN_WARNING
+		       "NILFS warning: unable to read primary superblock\n");
+	} else if (!sbp[1])
+		printk(KERN_WARNING
+		       "NILFS warning: unable to read secondary superblock\n");
+
+	valid[0] = nilfs_valid_sb(sbp[0]);
+	valid[1] = nilfs_valid_sb(sbp[1]);
+	swp = valid[1] &&
+		(!valid[0] ||
+		 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
+
+	if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
+		brelse(sbh[1]);
+		sbh[1] = NULL;
+		sbp[1] = NULL;
+		swp = 0;
+	}
+	if (!valid[swp]) {
+		nilfs_release_super_block(nilfs);
+		printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
+		       sb->s_id);
+		return -EINVAL;
+	}
+
+	if (swp) {
+		printk(KERN_WARNING "NILFS warning: broken superblock. "
+		       "using spare superblock.\n");
+		nilfs_swap_super_block(nilfs);
+	}
+
+	nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
+	nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
+	nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
+	*sbpp = sbp[0];
+	return 0;
+}
+
 /**
  * init_nilfs - initialize a NILFS instance.
  * @nilfs: the_nilfs structure
@@ -352,16 +471,15 @@ nilfs_store_disk_layout(struct the_nilfs *nilfs, struct super_block *sb,
 int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 {
 	struct super_block *sb = sbi->s_super;
-	struct buffer_head *sbh;
 	struct nilfs_super_block *sbp;
 	struct backing_dev_info *bdi;
 	int blocksize;
-	int err = 0;
+	int err;
 
 	down_write(&nilfs->ns_sem);
 	if (nilfs_init(nilfs)) {
 		/* Load values from existing the_nilfs */
-		sbp = nilfs->ns_sbp;
+		sbp = nilfs->ns_sbp[0];
 		err = nilfs_store_magic_and_option(sb, sbp, data);
 		if (err)
 			goto out;
@@ -377,36 +495,50 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 		goto out;
 	}
 
-	sbp = nilfs_load_super_block(sb, &sbh);
-	if (!sbp) {
+	blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+	if (!blocksize) {
+		printk(KERN_ERR "NILFS: unable to set blocksize\n");
 		err = -EINVAL;
 		goto out;
 	}
+	err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+	if (err)
+		goto out;
+
 	err = nilfs_store_magic_and_option(sb, sbp, data);
 	if (err)
 		goto failed_sbh;
 
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
 	if (sb->s_blocksize != blocksize) {
-		sbp = nilfs_reload_super_block(sb, &sbh, blocksize);
-		if (!sbp) {
+		int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+
+		if (blocksize < hw_blocksize) {
+			printk(KERN_ERR
+			       "NILFS: blocksize %d too small for device "
+			       "(sector-size = %d).\n",
+			       blocksize, hw_blocksize);
 			err = -EINVAL;
+			goto failed_sbh;
+		}
+		nilfs_release_super_block(nilfs);
+		sb_set_blocksize(sb, blocksize);
+
+		err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+		if (err)
 			goto out;
 			/* not failed_sbh; sbh is released automatically
 			   when reloading fails. */
-		}
 	}
 	nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
 
-	err = nilfs_store_disk_layout(nilfs, sb, sbp);
+	err = nilfs_store_disk_layout(nilfs, sbp);
 	if (err)
 		goto failed_sbh;
 
 	sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
 
 	nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
-	nilfs->ns_sbh = sbh;
-	nilfs->ns_sbp = sbp;
 
 	bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
 	if (!bdi)
@@ -443,7 +575,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 	return err;
 
  failed_sbh:
-	brelse(sbh);
+	nilfs_release_super_block(nilfs);
 	goto out;
 }
 
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index d750e48257c9..30fe58778d05 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -49,8 +49,10 @@ enum {
  * @ns_sem: semaphore for shared states
  * @ns_writer_mutex: mutex protecting ns_writer attach/detach
  * @ns_writer_refcount: number of referrers on ns_writer
- * @ns_sbh: buffer head of the on-disk super block
- * @ns_sbp: pointer to the super block data
+ * @ns_sbh: buffer heads of on-disk super blocks
+ * @ns_sbp: pointers to super block data
+ * @ns_sbwtime: previous write time of super blocks
+ * @ns_sbsize: size of valid data in super block
  * @ns_supers: list of nilfs super block structs
  * @ns_seg_seq: segment sequence counter
  * @ns_segnum: index number of the latest full segment.
@@ -101,8 +103,10 @@ struct the_nilfs {
 	 * - protecting s_dirt in the super_block struct
 	 *   (see nilfs_write_super) and the following fields.
 	 */
-	struct buffer_head     *ns_sbh;
-	struct nilfs_super_block *ns_sbp;
+	struct buffer_head     *ns_sbh[2];
+	struct nilfs_super_block *ns_sbp[2];
+	time_t			ns_sbwtime[2];
+	unsigned		ns_sbsize;
 	unsigned		ns_mount_state;
 	struct list_head	ns_supers;
 
@@ -182,6 +186,10 @@ THE_NILFS_FNS(INIT, init)
 THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
 
+/* Minimum interval of periodical update of superblocks (in seconds) */
+#define NILFS_SB_FREQ		10
+#define NILFS_ALTSB_FREQ	60  /* spare superblock */
+
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
 struct the_nilfs *alloc_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
@@ -190,6 +198,8 @@ int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
+void nilfs_fall_back_super_block(struct the_nilfs *);
+void nilfs_swap_super_block(struct the_nilfs *);
 
 
 static inline void get_nilfs(struct the_nilfs *nilfs)
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index cbce6647f7fd..1275b3099535 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -252,6 +252,10 @@ struct nilfs_super_block {
 #define NILFS_MIN_NRSVSEGS	8	/* Minimum number of reserved
 					   segments */
 
+/*
+ * bytes offset of secondary super block
+ */
+#define NILFS_SB2_OFFSET_BYTES(devsize)	((((devsize) >> 12) - 1) << 12)
 
 /*
  * Maximal count of links to a file
-- 
cgit v1.2.3


From 612392307cb09e49051225092cbbd7049bd8db93 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 6 Apr 2009 19:02:00 -0700
Subject: nilfs2: support nanosecond timestamp

After a review of user's feedback for finding out other compatibility
issues, I found nilfs improperly initializes timestamps in inode;
CURRENT_TIME was used there instead of CURRENT_TIME_SEC even though nilfs
didn't have nanosecond timestamps on disk.  A few users gave us the report
that the tar program sometimes failed to expand symbolic links on nilfs,
and it turned out to be the cause.

Instead of applying the above displacement, I've decided to support
nanosecond timestamps on this occation.  Fortunetaly, a needless 64-bit
field was in the nilfs_inode struct, and I found it's available for this
purpose without impact for the users.

So, this will do the enhancement and resolve the tar problem.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/gcinode.c       |  1 -
 fs/nilfs2/inode.c         | 13 ++++++-------
 fs/nilfs2/nilfs.h         |  1 -
 fs/nilfs2/super.c         |  1 +
 include/linux/nilfs2_fs.h | 10 ++++++----
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 77615aabc7e2..19d2102b6a69 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -226,7 +226,6 @@ static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
 	ii->i_flags = 0;
 	ii->i_state = 1 << NILFS_I_GCINODE;
 	ii->i_bh = NULL;
-	ii->i_dtime = 0;
 	nilfs_bmap_init_gc(ii->i_bmap);
 
 	return inode;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index a1922b17662c..49ab4a49bb4f 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -306,7 +306,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 
 	/* ii->i_file_acl = 0; */
 	/* ii->i_dir_acl = 0; */
-	ii->i_dtime = 0;
 	ii->i_dir_start_lookup = 0;
 #ifdef CONFIG_NILFS_FS_POSIX_ACL
 	ii->i_acl = NULL;
@@ -390,11 +389,10 @@ int nilfs_read_inode_common(struct inode *inode,
 	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
 	inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
 	inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
-	inode->i_atime.tv_nsec = 0;
-	inode->i_ctime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
-	ii->i_dtime = le64_to_cpu(raw_inode->i_dtime);
-	if (inode->i_nlink == 0 && (inode->i_mode == 0 || ii->i_dtime))
+	inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	if (inode->i_nlink == 0 && inode->i_mode == 0)
 		return -EINVAL; /* this inode is deleted */
 
 	inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
@@ -505,9 +503,10 @@ void nilfs_write_inode_common(struct inode *inode,
 	raw_inode->i_size = cpu_to_le64(inode->i_size);
 	raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
 	raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 	raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
 
-	raw_inode->i_dtime = cpu_to_le64(ii->i_dtime);
 	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 19af5ab86275..7558c977db02 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -48,7 +48,6 @@ struct nilfs_inode_info {
 	struct nilfs_bmap *i_bmap;
 	union nilfs_bmap_union i_bmap_union;
 	__u64 i_xattr;	/* sector_t ??? */
-	__u32 i_dtime;
 	__u32 i_dir_start_lookup;
 	__u64 i_cno;		/* check point number for GC inode */
 	struct address_space i_btnode_cache;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e2ced824c624..e117e1ea9bff 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -792,6 +792,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	sb->s_op = &nilfs_sops;
 	sb->s_export_op = &nilfs_export_ops;
 	sb->s_root = NULL;
+	sb->s_time_gran = 1;
 
 	if (!nilfs_loaded(nilfs)) {
 		err = load_nilfs(nilfs, sbi);
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 1275b3099535..79fec6af3f9f 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -67,9 +67,10 @@
  * struct nilfs_inode - structure of an inode on disk
  * @i_blocks: blocks count
  * @i_size: size in bytes
- * @i_ctime: creation time
- * @i_mtime: modification time
- * @i_dtime: deletion time
+ * @i_ctime: creation time (seconds)
+ * @i_mtime: modification time (seconds)
+ * @i_ctime_nsec: creation time (nano seconds)
+ * @i_mtime_nsec: modification time (nano seconds)
  * @i_uid: user id
  * @i_gid: group id
  * @i_mode: file mode
@@ -85,7 +86,8 @@ struct nilfs_inode {
 	__le64	i_size;
 	__le64	i_ctime;
 	__le64	i_mtime;
-	__le64	i_dtime;
+	__le32	i_ctime_nsec;
+	__le32	i_mtime_nsec;
 	__le32	i_uid;
 	__le32	i_gid;
 	__le16	i_mode;
-- 
cgit v1.2.3


From f786ddd285b4100909a013041d3eee1be9fac4db Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 7 Apr 2009 16:48:07 +0100
Subject: tty: Correct inline types for tty_driver_kref_get()

tty_driver_kref_get() should be static inline and not extern inline
(the latter even changed it's semantics in gcc >= 4.3).

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tty_driver.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 8615d661ab60..bcba84ea2d86 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -309,7 +309,8 @@ extern void tty_set_operations(struct tty_driver *driver,
 extern struct tty_driver *tty_find_polling_driver(char *name, int *line);
 
 extern void tty_driver_kref_put(struct tty_driver *driver);
-extern inline struct tty_driver *tty_driver_kref_get(struct tty_driver *d)
+
+static inline struct tty_driver *tty_driver_kref_get(struct tty_driver *d)
 {
 	kref_get(&d->kref);
 	return d;
-- 
cgit v1.2.3


From 1dcb884ca8048efb4ce2999d367c26369ab2227c Mon Sep 17 00:00:00 2001
From: Christian Pellegrin <chripell@gmail.com>
Date: Tue, 7 Apr 2009 16:48:51 +0100
Subject: Add support for the MAX3100 SPI UART.

(akpm: queued pending confirmation of the new major number)

[randy.dunlap@oracle.com: select SERIAL_CORE]
Signed-off-by: Christian Pellegrin <chripell@fsfe.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/Kconfig      | 7 +++++++
 drivers/serial/Makefile     | 1 +
 include/linux/serial_core.h | 3 +++
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 07c03b9eb126..d89972beb12c 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -533,6 +533,13 @@ config SERIAL_S3C6400
 	  Serial port support for the Samsung S3C6400 and S3C6410
 	  SoCs
 
+config SERIAL_MAX3100
+	tristate "MAX3100 support"
+	depends on SPI
+	select SERIAL_CORE
+	help
+	  MAX3100 chip support
+
 config SERIAL_DZ
 	bool "DECstation DZ serial driver"
 	depends on MACH_DECSTATION && 32BIT
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 8844c0a03929..d438eb2a73de 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_SERIAL_S3C2412) += s3c2412.o
 obj-$(CONFIG_SERIAL_S3C2440) += s3c2440.o
 obj-$(CONFIG_SERIAL_S3C24A0) += s3c24a0.o
 obj-$(CONFIG_SERIAL_S3C6400) += s3c6400.o
+obj-$(CONFIG_SERIAL_MAX3100) += max3100.o
 obj-$(CONFIG_SERIAL_IP22_ZILOG) += ip22zilog.o
 obj-$(CONFIG_SERIAL_MUX) += mux.o
 obj-$(CONFIG_SERIAL_68328) += 68328serial.o
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 83e4b3ff9cda..57a97e52e58d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -164,6 +164,9 @@
 /* NWPSERIAL */
 #define PORT_NWPSERIAL	85
 
+/* MAX3100 */
+#define PORT_MAX3100    86
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
cgit v1.2.3


From 51dcdfec6a274afc1c6fce180d582add9ff512c0 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 7 Apr 2009 15:30:57 +0100
Subject: parport: Use the PCI IRQ if offered

PCI parallel port devices can IRQ share so we should stop them hogging
the line and making a mess on modern PC systems.  We know the sharing
side works as the PCMCIA driver has shared the parallel port IRQ for
some time.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parport/parport_cs.c     |  3 ++-
 drivers/parport/parport_pc.c     | 57 +++++++++++++++++++++++++---------------
 drivers/parport/parport_serial.c | 20 +++++++++++---
 include/linux/parport_pc.h       | 11 ++++----
 4 files changed, 60 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/parport/parport_cs.c b/drivers/parport/parport_cs.c
index 0cd5fbc7f2c2..8fdfa4f537a6 100644
--- a/drivers/parport/parport_cs.c
+++ b/drivers/parport/parport_cs.c
@@ -43,6 +43,7 @@
 #include <linux/timer.h>
 #include <linux/ioport.h>
 #include <linux/major.h>
+#include <linux/interrupt.h>
 
 #include <linux/parport.h>
 #include <linux/parport_pc.h>
@@ -192,7 +193,7 @@ static int parport_config(struct pcmcia_device *link)
 
     p = parport_pc_probe_port(link->io.BasePort1, link->io.BasePort2,
 			      link->irq.AssignedIRQ, PARPORT_DMA_NONE,
-			      &link->dev);
+			      &link->dev, IRQF_SHARED);
     if (p == NULL) {
 	printk(KERN_NOTICE "parport_cs: parport_pc_probe_port() at "
 	       "0x%3x, irq %u failed\n", link->io.BasePort1,
diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 96f3bdf0ec4b..4e63cc9e2778 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -2170,10 +2170,11 @@ static int parport_dma_probe (struct parport *p)
 static LIST_HEAD(ports_list);
 static DEFINE_SPINLOCK(ports_lock);
 
-struct parport *parport_pc_probe_port (unsigned long int base,
-				       unsigned long int base_hi,
-				       int irq, int dma,
-				       struct device *dev)
+struct parport *parport_pc_probe_port(unsigned long int base,
+				      unsigned long int base_hi,
+				      int irq, int dma,
+				      struct device *dev,
+				      int irqflags)
 {
 	struct parport_pc_private *priv;
 	struct parport_operations *ops;
@@ -2194,11 +2195,11 @@ struct parport *parport_pc_probe_port (unsigned long int base,
 		dev = &pdev->dev;
 	}
 
-	ops = kmalloc(sizeof (struct parport_operations), GFP_KERNEL);
+	ops = kmalloc(sizeof(struct parport_operations), GFP_KERNEL);
 	if (!ops)
 		goto out1;
 
-	priv = kmalloc (sizeof (struct parport_pc_private), GFP_KERNEL);
+	priv = kmalloc(sizeof(struct parport_pc_private), GFP_KERNEL);
 	if (!priv)
 		goto out2;
 
@@ -2325,8 +2326,8 @@ struct parport *parport_pc_probe_port (unsigned long int base,
 		EPP_res = NULL;
 	}
 	if (p->irq != PARPORT_IRQ_NONE) {
-		if (request_irq (p->irq, parport_irq_handler,
-				 0, p->name, p)) {
+		if (request_irq(p->irq, parport_irq_handler,
+				 irqflags, p->name, p)) {
 			printk (KERN_WARNING "%s: irq %d in use, "
 				"resorting to polled operation\n",
 				p->name, p->irq);
@@ -2530,7 +2531,7 @@ static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
 	 */
 	release_resource(base_res);
 	if (parport_pc_probe_port (ite8872_lpt, ite8872_lpthi,
-				   irq, PARPORT_DMA_NONE, &pdev->dev)) {
+				   irq, PARPORT_DMA_NONE, &pdev->dev, 0)) {
 		printk (KERN_INFO
 			"parport_pc: ITE 8872 parallel port: io=0x%X",
 			ite8872_lpt);
@@ -2713,7 +2714,7 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 	}
 
 	/* finally, do the probe with values obtained */
-	if (parport_pc_probe_port (port1, port2, irq, dma, &pdev->dev)) {
+	if (parport_pc_probe_port (port1, port2, irq, dma, &pdev->dev, 0)) {
 		printk (KERN_INFO
 			"parport_pc: VIA parallel port: io=0x%X", port1);
 		if (irq != PARPORT_IRQ_NONE)
@@ -3018,6 +3019,7 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
 	for (n = 0; n < cards[i].numports; n++) {
 		int lo = cards[i].addr[n].lo;
 		int hi = cards[i].addr[n].hi;
+		int irq;
 		unsigned long io_lo, io_hi;
 		io_lo = pci_resource_start (dev, lo);
 		io_hi = 0;
@@ -3028,13 +3030,25 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
                                         "hi" as an offset (see SYBA
                                         def.) */
 		/* TODO: test if sharing interrupts works */
-		printk (KERN_DEBUG "PCI parallel port detected: %04x:%04x, "
-			"I/O at %#lx(%#lx)\n",
-			parport_pc_pci_tbl[i + last_sio].vendor,
-			parport_pc_pci_tbl[i + last_sio].device, io_lo, io_hi);
+		irq = dev->irq;
+		if (irq == IRQ_NONE) {
+			printk (KERN_DEBUG
+	"PCI parallel port detected: %04x:%04x, I/O at %#lx(%#lx)\n",
+				parport_pc_pci_tbl[i + last_sio].vendor,
+				parport_pc_pci_tbl[i + last_sio].device,
+				io_lo, io_hi);
+			irq = PARPORT_IRQ_NONE;
+		} else {
+			printk (KERN_DEBUG
+	"PCI parallel port detected: %04x:%04x, I/O at %#lx(%#lx), IRQ %d\n",
+				parport_pc_pci_tbl[i + last_sio].vendor,
+				parport_pc_pci_tbl[i + last_sio].device,
+				io_lo, io_hi, irq);
+		}
 		data->ports[count] =
-			parport_pc_probe_port (io_lo, io_hi, PARPORT_IRQ_NONE,
-					       PARPORT_DMA_NONE, &dev->dev);
+			parport_pc_probe_port(io_lo, io_hi, irq,
+					       PARPORT_DMA_NONE, &dev->dev,
+					       IRQF_SHARED);
 		if (data->ports[count])
 			count++;
 	}
@@ -3143,7 +3157,8 @@ static int parport_pc_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id
 		dma = PARPORT_DMA_NONE;
 
 	dev_info(&dev->dev, "reported by %s\n", dev->protocol->name);
-	if (!(pdata = parport_pc_probe_port (io_lo, io_hi, irq, dma, &dev->dev)))
+	if (!(pdata = parport_pc_probe_port(io_lo, io_hi,
+					irq, dma, &dev->dev, 0)))
 		return -ENODEV;
 
 	pnp_set_drvdata(dev,pdata);
@@ -3192,11 +3207,11 @@ parport_pc_find_isa_ports (int autoirq, int autodma)
 {
 	int count = 0;
 
-	if (parport_pc_probe_port(0x3bc, 0x7bc, autoirq, autodma, NULL))
+	if (parport_pc_probe_port(0x3bc, 0x7bc, autoirq, autodma, NULL, 0))
 		count++;
-	if (parport_pc_probe_port(0x378, 0x778, autoirq, autodma, NULL))
+	if (parport_pc_probe_port(0x378, 0x778, autoirq, autodma, NULL, 0))
 		count++;
-	if (parport_pc_probe_port(0x278, 0x678, autoirq, autodma, NULL))
+	if (parport_pc_probe_port(0x278, 0x678, autoirq, autodma, NULL, 0))
 		count++;
 
 	return count;
@@ -3481,7 +3496,7 @@ static int __init parport_pc_init(void)
 			if ((io_hi[i]) == PARPORT_IOHI_AUTO)
 			       io_hi[i] = 0x400 + io[i];
 			parport_pc_probe_port(io[i], io_hi[i],
-						  irqval[i], dmaval[i], NULL);
+					  irqval[i], dmaval[i], NULL, 0);
 		}
 	} else
 		parport_pc_find_ports (irqval[0], dmaval[0]);
diff --git a/drivers/parport/parport_serial.c b/drivers/parport/parport_serial.c
index f3492110b1ad..c3bb84ac931e 100644
--- a/drivers/parport/parport_serial.c
+++ b/drivers/parport/parport_serial.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pci.h>
+#include <linux/interrupt.h>
 #include <linux/parport.h>
 #include <linux/parport_pc.h>
 #include <linux/8250_pci.h>
@@ -311,6 +312,7 @@ static int __devinit parport_register (struct pci_dev *dev,
 		int lo = card->addr[n].lo;
 		int hi = card->addr[n].hi;
 		unsigned long io_lo, io_hi;
+		int irq;
 
 		if (priv->num_par == ARRAY_SIZE (priv->port)) {
 			printk (KERN_WARNING
@@ -329,10 +331,20 @@ static int __devinit parport_register (struct pci_dev *dev,
                                         "hi" as an offset (see SYBA
                                         def.) */
 		/* TODO: test if sharing interrupts works */
-		dev_dbg(&dev->dev, "PCI parallel port detected: I/O at "
-			"%#lx(%#lx)\n", io_lo, io_hi);
-		port = parport_pc_probe_port (io_lo, io_hi, PARPORT_IRQ_NONE,
-					      PARPORT_DMA_NONE, &dev->dev);
+		irq = dev->irq;
+		if (irq == IRQ_NONE) {
+			dev_dbg(&dev->dev,
+			"PCI parallel port detected: I/O at %#lx(%#lx)\n",
+				io_lo, io_hi);
+			irq = PARPORT_IRQ_NONE;
+		} else {
+			dev_dbg(&dev->dev,
+		"PCI parallel port detected: I/O at %#lx(%#lx), IRQ %d\n",
+				io_lo, io_hi, irq);
+			irq = PARPORT_IRQ_NONE;
+		}
+		port = parport_pc_probe_port (io_lo, io_hi, irq,
+			      PARPORT_DMA_NONE, &dev->dev, IRQF_SHARED);
 		if (port) {
 			priv->port[priv->num_par++] = port;
 			success = 1;
diff --git a/include/linux/parport_pc.h b/include/linux/parport_pc.h
index ea8c6d84996d..cc1767f5cca8 100644
--- a/include/linux/parport_pc.h
+++ b/include/linux/parport_pc.h
@@ -228,10 +228,11 @@ extern void parport_pc_release_resources(struct parport *p);
 extern int parport_pc_claim_resources(struct parport *p);
 
 /* PCMCIA code will want to get us to look at a port.  Provide a mechanism. */
-extern struct parport *parport_pc_probe_port (unsigned long base,
-					      unsigned long base_hi,
-					      int irq, int dma,
-					      struct device *dev);
-extern void parport_pc_unregister_port (struct parport *p);
+extern struct parport *parport_pc_probe_port(unsigned long base,
+					     unsigned long base_hi,
+					     int irq, int dma,
+					     struct device *dev,
+					     int irqflags);
+extern void parport_pc_unregister_port(struct parport *p);
 
 #endif
-- 
cgit v1.2.3


From aeeae86859f4319de0a4946b44771d9926eeed54 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 7 Apr 2009 07:59:41 -0700
Subject: Fix build errors due to CONFIG_BRANCH_TRACER=y

The code that enables branch tracing for all (non-constant) branches
plays games with the preprocessor and #define's the C 'if ()' construct
to do tracing.

That's all fine, but it fails for some unusual but valid C code that is
sometimes used in macros, notably by the intel-iommu code:

	if (i=drhd->iommu, drhd->ignored) ..

because now the preprocessor complains about multiple arguments to the
'if' macro.

So make the macro expansion of this particularly horrid trick use
varargs, and handle the case of comma-expressions in if-statements.  Use
another macro to do it cleanly in just one place.

This replaces a patch by David (and acked by Steven) that did this all
inside that one already-too-horrid macro.

Tested-by: Ingo Molnar <mingo@elte.hu>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 6faa7e549de4..cebfdcd3dbdd 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -114,7 +114,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
  * "Define 'is'", Bill Clinton
  * "Define 'if'", Steven Rostedt
  */
-#define if(cond) if (__builtin_constant_p((cond)) ? !!(cond) :		\
+#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
+#define __trace_if(cond) \
+	if (__builtin_constant_p((cond)) ? !!(cond) :			\
 	({								\
 		int ______r;						\
 		static struct ftrace_branch_data			\
-- 
cgit v1.2.3


From 8d82ffd15e59febf2c597067a777526958b7f769 Mon Sep 17 00:00:00 2001
From: Wolfgang Grandegger <wg@grandegger.com>
Date: Tue, 7 Apr 2009 10:20:56 +0200
Subject: powerpc: Document new FSL I2C bindings and cleanup

This patch documents the new bindings for the MPC I2C bus driver.
Furthermore, it removes obsolete FSL device related definitions
for I2C.

Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 Documentation/powerpc/dts-bindings/fsl/i2c.txt | 46 +++++++++++++++++---------
 include/linux/fsl_devices.h                    |  4 ---
 2 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/powerpc/dts-bindings/fsl/i2c.txt b/Documentation/powerpc/dts-bindings/fsl/i2c.txt
index d0ab33e21fe6..b6d2e21474f9 100644
--- a/Documentation/powerpc/dts-bindings/fsl/i2c.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/i2c.txt
@@ -7,8 +7,10 @@ Required properties :
 
 Recommended properties :
 
- - compatible : Should be "fsl-i2c" for parts compatible with
-   Freescale I2C specifications.
+ - compatible : compatibility list with 2 entries, the first should
+   be "fsl,CHIP-i2c" where CHIP is the name of a compatible processor,
+   e.g. mpc8313, mpc8543, mpc8544, mpc5200 or mpc5200b. The second one
+   should be "fsl-i2c".
  - interrupts : <a b> where a is the interrupt number and b is a
    field that represents an encoding of the sense and level
    information for the interrupt.  This should be encoded based on
@@ -16,17 +18,31 @@ Recommended properties :
    controller you have.
  - interrupt-parent : the phandle for the interrupt controller that
    services interrupts for this device.
- - dfsrr : boolean; if defined, indicates that this I2C device has
-   a digital filter sampling rate register
- - fsl5200-clocking : boolean; if defined, indicated that this device
-   uses the FSL 5200 clocking mechanism.
-
-Example :
-	i2c@3000 {
-		interrupt-parent = <40000>;
-		interrupts = <1b 3>;
-		reg = <3000 18>;
-		device_type = "i2c";
-		compatible  = "fsl-i2c";
-		dfsrr;
+ - fsl,preserve-clocking : boolean; if defined, the clock settings
+   from the bootloader are preserved (not touched).
+ - clock-frequency : desired I2C bus clock frequency in Hz.
+
+Examples :
+
+	i2c@3d00 {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		compatible = "fsl,mpc5200b-i2c","fsl,mpc5200-i2c","fsl-i2c";
+		cell-index = <0>;
+		reg = <0x3d00 0x40>;
+		interrupts = <2 15 0>;
+		interrupt-parent = <&mpc5200_pic>;
+		fsl,preserve-clocking;
 	};
+
+	i2c@3100 {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		cell-index = <1>;
+		compatible = "fsl,mpc8544-i2c", "fsl-i2c";
+		reg = <0x3100 0x100>;
+		interrupts = <43 2>;
+		interrupt-parent = <&mpic>;
+		clock-frequency = <400000>;
+	};
+
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index f2a78b5e8b55..43fc95d822d5 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -43,10 +43,6 @@
  *
  */
 
-/* Flags related to I2C device features */
-#define FSL_I2C_DEV_SEPARATE_DFSRR	0x00000001
-#define FSL_I2C_DEV_CLOCK_5200		0x00000002
-
 enum fsl_usb2_operating_modes {
 	FSL_USB2_MPH_HOST,
 	FSL_USB2_DR_HOST,
-- 
cgit v1.2.3


From f876d346e3807647b1de411de6a86c44821896ca Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Wed, 8 Apr 2009 14:05:43 +0900
Subject: tracing: append a comma to INIT_FTRACE_GRAPH

Impact: dont break future extensions of INIT_TASK

While not a problem right now, due to lack of a comma, build fails if
elements are appended to INIT_TASK() macro in development code:

 arch/x86/kernel/init_task.c:33: error: request for member `XXXXXXXXXX' in something not a structure or union
 arch/x86/kernel/init_task.c:33: error: initializer element is not constant
 arch/x86/kernel/init_task.c:33: error: (near initialization for `init_task.ret_stack')
 make[1]: *** [arch/x86/kernel/init_task.o] Error 1
 make: *** [arch/x86/kernel] Error 2

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: srostedt@redhat.com
LKML-Reference: <200904080505.n3855hcn017109@www262.sakura.ne.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index da5405dce347..ff112a872d75 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -357,7 +357,7 @@ struct ftrace_graph_ret {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 /* for init task */
-#define INIT_FTRACE_GRAPH		.ret_stack = NULL
+#define INIT_FTRACE_GRAPH		.ret_stack = NULL,
 
 /*
  * Stack of return addresses for functions
-- 
cgit v1.2.3


From 60f85019c6c8c1aebf3485a313e0da094bc95d07 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Wed, 8 Apr 2009 14:13:01 +0200
Subject: ide: replace IDE_TFLAG_* flags by IDE_VALID_*

Replace IDE_TFLAG_{IN|OUT}_* flags meaning to the taskfile register validity on
input/output by the IDE_VALID_* flags and introduce 4 symmetric 8-bit register
validity indicator subfields, 'valid.{input/output}.{tf|hob}', into the 'struct
ide_cmd' instead of using the 'tf_flags' field for that purpose (this field can
then be turned from 32-bit into 8-bit one).

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-acpi.c      |  3 +-
 drivers/ide/ide-atapi.c     | 21 +++++-----
 drivers/ide/ide-disk.c      | 40 ++++++++++++-------
 drivers/ide/ide-disk_proc.c |  6 ++-
 drivers/ide/ide-io-std.c    | 50 +++++++++++++-----------
 drivers/ide/ide-io.c        |  5 ++-
 drivers/ide/ide-ioctls.c    | 10 +++--
 drivers/ide/ide-iops.c      |  4 +-
 drivers/ide/ide-lib.c       | 11 +++---
 drivers/ide/ide-park.c      |  3 +-
 drivers/ide/ide-pm.c        |  3 +-
 drivers/ide/ide-probe.c     |  4 +-
 drivers/ide/ide-proc.c      |  4 +-
 drivers/ide/ide-taskfile.c  | 36 +++++++++--------
 drivers/ide/ns87415.c       | 25 ++++++------
 drivers/ide/scc_pata.c      | 50 +++++++++++++-----------
 drivers/ide/tx4939ide.c     |  2 +-
 include/linux/ide.h         | 95 ++++++++++++++++++---------------------------
 18 files changed, 197 insertions(+), 175 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index 12f436951bff..f0db4d349c60 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -319,7 +319,8 @@ static int do_drive_set_taskfiles(ide_drive_t *drive,
 		/* convert GTF to taskfile */
 		memset(&cmd, 0, sizeof(cmd));
 		memcpy(&cmd.tf_array[7], gtf, REGS_PER_GTF);
-		cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+		cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+		cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 
 		err = ide_no_data_taskfile(drive, &cmd);
 		if (err) {
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 3e43b889dd64..a359323d8ffe 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -257,8 +257,7 @@ void ide_read_bcount_and_ireason(ide_drive_t *drive, u16 *bcount, u8 *ireason)
 	struct ide_cmd cmd;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_IN_LBAH | IDE_TFLAG_IN_LBAM |
-		       IDE_TFLAG_IN_NSECT;
+	cmd.valid.in.tf = IDE_VALID_LBAH | IDE_VALID_LBAM | IDE_VALID_NSECT;
 
 	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
@@ -439,12 +438,12 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	return ide_started;
 }
 
-static void ide_init_packet_cmd(struct ide_cmd *cmd, u32 tf_flags,
+static void ide_init_packet_cmd(struct ide_cmd *cmd, u8 valid_tf,
 				u16 bcount, u8 dma)
 {
-	cmd->protocol  = dma ? ATAPI_PROT_DMA : ATAPI_PROT_PIO;
-	cmd->tf_flags |= IDE_TFLAG_OUT_LBAH | IDE_TFLAG_OUT_LBAM |
-			 IDE_TFLAG_OUT_FEATURE | tf_flags;
+	cmd->protocol = dma ? ATAPI_PROT_DMA : ATAPI_PROT_PIO;
+	cmd->valid.out.tf = IDE_VALID_LBAH | IDE_VALID_LBAM |
+			    IDE_VALID_FEATURE | valid_tf;
 	cmd->tf.command = ATA_CMD_PACKET;
 	cmd->tf.feature = dma;		/* Use PIO/DMA */
 	cmd->tf.lbam    = bcount & 0xff;
@@ -456,7 +455,7 @@ static u8 ide_read_ireason(ide_drive_t *drive)
 	struct ide_cmd cmd;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_IN_NSECT;
+	cmd.valid.in.tf = IDE_VALID_NSECT;
 
 	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
@@ -588,12 +587,12 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 	ide_expiry_t *expiry = NULL;
 	struct request *rq = hwif->rq;
 	unsigned int timeout;
-	u32 tf_flags;
 	u16 bcount;
+	u8 valid_tf;
 	u8 drq_int = !!(drive->atapi_flags & IDE_AFLAG_DRQ_INTERRUPT);
 
 	if (dev_is_idecd(drive)) {
-		tf_flags = IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL;
+		valid_tf = IDE_VALID_NSECT | IDE_VALID_LBAL;
 		bcount = ide_cd_get_xferlen(rq);
 		expiry = ide_cd_expiry;
 		timeout = ATAPI_WAIT_PC;
@@ -607,7 +606,7 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 		pc->xferred = 0;
 		pc->cur_pos = pc->buf;
 
-		tf_flags = IDE_TFLAG_OUT_DEVICE;
+		valid_tf = IDE_VALID_DEVICE;
 		bcount = ((drive->media == ide_tape) ?
 				pc->req_xfer :
 				min(pc->req_xfer, 63 * 1024));
@@ -627,7 +626,7 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 						       : WAIT_TAPE_CMD;
 	}
 
-	ide_init_packet_cmd(cmd, tf_flags, bcount, drive->dma);
+	ide_init_packet_cmd(cmd, valid_tf, bcount, drive->dma);
 
 	(void)do_rw_taskfile(drive, cmd);
 
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index c998cf8e971a..235263e51dd9 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -97,7 +97,8 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 	}
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 
 	if (drive->dev_flags & IDE_DFLAG_LBA) {
 		if (lba48) {
@@ -116,7 +117,9 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 			tf->lbam   = (u8)(block >>  8);
 			tf->lbah   = (u8)(block >> 16);
 
-			cmd.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_HOB);
+			cmd.valid.out.hob = IDE_VALID_OUT_HOB;
+			cmd.valid.in.hob  = IDE_VALID_IN_HOB;
+			cmd.tf_flags |= IDE_TFLAG_LBA48;
 		} else {
 			tf->nsect  = nsectors & 0xff;
 			tf->lbal   = block;
@@ -220,9 +223,13 @@ static u64 idedisk_read_native_max_address(ide_drive_t *drive, int lba48)
 		tf->command = ATA_CMD_READ_NATIVE_MAX;
 	tf->device  = ATA_LBA;
 
-	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	if (lba48)
-		cmd.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_HOB);
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
+	if (lba48) {
+		cmd.valid.out.hob = IDE_VALID_OUT_HOB;
+		cmd.valid.in.hob  = IDE_VALID_IN_HOB;
+		cmd.tf_flags = IDE_TFLAG_LBA48;
+	}
 
 	ide_no_data_taskfile(drive, &cmd);
 
@@ -260,9 +267,13 @@ static u64 idedisk_set_max_address(ide_drive_t *drive, u64 addr_req, int lba48)
 	}
 	tf->device |= ATA_LBA;
 
-	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	if (lba48)
-		cmd.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_HOB);
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
+	if (lba48) {
+		cmd.valid.out.hob = IDE_VALID_OUT_HOB;
+		cmd.valid.in.hob  = IDE_VALID_IN_HOB;
+		cmd.tf_flags = IDE_TFLAG_LBA48;
+	}
 
 	ide_no_data_taskfile(drive, &cmd);
 
@@ -395,8 +406,8 @@ static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
 		cmd->tf.command = ATA_CMD_FLUSH_EXT;
 	else
 		cmd->tf.command = ATA_CMD_FLUSH;
-	cmd->tf_flags = IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
-			IDE_TFLAG_DYN;
+	cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd->tf_flags = IDE_TFLAG_DYN;
 	cmd->protocol = ATA_PROT_NODATA;
 
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
@@ -457,7 +468,8 @@ static int ide_do_setfeature(ide_drive_t *drive, u8 feature, u8 nsect)
 	cmd.tf.feature = feature;
 	cmd.tf.nsect   = nsect;
 	cmd.tf.command = ATA_CMD_SET_FEATURES;
-	cmd.tf_flags   = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 
 	return ide_no_data_taskfile(drive, &cmd);
 }
@@ -533,7 +545,8 @@ static int do_idedisk_flushcache(ide_drive_t *drive)
 		cmd.tf.command = ATA_CMD_FLUSH_EXT;
 	else
 		cmd.tf.command = ATA_CMD_FLUSH;
-	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 
 	return ide_no_data_taskfile(drive, &cmd);
 }
@@ -715,7 +728,8 @@ static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.tf.command = on ? ATA_CMD_MEDIA_LOCK : ATA_CMD_MEDIA_UNLOCK;
-	cmd.tf_flags   = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 
 	ret = ide_no_data_taskfile(drive, &cmd);
 
diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c
index eaea3bef2073..19f263bf0a9e 100644
--- a/drivers/ide/ide-disk_proc.c
+++ b/drivers/ide/ide-disk_proc.c
@@ -13,7 +13,8 @@ static int smart_enable(ide_drive_t *drive)
 	tf->lbam    = ATA_SMART_LBAM_PASS;
 	tf->lbah    = ATA_SMART_LBAH_PASS;
 	tf->command = ATA_CMD_SMART;
-	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 
 	return ide_no_data_taskfile(drive, &cmd);
 }
@@ -29,7 +30,8 @@ static int get_smart_data(ide_drive_t *drive, u8 *buf, u8 sub_cmd)
 	tf->lbam    = ATA_SMART_LBAM_PASS;
 	tf->lbah    = ATA_SMART_LBAH_PASS;
 	tf->command = ATA_CMD_SMART;
-	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 	cmd.protocol = ATA_PROT_PIO;
 
 	return ide_raw_taskfile(drive, &cmd, buf, 1);
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 9cac281d82c4..8b0b2e9ccf5b 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -91,6 +91,7 @@ void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	struct ide_taskfile *tf = &cmd->tf;
 	void (*tf_outb)(u8 addr, unsigned long port);
+	u8 valid = cmd->valid.out.hob;
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
 	u8 HIHI = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
 
@@ -102,29 +103,31 @@ void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
+	if (valid & IDE_VALID_FEATURE)
 		tf_outb(tf->hob_feature, io_ports->feature_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
+	if (valid & IDE_VALID_NSECT)
 		tf_outb(tf->hob_nsect, io_ports->nsect_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
+	if (valid & IDE_VALID_LBAL)
 		tf_outb(tf->hob_lbal, io_ports->lbal_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
+	if (valid & IDE_VALID_LBAM)
 		tf_outb(tf->hob_lbam, io_ports->lbam_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
+	if (valid & IDE_VALID_LBAH)
 		tf_outb(tf->hob_lbah, io_ports->lbah_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_OUT_FEATURE)
+	valid = cmd->valid.out.tf;
+
+	if (valid & IDE_VALID_FEATURE)
 		tf_outb(tf->feature, io_ports->feature_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_NSECT)
+	if (valid & IDE_VALID_NSECT)
 		tf_outb(tf->nsect, io_ports->nsect_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAL)
+	if (valid & IDE_VALID_LBAL)
 		tf_outb(tf->lbal, io_ports->lbal_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAM)
+	if (valid & IDE_VALID_LBAM)
 		tf_outb(tf->lbam, io_ports->lbam_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAH)
+	if (valid & IDE_VALID_LBAH)
 		tf_outb(tf->lbah, io_ports->lbah_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_OUT_DEVICE)
+	if (valid & IDE_VALID_DEVICE)
 		tf_outb((tf->device & HIHI) | drive->select,
 			 io_ports->device_addr);
 }
@@ -137,6 +140,7 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_taskfile *tf = &cmd->tf;
 	void (*tf_outb)(u8 addr, unsigned long port);
 	u8 (*tf_inb)(unsigned long port);
+	u8 valid = cmd->valid.in.tf;
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
 
 	if (mmio) {
@@ -150,31 +154,33 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	tf_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+	if (valid & IDE_VALID_ERROR)
 		tf->error  = tf_inb(io_ports->feature_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
+	if (valid & IDE_VALID_NSECT)
 		tf->nsect  = tf_inb(io_ports->nsect_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
+	if (valid & IDE_VALID_LBAL)
 		tf->lbal   = tf_inb(io_ports->lbal_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_LBAM)
+	if (valid & IDE_VALID_LBAM)
 		tf->lbam   = tf_inb(io_ports->lbam_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_LBAH)
+	if (valid & IDE_VALID_LBAH)
 		tf->lbah   = tf_inb(io_ports->lbah_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_DEVICE)
+	if (valid & IDE_VALID_DEVICE)
 		tf->device = tf_inb(io_ports->device_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tf_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+		valid = cmd->valid.in.hob;
+
+		if (valid & IDE_VALID_ERROR)
 			tf->hob_error = tf_inb(io_ports->feature_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
+		if (valid & IDE_VALID_NSECT)
 			tf->hob_nsect = tf_inb(io_ports->nsect_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
+		if (valid & IDE_VALID_LBAL)
 			tf->hob_lbal  = tf_inb(io_ports->lbal_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
+		if (valid & IDE_VALID_LBAM)
 			tf->hob_lbam  = tf_inb(io_ports->lbam_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
+		if (valid & IDE_VALID_LBAH)
 			tf->hob_lbah  = tf_inb(io_ports->lbah_addr);
 	}
 }
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 1deb6d29b186..99bb0a9a67e8 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -205,8 +205,9 @@ static ide_startstop_t ide_disk_special(ide_drive_t *drive)
 		return ide_stopped;
 	}
 
-	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE |
-		       IDE_TFLAG_CUSTOM_HANDLER;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
+	cmd.tf_flags = IDE_TFLAG_CUSTOM_HANDLER;
 
 	do_rw_taskfile(drive, &cmd);
 
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 770142767437..b11df4b7998e 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -141,11 +141,12 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 		tf->lbal  = args[1];
 		tf->lbam  = 0x4f;
 		tf->lbah  = 0xc2;
-		cmd.tf_flags = IDE_TFLAG_OUT_TF | IDE_TFLAG_IN_NSECT;
+		cmd.valid.out.tf = IDE_VALID_OUT_TF;
+		cmd.valid.in.tf  = IDE_VALID_NSECT;
 	} else {
 		tf->nsect = args[1];
-		cmd.tf_flags = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT |
-			       IDE_TFLAG_IN_NSECT;
+		cmd.valid.out.tf = IDE_VALID_FEATURE | IDE_VALID_NSECT;
+		cmd.valid.in.tf  = IDE_VALID_NSECT;
 	}
 	tf->command = args[0];
 	cmd.protocol = args[3] ? ATA_PROT_PIO : ATA_PROT_NODATA;
@@ -207,7 +208,8 @@ static int ide_task_ioctl(ide_drive_t *drive, unsigned long arg)
 	memset(&cmd, 0, sizeof(cmd));
 	memcpy(&cmd.tf_array[7], &args[1], 6);
 	cmd.tf.command = args[0];
-	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 
 	err = ide_no_data_taskfile(drive, &cmd);
 
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 27bb70ddd459..0fdf0dfb5743 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -40,7 +40,7 @@ u8 ide_read_error(ide_drive_t *drive)
 	struct ide_cmd cmd;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_IN_ERROR;
+	cmd.valid.in.tf = IDE_VALID_ERROR;
 
 	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
@@ -348,7 +348,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT;
+	cmd.valid.out.tf = IDE_VALID_FEATURE | IDE_VALID_NSECT;
 	cmd.tf.feature = SETFEATURES_XFER;
 	cmd.tf.nsect   = speed;
 
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 217b7fdf2b17..c9ef77c5d62e 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -71,11 +71,12 @@ static void ide_dump_sector(ide_drive_t *drive)
 	u8 lba48 = !!(drive->dev_flags & IDE_DFLAG_LBA48);
 
 	memset(&cmd, 0, sizeof(cmd));
-	if (lba48)
-		cmd.tf_flags = IDE_TFLAG_IN_LBA | IDE_TFLAG_IN_HOB_LBA |
-				IDE_TFLAG_LBA48;
-	else
-		cmd.tf_flags = IDE_TFLAG_IN_LBA | IDE_TFLAG_IN_DEVICE;
+	if (lba48) {
+		cmd.valid.in.tf  = IDE_VALID_LBA;
+		cmd.valid.in.hob = IDE_VALID_LBA;
+		cmd.tf_flags = IDE_TFLAG_LBA48;
+	} else
+		cmd.valid.in.tf  = IDE_VALID_LBA | IDE_VALID_DEVICE;
 
 	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 9490b446519f..310d03f2b5b7 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -74,7 +74,8 @@ ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
 		tf->lbal = 0x4c;
 		tf->lbam = 0x4e;
 		tf->lbah = 0x55;
-		cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+		cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+		cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 	} else		/* cmd == REQ_UNPARK_HEADS */
 		tf->command = ATA_CMD_CHK_POWER;
 
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index bb7858ebb7d1..0d8a151c0a01 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -163,7 +163,8 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 	return ide_stopped;
 
 out_do_tf:
-	cmd->tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd->valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 	cmd->protocol = ATA_PROT_NODATA;
 
 	return do_rw_taskfile(drive, cmd);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index c1ef8c8c785e..6a98d7c1681a 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -287,7 +287,7 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 
 		memset(&cmd, 0, sizeof(cmd));
 		/* disable DMA & overlap */
-		cmd.tf_flags = IDE_TFLAG_OUT_FEATURE;
+		cmd.valid.out.tf = IDE_VALID_FEATURE;
 
 		tp_ops->tf_load(drive, &cmd);
 	}
@@ -340,7 +340,7 @@ static u8 ide_read_device(ide_drive_t *drive)
 	struct ide_cmd cmd;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_IN_DEVICE;
+	cmd.valid.in.tf = IDE_VALID_DEVICE;
 
 	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index 10a88bf3eefa..3242698832a4 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -204,8 +204,8 @@ static int set_xfer_rate (ide_drive_t *drive, int arg)
 	cmd.tf.command = ATA_CMD_SET_FEATURES;
 	cmd.tf.feature = SETFEATURES_XFER;
 	cmd.tf.nsect   = (u8)arg;
-	cmd.tf_flags   = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT |
-			 IDE_TFLAG_IN_NSECT;
+	cmd.valid.out.tf = IDE_VALID_FEATURE | IDE_VALID_NSECT;
+	cmd.valid.in.tf  = IDE_VALID_NSECT;
 
 	err = ide_no_data_taskfile(drive, &cmd);
 
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 243421ce40d0..dc84f8bde52a 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -47,7 +47,8 @@ int taskfile_lib_get_identify (ide_drive_t *drive, u8 *buf)
 		cmd.tf.command = ATA_CMD_ID_ATA;
 	else
 		cmd.tf.command = ATA_CMD_ID_ATAPI;
-	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 	cmd.protocol = ATA_PROT_PIO;
 
 	return ide_raw_taskfile(drive, &cmd, buf, 1);
@@ -494,11 +495,14 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
 	memcpy(&cmd.tf_array[6], req_task->io_ports,
 	       HDIO_DRIVE_TASK_HDR_SIZE);
 
-	cmd.tf_flags   = IDE_TFLAG_IO_16BIT | IDE_TFLAG_DEVICE |
-			 IDE_TFLAG_IN_TF;
+	cmd.valid.out.tf = IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_DEVICE | IDE_VALID_IN_TF;
+	cmd.tf_flags = IDE_TFLAG_IO_16BIT;
 
-	if (drive->dev_flags & IDE_DFLAG_LBA48)
-		cmd.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_IN_HOB);
+	if (drive->dev_flags & IDE_DFLAG_LBA48) {
+		cmd.tf_flags |= IDE_TFLAG_LBA48;
+		cmd.valid.in.hob = IDE_VALID_IN_HOB;
+	}
 
 	if (req_task->out_flags.all) {
 		cmd.ftf_flags |= IDE_FTFLAG_FLAGGED;
@@ -507,28 +511,28 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
 			cmd.ftf_flags |= IDE_FTFLAG_OUT_DATA;
 
 		if (req_task->out_flags.b.nsector_hob)
-			cmd.tf_flags |= IDE_TFLAG_OUT_HOB_NSECT;
+			cmd.valid.out.hob |= IDE_VALID_NSECT;
 		if (req_task->out_flags.b.sector_hob)
-			cmd.tf_flags |= IDE_TFLAG_OUT_HOB_LBAL;
+			cmd.valid.out.hob |= IDE_VALID_LBAL;
 		if (req_task->out_flags.b.lcyl_hob)
-			cmd.tf_flags |= IDE_TFLAG_OUT_HOB_LBAM;
+			cmd.valid.out.hob |= IDE_VALID_LBAM;
 		if (req_task->out_flags.b.hcyl_hob)
-			cmd.tf_flags |= IDE_TFLAG_OUT_HOB_LBAH;
+			cmd.valid.out.hob |= IDE_VALID_LBAH;
 
 		if (req_task->out_flags.b.error_feature)
-			cmd.tf_flags |= IDE_TFLAG_OUT_FEATURE;
+			cmd.valid.out.tf  |= IDE_VALID_FEATURE;
 		if (req_task->out_flags.b.nsector)
-			cmd.tf_flags |= IDE_TFLAG_OUT_NSECT;
+			cmd.valid.out.tf  |= IDE_VALID_NSECT;
 		if (req_task->out_flags.b.sector)
-			cmd.tf_flags |= IDE_TFLAG_OUT_LBAL;
+			cmd.valid.out.tf  |= IDE_VALID_LBAL;
 		if (req_task->out_flags.b.lcyl)
-			cmd.tf_flags |= IDE_TFLAG_OUT_LBAM;
+			cmd.valid.out.tf  |= IDE_VALID_LBAM;
 		if (req_task->out_flags.b.hcyl)
-			cmd.tf_flags |= IDE_TFLAG_OUT_LBAH;
+			cmd.valid.out.tf  |= IDE_VALID_LBAH;
 	} else {
-		cmd.tf_flags |= IDE_TFLAG_OUT_TF;
+		cmd.valid.out.tf |= IDE_VALID_OUT_TF;
 		if (cmd.tf_flags & IDE_TFLAG_LBA48)
-			cmd.tf_flags |= IDE_TFLAG_OUT_HOB;
+			cmd.valid.out.hob |= IDE_VALID_OUT_HOB;
 	}
 
 	if (req_task->in_flags.b.data)
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 71a39fb3856f..0208dd35c1a3 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -65,35 +65,38 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
 	struct ide_taskfile *tf = &cmd->tf;
+	u8 valid = cmd->valid.in.tf;
 
 	/* be sure we're looking at the low order bits */
 	outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+	if (valid & IDE_VALID_ERROR)
 		tf->error  = inb(io_ports->feature_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
+	if (valid & IDE_VALID_NSECT)
 		tf->nsect  = inb(io_ports->nsect_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
+	if (valid & IDE_VALID_LBAL)
 		tf->lbal   = inb(io_ports->lbal_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_LBAM)
+	if (valid & IDE_VALID_LBAM)
 		tf->lbam   = inb(io_ports->lbam_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_LBAH)
+	if (valid & IDE_VALID_LBAH)
 		tf->lbah   = inb(io_ports->lbah_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_DEVICE)
+	if (valid & IDE_VALID_DEVICE)
 		tf->device = superio_ide_inb(io_ports->device_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+		valid = cmd->valid.in.hob;
+
+		if (valid & IDE_VALID_ERROR)
 			tf->hob_error = inb(io_ports->feature_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
+		if (valid & IDE_VALID_NSECT)
 			tf->hob_nsect = inb(io_ports->nsect_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
+		if (valid & IDE_VALID_LBAL)
 			tf->hob_lbal  = inb(io_ports->lbal_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
+		if (valid & IDE_VALID_LBAM)
 			tf->hob_lbam  = inb(io_ports->lbam_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
+		if (valid & IDE_VALID_LBAH)
 			tf->hob_lbah  = inb(io_ports->lbah_addr);
 	}
 }
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 55e48db7d1be..38a715e293d4 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -649,34 +649,37 @@ static void scc_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
 	struct ide_taskfile *tf = &cmd->tf;
+	u8 valid = cmd->valid.out.hob;
 	u8 HIHI = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
 
 	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
+	if (valid & IDE_VALID_FEATURE)
 		scc_ide_outb(tf->hob_feature, io_ports->feature_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
+	if (valid & IDE_VALID_NSECT)
 		scc_ide_outb(tf->hob_nsect, io_ports->nsect_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
+	if (valid & IDE_VALID_LBAL)
 		scc_ide_outb(tf->hob_lbal, io_ports->lbal_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
+	if (valid & IDE_VALID_LBAM)
 		scc_ide_outb(tf->hob_lbam, io_ports->lbam_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
+	if (valid & IDE_VALID_LBAH)
 		scc_ide_outb(tf->hob_lbah, io_ports->lbah_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_OUT_FEATURE)
+	valid = cmd->valid.out.tf;
+
+	if (valid & IDE_VALID_FEATURE)
 		scc_ide_outb(tf->feature, io_ports->feature_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_NSECT)
+	if (valid & IDE_VALID_NSECT)
 		scc_ide_outb(tf->nsect, io_ports->nsect_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAL)
+	if (valid & IDE_VALID_LBAL)
 		scc_ide_outb(tf->lbal, io_ports->lbal_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAM)
+	if (valid & IDE_VALID_LBAM)
 		scc_ide_outb(tf->lbam, io_ports->lbam_addr);
-	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAH)
+	if (valid & IDE_VALID_LBAH)
 		scc_ide_outb(tf->lbah, io_ports->lbah_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_OUT_DEVICE)
+	if (valid & IDE_VALID_DEVICE)
 		scc_ide_outb((tf->device & HIHI) | drive->select,
 			     io_ports->device_addr);
 }
@@ -685,35 +688,38 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
 	struct ide_taskfile *tf = &cmd->tf;
+	u8 valid = cmd->valid.in.tf;
 
 	/* be sure we're looking at the low order bits */
 	scc_ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+	if (valid & IDE_VALID_ERROR)
 		tf->error  = scc_ide_inb(io_ports->feature_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
+	if (valid & IDE_VALID_NSECT)
 		tf->nsect  = scc_ide_inb(io_ports->nsect_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
+	if (valid & IDE_VALID_LBAL)
 		tf->lbal   = scc_ide_inb(io_ports->lbal_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_LBAM)
+	if (valid & IDE_VALID_LBAM)
 		tf->lbam   = scc_ide_inb(io_ports->lbam_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_LBAH)
+	if (valid & IDE_VALID_LBAH)
 		tf->lbah   = scc_ide_inb(io_ports->lbah_addr);
-	if (cmd->tf_flags & IDE_TFLAG_IN_DEVICE)
+	if (valid & IDE_VALID_DEVICE)
 		tf->device = scc_ide_inb(io_ports->device_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		scc_ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+		valid = cmd->valid.in.hob;
+
+		if (valid & IDE_VALID_ERROR)
 			tf->hob_error = scc_ide_inb(io_ports->feature_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
+		if (valid & IDE_VALID_NSECT)
 			tf->hob_nsect = scc_ide_inb(io_ports->nsect_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
+		if (valid & IDE_VALID_LBAL)
 			tf->hob_lbal  = scc_ide_inb(io_ports->lbal_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
+		if (valid & IDE_VALID_LBAM)
 			tf->hob_lbam  = scc_ide_inb(io_ports->lbam_addr);
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
+		if (valid & IDE_VALID_LBAH)
 			tf->hob_lbah  = scc_ide_inb(io_ports->lbah_addr);
 	}
 }
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index bee9461f13b3..af8b0f68d5cf 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -438,7 +438,7 @@ static void tx4939ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_tf_load(drive, cmd);
 
-	if (cmd->tf_flags & IDE_TFLAG_OUT_DEVICE)
+	if (cmd->valid.out.tf & IDE_VALID_DEVICE)
 		tx4939ide_tf_load_fixup(drive);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a5d26f66ef78..58951f5540bf 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -239,66 +239,39 @@ typedef enum {
 	ide_started,	/* a drive operation was started, handler was set */
 } ide_startstop_t;
 
+enum {
+	IDE_VALID_ERROR 		= (1 << 1),
+	IDE_VALID_FEATURE		= IDE_VALID_ERROR,
+	IDE_VALID_NSECT 		= (1 << 2),
+	IDE_VALID_LBAL			= (1 << 3),
+	IDE_VALID_LBAM			= (1 << 4),
+	IDE_VALID_LBAH			= (1 << 5),
+	IDE_VALID_DEVICE		= (1 << 6),
+	IDE_VALID_LBA			= IDE_VALID_LBAL |
+					  IDE_VALID_LBAM |
+					  IDE_VALID_LBAH,
+	IDE_VALID_OUT_TF		= IDE_VALID_FEATURE |
+					  IDE_VALID_NSECT |
+					  IDE_VALID_LBA,
+	IDE_VALID_IN_TF 		= IDE_VALID_NSECT |
+					  IDE_VALID_LBA,
+	IDE_VALID_OUT_HOB		= IDE_VALID_OUT_TF,
+	IDE_VALID_IN_HOB		= IDE_VALID_ERROR |
+					  IDE_VALID_NSECT |
+					  IDE_VALID_LBA,
+};
+
 enum {
 	IDE_TFLAG_LBA48			= (1 << 0),
-	IDE_TFLAG_OUT_HOB_FEATURE	= (1 << 1),
-	IDE_TFLAG_OUT_HOB_NSECT		= (1 << 2),
-	IDE_TFLAG_OUT_HOB_LBAL		= (1 << 3),
-	IDE_TFLAG_OUT_HOB_LBAM		= (1 << 4),
-	IDE_TFLAG_OUT_HOB_LBAH		= (1 << 5),
-	IDE_TFLAG_OUT_HOB		= IDE_TFLAG_OUT_HOB_FEATURE |
-					  IDE_TFLAG_OUT_HOB_NSECT |
-					  IDE_TFLAG_OUT_HOB_LBAL |
-					  IDE_TFLAG_OUT_HOB_LBAM |
-					  IDE_TFLAG_OUT_HOB_LBAH,
-	IDE_TFLAG_OUT_FEATURE		= (1 << 6),
-	IDE_TFLAG_OUT_NSECT		= (1 << 7),
-	IDE_TFLAG_OUT_LBAL		= (1 << 8),
-	IDE_TFLAG_OUT_LBAM		= (1 << 9),
-	IDE_TFLAG_OUT_LBAH		= (1 << 10),
-	IDE_TFLAG_OUT_TF		= IDE_TFLAG_OUT_FEATURE |
-					  IDE_TFLAG_OUT_NSECT |
-					  IDE_TFLAG_OUT_LBAL |
-					  IDE_TFLAG_OUT_LBAM |
-					  IDE_TFLAG_OUT_LBAH,
-	IDE_TFLAG_OUT_DEVICE		= (1 << 11),
-	IDE_TFLAG_WRITE			= (1 << 12),
-	IDE_TFLAG_CUSTOM_HANDLER	= (1 << 13),
-	IDE_TFLAG_DMA_PIO_FALLBACK	= (1 << 14),
-	IDE_TFLAG_IN_HOB_ERROR		= (1 << 15),
-	IDE_TFLAG_IN_HOB_NSECT		= (1 << 16),
-	IDE_TFLAG_IN_HOB_LBAL		= (1 << 17),
-	IDE_TFLAG_IN_HOB_LBAM		= (1 << 18),
-	IDE_TFLAG_IN_HOB_LBAH		= (1 << 19),
-	IDE_TFLAG_IN_HOB_LBA		= IDE_TFLAG_IN_HOB_LBAL |
-					  IDE_TFLAG_IN_HOB_LBAM |
-					  IDE_TFLAG_IN_HOB_LBAH,
-	IDE_TFLAG_IN_HOB		= IDE_TFLAG_IN_HOB_ERROR |
-					  IDE_TFLAG_IN_HOB_NSECT |
-					  IDE_TFLAG_IN_HOB_LBA,
-	IDE_TFLAG_IN_ERROR		= (1 << 20),
-	IDE_TFLAG_IN_NSECT		= (1 << 21),
-	IDE_TFLAG_IN_LBAL		= (1 << 22),
-	IDE_TFLAG_IN_LBAM		= (1 << 23),
-	IDE_TFLAG_IN_LBAH		= (1 << 24),
-	IDE_TFLAG_IN_LBA		= IDE_TFLAG_IN_LBAL |
-					  IDE_TFLAG_IN_LBAM |
-					  IDE_TFLAG_IN_LBAH,
-	IDE_TFLAG_IN_TF			= IDE_TFLAG_IN_NSECT |
-					  IDE_TFLAG_IN_LBA,
-	IDE_TFLAG_IN_DEVICE		= (1 << 25),
-	IDE_TFLAG_HOB			= IDE_TFLAG_OUT_HOB |
-					  IDE_TFLAG_IN_HOB,
-	IDE_TFLAG_TF			= IDE_TFLAG_OUT_TF |
-					  IDE_TFLAG_IN_TF,
-	IDE_TFLAG_DEVICE		= IDE_TFLAG_OUT_DEVICE |
-					  IDE_TFLAG_IN_DEVICE,
+	IDE_TFLAG_WRITE			= (1 << 1),
+	IDE_TFLAG_CUSTOM_HANDLER	= (1 << 2),
+	IDE_TFLAG_DMA_PIO_FALLBACK	= (1 << 3),
 	/* force 16-bit I/O operations */
-	IDE_TFLAG_IO_16BIT		= (1 << 26),
+	IDE_TFLAG_IO_16BIT		= (1 << 4),
 	/* struct ide_cmd was allocated using kmalloc() */
-	IDE_TFLAG_DYN			= (1 << 27),
-	IDE_TFLAG_FS			= (1 << 28),
-	IDE_TFLAG_MULTI_PIO		= (1 << 29),
+	IDE_TFLAG_DYN			= (1 << 5),
+	IDE_TFLAG_FS			= (1 << 6),
+	IDE_TFLAG_MULTI_PIO		= (1 << 7),
 };
 
 enum {
@@ -346,8 +319,16 @@ struct ide_cmd {
 		struct ide_taskfile	tf;
 		u8			tf_array[14];
 	};
+
+	struct {
+		struct {
+			u8		tf;
+			u8		hob;
+		} out, in;
+	} valid;
+
+	u8			tf_flags;
 	u8			ftf_flags;	/* for TASKFILE ioctl */
-	u32			tf_flags;
 	int			protocol;
 
 	int			sg_nents;	  /* number of sg entries */
-- 
cgit v1.2.3


From 745483f10c6cefb303007c6873e2bfce54efa8ed Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Wed, 8 Apr 2009 14:13:02 +0200
Subject: ide: simplify 'struct ide_taskfile'

Make 'struct ide_taskfile' cover only 8 register values and thus put two such
fields ('tf' and 'hob') into 'struct ide_cmd', dropping unnecessary 'tf_array'
field from it.

This required changing the prototype of ide_get_lba_addr() and ide_tf_dump().

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
[bart: fix setting of ATA_LBA bit for LBA48 commands in __ide_do_rw_disk()]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-acpi.c     |  2 +-
 drivers/ide/ide-disk.c     | 30 ++++++++++++++--------------
 drivers/ide/ide-io-std.c   | 24 ++++++++++++-----------
 drivers/ide/ide-io.c       |  6 +++---
 drivers/ide/ide-ioctls.c   |  4 ++--
 drivers/ide/ide-lib.c      | 15 +++++++-------
 drivers/ide/ide-taskfile.c | 31 ++++++++++++-----------------
 drivers/ide/ns87415.c      | 11 ++++++-----
 drivers/ide/scc_pata.c     | 24 ++++++++++++-----------
 include/linux/ide.h        | 49 ++++++++++++++--------------------------------
 10 files changed, 89 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index f0db4d349c60..77f79d26b264 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -318,7 +318,7 @@ static int do_drive_set_taskfiles(ide_drive_t *drive,
 
 		/* convert GTF to taskfile */
 		memset(&cmd, 0, sizeof(cmd));
-		memcpy(&cmd.tf_array[7], gtf, REGS_PER_GTF);
+		memcpy(&cmd.tf.feature, gtf, REGS_PER_GTF);
 		cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
 		cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 235263e51dd9..a9fbe2c31210 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -105,17 +105,19 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 			pr_debug("%s: LBA=0x%012llx\n", drive->name,
 					(unsigned long long)block);
 
-			tf->hob_nsect = (nsectors >> 8) & 0xff;
-			tf->hob_lbal  = (u8)(block >> 24);
-			if (sizeof(block) != 4) {
-				tf->hob_lbam = (u8)((u64)block >> 32);
-				tf->hob_lbah = (u8)((u64)block >> 40);
-			}
-
 			tf->nsect  = nsectors & 0xff;
 			tf->lbal   = (u8) block;
 			tf->lbam   = (u8)(block >>  8);
 			tf->lbah   = (u8)(block >> 16);
+			tf->device = ATA_LBA;
+
+			tf = &cmd.hob;
+			tf->nsect = (nsectors >> 8) & 0xff;
+			tf->lbal  = (u8)(block >> 24);
+			if (sizeof(block) != 4) {
+				tf->lbam = (u8)((u64)block >> 32);
+				tf->lbah = (u8)((u64)block >> 40);
+			}
 
 			cmd.valid.out.hob = IDE_VALID_OUT_HOB;
 			cmd.valid.in.hob  = IDE_VALID_IN_HOB;
@@ -125,10 +127,8 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 			tf->lbal   = block;
 			tf->lbam   = block >>= 8;
 			tf->lbah   = block >>= 8;
-			tf->device = (block >> 8) & 0xf;
+			tf->device = ((block >> 8) & 0xf) | ATA_LBA;
 		}
-
-		tf->device |= ATA_LBA;
 	} else {
 		unsigned int sect, head, cyl, track;
 
@@ -235,7 +235,7 @@ static u64 idedisk_read_native_max_address(ide_drive_t *drive, int lba48)
 
 	/* if OK, compute maximum address value */
 	if (!(tf->status & ATA_ERR))
-		addr = ide_get_lba_addr(tf, lba48) + 1;
+		addr = ide_get_lba_addr(&cmd, lba48) + 1;
 
 	return addr;
 }
@@ -257,9 +257,9 @@ static u64 idedisk_set_max_address(ide_drive_t *drive, u64 addr_req, int lba48)
 	tf->lbam     = (addr_req >>= 8) & 0xff;
 	tf->lbah     = (addr_req >>= 8) & 0xff;
 	if (lba48) {
-		tf->hob_lbal = (addr_req >>= 8) & 0xff;
-		tf->hob_lbam = (addr_req >>= 8) & 0xff;
-		tf->hob_lbah = (addr_req >>= 8) & 0xff;
+		cmd.hob.lbal = (addr_req >>= 8) & 0xff;
+		cmd.hob.lbam = (addr_req >>= 8) & 0xff;
+		cmd.hob.lbah = (addr_req >>= 8) & 0xff;
 		tf->command  = ATA_CMD_SET_MAX_EXT;
 	} else {
 		tf->device   = (addr_req >>= 8) & 0x0f;
@@ -279,7 +279,7 @@ static u64 idedisk_set_max_address(ide_drive_t *drive, u64 addr_req, int lba48)
 
 	/* if OK, compute maximum address value */
 	if (!(tf->status & ATA_ERR))
-		addr_set = ide_get_lba_addr(tf, lba48) + 1;
+		addr_set = ide_get_lba_addr(&cmd, lba48) + 1;
 
 	return addr_set;
 }
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 8b0b2e9ccf5b..45a424b60c85 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -89,7 +89,7 @@ void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &cmd->tf;
+	struct ide_taskfile *tf = &cmd->hob;
 	void (*tf_outb)(u8 addr, unsigned long port);
 	u8 valid = cmd->valid.out.hob;
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
@@ -104,16 +104,17 @@ void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 		HIHI = 0xFF;
 
 	if (valid & IDE_VALID_FEATURE)
-		tf_outb(tf->hob_feature, io_ports->feature_addr);
+		tf_outb(tf->feature, io_ports->feature_addr);
 	if (valid & IDE_VALID_NSECT)
-		tf_outb(tf->hob_nsect, io_ports->nsect_addr);
+		tf_outb(tf->nsect, io_ports->nsect_addr);
 	if (valid & IDE_VALID_LBAL)
-		tf_outb(tf->hob_lbal, io_ports->lbal_addr);
+		tf_outb(tf->lbal, io_ports->lbal_addr);
 	if (valid & IDE_VALID_LBAM)
-		tf_outb(tf->hob_lbam, io_ports->lbam_addr);
+		tf_outb(tf->lbam, io_ports->lbam_addr);
 	if (valid & IDE_VALID_LBAH)
-		tf_outb(tf->hob_lbah, io_ports->lbah_addr);
+		tf_outb(tf->lbah, io_ports->lbah_addr);
 
+	tf = &cmd->tf;
 	valid = cmd->valid.out.tf;
 
 	if (valid & IDE_VALID_FEATURE)
@@ -170,18 +171,19 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tf_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
+		tf = &cmd->hob;
 		valid = cmd->valid.in.hob;
 
 		if (valid & IDE_VALID_ERROR)
-			tf->hob_error = tf_inb(io_ports->feature_addr);
+			tf->error = tf_inb(io_ports->feature_addr);
 		if (valid & IDE_VALID_NSECT)
-			tf->hob_nsect = tf_inb(io_ports->nsect_addr);
+			tf->nsect = tf_inb(io_ports->nsect_addr);
 		if (valid & IDE_VALID_LBAL)
-			tf->hob_lbal  = tf_inb(io_ports->lbal_addr);
+			tf->lbal  = tf_inb(io_ports->lbal_addr);
 		if (valid & IDE_VALID_LBAM)
-			tf->hob_lbam  = tf_inb(io_ports->lbam_addr);
+			tf->lbam  = tf_inb(io_ports->lbam_addr);
 		if (valid & IDE_VALID_LBAH)
-			tf->hob_lbah  = tf_inb(io_ports->lbah_addr);
+			tf->lbah  = tf_inb(io_ports->lbah_addr);
 	}
 }
 EXPORT_SYMBOL_GPL(ide_tf_read);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 99bb0a9a67e8..e71c72be7622 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -86,8 +86,8 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 
 		tp_ops->input_data(drive, cmd, data, 2);
 
-		tf->data = data[0];
-		tf->hob_data = data[1];
+		cmd->tf.data  = data[0];
+		cmd->hob.data = data[1];
 	}
 
 	tp_ops->tf_read(drive, cmd);
@@ -97,7 +97,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 		if (tf->lbal != 0xc4) {
 			printk(KERN_ERR "%s: head unload failed!\n",
 			       drive->name);
-			ide_tf_dump(drive->name, tf);
+			ide_tf_dump(drive->name, cmd);
 		} else
 			drive->dev_flags |= IDE_DFLAG_PARKED;
 	}
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index b11df4b7998e..c1c25ebbaa1f 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -206,7 +206,7 @@ static int ide_task_ioctl(ide_drive_t *drive, unsigned long arg)
 		return -EFAULT;
 
 	memset(&cmd, 0, sizeof(cmd));
-	memcpy(&cmd.tf_array[7], &args[1], 6);
+	memcpy(&cmd.tf.feature, &args[1], 6);
 	cmd.tf.command = args[0];
 	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
 	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
@@ -214,7 +214,7 @@ static int ide_task_ioctl(ide_drive_t *drive, unsigned long arg)
 	err = ide_no_data_taskfile(drive, &cmd);
 
 	args[0] = cmd.tf.command;
-	memcpy(&args[1], &cmd.tf_array[7], 6);
+	memcpy(&args[1], &cmd.tf.feature, 6);
 
 	if (copy_to_user(p, args, 7))
 		err = -EFAULT;
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index c9ef77c5d62e..6857e6aaf20d 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -49,16 +49,17 @@ static void ide_dump_opcode(ide_drive_t *drive)
 		printk(KERN_CONT "0x%02x\n", cmd->tf.command);
 }
 
-u64 ide_get_lba_addr(struct ide_taskfile *tf, int lba48)
+u64 ide_get_lba_addr(struct ide_cmd *cmd, int lba48)
 {
+	struct ide_taskfile *tf = &cmd->tf;
 	u32 high, low;
 
-	if (lba48)
-		high = (tf->hob_lbah << 16) | (tf->hob_lbam << 8) |
-			tf->hob_lbal;
-	else
-		high = tf->device & 0xf;
 	low  = (tf->lbah << 16) | (tf->lbam << 8) | tf->lbal;
+	if (lba48) {
+		tf = &cmd->hob;
+		high = (tf->lbah << 16) | (tf->lbam << 8) | tf->lbal;
+	} else
+		high = tf->device & 0xf;
 
 	return ((u64)high << 24) | low;
 }
@@ -82,7 +83,7 @@ static void ide_dump_sector(ide_drive_t *drive)
 
 	if (lba48 || (tf->device & ATA_LBA))
 		printk(KERN_CONT ", LBAsect=%llu",
-			(unsigned long long)ide_get_lba_addr(tf, lba48));
+			(unsigned long long)ide_get_lba_addr(&cmd, lba48));
 	else
 		printk(KERN_CONT ", CHS=%d/%d/%d", (tf->lbah << 8) + tf->lbam,
 			tf->device & 0xf, tf->lbal);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index dc84f8bde52a..3160be494aa0 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -23,17 +23,16 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
-void ide_tf_dump(const char *s, struct ide_taskfile *tf)
+void ide_tf_dump(const char *s, struct ide_cmd *cmd)
 {
 #ifdef DEBUG
 	printk("%s: tf: feat 0x%02x nsect 0x%02x lbal 0x%02x "
 		"lbam 0x%02x lbah 0x%02x dev 0x%02x cmd 0x%02x\n",
-		s, tf->feature, tf->nsect, tf->lbal,
-		tf->lbam, tf->lbah, tf->device, tf->command);
-	printk("%s: hob: nsect 0x%02x lbal 0x%02x "
-		"lbam 0x%02x lbah 0x%02x\n",
-		s, tf->hob_nsect, tf->hob_lbal,
-		tf->hob_lbam, tf->hob_lbah);
+	       s, cmd->tf.feature, cmd->tf.nsect,
+	       cmd->tf.lbal, cmd->tf.lbam, cmd->tf.lbah,
+	       cmd->tf.device, cmd->tf.command);
+	printk("%s: hob: nsect 0x%02x lbal 0x%02x lbam 0x%02x lbah 0x%02x\n",
+	       s, cmd->hob.nsect, cmd->hob.lbal, cmd->hob.lbam, cmd->hob.lbah);
 #endif
 }
 
@@ -80,12 +79,12 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 	memcpy(cmd, orig_cmd, sizeof(*cmd));
 
 	if ((cmd->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) {
-		ide_tf_dump(drive->name, tf);
+		ide_tf_dump(drive->name, cmd);
 		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 		SELECT_MASK(drive, 0);
 
 		if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-			u8 data[2] = { tf->data, tf->hob_data };
+			u8 data[2] = { cmd->tf.data, cmd->hob.data };
 
 			tp_ops->output_data(drive, cmd, data, 2);
 		}
@@ -490,10 +489,8 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
 
 	memset(&cmd, 0, sizeof(cmd));
 
-	memcpy(&cmd.tf_array[0], req_task->hob_ports,
-	       HDIO_DRIVE_HOB_HDR_SIZE - 2);
-	memcpy(&cmd.tf_array[6], req_task->io_ports,
-	       HDIO_DRIVE_TASK_HDR_SIZE);
+	memcpy(&cmd.hob, req_task->hob_ports, HDIO_DRIVE_HOB_HDR_SIZE - 2);
+	memcpy(&cmd.tf,  req_task->io_ports,  HDIO_DRIVE_TASK_HDR_SIZE);
 
 	cmd.valid.out.tf = IDE_VALID_DEVICE;
 	cmd.valid.in.tf  = IDE_VALID_DEVICE | IDE_VALID_IN_TF;
@@ -598,7 +595,7 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
 	if (req_task->req_cmd == IDE_DRIVE_TASK_NO_DATA)
 		nsect = 0;
 	else if (!nsect) {
-		nsect = (cmd.tf.hob_nsect << 8) | cmd.tf.nsect;
+		nsect = (cmd.hob.nsect << 8) | cmd.tf.nsect;
 
 		if (!nsect) {
 			printk(KERN_ERR "%s: in/out command without data\n",
@@ -610,10 +607,8 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
 
 	err = ide_raw_taskfile(drive, &cmd, data_buf, nsect);
 
-	memcpy(req_task->hob_ports, &cmd.tf_array[0],
-	       HDIO_DRIVE_HOB_HDR_SIZE - 2);
-	memcpy(req_task->io_ports, &cmd.tf_array[6],
-	       HDIO_DRIVE_TASK_HDR_SIZE);
+	memcpy(req_task->hob_ports, &cmd.hob, HDIO_DRIVE_HOB_HDR_SIZE - 2);
+	memcpy(req_task->io_ports,  &cmd.tf,  HDIO_DRIVE_TASK_HDR_SIZE);
 
 	if ((cmd.ftf_flags & IDE_FTFLAG_SET_IN_FLAGS) &&
 	    req_task->in_flags.all == 0) {
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 0208dd35c1a3..3ab5bb196d2f 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -86,18 +86,19 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
+		tf = &cmd->hob;
 		valid = cmd->valid.in.hob;
 
 		if (valid & IDE_VALID_ERROR)
-			tf->hob_error = inb(io_ports->feature_addr);
+			tf->error = inb(io_ports->feature_addr);
 		if (valid & IDE_VALID_NSECT)
-			tf->hob_nsect = inb(io_ports->nsect_addr);
+			tf->nsect = inb(io_ports->nsect_addr);
 		if (valid & IDE_VALID_LBAL)
-			tf->hob_lbal  = inb(io_ports->lbal_addr);
+			tf->lbal  = inb(io_ports->lbal_addr);
 		if (valid & IDE_VALID_LBAM)
-			tf->hob_lbam  = inb(io_ports->lbam_addr);
+			tf->lbam  = inb(io_ports->lbam_addr);
 		if (valid & IDE_VALID_LBAH)
-			tf->hob_lbah  = inb(io_ports->lbah_addr);
+			tf->lbah  = inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 38a715e293d4..1238d5561976 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -648,7 +648,7 @@ static int __devinit init_setup_scc(struct pci_dev *dev,
 static void scc_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
-	struct ide_taskfile *tf = &cmd->tf;
+	struct ide_taskfile *tf = &cmd->hob;
 	u8 valid = cmd->valid.out.hob;
 	u8 HIHI = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
 
@@ -656,16 +656,17 @@ static void scc_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 		HIHI = 0xFF;
 
 	if (valid & IDE_VALID_FEATURE)
-		scc_ide_outb(tf->hob_feature, io_ports->feature_addr);
+		scc_ide_outb(tf->feature, io_ports->feature_addr);
 	if (valid & IDE_VALID_NSECT)
-		scc_ide_outb(tf->hob_nsect, io_ports->nsect_addr);
+		scc_ide_outb(tf->nsect, io_ports->nsect_addr);
 	if (valid & IDE_VALID_LBAL)
-		scc_ide_outb(tf->hob_lbal, io_ports->lbal_addr);
+		scc_ide_outb(tf->lbal, io_ports->lbal_addr);
 	if (valid & IDE_VALID_LBAM)
-		scc_ide_outb(tf->hob_lbam, io_ports->lbam_addr);
+		scc_ide_outb(tf->lbam, io_ports->lbam_addr);
 	if (valid & IDE_VALID_LBAH)
-		scc_ide_outb(tf->hob_lbah, io_ports->lbah_addr);
+		scc_ide_outb(tf->lbah, io_ports->lbah_addr);
 
+	tf = &cmd->tf;
 	valid = cmd->valid.out.tf;
 
 	if (valid & IDE_VALID_FEATURE)
@@ -709,18 +710,19 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		scc_ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
+		tf = &cmd->hob;
 		valid = cmd->valid.in.hob;
 
 		if (valid & IDE_VALID_ERROR)
-			tf->hob_error = scc_ide_inb(io_ports->feature_addr);
+			tf->error = scc_ide_inb(io_ports->feature_addr);
 		if (valid & IDE_VALID_NSECT)
-			tf->hob_nsect = scc_ide_inb(io_ports->nsect_addr);
+			tf->nsect = scc_ide_inb(io_ports->nsect_addr);
 		if (valid & IDE_VALID_LBAL)
-			tf->hob_lbal  = scc_ide_inb(io_ports->lbal_addr);
+			tf->lbal  = scc_ide_inb(io_ports->lbal_addr);
 		if (valid & IDE_VALID_LBAM)
-			tf->hob_lbam  = scc_ide_inb(io_ports->lbam_addr);
+			tf->lbam  = scc_ide_inb(io_ports->lbam_addr);
 		if (valid & IDE_VALID_LBAH)
-			tf->hob_lbah  = scc_ide_inb(io_ports->lbah_addr);
+			tf->lbah  = scc_ide_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 58951f5540bf..e2ea38df26bc 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -282,44 +282,25 @@ enum {
 };
 
 struct ide_taskfile {
-	u8	hob_data;	/*  0: high data byte (for TASKFILE IOCTL) */
-				/*  1-5: additional data to support LBA48 */
-	union {
-		u8 hob_error;	/*   read: error */
-		u8 hob_feature;	/*  write: feature */
-	};
-
-	u8	hob_nsect;
-	u8	hob_lbal;
-	u8	hob_lbam;
-	u8	hob_lbah;
-
-	u8	data;		/*  6: low data byte (for TASKFILE IOCTL) */
-
-	union {			/*  7: */
-		u8 error;	/*   read:  error */
-		u8 feature;	/*  write: feature */
+	u8	data;		/* 0: data byte (for TASKFILE ioctl) */
+	union {			/* 1: */
+		u8 error;	/*  read: error */
+		u8 feature;	/* write: feature */
 	};
-
-	u8	nsect;		/*  8: number of sectors */
-	u8	lbal;		/*  9: LBA low */
-	u8	lbam;		/* 10: LBA mid */
-	u8	lbah;		/* 11: LBA high */
-
-	u8	device;		/* 12: device select */
-
-	union {			/* 13: */
-		u8 status;	/*  read: status  */
+	u8	nsect;		/* 2: number of sectors */
+	u8	lbal;		/* 3: LBA low */
+	u8	lbam;		/* 4: LBA mid */
+	u8	lbah;		/* 5: LBA high */
+	u8	device;		/* 6: device select */
+	union {			/* 7: */
+		u8 status;	/*  read: status */
 		u8 command;	/* write: command */
 	};
 };
 
 struct ide_cmd {
-	union {
-		struct ide_taskfile	tf;
-		u8			tf_array[14];
-	};
-
+	struct ide_taskfile	tf;
+	struct ide_taskfile	hob;
 	struct {
 		struct {
 			u8		tf;
@@ -1143,7 +1124,7 @@ extern int ide_devset_execute(ide_drive_t *drive,
 void ide_complete_cmd(ide_drive_t *, struct ide_cmd *, u8, u8);
 int ide_complete_rq(ide_drive_t *, int, unsigned int);
 
-void ide_tf_dump(const char *, struct ide_taskfile *);
+void ide_tf_dump(const char *, struct ide_cmd *);
 
 void ide_exec_command(ide_hwif_t *, u8);
 u8 ide_read_status(ide_hwif_t *);
@@ -1510,7 +1491,7 @@ static inline void ide_set_hwifdata (ide_hwif_t * hwif, void *data)
 
 extern void ide_toggle_bounce(ide_drive_t *drive, int on);
 
-u64 ide_get_lba_addr(struct ide_taskfile *, int);
+u64 ide_get_lba_addr(struct ide_cmd *, int);
 u8 ide_dump_status(ide_drive_t *, const char *, u8);
 
 struct ide_timing {
-- 
cgit v1.2.3


From c9ff9e7b64138d87023b733e618f29a1d58543f7 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Wed, 8 Apr 2009 14:13:03 +0200
Subject: ide: refactor tf_load() method

Simplify tf_load() method, making it deal only with 'struct ide_taskfile' and
the validity flags that the upper layer passes, and moving the code that deals
with the high order bytes into the only function interested, do_rw_taskfile().

This should stop the needless code duplication in this method and so make
it about twice smaller than it was; along with simplifying the setup for the
method call, this should save both time and space...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io-std.c   | 18 +-----------------
 drivers/ide/ide-iops.c     | 11 +++++------
 drivers/ide/ide-probe.c    |  8 +++-----
 drivers/ide/ide-taskfile.c |  3 ++-
 drivers/ide/scc_pata.c     | 18 +-----------------
 drivers/ide/tx4939ide.c    |  7 ++++---
 include/linux/ide.h        |  4 ++--
 7 files changed, 18 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 66c27768e85b..481e221b233d 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -85,13 +85,11 @@ void ide_dev_select(ide_drive_t *drive)
 }
 EXPORT_SYMBOL_GPL(ide_dev_select);
 
-void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
+void ide_tf_load(ide_drive_t *drive, struct ide_taskfile *tf, u8 valid)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &cmd->hob;
 	void (*tf_outb)(u8 addr, unsigned long port);
-	u8 valid = cmd->valid.out.hob;
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
 
 	if (mmio)
@@ -99,20 +97,6 @@ void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	else
 		tf_outb = ide_outb;
 
-	if (valid & IDE_VALID_FEATURE)
-		tf_outb(tf->feature, io_ports->feature_addr);
-	if (valid & IDE_VALID_NSECT)
-		tf_outb(tf->nsect, io_ports->nsect_addr);
-	if (valid & IDE_VALID_LBAL)
-		tf_outb(tf->lbal, io_ports->lbal_addr);
-	if (valid & IDE_VALID_LBAM)
-		tf_outb(tf->lbam, io_ports->lbam_addr);
-	if (valid & IDE_VALID_LBAH)
-		tf_outb(tf->lbah, io_ports->lbah_addr);
-
-	tf = &cmd->tf;
-	valid = cmd->valid.out.tf;
-
 	if (valid & IDE_VALID_FEATURE)
 		tf_outb(tf->feature, io_ports->feature_addr);
 	if (valid & IDE_VALID_NSECT)
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 0fdf0dfb5743..6f1ed427a484 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -312,10 +312,10 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
+	struct ide_taskfile tf;
 	u16 *id = drive->id, i;
 	int error = 0;
 	u8 stat;
-	struct ide_cmd cmd;
 
 #ifdef CONFIG_BLK_DEV_IDEDMA
 	if (hwif->dma_ops)	/* check if host supports DMA */
@@ -347,12 +347,11 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	udelay(1);
 	tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
 
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.valid.out.tf = IDE_VALID_FEATURE | IDE_VALID_NSECT;
-	cmd.tf.feature = SETFEATURES_XFER;
-	cmd.tf.nsect   = speed;
+	memset(&tf, 0, sizeof(tf));
+	tf.feature = SETFEATURES_XFER;
+	tf.nsect   = speed;
 
-	tp_ops->tf_load(drive, &cmd);
+	tp_ops->tf_load(drive, &tf, IDE_VALID_FEATURE | IDE_VALID_NSECT);
 
 	tp_ops->exec_command(hwif, ATA_CMD_SET_FEATURES);
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 6a98d7c1681a..44d7816c1fe9 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -283,13 +283,11 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 	 * identify command to be sure of reply
 	 */
 	if (cmd == ATA_CMD_ID_ATAPI) {
-		struct ide_cmd cmd;
+		struct ide_taskfile tf;
 
-		memset(&cmd, 0, sizeof(cmd));
+		memset(&tf, 0, sizeof(tf));
 		/* disable DMA & overlap */
-		cmd.valid.out.tf = IDE_VALID_FEATURE;
-
-		tp_ops->tf_load(drive, &cmd);
+		tp_ops->tf_load(drive, &tf, IDE_VALID_FEATURE);
 	}
 
 	/* ask drive for ID */
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 0318a4cb09de..b1806ed46175 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -98,7 +98,8 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 			cmd->tf.device |= drive->select;
 		}
 
-		tp_ops->tf_load(drive, cmd);
+		tp_ops->tf_load(drive, &cmd->hob, cmd->valid.out.hob);
+		tp_ops->tf_load(drive, &cmd->tf,  cmd->valid.out.tf);
 	}
 
 	switch (cmd->protocol) {
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index feabf5487049..5ecb70cf29dc 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -645,25 +645,9 @@ static int __devinit init_setup_scc(struct pci_dev *dev,
 	return rc;
 }
 
-static void scc_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
+static void scc_tf_load(ide_drive_t *drive, struct ide_taskfile *tf, u8 valid)
 {
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
-	struct ide_taskfile *tf = &cmd->hob;
-	u8 valid = cmd->valid.out.hob;
-
-	if (valid & IDE_VALID_FEATURE)
-		scc_ide_outb(tf->feature, io_ports->feature_addr);
-	if (valid & IDE_VALID_NSECT)
-		scc_ide_outb(tf->nsect, io_ports->nsect_addr);
-	if (valid & IDE_VALID_LBAL)
-		scc_ide_outb(tf->lbal, io_ports->lbal_addr);
-	if (valid & IDE_VALID_LBAM)
-		scc_ide_outb(tf->lbam, io_ports->lbam_addr);
-	if (valid & IDE_VALID_LBAH)
-		scc_ide_outb(tf->lbah, io_ports->lbah_addr);
-
-	tf = &cmd->tf;
-	valid = cmd->valid.out.tf;
 
 	if (valid & IDE_VALID_FEATURE)
 		scc_ide_outb(tf->feature, io_ports->feature_addr);
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index af8b0f68d5cf..564422d23976 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -434,11 +434,12 @@ static void tx4939ide_tf_load_fixup(ide_drive_t *drive)
 	tx4939ide_writew(sysctl, base, TX4939IDE_Sys_Ctl);
 }
 
-static void tx4939ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
+static void tx4939ide_tf_load(ide_drive_t *drive, struct ide_taskfile *tf,
+			      u8 valid)
 {
-	ide_tf_load(drive, cmd);
+	ide_tf_load(drive, tf, valid);
 
-	if (cmd->valid.out.tf & IDE_VALID_DEVICE)
+	if (valid & IDE_VALID_DEVICE)
 		tx4939ide_tf_load_fixup(drive);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e2ea38df26bc..0ba1c6ab97f8 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -624,7 +624,7 @@ struct ide_tp_ops {
 	void	(*write_devctl)(struct hwif_s *, u8);
 
 	void	(*dev_select)(ide_drive_t *);
-	void	(*tf_load)(ide_drive_t *, struct ide_cmd *);
+	void	(*tf_load)(ide_drive_t *, struct ide_taskfile *, u8);
 	void	(*tf_read)(ide_drive_t *, struct ide_cmd *);
 
 	void	(*input_data)(ide_drive_t *, struct ide_cmd *,
@@ -1132,7 +1132,7 @@ u8 ide_read_altstatus(ide_hwif_t *);
 void ide_write_devctl(ide_hwif_t *, u8);
 
 void ide_dev_select(ide_drive_t *);
-void ide_tf_load(ide_drive_t *, struct ide_cmd *);
+void ide_tf_load(ide_drive_t *, struct ide_taskfile *, u8);
 void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 
 void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
-- 
cgit v1.2.3


From 3153c26b54230d025c6d536e8d3015def4524906 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Wed, 8 Apr 2009 14:13:03 +0200
Subject: ide: refactor tf_read() method

Simplify tf_read() method, making it deal only with 'struct ide_taskfile' and
the validity flags that the upper layer passes, and factoring out the code that
deals with the high order bytes into ide_tf_readback() to be called from the
only two functions interested, ide_complete_cmd() and ide_dump_sector().

This should stop the needless code duplication in this method and so make
it about twice smaller than it was; along with simplifying the setup for
the method call, this should save both time and space...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c    | 21 ++++++++-------------
 drivers/ide/ide-io-std.c   | 25 +------------------------
 drivers/ide/ide-io.c       |  2 +-
 drivers/ide/ide-iops.c     |  9 +++------
 drivers/ide/ide-lib.c      |  2 +-
 drivers/ide/ide-probe.c    |  9 +++------
 drivers/ide/ide-taskfile.c | 17 +++++++++++++++++
 drivers/ide/ns87415.c      | 26 ++------------------------
 drivers/ide/scc_pata.c     | 25 +------------------------
 include/linux/ide.h        |  5 +++--
 10 files changed, 40 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index a359323d8ffe..7201b176d75b 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -254,15 +254,13 @@ EXPORT_SYMBOL_GPL(ide_cd_get_xferlen);
 
 void ide_read_bcount_and_ireason(ide_drive_t *drive, u16 *bcount, u8 *ireason)
 {
-	struct ide_cmd cmd;
+	struct ide_taskfile tf;
 
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.valid.in.tf = IDE_VALID_LBAH | IDE_VALID_LBAM | IDE_VALID_NSECT;
+	drive->hwif->tp_ops->tf_read(drive, &tf, IDE_VALID_NSECT |
+				     IDE_VALID_LBAM | IDE_VALID_LBAH);
 
-	drive->hwif->tp_ops->tf_read(drive, &cmd);
-
-	*bcount = (cmd.tf.lbah << 8) | cmd.tf.lbam;
-	*ireason = cmd.tf.nsect & 3;
+	*bcount = (tf.lbah << 8) | tf.lbam;
+	*ireason = tf.nsect & 3;
 }
 EXPORT_SYMBOL_GPL(ide_read_bcount_and_ireason);
 
@@ -452,14 +450,11 @@ static void ide_init_packet_cmd(struct ide_cmd *cmd, u8 valid_tf,
 
 static u8 ide_read_ireason(ide_drive_t *drive)
 {
-	struct ide_cmd cmd;
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.valid.in.tf = IDE_VALID_NSECT;
+	struct ide_taskfile tf;
 
-	drive->hwif->tp_ops->tf_read(drive, &cmd);
+	drive->hwif->tp_ops->tf_read(drive, &tf, IDE_VALID_NSECT);
 
-	return cmd.tf.nsect & 3;
+	return tf.nsect & 3;
 }
 
 static u8 ide_wait_ireason(ide_drive_t *drive, u8 ireason)
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 481e221b233d..46721c454518 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -112,13 +112,11 @@ void ide_tf_load(ide_drive_t *drive, struct ide_taskfile *tf, u8 valid)
 }
 EXPORT_SYMBOL_GPL(ide_tf_load);
 
-void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
+void ide_tf_read(ide_drive_t *drive, struct ide_taskfile *tf, u8 valid)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &cmd->tf;
 	u8 (*tf_inb)(unsigned long port);
-	u8 valid = cmd->valid.in.tf;
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
 
 	if (mmio)
@@ -126,9 +124,6 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	else
 		tf_inb  = ide_inb;
 
-	/* be sure we're looking at the low order bits */
-	hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
-
 	if (valid & IDE_VALID_ERROR)
 		tf->error  = tf_inb(io_ports->feature_addr);
 	if (valid & IDE_VALID_NSECT)
@@ -141,24 +136,6 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 		tf->lbah   = tf_inb(io_ports->lbah_addr);
 	if (valid & IDE_VALID_DEVICE)
 		tf->device = tf_inb(io_ports->device_addr);
-
-	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
-		hwif->tp_ops->write_devctl(hwif, ATA_HOB | ATA_DEVCTL_OBS);
-
-		tf = &cmd->hob;
-		valid = cmd->valid.in.hob;
-
-		if (valid & IDE_VALID_ERROR)
-			tf->error = tf_inb(io_ports->feature_addr);
-		if (valid & IDE_VALID_NSECT)
-			tf->nsect = tf_inb(io_ports->nsect_addr);
-		if (valid & IDE_VALID_LBAL)
-			tf->lbal  = tf_inb(io_ports->lbal_addr);
-		if (valid & IDE_VALID_LBAM)
-			tf->lbam  = tf_inb(io_ports->lbam_addr);
-		if (valid & IDE_VALID_LBAH)
-			tf->lbah  = tf_inb(io_ports->lbah_addr);
-	}
 }
 EXPORT_SYMBOL_GPL(ide_tf_read);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index e71c72be7622..2ae02b8d7f8e 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -90,7 +90,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 		cmd->hob.data = data[1];
 	}
 
-	tp_ops->tf_read(drive, cmd);
+	ide_tf_readback(drive, cmd);
 
 	if ((cmd->tf_flags & IDE_TFLAG_CUSTOM_HANDLER) &&
 	    tf_cmd == ATA_CMD_IDLEIMMEDIATE) {
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 6f1ed427a484..c19a221b1e18 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -37,14 +37,11 @@ void SELECT_MASK(ide_drive_t *drive, int mask)
 
 u8 ide_read_error(ide_drive_t *drive)
 {
-	struct ide_cmd cmd;
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.valid.in.tf = IDE_VALID_ERROR;
+	struct ide_taskfile tf;
 
-	drive->hwif->tp_ops->tf_read(drive, &cmd);
+	drive->hwif->tp_ops->tf_read(drive, &tf, IDE_VALID_ERROR);
 
-	return cmd.tf.error;
+	return tf.error;
 }
 EXPORT_SYMBOL_GPL(ide_read_error);
 
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 6857e6aaf20d..56ff8c46c7d1 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -79,7 +79,7 @@ static void ide_dump_sector(ide_drive_t *drive)
 	} else
 		cmd.valid.in.tf  = IDE_VALID_LBA | IDE_VALID_DEVICE;
 
-	drive->hwif->tp_ops->tf_read(drive, &cmd);
+	ide_tf_readback(drive, &cmd);
 
 	if (lba48 || (tf->device & ATA_LBA))
 		printk(KERN_CONT ", LBAsect=%llu",
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 44d7816c1fe9..7f264ed1141b 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -335,14 +335,11 @@ int ide_busy_sleep(ide_hwif_t *hwif, unsigned long timeout, int altstatus)
 
 static u8 ide_read_device(ide_drive_t *drive)
 {
-	struct ide_cmd cmd;
+	struct ide_taskfile tf;
 
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.valid.in.tf = IDE_VALID_DEVICE;
+	drive->hwif->tp_ops->tf_read(drive, &tf, IDE_VALID_DEVICE);
 
-	drive->hwif->tp_ops->tf_read(drive, &cmd);
-
-	return cmd.tf.device;
+	return tf.device;
 }
 
 /**
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index b1806ed46175..4aa6223c11be 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -23,6 +23,23 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
+void ide_tf_readback(ide_drive_t *drive, struct ide_cmd *cmd)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
+
+	/* Be sure we're looking at the low order bytes */
+	tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
+
+	tp_ops->tf_read(drive, &cmd->tf, cmd->valid.in.tf);
+
+	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
+		tp_ops->write_devctl(hwif, ATA_HOB | ATA_DEVCTL_OBS);
+
+		tp_ops->tf_read(drive, &cmd->hob, cmd->valid.in.hob);
+	}
+}
+
 void ide_tf_dump(const char *s, struct ide_cmd *cmd)
 {
 #ifdef DEBUG
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index f1305f4d2be7..95327a2c2422 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -61,14 +61,10 @@ static u8 superio_dma_sff_read_status(ide_hwif_t *hwif)
 	return superio_ide_inb(hwif->dma_base + ATA_DMA_STATUS);
 }
 
-static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
+static void superio_tf_read(ide_drive_t *drive, struct ide_taskfile *tf,
+			    u8 valid)
 {
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
-	struct ide_taskfile *tf = &cmd->tf;
-	u8 valid = cmd->valid.in.tf;
-
-	/* be sure we're looking at the low order bits */
-	ide_write_devctl(hwif, ATA_DEVCTL_OBS);
 
 	if (valid & IDE_VALID_ERROR)
 		tf->error  = inb(io_ports->feature_addr);
@@ -82,24 +78,6 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 		tf->lbah   = inb(io_ports->lbah_addr);
 	if (valid & IDE_VALID_DEVICE)
 		tf->device = superio_ide_inb(io_ports->device_addr);
-
-	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
-		ide_write_devctl(hwif, ATA_HOB | ATA_DEVCTL_OBS);
-
-		tf = &cmd->hob;
-		valid = cmd->valid.in.hob;
-
-		if (valid & IDE_VALID_ERROR)
-			tf->error = inb(io_ports->feature_addr);
-		if (valid & IDE_VALID_NSECT)
-			tf->nsect = inb(io_ports->nsect_addr);
-		if (valid & IDE_VALID_LBAL)
-			tf->lbal  = inb(io_ports->lbal_addr);
-		if (valid & IDE_VALID_LBAM)
-			tf->lbam  = inb(io_ports->lbam_addr);
-		if (valid & IDE_VALID_LBAH)
-			tf->lbah  = inb(io_ports->lbah_addr);
-	}
 }
 
 static void ns87415_dev_select(ide_drive_t *drive);
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 5ecb70cf29dc..5be41f25204f 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -663,14 +663,9 @@ static void scc_tf_load(ide_drive_t *drive, struct ide_taskfile *tf, u8 valid)
 		scc_ide_outb(tf->device, io_ports->device_addr);
 }
 
-static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
+static void scc_tf_read(ide_drive_t *drive, struct ide_taskfile *tf, u8 valid)
 {
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
-	struct ide_taskfile *tf = &cmd->tf;
-	u8 valid = cmd->valid.in.tf;
-
-	/* be sure we're looking at the low order bits */
-	scc_write_devctl(hwif, ATA_DEVCTL_OBS);
 
 	if (valid & IDE_VALID_ERROR)
 		tf->error  = scc_ide_inb(io_ports->feature_addr);
@@ -684,24 +679,6 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 		tf->lbah   = scc_ide_inb(io_ports->lbah_addr);
 	if (valid & IDE_VALID_DEVICE)
 		tf->device = scc_ide_inb(io_ports->device_addr);
-
-	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
-		scc_write_devctl(hwif, ATA_HOB | ATA_DEVCTL_OBS);
-
-		tf = &cmd->hob;
-		valid = cmd->valid.in.hob;
-
-		if (valid & IDE_VALID_ERROR)
-			tf->error = scc_ide_inb(io_ports->feature_addr);
-		if (valid & IDE_VALID_NSECT)
-			tf->nsect = scc_ide_inb(io_ports->nsect_addr);
-		if (valid & IDE_VALID_LBAL)
-			tf->lbal  = scc_ide_inb(io_ports->lbal_addr);
-		if (valid & IDE_VALID_LBAM)
-			tf->lbam  = scc_ide_inb(io_ports->lbam_addr);
-		if (valid & IDE_VALID_LBAH)
-			tf->lbah  = scc_ide_inb(io_ports->lbah_addr);
-	}
 }
 
 static void scc_input_data(ide_drive_t *drive, struct ide_cmd *cmd,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 0ba1c6ab97f8..ff65fffb078f 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -625,7 +625,7 @@ struct ide_tp_ops {
 
 	void	(*dev_select)(ide_drive_t *);
 	void	(*tf_load)(ide_drive_t *, struct ide_taskfile *, u8);
-	void	(*tf_read)(ide_drive_t *, struct ide_cmd *);
+	void	(*tf_read)(ide_drive_t *, struct ide_taskfile *, u8);
 
 	void	(*input_data)(ide_drive_t *, struct ide_cmd *,
 			      void *, unsigned int);
@@ -1124,6 +1124,7 @@ extern int ide_devset_execute(ide_drive_t *drive,
 void ide_complete_cmd(ide_drive_t *, struct ide_cmd *, u8, u8);
 int ide_complete_rq(ide_drive_t *, int, unsigned int);
 
+void ide_tf_readback(ide_drive_t *drive, struct ide_cmd *cmd);
 void ide_tf_dump(const char *, struct ide_cmd *);
 
 void ide_exec_command(ide_hwif_t *, u8);
@@ -1133,7 +1134,7 @@ void ide_write_devctl(ide_hwif_t *, u8);
 
 void ide_dev_select(ide_drive_t *);
 void ide_tf_load(ide_drive_t *, struct ide_taskfile *, u8);
-void ide_tf_read(ide_drive_t *, struct ide_cmd *);
+void ide_tf_read(ide_drive_t *, struct ide_taskfile *, u8);
 
 void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 void ide_output_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
-- 
cgit v1.2.3


From 7831d56b0a3544cbb6f82f76c34ca95e24d5b676 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 8 Apr 2009 20:13:16 +0100
Subject: tty: MAX3100

Thou shalt remember to use 'git add' or errors shall be visited on your
downloads and there shall be wrath from on list and much gnashing of teeth.

Thou shalt remember to use git status or there shall be catcalls and much
embarrasment shall come to pass.

Signed-off-by: Alan "I'm hiding" Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/max3100.c       | 927 +++++++++++++++++++++++++++++++++++++++++
 include/linux/serial_max3100.h |  52 +++
 2 files changed, 979 insertions(+)
 create mode 100644 drivers/serial/max3100.c
 create mode 100644 include/linux/serial_max3100.h

(limited to 'include/linux')

diff --git a/drivers/serial/max3100.c b/drivers/serial/max3100.c
new file mode 100644
index 000000000000..9fd33e5622bd
--- /dev/null
+++ b/drivers/serial/max3100.c
@@ -0,0 +1,927 @@
+/*
+ *
+ *  Copyright (C) 2008 Christian Pellegrin <chripell@evolware.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ *
+ * Notes: the MAX3100 doesn't provide an interrupt on CTS so we have
+ * to use polling for flow control. TX empty IRQ is unusable, since
+ * writing conf clears FIFO buffer and we cannot have this interrupt
+ * always asking us for attention.
+ *
+ * Example platform data:
+
+ static struct plat_max3100 max3100_plat_data = {
+ .loopback = 0,
+ .crystal = 0,
+ .poll_time = 100,
+ };
+
+ static struct spi_board_info spi_board_info[] = {
+ {
+ .modalias	= "max3100",
+ .platform_data	= &max3100_plat_data,
+ .irq		= IRQ_EINT12,
+ .max_speed_hz	= 5*1000*1000,
+ .chip_select	= 0,
+ },
+ };
+
+ * The initial minor number is 209 in the low-density serial port:
+ * mknod /dev/ttyMAX0 c 204 209
+ */
+
+#define MAX3100_MAJOR 204
+#define MAX3100_MINOR 209
+/* 4 MAX3100s should be enough for everyone */
+#define MAX_MAX3100 4
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/serial_core.h>
+#include <linux/serial.h>
+#include <linux/spi/spi.h>
+#include <linux/freezer.h>
+
+#include <linux/serial_max3100.h>
+
+#define MAX3100_C    (1<<14)
+#define MAX3100_D    (0<<14)
+#define MAX3100_W    (1<<15)
+#define MAX3100_RX   (0<<15)
+
+#define MAX3100_WC   (MAX3100_W  | MAX3100_C)
+#define MAX3100_RC   (MAX3100_RX | MAX3100_C)
+#define MAX3100_WD   (MAX3100_W  | MAX3100_D)
+#define MAX3100_RD   (MAX3100_RX | MAX3100_D)
+#define MAX3100_CMD  (3 << 14)
+
+#define MAX3100_T    (1<<14)
+#define MAX3100_R    (1<<15)
+
+#define MAX3100_FEN  (1<<13)
+#define MAX3100_SHDN (1<<12)
+#define MAX3100_TM   (1<<11)
+#define MAX3100_RM   (1<<10)
+#define MAX3100_PM   (1<<9)
+#define MAX3100_RAM  (1<<8)
+#define MAX3100_IR   (1<<7)
+#define MAX3100_ST   (1<<6)
+#define MAX3100_PE   (1<<5)
+#define MAX3100_L    (1<<4)
+#define MAX3100_BAUD (0xf)
+
+#define MAX3100_TE   (1<<10)
+#define MAX3100_RAFE (1<<10)
+#define MAX3100_RTS  (1<<9)
+#define MAX3100_CTS  (1<<9)
+#define MAX3100_PT   (1<<8)
+#define MAX3100_DATA (0xff)
+
+#define MAX3100_RT   (MAX3100_R | MAX3100_T)
+#define MAX3100_RTC  (MAX3100_RT | MAX3100_CTS | MAX3100_RAFE)
+
+/* the following simulate a status reg for ignore_status_mask */
+#define MAX3100_STATUS_PE 1
+#define MAX3100_STATUS_FE 2
+#define MAX3100_STATUS_OE 4
+
+struct max3100_port {
+	struct uart_port port;
+	struct spi_device *spi;
+
+	int cts;	        /* last CTS received for flow ctrl */
+	int tx_empty;		/* last TX empty bit */
+
+	spinlock_t conf_lock;	/* shared data */
+	int conf_commit;	/* need to make changes */
+	int conf;		/* configuration for the MAX31000
+				 * (bits 0-7, bits 8-11 are irqs) */
+	int rts_commit;	        /* need to change rts */
+	int rts;		/* rts status */
+	int baud;		/* current baud rate */
+
+	int parity;		/* keeps track if we should send parity */
+#define MAX3100_PARITY_ON 1
+#define MAX3100_PARITY_ODD 2
+#define MAX3100_7BIT 4
+	int rx_enabled;	        /* if we should rx chars */
+
+	int irq;		/* irq assigned to the max3100 */
+
+	int minor;		/* minor number */
+	int crystal;		/* 1 if 3.6864Mhz crystal 0 for 1.8432 */
+	int loopback;		/* 1 if we are in loopback mode */
+
+	/* for handling irqs: need workqueue since we do spi_sync */
+	struct workqueue_struct *workqueue;
+	struct work_struct work;
+	/* set to 1 to make the workhandler exit as soon as possible */
+	int  force_end_work;
+	/* need to know we are suspending to avoid deadlock on workqueue */
+	int suspending;
+
+	/* hook for suspending MAX3100 via dedicated pin */
+	void (*max3100_hw_suspend) (int suspend);
+
+	/* poll time (in ms) for ctrl lines */
+	int poll_time;
+	/* and its timer */
+	struct timer_list	timer;
+};
+
+static struct max3100_port *max3100s[MAX_MAX3100]; /* the chips */
+static DEFINE_MUTEX(max3100s_lock);		   /* race on probe */
+
+static int max3100_do_parity(struct max3100_port *s, u16 c)
+{
+	int parity;
+
+	if (s->parity & MAX3100_PARITY_ODD)
+		parity = 1;
+	else
+		parity = 0;
+
+	if (s->parity & MAX3100_7BIT)
+		c &= 0x7f;
+	else
+		c &= 0xff;
+
+	parity = parity ^ (hweight8(c) & 1);
+	return parity;
+}
+
+static int max3100_check_parity(struct max3100_port *s, u16 c)
+{
+	return max3100_do_parity(s, c) == ((c >> 8) & 1);
+}
+
+static void max3100_calc_parity(struct max3100_port *s, u16 *c)
+{
+	if (s->parity & MAX3100_7BIT)
+		*c &= 0x7f;
+	else
+		*c &= 0xff;
+
+	if (s->parity & MAX3100_PARITY_ON)
+		*c |= max3100_do_parity(s, *c) << 8;
+}
+
+static void max3100_work(struct work_struct *w);
+
+static void max3100_dowork(struct max3100_port *s)
+{
+	if (!s->force_end_work && !work_pending(&s->work) &&
+	    !freezing(current) && !s->suspending)
+		queue_work(s->workqueue, &s->work);
+}
+
+static void max3100_timeout(unsigned long data)
+{
+	struct max3100_port *s = (struct max3100_port *)data;
+
+	if (s->port.info) {
+		max3100_dowork(s);
+		mod_timer(&s->timer, jiffies + s->poll_time);
+	}
+}
+
+static int max3100_sr(struct max3100_port *s, u16 tx, u16 *rx)
+{
+	struct spi_message message;
+	u16 etx, erx;
+	int status;
+	struct spi_transfer tran = {
+		.tx_buf = &etx,
+		.rx_buf = &erx,
+		.len = 2,
+	};
+
+	etx = cpu_to_be16(tx);
+	spi_message_init(&message);
+	spi_message_add_tail(&tran, &message);
+	status = spi_sync(s->spi, &message);
+	if (status) {
+		dev_warn(&s->spi->dev, "error while calling spi_sync\n");
+		return -EIO;
+	}
+	*rx = be16_to_cpu(erx);
+	s->tx_empty = (*rx & MAX3100_T) > 0;
+	dev_dbg(&s->spi->dev, "%04x - %04x\n", tx, *rx);
+	return 0;
+}
+
+static int max3100_handlerx(struct max3100_port *s, u16 rx)
+{
+	unsigned int ch, flg, status = 0;
+	int ret = 0, cts;
+
+	if (rx & MAX3100_R && s->rx_enabled) {
+		dev_dbg(&s->spi->dev, "%s\n", __func__);
+		ch = rx & (s->parity & MAX3100_7BIT ? 0x7f : 0xff);
+		if (rx & MAX3100_RAFE) {
+			s->port.icount.frame++;
+			flg = TTY_FRAME;
+			status |= MAX3100_STATUS_FE;
+		} else {
+			if (s->parity & MAX3100_PARITY_ON) {
+				if (max3100_check_parity(s, rx)) {
+					s->port.icount.rx++;
+					flg = TTY_NORMAL;
+				} else {
+					s->port.icount.parity++;
+					flg = TTY_PARITY;
+					status |= MAX3100_STATUS_PE;
+				}
+			} else {
+				s->port.icount.rx++;
+				flg = TTY_NORMAL;
+			}
+		}
+		uart_insert_char(&s->port, status, MAX3100_STATUS_OE, ch, flg);
+		ret = 1;
+	}
+
+	cts = (rx & MAX3100_CTS) > 0;
+	if (s->cts != cts) {
+		s->cts = cts;
+		uart_handle_cts_change(&s->port, cts ? TIOCM_CTS : 0);
+	}
+
+	return ret;
+}
+
+static void max3100_work(struct work_struct *w)
+{
+	struct max3100_port *s = container_of(w, struct max3100_port, work);
+	int rxchars;
+	u16 tx, rx;
+	int conf, cconf, rts, crts;
+	struct circ_buf *xmit = &s->port.info->xmit;
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	rxchars = 0;
+	do {
+		spin_lock(&s->conf_lock);
+		conf = s->conf;
+		cconf = s->conf_commit;
+		s->conf_commit = 0;
+		rts = s->rts;
+		crts = s->rts_commit;
+		s->rts_commit = 0;
+		spin_unlock(&s->conf_lock);
+		if (cconf)
+			max3100_sr(s, MAX3100_WC | conf, &rx);
+		if (crts) {
+			max3100_sr(s, MAX3100_WD | MAX3100_TE |
+				   (s->rts ? MAX3100_RTS : 0), &rx);
+			rxchars += max3100_handlerx(s, rx);
+		}
+
+		max3100_sr(s, MAX3100_RD, &rx);
+		rxchars += max3100_handlerx(s, rx);
+
+		if (rx & MAX3100_T) {
+			tx = 0xffff;
+			if (s->port.x_char) {
+				tx = s->port.x_char;
+				s->port.icount.tx++;
+				s->port.x_char = 0;
+			} else if (!uart_circ_empty(xmit) &&
+				   !uart_tx_stopped(&s->port)) {
+				tx = xmit->buf[xmit->tail];
+				xmit->tail = (xmit->tail + 1) &
+					(UART_XMIT_SIZE - 1);
+				s->port.icount.tx++;
+			}
+			if (tx != 0xffff) {
+				max3100_calc_parity(s, &tx);
+				tx |= MAX3100_WD | (s->rts ? MAX3100_RTS : 0);
+				max3100_sr(s, tx, &rx);
+				rxchars += max3100_handlerx(s, rx);
+			}
+		}
+
+		if (rxchars > 16 && s->port.info->port.tty != NULL) {
+			tty_flip_buffer_push(s->port.info->port.tty);
+			rxchars = 0;
+		}
+		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+			uart_write_wakeup(&s->port);
+
+	} while (!s->force_end_work &&
+		 !freezing(current) &&
+		 ((rx & MAX3100_R) ||
+		  (!uart_circ_empty(xmit) &&
+		   !uart_tx_stopped(&s->port))));
+
+	if (rxchars > 0 && s->port.info->port.tty != NULL)
+		tty_flip_buffer_push(s->port.info->port.tty);
+}
+
+static irqreturn_t max3100_irq(int irqno, void *dev_id)
+{
+	struct max3100_port *s = dev_id;
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	max3100_dowork(s);
+	return IRQ_HANDLED;
+}
+
+static void max3100_enable_ms(struct uart_port *port)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	if (s->poll_time > 0)
+		mod_timer(&s->timer, jiffies);
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+}
+
+static void max3100_start_tx(struct uart_port *port)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	max3100_dowork(s);
+}
+
+static void max3100_stop_rx(struct uart_port *port)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	s->rx_enabled = 0;
+	spin_lock(&s->conf_lock);
+	s->conf &= ~MAX3100_RM;
+	s->conf_commit = 1;
+	spin_unlock(&s->conf_lock);
+	max3100_dowork(s);
+}
+
+static unsigned int max3100_tx_empty(struct uart_port *port)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	/* may not be truly up-to-date */
+	max3100_dowork(s);
+	return s->tx_empty;
+}
+
+static unsigned int max3100_get_mctrl(struct uart_port *port)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	/* may not be truly up-to-date */
+	max3100_dowork(s);
+	/* always assert DCD and DSR since these lines are not wired */
+	return (s->cts ? TIOCM_CTS : 0) | TIOCM_DSR | TIOCM_CAR;
+}
+
+static void max3100_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+	int rts;
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	rts = (mctrl & TIOCM_RTS) > 0;
+
+	spin_lock(&s->conf_lock);
+	if (s->rts != rts) {
+		s->rts = rts;
+		s->rts_commit = 1;
+		max3100_dowork(s);
+	}
+	spin_unlock(&s->conf_lock);
+}
+
+static void
+max3100_set_termios(struct uart_port *port, struct ktermios *termios,
+		    struct ktermios *old)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+	int baud = 0;
+	unsigned cflag;
+	u32 param_new, param_mask, parity = 0;
+	struct tty_struct *tty = s->port.info->port.tty;
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+	if (!tty)
+		return;
+
+	cflag = termios->c_cflag;
+	param_new = 0;
+	param_mask = 0;
+
+	baud = tty_get_baud_rate(tty);
+	param_new = s->conf & MAX3100_BAUD;
+	switch (baud) {
+	case 300:
+		if (s->crystal)
+			baud = s->baud;
+		else
+			param_new = 15;
+		break;
+	case 600:
+		param_new = 14 + s->crystal;
+		break;
+	case 1200:
+		param_new = 13 + s->crystal;
+		break;
+	case 2400:
+		param_new = 12 + s->crystal;
+		break;
+	case 4800:
+		param_new = 11 + s->crystal;
+		break;
+	case 9600:
+		param_new = 10 + s->crystal;
+		break;
+	case 19200:
+		param_new = 9 + s->crystal;
+		break;
+	case 38400:
+		param_new = 8 + s->crystal;
+		break;
+	case 57600:
+		param_new = 1 + s->crystal;
+		break;
+	case 115200:
+		param_new = 0 + s->crystal;
+		break;
+	case 230400:
+		if (s->crystal)
+			param_new = 0;
+		else
+			baud = s->baud;
+		break;
+	default:
+		baud = s->baud;
+	}
+	tty_encode_baud_rate(tty, baud, baud);
+	s->baud = baud;
+	param_mask |= MAX3100_BAUD;
+
+	if ((cflag & CSIZE) == CS8) {
+		param_new &= ~MAX3100_L;
+		parity &= ~MAX3100_7BIT;
+	} else {
+		param_new |= MAX3100_L;
+		parity |= MAX3100_7BIT;
+		cflag = (cflag & ~CSIZE) | CS7;
+	}
+	param_mask |= MAX3100_L;
+
+	if (cflag & CSTOPB)
+		param_new |= MAX3100_ST;
+	else
+		param_new &= ~MAX3100_ST;
+	param_mask |= MAX3100_ST;
+
+	if (cflag & PARENB) {
+		param_new |= MAX3100_PE;
+		parity |= MAX3100_PARITY_ON;
+	} else {
+		param_new &= ~MAX3100_PE;
+		parity &= ~MAX3100_PARITY_ON;
+	}
+	param_mask |= MAX3100_PE;
+
+	if (cflag & PARODD)
+		parity |= MAX3100_PARITY_ODD;
+	else
+		parity &= ~MAX3100_PARITY_ODD;
+
+	/* mask termios capabilities we don't support */
+	cflag &= ~CMSPAR;
+	termios->c_cflag = cflag;
+
+	s->port.ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		s->port.ignore_status_mask |=
+			MAX3100_STATUS_PE | MAX3100_STATUS_FE |
+			MAX3100_STATUS_OE;
+
+	/* we are sending char from a workqueue so enable */
+	s->port.info->port.tty->low_latency = 1;
+
+	if (s->poll_time > 0)
+		del_timer_sync(&s->timer);
+
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	spin_lock(&s->conf_lock);
+	s->conf = (s->conf & ~param_mask) | (param_new & param_mask);
+	s->conf_commit = 1;
+	s->parity = parity;
+	spin_unlock(&s->conf_lock);
+	max3100_dowork(s);
+
+	if (UART_ENABLE_MS(&s->port, termios->c_cflag))
+		max3100_enable_ms(&s->port);
+}
+
+static void max3100_shutdown(struct uart_port *port)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	if (s->suspending)
+		return;
+
+	s->force_end_work = 1;
+
+	if (s->poll_time > 0)
+		del_timer_sync(&s->timer);
+
+	if (s->workqueue) {
+		flush_workqueue(s->workqueue);
+		destroy_workqueue(s->workqueue);
+		s->workqueue = NULL;
+	}
+	if (s->irq)
+		free_irq(s->irq, s);
+
+	/* set shutdown mode to save power */
+	if (s->max3100_hw_suspend)
+		s->max3100_hw_suspend(1);
+	else  {
+		u16 tx, rx;
+
+		tx = MAX3100_WC | MAX3100_SHDN;
+		max3100_sr(s, tx, &rx);
+	}
+}
+
+static int max3100_startup(struct uart_port *port)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+	char b[12];
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	s->conf = MAX3100_RM;
+	s->baud = s->crystal ? 230400 : 115200;
+	s->rx_enabled = 1;
+
+	if (s->suspending)
+		return 0;
+
+	s->force_end_work = 0;
+	s->parity = 0;
+	s->rts = 0;
+
+	sprintf(b, "max3100-%d", s->minor);
+	s->workqueue = create_freezeable_workqueue(b);
+	if (!s->workqueue) {
+		dev_warn(&s->spi->dev, "cannot create workqueue\n");
+		return -EBUSY;
+	}
+	INIT_WORK(&s->work, max3100_work);
+
+	if (request_irq(s->irq, max3100_irq,
+			IRQF_TRIGGER_FALLING, "max3100", s) < 0) {
+		dev_warn(&s->spi->dev, "cannot allocate irq %d\n", s->irq);
+		s->irq = 0;
+		destroy_workqueue(s->workqueue);
+		s->workqueue = NULL;
+		return -EBUSY;
+	}
+
+	if (s->loopback) {
+		u16 tx, rx;
+		tx = 0x4001;
+		max3100_sr(s, tx, &rx);
+	}
+
+	if (s->max3100_hw_suspend)
+		s->max3100_hw_suspend(0);
+	s->conf_commit = 1;
+	max3100_dowork(s);
+	/* wait for clock to settle */
+	msleep(50);
+
+	max3100_enable_ms(&s->port);
+
+	return 0;
+}
+
+static const char *max3100_type(struct uart_port *port)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	return s->port.type == PORT_MAX3100 ? "MAX3100" : NULL;
+}
+
+static void max3100_release_port(struct uart_port *port)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+}
+
+static void max3100_config_port(struct uart_port *port, int flags)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	if (flags & UART_CONFIG_TYPE)
+		s->port.type = PORT_MAX3100;
+}
+
+static int max3100_verify_port(struct uart_port *port,
+			       struct serial_struct *ser)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+	int ret = -EINVAL;
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	if (ser->type == PORT_UNKNOWN || ser->type == PORT_MAX3100)
+		ret = 0;
+	return ret;
+}
+
+static void max3100_stop_tx(struct uart_port *port)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+}
+
+static int max3100_request_port(struct uart_port *port)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+	return 0;
+}
+
+static void max3100_break_ctl(struct uart_port *port, int break_state)
+{
+	struct max3100_port *s = container_of(port,
+					      struct max3100_port,
+					      port);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+}
+
+static struct uart_ops max3100_ops = {
+	.tx_empty	= max3100_tx_empty,
+	.set_mctrl	= max3100_set_mctrl,
+	.get_mctrl	= max3100_get_mctrl,
+	.stop_tx        = max3100_stop_tx,
+	.start_tx	= max3100_start_tx,
+	.stop_rx	= max3100_stop_rx,
+	.enable_ms      = max3100_enable_ms,
+	.break_ctl      = max3100_break_ctl,
+	.startup	= max3100_startup,
+	.shutdown	= max3100_shutdown,
+	.set_termios	= max3100_set_termios,
+	.type		= max3100_type,
+	.release_port   = max3100_release_port,
+	.request_port   = max3100_request_port,
+	.config_port	= max3100_config_port,
+	.verify_port	= max3100_verify_port,
+};
+
+static struct uart_driver max3100_uart_driver = {
+	.owner          = THIS_MODULE,
+	.driver_name    = "ttyMAX",
+	.dev_name       = "ttyMAX",
+	.major          = MAX3100_MAJOR,
+	.minor          = MAX3100_MINOR,
+	.nr             = MAX_MAX3100,
+};
+static int uart_driver_registered;
+
+static int __devinit max3100_probe(struct spi_device *spi)
+{
+	int i, retval;
+	struct plat_max3100 *pdata;
+	u16 tx, rx;
+
+	mutex_lock(&max3100s_lock);
+
+	if (!uart_driver_registered) {
+		uart_driver_registered = 1;
+		retval = uart_register_driver(&max3100_uart_driver);
+		if (retval) {
+			printk(KERN_ERR "Couldn't register max3100 uart driver\n");
+			mutex_unlock(&max3100s_lock);
+			return retval;
+		}
+	}
+
+	for (i = 0; i < MAX_MAX3100; i++)
+		if (!max3100s[i])
+			break;
+	if (i == MAX_MAX3100) {
+		dev_warn(&spi->dev, "too many MAX3100 chips\n");
+		mutex_unlock(&max3100s_lock);
+		return -ENOMEM;
+	}
+
+	max3100s[i] = kzalloc(sizeof(struct max3100_port), GFP_KERNEL);
+	if (!max3100s[i]) {
+		dev_warn(&spi->dev,
+			 "kmalloc for max3100 structure %d failed!\n", i);
+		mutex_unlock(&max3100s_lock);
+		return -ENOMEM;
+	}
+	max3100s[i]->spi = spi;
+	max3100s[i]->irq = spi->irq;
+	spin_lock_init(&max3100s[i]->conf_lock);
+	dev_set_drvdata(&spi->dev, max3100s[i]);
+	pdata = spi->dev.platform_data;
+	max3100s[i]->crystal = pdata->crystal;
+	max3100s[i]->loopback = pdata->loopback;
+	max3100s[i]->poll_time = pdata->poll_time * HZ / 1000;
+	if (pdata->poll_time > 0 && max3100s[i]->poll_time == 0)
+		max3100s[i]->poll_time = 1;
+	max3100s[i]->max3100_hw_suspend = pdata->max3100_hw_suspend;
+	max3100s[i]->minor = i;
+	init_timer(&max3100s[i]->timer);
+	max3100s[i]->timer.function = max3100_timeout;
+	max3100s[i]->timer.data = (unsigned long) max3100s[i];
+
+	dev_dbg(&spi->dev, "%s: adding port %d\n", __func__, i);
+	max3100s[i]->port.irq = max3100s[i]->irq;
+	max3100s[i]->port.uartclk = max3100s[i]->crystal ? 3686400 : 1843200;
+	max3100s[i]->port.fifosize = 16;
+	max3100s[i]->port.ops = &max3100_ops;
+	max3100s[i]->port.flags = UPF_SKIP_TEST | UPF_BOOT_AUTOCONF;
+	max3100s[i]->port.line = i;
+	max3100s[i]->port.type = PORT_MAX3100;
+	max3100s[i]->port.dev = &spi->dev;
+	retval = uart_add_one_port(&max3100_uart_driver, &max3100s[i]->port);
+	if (retval < 0)
+		dev_warn(&spi->dev,
+			 "uart_add_one_port failed for line %d with error %d\n",
+			 i, retval);
+
+	/* set shutdown mode to save power. Will be woken-up on open */
+	if (max3100s[i]->max3100_hw_suspend)
+		max3100s[i]->max3100_hw_suspend(1);
+	else {
+		tx = MAX3100_WC | MAX3100_SHDN;
+		max3100_sr(max3100s[i], tx, &rx);
+	}
+	mutex_unlock(&max3100s_lock);
+	return 0;
+}
+
+static int __devexit max3100_remove(struct spi_device *spi)
+{
+	struct max3100_port *s = dev_get_drvdata(&spi->dev);
+	int i;
+
+	mutex_lock(&max3100s_lock);
+
+	/* find out the index for the chip we are removing */
+	for (i = 0; i < MAX_MAX3100; i++)
+		if (max3100s[i] == s)
+			break;
+
+	dev_dbg(&spi->dev, "%s: removing port %d\n", __func__, i);
+	uart_remove_one_port(&max3100_uart_driver, &max3100s[i]->port);
+	kfree(max3100s[i]);
+	max3100s[i] = NULL;
+
+	/* check if this is the last chip we have */
+	for (i = 0; i < MAX_MAX3100; i++)
+		if (max3100s[i]) {
+			mutex_unlock(&max3100s_lock);
+			return 0;
+		}
+	pr_debug("removing max3100 driver\n");
+	uart_unregister_driver(&max3100_uart_driver);
+
+	mutex_unlock(&max3100s_lock);
+	return 0;
+}
+
+#ifdef CONFIG_PM
+
+static int max3100_suspend(struct spi_device *spi, pm_message_t state)
+{
+	struct max3100_port *s = dev_get_drvdata(&spi->dev);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	disable_irq(s->irq);
+
+	s->suspending = 1;
+	uart_suspend_port(&max3100_uart_driver, &s->port);
+
+	if (s->max3100_hw_suspend)
+		s->max3100_hw_suspend(1);
+	else {
+		/* no HW suspend, so do SW one */
+		u16 tx, rx;
+
+		tx = MAX3100_WC | MAX3100_SHDN;
+		max3100_sr(s, tx, &rx);
+	}
+	return 0;
+}
+
+static int max3100_resume(struct spi_device *spi)
+{
+	struct max3100_port *s = dev_get_drvdata(&spi->dev);
+
+	dev_dbg(&s->spi->dev, "%s\n", __func__);
+
+	if (s->max3100_hw_suspend)
+		s->max3100_hw_suspend(0);
+	uart_resume_port(&max3100_uart_driver, &s->port);
+	s->suspending = 0;
+
+	enable_irq(s->irq);
+
+	s->conf_commit = 1;
+	if (s->workqueue)
+		max3100_dowork(s);
+
+	return 0;
+}
+
+#else
+#define max3100_suspend NULL
+#define max3100_resume  NULL
+#endif
+
+static struct spi_driver max3100_driver = {
+	.driver = {
+		.name		= "max3100",
+		.bus		= &spi_bus_type,
+		.owner		= THIS_MODULE,
+	},
+
+	.probe		= max3100_probe,
+	.remove		= __devexit_p(max3100_remove),
+	.suspend	= max3100_suspend,
+	.resume		= max3100_resume,
+};
+
+static int __init max3100_init(void)
+{
+	return spi_register_driver(&max3100_driver);
+}
+module_init(max3100_init);
+
+static void __exit max3100_exit(void)
+{
+	spi_unregister_driver(&max3100_driver);
+}
+module_exit(max3100_exit);
+
+MODULE_DESCRIPTION("MAX3100 driver");
+MODULE_AUTHOR("Christian Pellegrin <chripell@evolware.org>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/serial_max3100.h b/include/linux/serial_max3100.h
new file mode 100644
index 000000000000..4976befb6aeb
--- /dev/null
+++ b/include/linux/serial_max3100.h
@@ -0,0 +1,52 @@
+/*
+ *
+ *  Copyright (C) 2007 Christian Pellegrin
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+
+#ifndef _LINUX_SERIAL_MAX3100_H
+#define _LINUX_SERIAL_MAX3100_H 1
+
+
+/**
+ * struct plat_max3100 - MAX3100 SPI UART platform data
+ * @loopback:            force MAX3100 in loopback
+ * @crystal:             1 for 3.6864 Mhz, 0 for 1.8432
+ * @max3100_hw_suspend:  MAX3100 has a shutdown pin. This is a hook
+ *                       called on suspend and resume to activate it.
+ * @poll_time:           poll time for CTS signal in ms, 0 disables (so no hw
+ *                       flow ctrl is possible but you have less CPU usage)
+ *
+ * You should use this structure in your machine description to specify
+ * how the MAX3100 is connected. Example:
+ *
+ * static struct plat_max3100 max3100_plat_data = {
+ *  .loopback = 0,
+ *  .crystal = 0,
+ *  .poll_time = 100,
+ * };
+ *
+ * static struct spi_board_info spi_board_info[] = {
+ * {
+ *  .modalias	= "max3100",
+ *  .platform_data	= &max3100_plat_data,
+ *  .irq		= IRQ_EINT12,
+ *  .max_speed_hz	= 5*1000*1000,
+ *  .chip_select	= 0,
+ * },
+ * };
+ *
+ **/
+struct plat_max3100 {
+	int loopback;
+	int crystal;
+	void (*max3100_hw_suspend) (int suspend);
+	int poll_time;
+};
+
+#endif
-- 
cgit v1.2.3


From 692d0eb9e02cf81fb387ff891f53840db2f3110a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 9 Apr 2009 00:27:13 +0100
Subject: dm: remove limited barrier support

Prepare for full barrier implementation: first remove the restricted support.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-linear.c        |  1 -
 drivers/md/dm-table.c         | 19 -------------------
 drivers/md/dm.c               | 15 ++++++++++-----
 drivers/md/dm.h               |  1 -
 include/linux/device-mapper.h |  1 -
 5 files changed, 10 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index bfa107f59d96..79fb53e51c70 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,7 +142,6 @@ static struct target_type linear_target = {
 	.status = linear_status,
 	.ioctl  = linear_ioctl,
 	.merge  = linear_merge,
-	.features = DM_TARGET_SUPPORTS_BARRIERS,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 02d0b489fad6..429b50b975d5 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -52,8 +52,6 @@ struct dm_table {
 	sector_t *highs;
 	struct dm_target *targets;
 
-	unsigned barriers_supported:1;
-
 	/*
 	 * Indicates the rw permissions for the new logical
 	 * device.  This should be a combination of FMODE_READ
@@ -243,7 +241,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 
 	INIT_LIST_HEAD(&t->devices);
 	atomic_set(&t->holders, 0);
-	t->barriers_supported = 1;
 
 	if (!num_targets)
 		num_targets = KEYS_PER_NODE;
@@ -751,10 +748,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 	/* FIXME: the plan is to combine high here and then have
 	 * the merge fn apply the target level restrictions. */
 	combine_restrictions_low(&t->limits, &tgt->limits);
-
-	if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS))
-		t->barriers_supported = 0;
-
 	return 0;
 
  bad:
@@ -799,12 +792,6 @@ int dm_table_complete(struct dm_table *t)
 
 	check_for_valid_limits(&t->limits);
 
-	/*
-	 * We only support barriers if there is exactly one underlying device.
-	 */
-	if (!list_is_singular(&t->devices))
-		t->barriers_supported = 0;
-
 	/* how many indexes will the btree have ? */
 	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
 	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -1059,12 +1046,6 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
 	return t->md;
 }
 
-int dm_table_barrier_ok(struct dm_table *t)
-{
-	return t->barriers_supported;
-}
-EXPORT_SYMBOL(dm_table_barrier_ok);
-
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 25d86e2c01f2..ab3b5d84df65 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -851,11 +851,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 		bio_io_error(bio);
 		return;
 	}
-	if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
-		dm_table_put(ci.map);
-		bio_endio(bio, -EOPNOTSUPP);
-		return;
-	}
+
 	ci.md = md;
 	ci.bio = bio;
 	ci.io = alloc_io(md);
@@ -937,6 +933,15 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	struct mapped_device *md = q->queuedata;
 	int cpu;
 
+	/*
+	 * There is no use in forwarding any barrier request since we can't
+	 * guarantee it is (or can be) handled by the targets correctly.
+	 */
+	if (unlikely(bio_barrier(bio))) {
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
+	}
+
 	down_read(&md->io_lock);
 
 	cpu = part_stat_lock();
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b48397c0abbd..a31506d93e91 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -52,7 +52,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
  * To check the return value from dm_table_find_target().
  */
 #define dm_target_is_valid(t) ((t)->table)
-int dm_table_barrier_ok(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 66ec05a57955..ded2d7c42668 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -116,7 +116,6 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d);
 /*
  * Target features
  */
-#define DM_TARGET_SUPPORTS_BARRIERS 0x00000001
 
 struct target_type {
 	uint64_t features;
-- 
cgit v1.2.3


From 47788c58e66c050982241d9a05eb690daceb05a9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 8 Apr 2009 20:40:59 +0200
Subject: tracing/syscalls: use a dedicated file header

Impact: fix build warnings and possibe compat misbehavior on IA64

Building a kernel on ia64 might trigger these ugly build warnings:

CC      arch/ia64/ia32/sys_ia32.o
In file included from arch/ia64/ia32/sys_ia32.c:55:
arch/ia64/ia32/ia32priv.h:290:1: warning: "elf_check_arch" redefined
In file included from include/linux/elf.h:7,
                 from include/linux/module.h:14,
                 from include/linux/ftrace.h:8,
                 from include/linux/syscalls.h:68,
                 from arch/ia64/ia32/sys_ia32.c:18:
arch/ia64/include/asm/elf.h:19:1: warning: this is the location of the previous definition
[...]

sys_ia32.c includes linux/syscalls.h which in turn includes linux/ftrace.h
to import the syscalls tracing prototypes.

But including ftrace.h can pull too much things for a low level file,
especially on ia64 where the ia32 private headers conflict with higher
level headers.

Now we isolate the syscall tracing headers in their own lightweight file.

Reported-by: Tony Luck <tony.luck@intel.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jason Baron <jbaron@redhat.com>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Jiaying Zhang <jiayingz@google.com>
Cc: Michael Rubin <mrubin@google.com>
Cc: Martin Bligh <mbligh@google.com>
Cc: Michael Davidson <md@google.com>
LKML-Reference: <20090408184058.GB6017@nowhere>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c      |  2 ++
 arch/x86/kernel/ptrace.c      |  3 ++-
 include/linux/ftrace.h        | 29 -----------------------------
 include/linux/syscalls.h      |  2 +-
 include/trace/syscall.h       | 35 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_syscalls.c |  2 +-
 6 files changed, 41 insertions(+), 32 deletions(-)
 create mode 100644 include/trace/syscall.h

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 70a10ca100f6..18dfa30795c9 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -18,6 +18,8 @@
 #include <linux/init.h>
 #include <linux/list.h>
 
+#include <trace/syscall.h>
+
 #include <asm/cacheflush.h>
 #include <asm/ftrace.h>
 #include <asm/nops.h>
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index fe9345c967de..23b7c8f017e2 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,7 +21,6 @@
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/signal.h>
-#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -35,6 +34,8 @@
 #include <asm/proto.h>
 #include <asm/ds.h>
 
+#include <trace/syscall.h>
+
 #include "tls.h"
 
 enum x86_regset {
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ff112a872d75..8a0c2f221e6b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -511,33 +511,4 @@ static inline void trace_hw_branch_oops(void) {}
 
 #endif /* CONFIG_HW_BRANCH_TRACER */
 
-/*
- * A syscall entry in the ftrace syscalls array.
- *
- * @name: name of the syscall
- * @nb_args: number of parameters it takes
- * @types: list of types as strings
- * @args: list of args as strings (args[i] matches types[i])
- */
-struct syscall_metadata {
-	const char	*name;
-	int		nb_args;
-	const char	**types;
-	const char	**args;
-};
-
-#ifdef CONFIG_FTRACE_SYSCALLS
-extern void arch_init_ftrace_syscalls(void);
-extern struct syscall_metadata *syscall_nr_to_meta(int nr);
-extern void start_ftrace_syscalls(void);
-extern void stop_ftrace_syscalls(void);
-extern void ftrace_syscall_enter(struct pt_regs *regs);
-extern void ftrace_syscall_exit(struct pt_regs *regs);
-#else
-static inline void start_ftrace_syscalls(void) { }
-static inline void stop_ftrace_syscalls(void) { }
-static inline void ftrace_syscall_enter(struct pt_regs *regs) { }
-static inline void ftrace_syscall_exit(struct pt_regs *regs) { }
-#endif
-
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 6470f74074af..dabe4ad89141 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -65,7 +65,7 @@ struct old_linux_dirent;
 #include <asm/signal.h>
 #include <linux/quota.h>
 #include <linux/key.h>
-#include <linux/ftrace.h>
+#include <trace/syscall.h>
 
 #define __SC_DECL1(t1, a1)	t1 a1
 #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
new file mode 100644
index 000000000000..8cfe515cbc47
--- /dev/null
+++ b/include/trace/syscall.h
@@ -0,0 +1,35 @@
+#ifndef _TRACE_SYSCALL_H
+#define _TRACE_SYSCALL_H
+
+#include <asm/ptrace.h>
+
+/*
+ * A syscall entry in the ftrace syscalls array.
+ *
+ * @name: name of the syscall
+ * @nb_args: number of parameters it takes
+ * @types: list of types as strings
+ * @args: list of args as strings (args[i] matches types[i])
+ */
+struct syscall_metadata {
+	const char	*name;
+	int		nb_args;
+	const char	**types;
+	const char	**args;
+};
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+extern void arch_init_ftrace_syscalls(void);
+extern struct syscall_metadata *syscall_nr_to_meta(int nr);
+extern void start_ftrace_syscalls(void);
+extern void stop_ftrace_syscalls(void);
+extern void ftrace_syscall_enter(struct pt_regs *regs);
+extern void ftrace_syscall_exit(struct pt_regs *regs);
+#else
+static inline void start_ftrace_syscalls(void)			{ }
+static inline void stop_ftrace_syscalls(void)			{ }
+static inline void ftrace_syscall_enter(struct pt_regs *regs)	{ }
+static inline void ftrace_syscall_exit(struct pt_regs *regs)	{ }
+#endif
+
+#endif /* _TRACE_SYSCALL_H */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a2a3af29c943..5e579645ac86 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,5 @@
+#include <trace/syscall.h>
 #include <linux/kernel.h>
-#include <linux/ftrace.h>
 #include <asm/syscall.h>
 
 #include "trace_output.h"
-- 
cgit v1.2.3


From e3c8ca8336707062f3f7cb1cd7e6b3c753baccdd Mon Sep 17 00:00:00 2001
From: Nathan Lynch <ntl@pobox.com>
Date: Wed, 8 Apr 2009 19:45:12 -0500
Subject: sched: do not count frozen tasks toward load

Freezing tasks via the cgroup freezer causes the load average to climb
because the freezer's current implementation puts frozen tasks in
uninterruptible sleep (D state).

Some applications which perform job-scheduling functions consult the
load average when making decisions.  If a cgroup is frozen, the load
average does not provide a useful measure of the system's utilization
to such applications.  This is especially inconvenient if the job
scheduler employs the cgroup freezer as a mechanism for preempting low
priority jobs.  Contrast this with using SIGSTOP for the same purpose:
the stopped tasks do not count toward system load.

Change task_contributes_to_load() to return false if the task is
frozen.  This results in /proc/loadavg behavior that better meets
users' expectations.

Signed-off-by: Nathan Lynch <ntl@pobox.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Nigel Cunningham <nigel@tuxonice.net>
Tested-by: Nigel Cunningham <nigel@tuxonice.net>
Cc: <stable@kernel.org>
Cc: containers@lists.linux-foundation.org
Cc: linux-pm@lists.linux-foundation.org
Cc: Matt Helsley <matthltc@us.ibm.com>
LKML-Reference: <20090408194512.47a99b95@manatee.lan>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 98e1fe51601d..b4c38bc8049c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -205,7 +205,8 @@ extern unsigned long long time_sync_thresh;
 #define task_is_stopped_or_traced(task)	\
 			((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
 #define task_contributes_to_load(task)	\
-				((task->state & TASK_UNINTERRUPTIBLE) != 0)
+				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
+				 (task->flags & PF_FROZEN) == 0)
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
-- 
cgit v1.2.3


From 97c18e2c7a8e36d2d83d50ee070314aadac73a11 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 9 Apr 2009 10:35:47 +0800
Subject: module: try_then_request_module must wait

Since the whole point of try_then_request_module is to retry
the operation after a module has been loaded, we must wait for
the module to fully load.

Otherwise all sort of things start breaking, e.g., you won't
be able to read your encrypted disks on the first attempt.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Tested-by: Maciej Rutecki <maciej.rutecki@gmail.com>
Tested-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmod.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index d5fa565086d1..384ca8bbf1ac 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -34,7 +34,7 @@ extern int __request_module(bool wait, const char *name, ...) \
 #define request_module(mod...) __request_module(true, mod)
 #define request_module_nowait(mod...) __request_module(false, mod)
 #define try_then_request_module(x, mod...) \
-	((x) ?: (__request_module(false, mod), (x)))
+	((x) ?: (__request_module(true, mod), (x)))
 #else
 static inline int request_module(const char *name, ...) { return -ENOSYS; }
 static inline int request_module_nowait(const char *name, ...) { return -ENOSYS; }
-- 
cgit v1.2.3


From 8f7c2c37319a81ef4c2bfdec67b1ccd5744d97e4 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 8 Apr 2009 16:58:57 +0800
Subject: Make __stringify support variable argument macros too

For example:

  __stringify(__entry->irq, __entry->ret)

will now convert it to:

  "REC->irq, REC->ret"

It also still supports single arguments as the old macro did.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49DC6751.30308@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/stringify.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stringify.h b/include/linux/stringify.h
index 0b4388356c87..841cec8ed525 100644
--- a/include/linux/stringify.h
+++ b/include/linux/stringify.h
@@ -6,7 +6,7 @@
  * converts to "bar".
  */
 
-#define __stringify_1(x)	#x
-#define __stringify(x)		__stringify_1(x)
+#define __stringify_1(x...)	#x
+#define __stringify(x...)	__stringify_1(x)
 
 #endif	/* !__LINUX_STRINGIFY_H */
-- 
cgit v1.2.3


From 066123a535927b3f17cac2305258cc71abdb0d92 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Apr 2009 12:02:40 -0700
Subject: percpu: unbreak alpha percpu

For the time being, move the generic percpu_*() accessors to
linux/percpu.h.

asm-generic/percpu.h is meant to carry generic stuff for low level
stuff - declarations, definitions and pointer offset calculation
and so on but not for generic interface.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/percpu.h | 52 --------------------------------------------
 include/linux/percpu.h       | 52 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 00f45ff081a6..b0e63c672ebd 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,56 +80,4 @@ extern void setup_per_cpu_areas(void);
 #define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
 					__typeof__(type) per_cpu_var(name)
 
-/*
- * Optional methods for optimized non-lvalue per-cpu variable access.
- *
- * @var can be a percpu variable or a field of it and its size should
- * equal char, int or long.  percpu_read() evaluates to a lvalue and
- * all others to void.
- *
- * These operations are guaranteed to be atomic w.r.t. preemption.
- * The generic versions use plain get/put_cpu_var().  Archs are
- * encouraged to implement single-instruction alternatives which don't
- * require preemption protection.
- */
-#ifndef percpu_read
-# define percpu_read(var)						\
-  ({									\
-	typeof(per_cpu_var(var)) __tmp_var__;				\
-	__tmp_var__ = get_cpu_var(var);					\
-	put_cpu_var(var);						\
-	__tmp_var__;							\
-  })
-#endif
-
-#define __percpu_generic_to_op(var, val, op)				\
-do {									\
-	get_cpu_var(var) op val;					\
-	put_cpu_var(var);						\
-} while (0)
-
-#ifndef percpu_write
-# define percpu_write(var, val)		__percpu_generic_to_op(var, (val), =)
-#endif
-
-#ifndef percpu_add
-# define percpu_add(var, val)		__percpu_generic_to_op(var, (val), +=)
-#endif
-
-#ifndef percpu_sub
-# define percpu_sub(var, val)		__percpu_generic_to_op(var, (val), -=)
-#endif
-
-#ifndef percpu_and
-# define percpu_and(var, val)		__percpu_generic_to_op(var, (val), &=)
-#endif
-
-#ifndef percpu_or
-# define percpu_or(var, val)		__percpu_generic_to_op(var, (val), |=)
-#endif
-
-#ifndef percpu_xor
-# define percpu_xor(var, val)		__percpu_generic_to_op(var, (val), ^=)
-#endif
-
 #endif /* _ASM_GENERIC_PERCPU_H_ */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index ee5615d65211..cfda2d5ad319 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -168,4 +168,56 @@ static inline void free_percpu(void *p)
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type), \
 						       __alignof__(type))
 
+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access.
+ *
+ * @var can be a percpu variable or a field of it and its size should
+ * equal char, int or long.  percpu_read() evaluates to a lvalue and
+ * all others to void.
+ *
+ * These operations are guaranteed to be atomic w.r.t. preemption.
+ * The generic versions use plain get/put_cpu_var().  Archs are
+ * encouraged to implement single-instruction alternatives which don't
+ * require preemption protection.
+ */
+#ifndef percpu_read
+# define percpu_read(var)						\
+  ({									\
+	typeof(per_cpu_var(var)) __tmp_var__;				\
+	__tmp_var__ = get_cpu_var(var);					\
+	put_cpu_var(var);						\
+	__tmp_var__;							\
+  })
+#endif
+
+#define __percpu_generic_to_op(var, val, op)				\
+do {									\
+	get_cpu_var(var) op val;					\
+	put_cpu_var(var);						\
+} while (0)
+
+#ifndef percpu_write
+# define percpu_write(var, val)		__percpu_generic_to_op(var, (val), =)
+#endif
+
+#ifndef percpu_add
+# define percpu_add(var, val)		__percpu_generic_to_op(var, (val), +=)
+#endif
+
+#ifndef percpu_sub
+# define percpu_sub(var, val)		__percpu_generic_to_op(var, (val), -=)
+#endif
+
+#ifndef percpu_and
+# define percpu_and(var, val)		__percpu_generic_to_op(var, (val), &=)
+#endif
+
+#ifndef percpu_or
+# define percpu_or(var, val)		__percpu_generic_to_op(var, (val), |=)
+#endif
+
+#ifndef percpu_xor
+# define percpu_xor(var, val)		__percpu_generic_to_op(var, (val), ^=)
+#endif
+
 #endif /* __LINUX_PERCPU_H */
-- 
cgit v1.2.3


From fd746d540abf8c686f5f868ae62112692e684088 Mon Sep 17 00:00:00 2001
From: Eric Miao <eric.miao@marvell.com>
Date: Sat, 11 Apr 2009 16:54:59 -0700
Subject: Input: ads7846 - introduce platform specific way to synchronize
 sampling

Noises can be introduced when LCD signals are being driven, some platforms
provide a signal to assist the synchronization of this sampling procedure.

Signed-off-by: Eric Miao <eric.miao@marvell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/ads7846.c | 10 ++++++++++
 include/linux/spi/ads7846.h         |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index 7c27c8b9b6d0..cf7e69766b2b 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -127,6 +127,8 @@ struct ads7846 {
 	void			(*filter_cleanup)(void *data);
 	int			(*get_pendown_state)(void);
 	int			gpio_pendown;
+
+	void			(*wait_for_sync)(void);
 };
 
 /* leave chip selected when we're done, for quicker re-select? */
@@ -511,6 +513,10 @@ static int get_pendown_state(struct ads7846 *ts)
 	return !gpio_get_value(ts->gpio_pendown);
 }
 
+static void null_wait_for_sync(void)
+{
+}
+
 /*
  * PENIRQ only kicks the timer.  The timer only reissues the SPI transfer,
  * to retrieve touchscreen status.
@@ -686,6 +692,7 @@ static void ads7846_rx_val(void *ads)
 	default:
 		BUG();
 	}
+	ts->wait_for_sync();
 	status = spi_async(ts->spi, m);
 	if (status)
 		dev_err(&ts->spi->dev, "spi_async --> %d\n",
@@ -723,6 +730,7 @@ static enum hrtimer_restart ads7846_timer(struct hrtimer *handle)
 	} else {
 		/* pen is still down, continue with the measurement */
 		ts->msg_idx = 0;
+		ts->wait_for_sync();
 		status = spi_async(ts->spi, &ts->msg[0]);
 		if (status)
 			dev_err(&ts->spi->dev, "spi_async --> %d\n", status);
@@ -947,6 +955,8 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 		ts->penirq_recheck_delay_usecs =
 				pdata->penirq_recheck_delay_usecs;
 
+	ts->wait_for_sync = pdata->wait_for_sync ? : null_wait_for_sync;
+
 	snprintf(ts->phys, sizeof(ts->phys), "%s/input0", dev_name(&spi->dev));
 
 	input_dev->name = "ADS784x Touchscreen";
diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
index 05eab2f11e63..2ea20320c093 100644
--- a/include/linux/spi/ads7846.h
+++ b/include/linux/spi/ads7846.h
@@ -51,5 +51,6 @@ struct ads7846_platform_data {
 				 void **filter_data);
 	int	(*filter)	(void *filter_data, int data_idx, int *val);
 	void	(*filter_cleanup)(void *filter_data);
+	void	(*wait_for_sync)(void);
 };
 
-- 
cgit v1.2.3


From 9eeba6138cefc0435695463ddadb0d95e0a6bcd2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 11 Apr 2009 03:17:17 +0200
Subject: lockdep: warn about lockdep disabling after kernel taint

Impact: provide useful missing info for developers

Kernel taint can occur in several situations such as warnings,
load of prorietary or staging modules, bad page, etc...

But when such taint happens, a developer might still be working on
the kernel, expecting that lockdep is still enabled. But a taint
disables lockdep without ever warning about it.
Such a kernel behaviour doesn't really help for kernel development.

This patch adds this missing warning.

Since the taint is done most of the time after the main message that
explain the real source issue, it seems safe to warn about it inside
add_taint() so that it appears at last, without hurting the main
information.

v2: Use a generic helper to disable lockdep instead of an
    open coded xchg().

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1239412638-6739-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/debug_locks.h |  7 +++++++
 kernel/panic.c              | 10 ++++++++--
 lib/debug_locks.c           |  2 +-
 3 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 096476f1fb35..493dedb7a67b 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -2,12 +2,19 @@
 #define __LINUX_DEBUG_LOCKING_H
 
 #include <linux/kernel.h>
+#include <asm/atomic.h>
 
 struct task_struct;
 
 extern int debug_locks;
 extern int debug_locks_silent;
 
+
+static inline int __debug_locks_off(void)
+{
+	return xchg(&debug_locks, 0);
+}
+
 /*
  * Generic 'turn off all lock debugging' function:
  */
diff --git a/kernel/panic.c b/kernel/panic.c
index 3fd8c5bf8b39..940ca14f6dbf 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -213,8 +213,14 @@ unsigned long get_taint(void)
 
 void add_taint(unsigned flag)
 {
-	/* can't trust the integrity of the kernel anymore: */
-	debug_locks = 0;
+	/*
+	 * Can't trust the integrity of the kernel anymore.
+	 * We don't call directly debug_locks_off() because the issue
+	 * is not necessarily serious enough to set oops_in_progress to 1
+	 */
+	if (__debug_locks_off())
+		printk(KERN_WARNING "Disabling lockdep due to kernel taint\n");
+
 	set_bit(flag, &tainted_mask);
 }
 EXPORT_SYMBOL(add_taint);
diff --git a/lib/debug_locks.c b/lib/debug_locks.c
index 0218b4693dd8..bc3b11731b9c 100644
--- a/lib/debug_locks.c
+++ b/lib/debug_locks.c
@@ -36,7 +36,7 @@ int debug_locks_silent;
  */
 int debug_locks_off(void)
 {
-	if (xchg(&debug_locks, 0)) {
+	if (__debug_locks_off()) {
 		if (!debug_locks_silent) {
 			oops_in_progress = 1;
 			console_verbose();
-- 
cgit v1.2.3


From c758e8cffe3b1bc7970d579371db01b19ff440bf Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 13 Apr 2009 17:02:14 +0200
Subject: i2c: Fix sparse warnings for I2C_BOARD_INFO()

Since the first argument to I2C_BOARD_INFO() must be a string constant,
there is no need to parenthesise it, and adding parentheses results in
an invalid initialiser for char[].  gcc obviously accepts this syntax as
an extension, but sparse complains, e.g.:

drivers/net/sfc/boards.c:173:2: warning: array initialized from parenthesized string constant

Therefore, remove the parentheses.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 include/linux/i2c.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 00ee11eb9092..ad2580596033 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -274,7 +274,7 @@ struct i2c_board_info {
  * are provided using conventional syntax.
  */
 #define I2C_BOARD_INFO(dev_type, dev_addr) \
-	.type = (dev_type), .addr = (dev_addr)
+	.type = dev_type, .addr = (dev_addr)
 
 
 /* Add-on boards should register/unregister their devices; e.g. a board
-- 
cgit v1.2.3


From 0ad30b8fd5fe798aae80df6344b415d8309342cc Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 13 Apr 2009 09:56:14 -0500
Subject: add some long-missing capabilities to fs_mask

When POSIX capabilities were introduced during the 2.1 Linux
cycle, the fs mask, which represents the capabilities which having
fsuid==0 is supposed to grant, did not include CAP_MKNOD and
CAP_LINUX_IMMUTABLE.  However, before capabilities the privilege
to call these did in fact depend upon fsuid==0.

This patch introduces those capabilities into the fsmask,
restoring the old behavior.

See the thread starting at http://lkml.org/lkml/2009/3/11/157 for
reference.

Note that if this fix is deemed valid, then earlier kernel versions (2.4
and 2.2) ought to be fixed too.

Changelog:
	[Mar 23] Actually delete old CAP_FS_SET definition...
	[Mar 20] Updated against J. Bruce Fields's patch

Reported-by: Igor Zhbanov <izh1979@gmail.com>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: stable@kernel.org
Cc: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/capability.h | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 4864a43b2b45..c3021105edc0 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -377,7 +377,21 @@ struct cpu_vfs_cap_data {
 #define CAP_FOR_EACH_U32(__capi)  \
 	for (__capi = 0; __capi < _KERNEL_CAPABILITY_U32S; ++__capi)
 
+/*
+ * CAP_FS_MASK and CAP_NFSD_MASKS:
+ *
+ * The fs mask is all the privileges that fsuid==0 historically meant.
+ * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE.
+ *
+ * It has never meant setting security.* and trusted.* xattrs.
+ *
+ * We could also define fsmask as follows:
+ *   1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions
+ *   2. The security.* and trusted.* xattrs are fs-related MAC permissions
+ */
+
 # define CAP_FS_MASK_B0     (CAP_TO_MASK(CAP_CHOWN)		\
+			    | CAP_TO_MASK(CAP_MKNOD)		\
 			    | CAP_TO_MASK(CAP_DAC_OVERRIDE)	\
 			    | CAP_TO_MASK(CAP_DAC_READ_SEARCH)	\
 			    | CAP_TO_MASK(CAP_FOWNER)		\
@@ -392,11 +406,12 @@ struct cpu_vfs_cap_data {
 # define CAP_EMPTY_SET    ((kernel_cap_t){{ 0, 0 }})
 # define CAP_FULL_SET     ((kernel_cap_t){{ ~0, ~0 }})
 # define CAP_INIT_EFF_SET ((kernel_cap_t){{ ~CAP_TO_MASK(CAP_SETPCAP), ~0 }})
-# define CAP_FS_SET       ((kernel_cap_t){{ CAP_FS_MASK_B0, CAP_FS_MASK_B1 } })
+# define CAP_FS_SET       ((kernel_cap_t){{ CAP_FS_MASK_B0 \
+				    | CAP_TO_MASK(CAP_LINUX_IMMUTABLE), \
+				    CAP_FS_MASK_B1 } })
 # define CAP_NFSD_SET     ((kernel_cap_t){{ CAP_FS_MASK_B0 \
-					    | CAP_TO_MASK(CAP_SYS_RESOURCE) \
-					    | CAP_TO_MASK(CAP_MKNOD), \
-					    CAP_FS_MASK_B1 } })
+				    | CAP_TO_MASK(CAP_SYS_RESOURCE), \
+				    CAP_FS_MASK_B1 } })
 
 #endif /* _KERNEL_CAPABILITY_U32S != 2 */
 
-- 
cgit v1.2.3


From a8729eb302a5b5da8b0b4d29582c42648a2e0f12 Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Tue, 7 Apr 2009 02:01:42 +0000
Subject: phylib: Allow early-out in phy_change

Marvell 88E1121R Dual PHY device can be hardware-configured
to use shared interrupt pin for both PHY ports. For such
PHY configurations using shared PHY interrupt phy_interrupt()
handler will also schedule a work for PHY port which didn't
cause an interrupt.

This patch adds a possibility for PHY drivers to provide
did_interrupt() function which reports if the PHY (or a PHY
port in a multi-PHY device) generated an interrupt. This
function is called in phy_change() as phy_change() shouldn't
proceed if it is invoked for a PHY which didn't cause an
interrupt. So check for interrupt originator in phy_change()
to allow early-out.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 9 +++++++++
 include/linux/phy.h   | 6 ++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 3ff1f425f1bb..e3b8932d7d74 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -655,6 +655,10 @@ static void phy_change(struct work_struct *work)
 	struct phy_device *phydev =
 		container_of(work, struct phy_device, phy_queue);
 
+	if (phydev->drv->did_interrupt &&
+	    !phydev->drv->did_interrupt(phydev))
+		goto ignore;
+
 	err = phy_disable_interrupts(phydev);
 
 	if (err)
@@ -681,6 +685,11 @@ static void phy_change(struct work_struct *work)
 
 	return;
 
+ignore:
+	atomic_dec(&phydev->irq_disable);
+	enable_irq(phydev->irq);
+	return;
+
 irq_enable_err:
 	disable_irq(phydev->irq);
 	atomic_inc(&phydev->irq_disable);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 32cf14a4b034..97e40cb6b588 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -388,6 +388,12 @@ struct phy_driver {
 	/* Enables or disables interrupts */
 	int (*config_intr)(struct phy_device *phydev);
 
+	/*
+	 * Checks if the PHY generated an interrupt.
+	 * For multi-PHY devices with shared PHY interrupt pin
+	 */
+	int (*did_interrupt)(struct phy_device *phydev);
+
 	/* Clears up any memory if needed */
 	void (*remove)(struct phy_device *phydev);
 
-- 
cgit v1.2.3


From ebde441177da3bad156701d351509f34295282ab Mon Sep 17 00:00:00 2001
From: Michal Januszewski <spock@gentoo.org>
Date: Mon, 13 Apr 2009 14:39:41 -0700
Subject: fbdev: fix color component field length documentation

The documentation about the meaning of the color component bitfield
lengths in pseudocolor modes is inconsistent.  Fix it, so that it
indicates the correct interpretation everywhere, i.e.  that 1 << length is
the number of palette entries.

Signed-off-by: Michal Januszewski <spock@gentoo.org>
Acked-by: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: <syrjala@sci.fi>
Acked-by: Geert Uytterhoeven <geert.uytterhoeven@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/skeletonfb.c |  8 +++++---
 drivers/video/vfb.c        | 11 +++++++----
 include/linux/fb.h         |  8 ++++++--
 3 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/skeletonfb.c b/drivers/video/skeletonfb.c
index a439159204a8..89158bc71da2 100644
--- a/drivers/video/skeletonfb.c
+++ b/drivers/video/skeletonfb.c
@@ -308,9 +308,11 @@ static int xxxfb_setcolreg(unsigned regno, unsigned red, unsigned green,
      *   color depth = SUM(var->{color}.length)
      *
      * Pseudocolor:
-     *    var->{color}.offset is 0
-     *    var->{color}.length contains width of DAC or the number of unique
-     *                        colors available (color depth)
+     *    var->{color}.offset is 0 unless the palette index takes less than
+     *                        bits_per_pixel bits and is stored in the upper
+     *                        bits of the pixel value
+     *    var->{color}.length is set so that 1 << length is the number of
+     *                        available palette entries
      *    pseudo_palette is not used
      *    RAMDAC[X] is programmed to (red, green, blue)
      *    color depth = var->{color}.length
diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c
index cc919ae46571..050d432c7d95 100644
--- a/drivers/video/vfb.c
+++ b/drivers/video/vfb.c
@@ -318,13 +318,16 @@ static int vfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
 	 *   {hardwarespecific} contains width of RAMDAC
 	 *   cmap[X] is programmed to (X << red.offset) | (X << green.offset) | (X << blue.offset)
 	 *   RAMDAC[X] is programmed to (red, green, blue)
-	 * 
+	 *
 	 * Pseudocolor:
-	 *    uses offset = 0 && length = RAMDAC register width.
-	 *    var->{color}.offset is 0
-	 *    var->{color}.length contains widht of DAC
+	 *    var->{color}.offset is 0 unless the palette index takes less than
+	 *                        bits_per_pixel bits and is stored in the upper
+	 *                        bits of the pixel value
+	 *    var->{color}.length is set so that 1 << length is the number of available
+	 *                        palette entries
 	 *    cmap is not used
 	 *    RAMDAC[X] is programmed to (red, green, blue)
+	 *
 	 * Truecolor:
 	 *    does not use DAC. Usually 3 are present.
 	 *    var->{color}.offset contains start of bitfield
diff --git a/include/linux/fb.h b/include/linux/fb.h
index f563c5013932..330c4b1bfcaa 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -173,8 +173,12 @@ struct fb_fix_screeninfo {
 /* Interpretation of offset for color fields: All offsets are from the right,
  * inside a "pixel" value, which is exactly 'bits_per_pixel' wide (means: you
  * can use the offset as right argument to <<). A pixel afterwards is a bit
- * stream and is written to video memory as that unmodified. This implies
- * big-endian byte order if bits_per_pixel is greater than 8.
+ * stream and is written to video memory as that unmodified.
+ *
+ * For pseudocolor: offset and length should be the same for all color
+ * components. Offset specifies the position of the least significant bit
+ * of the pallette index in a pixel value. Length indicates the number
+ * of available palette entries (i.e. # of entries = 1 << length).
  */
 struct fb_bitfield {
 	__u32 offset;			/* beginning of bitfield	*/
-- 
cgit v1.2.3


From 251eb40f5ccd07a905633a816fbf8f2b6b25cced Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@cam.ac.uk>
Date: Mon, 13 Apr 2009 14:39:45 -0700
Subject: hwmon: sht15 humidity sensor driver

Data sheet at:
http://www.sensirion.ch/en/pdf/product_information/Datasheet-humidity-sensor-SHT1x.pdf

These sensors communicate over a 2 wire bus running a device specific
protocol.  The complexity of the driver is mainly due to handling the
substantial delays between requesting a reading and the device pulling the
data line low to indicate that the data is available.  This is handled by
an interrupt that is disabled under all other conditions.

I wasn't terribly clear on the best way to handle this, so comments on
that aspect would be particularly welcome!

Interpretation of the temperature depends on knowing the supply voltage.
If configured in a board config as a regulator consumer this is obtained
from the regulator subsystem.  If not it should be provided in the
platform data.

I've placed this driver in the hwmon subsystem as it is definitely a
device that may be used for hardware monitoring and with it's relatively
slow response times (up to 120 millisecs to get a reading) a caching
strategy certainly seems to make sense!

Signed-off-by: Jonathan Cameron <jic23@cam.ac.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hwmon/Kconfig  |  10 +
 drivers/hwmon/Makefile |   1 +
 drivers/hwmon/sht15.c  | 692 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sht15.h  |  24 ++
 4 files changed, 727 insertions(+)
 create mode 100644 drivers/hwmon/sht15.c
 create mode 100644 include/linux/sht15.h

(limited to 'include/linux')

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 0e8a9185f676..d73f5f473e38 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -692,6 +692,16 @@ config SENSORS_PCF8591
 	  These devices are hard to detect and rarely found on mainstream
 	  hardware.  If unsure, say N.
 
+config SENSORS_SHT15
+	tristate "Sensiron humidity and temperature sensors. SHT15 and compat."
+	depends on GENERIC_GPIO
+	help
+	  If you say yes here you get support for the Sensiron SHT10, SHT11,
+	  SHT15, SHT71, SHT75 humidity and temperature sensors.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called sht15.
+
 config SENSORS_SIS5595
 	tristate "Silicon Integrated Systems Corp. SiS5595"
 	depends on PCI
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 1d3757837b4f..0ae26984ba45 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -76,6 +76,7 @@ obj-$(CONFIG_SENSORS_MAX6650)	+= max6650.o
 obj-$(CONFIG_SENSORS_PC87360)	+= pc87360.o
 obj-$(CONFIG_SENSORS_PC87427)	+= pc87427.o
 obj-$(CONFIG_SENSORS_PCF8591)	+= pcf8591.o
+obj-$(CONFIG_SENSORS_SHT15)	+= sht15.o
 obj-$(CONFIG_SENSORS_SIS5595)	+= sis5595.o
 obj-$(CONFIG_SENSORS_SMSC47B397)+= smsc47b397.o
 obj-$(CONFIG_SENSORS_SMSC47M1)	+= smsc47m1.o
diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c
new file mode 100644
index 000000000000..6cbdc2fea734
--- /dev/null
+++ b/drivers/hwmon/sht15.c
@@ -0,0 +1,692 @@
+/*
+ * sht15.c - support for the SHT15 Temperature and Humidity Sensor
+ *
+ * Copyright (c) 2009 Jonathan Cameron
+ *
+ * Copyright (c) 2007 Wouter Horre
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Currently ignoring checksum on readings.
+ * Default resolution only (14bit temp, 12bit humidity)
+ * Ignoring battery status.
+ * Heater not enabled.
+ * Timings are all conservative.
+ *
+ * Data sheet available (1/2009) at
+ * http://www.sensirion.ch/en/pdf/product_information/Datasheet-humidity-sensor-SHT1x.pdf
+ *
+ * Regulator supply name = vcc
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/gpio.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/err.h>
+#include <linux/sht15.h>
+#include <linux/regulator/consumer.h>
+#include <asm/atomic.h>
+
+#define SHT15_MEASURE_TEMP	3
+#define SHT15_MEASURE_RH	5
+
+#define SHT15_READING_NOTHING	0
+#define SHT15_READING_TEMP	1
+#define SHT15_READING_HUMID	2
+
+/* Min timings in nsecs */
+#define SHT15_TSCKL		100	/* clock low */
+#define SHT15_TSCKH		100	/* clock high */
+#define SHT15_TSU		150	/* data setup time */
+
+/**
+ * struct sht15_temppair - elements of voltage dependant temp calc
+ * @vdd:	supply voltage in microvolts
+ * @d1:		see data sheet
+ */
+struct sht15_temppair {
+	int vdd; /* microvolts */
+	int d1;
+};
+
+/* Table 9 from data sheet - relates temperature calculation
+ * to supply voltage.
+ */
+static const struct sht15_temppair temppoints[] = {
+	{ 2500000, -39400 },
+	{ 3000000, -39600 },
+	{ 3500000, -39700 },
+	{ 4000000, -39800 },
+	{ 5000000, -40100 },
+};
+
+/**
+ * struct sht15_data - device instance specific data
+ * @pdata:	platform data (gpio's etc)
+ * @read_work:	bh of interrupt handler
+ * @wait_queue:	wait queue for getting values from device
+ * @val_temp:	last temperature value read from device
+ * @val_humid: 	last humidity value read from device
+ * @flag:	status flag used to identify what the last request was
+ * @valid:	are the current stored values valid (start condition)
+ * @last_updat:	time of last update
+ * @read_lock:	mutex to ensure only one read in progress
+ *		at a time.
+ * @dev:	associate device structure
+ * @hwmon_dev:	device associated with hwmon subsystem
+ * @reg:	associated regulator (if specified)
+ * @nb:		notifier block to handle notifications of voltage changes
+ * @supply_uV:	local copy of supply voltage used to allow
+ *		use of regulator consumer if available
+ * @supply_uV_valid:   indicates that an updated value has not yet
+ *		been obtained from the regulator and so any calculations
+ *		based upon it will be invalid.
+ * @update_supply_work:	work struct that is used to update the supply_uV
+ * @interrupt_handled:	flag used to indicate a hander has been scheduled
+ */
+struct sht15_data {
+	struct sht15_platform_data	*pdata;
+	struct work_struct		read_work;
+	wait_queue_head_t		wait_queue;
+	uint16_t			val_temp;
+	uint16_t			val_humid;
+	u8				flag;
+	u8				valid;
+	unsigned long			last_updat;
+	struct mutex			read_lock;
+	struct device			*dev;
+	struct device			*hwmon_dev;
+	struct regulator		*reg;
+	struct notifier_block		nb;
+	int				supply_uV;
+	int				supply_uV_valid;
+	struct work_struct		update_supply_work;
+	atomic_t			interrupt_handled;
+};
+
+/**
+ * sht15_connection_reset() - reset the comms interface
+ * @data:	sht15 specific data
+ *
+ * This implements section 3.4 of the data sheet
+ */
+static void sht15_connection_reset(struct sht15_data *data)
+{
+	int i;
+	gpio_direction_output(data->pdata->gpio_data, 1);
+	ndelay(SHT15_TSCKL);
+	gpio_set_value(data->pdata->gpio_sck, 0);
+	ndelay(SHT15_TSCKL);
+	for (i = 0; i < 9; ++i) {
+		gpio_set_value(data->pdata->gpio_sck, 1);
+		ndelay(SHT15_TSCKH);
+		gpio_set_value(data->pdata->gpio_sck, 0);
+		ndelay(SHT15_TSCKL);
+	}
+}
+/**
+ * sht15_send_bit() - send an individual bit to the device
+ * @data:	device state data
+ * @val:	value of bit to be sent
+ **/
+static inline void sht15_send_bit(struct sht15_data *data, int val)
+{
+
+	gpio_set_value(data->pdata->gpio_data, val);
+	ndelay(SHT15_TSU);
+	gpio_set_value(data->pdata->gpio_sck, 1);
+	ndelay(SHT15_TSCKH);
+	gpio_set_value(data->pdata->gpio_sck, 0);
+	ndelay(SHT15_TSCKL); /* clock low time */
+}
+
+/**
+ * sht15_transmission_start() - specific sequence for new transmission
+ *
+ * @data:	device state data
+ * Timings for this are not documented on the data sheet, so very
+ * conservative ones used in implementation. This implements
+ * figure 12 on the data sheet.
+ **/
+static void sht15_transmission_start(struct sht15_data *data)
+{
+	/* ensure data is high and output */
+	gpio_direction_output(data->pdata->gpio_data, 1);
+	ndelay(SHT15_TSU);
+	gpio_set_value(data->pdata->gpio_sck, 0);
+	ndelay(SHT15_TSCKL);
+	gpio_set_value(data->pdata->gpio_sck, 1);
+	ndelay(SHT15_TSCKH);
+	gpio_set_value(data->pdata->gpio_data, 0);
+	ndelay(SHT15_TSU);
+	gpio_set_value(data->pdata->gpio_sck, 0);
+	ndelay(SHT15_TSCKL);
+	gpio_set_value(data->pdata->gpio_sck, 1);
+	ndelay(SHT15_TSCKH);
+	gpio_set_value(data->pdata->gpio_data, 1);
+	ndelay(SHT15_TSU);
+	gpio_set_value(data->pdata->gpio_sck, 0);
+	ndelay(SHT15_TSCKL);
+}
+/**
+ * sht15_send_byte() - send a single byte to the device
+ * @data:	device state
+ * @byte:	value to be sent
+ **/
+static void sht15_send_byte(struct sht15_data *data, u8 byte)
+{
+	int i;
+	for (i = 0; i < 8; i++) {
+		sht15_send_bit(data, !!(byte & 0x80));
+		byte <<= 1;
+	}
+}
+/**
+ * sht15_wait_for_response() - checks for ack from device
+ * @data:	device state
+ **/
+static int sht15_wait_for_response(struct sht15_data *data)
+{
+	gpio_direction_input(data->pdata->gpio_data);
+	gpio_set_value(data->pdata->gpio_sck, 1);
+	ndelay(SHT15_TSCKH);
+	if (gpio_get_value(data->pdata->gpio_data)) {
+		gpio_set_value(data->pdata->gpio_sck, 0);
+		dev_err(data->dev, "Command not acknowledged\n");
+		sht15_connection_reset(data);
+		return -EIO;
+	}
+	gpio_set_value(data->pdata->gpio_sck, 0);
+	ndelay(SHT15_TSCKL);
+	return 0;
+}
+
+/**
+ * sht15_send_cmd() - Sends a command to the device.
+ * @data:	device state
+ * @cmd:	command byte to be sent
+ *
+ * On entry, sck is output low, data is output pull high
+ * and the interrupt disabled.
+ **/
+static int sht15_send_cmd(struct sht15_data *data, u8 cmd)
+{
+	int ret = 0;
+	sht15_transmission_start(data);
+	sht15_send_byte(data, cmd);
+	ret = sht15_wait_for_response(data);
+	return ret;
+}
+/**
+ * sht15_update_single_val() - get a new value from device
+ * @data:		device instance specific data
+ * @command:		command sent to request value
+ * @timeout_msecs:	timeout after which comms are assumed
+ *			to have failed are reset.
+ **/
+static inline int sht15_update_single_val(struct sht15_data *data,
+					  int command,
+					  int timeout_msecs)
+{
+	int ret;
+	ret = sht15_send_cmd(data, command);
+	if (ret)
+		return ret;
+
+	gpio_direction_input(data->pdata->gpio_data);
+	atomic_set(&data->interrupt_handled, 0);
+
+	enable_irq(gpio_to_irq(data->pdata->gpio_data));
+	if (gpio_get_value(data->pdata->gpio_data) == 0) {
+		disable_irq_nosync(gpio_to_irq(data->pdata->gpio_data));
+		/* Only relevant if the interrupt hasn't occured. */
+		if (!atomic_read(&data->interrupt_handled))
+			schedule_work(&data->read_work);
+	}
+	ret = wait_event_timeout(data->wait_queue,
+				 (data->flag == SHT15_READING_NOTHING),
+				 msecs_to_jiffies(timeout_msecs));
+	if (ret == 0) {/* timeout occurred */
+		disable_irq_nosync(gpio_to_irq(data->pdata->gpio_data));;
+		sht15_connection_reset(data);
+		return -ETIME;
+	}
+	return 0;
+}
+
+/**
+ * sht15_update_vals() - get updated readings from device if too old
+ * @data:	device state
+ **/
+static int sht15_update_vals(struct sht15_data *data)
+{
+	int ret = 0;
+	int timeout = HZ;
+
+	mutex_lock(&data->read_lock);
+	if (time_after(jiffies, data->last_updat + timeout)
+	    || !data->valid) {
+		data->flag = SHT15_READING_HUMID;
+		ret = sht15_update_single_val(data, SHT15_MEASURE_RH, 160);
+		if (ret)
+			goto error_ret;
+		data->flag = SHT15_READING_TEMP;
+		ret = sht15_update_single_val(data, SHT15_MEASURE_TEMP, 400);
+		if (ret)
+			goto error_ret;
+		data->valid = 1;
+		data->last_updat = jiffies;
+	}
+error_ret:
+	mutex_unlock(&data->read_lock);
+
+	return ret;
+}
+
+/**
+ * sht15_calc_temp() - convert the raw reading to a temperature
+ * @data:	device state
+ *
+ * As per section 4.3 of the data sheet.
+ **/
+static inline int sht15_calc_temp(struct sht15_data *data)
+{
+	int d1 = 0;
+	int i;
+
+	for (i = 1; i < ARRAY_SIZE(temppoints) - 1; i++)
+		/* Find pointer to interpolate */
+		if (data->supply_uV > temppoints[i - 1].vdd) {
+			d1 = (data->supply_uV/1000 - temppoints[i - 1].vdd)
+				* (temppoints[i].d1 - temppoints[i - 1].d1)
+				/ (temppoints[i].vdd - temppoints[i - 1].vdd)
+				+ temppoints[i - 1].d1;
+			break;
+		}
+
+	return data->val_temp*10 + d1;
+}
+
+/**
+ * sht15_calc_humid() - using last temperature convert raw to humid
+ * @data:	device state
+ *
+ * This is the temperature compensated version as per section 4.2 of
+ * the data sheet.
+ **/
+static inline int sht15_calc_humid(struct sht15_data *data)
+{
+	int RHlinear; /* milli percent */
+	int temp = sht15_calc_temp(data);
+
+	const int c1 = -4;
+	const int c2 = 40500; /* x 10 ^ -6 */
+	const int c3 = 2800; /* x10 ^ -9 */
+
+	RHlinear = c1*1000
+		+ c2 * data->val_humid/1000
+		+ (data->val_humid * data->val_humid * c3)/1000000;
+	return (temp - 25000) * (10000 + 800 * data->val_humid)
+		/ 1000000 + RHlinear;
+}
+
+static ssize_t sht15_show_temp(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	int ret;
+	struct sht15_data *data = dev_get_drvdata(dev);
+
+	/* Technically no need to read humidity as well */
+	ret = sht15_update_vals(data);
+
+	return ret ? ret : sprintf(buf, "%d\n",
+				   sht15_calc_temp(data));
+}
+
+static ssize_t sht15_show_humidity(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	int ret;
+	struct sht15_data *data = dev_get_drvdata(dev);
+
+	ret = sht15_update_vals(data);
+
+	return ret ? ret : sprintf(buf, "%d\n", sht15_calc_humid(data));
+
+};
+static ssize_t show_name(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	return sprintf(buf, "%s\n", pdev->name);
+}
+
+static SENSOR_DEVICE_ATTR(temp1_input,
+			  S_IRUGO, sht15_show_temp,
+			  NULL, 0);
+static SENSOR_DEVICE_ATTR(humidity1_input,
+			  S_IRUGO, sht15_show_humidity,
+			  NULL, 0);
+static DEVICE_ATTR(name, S_IRUGO, show_name, NULL);
+static struct attribute *sht15_attrs[] = {
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	&sensor_dev_attr_humidity1_input.dev_attr.attr,
+	&dev_attr_name.attr,
+	NULL,
+};
+
+static const struct attribute_group sht15_attr_group = {
+	.attrs = sht15_attrs,
+};
+
+static irqreturn_t sht15_interrupt_fired(int irq, void *d)
+{
+	struct sht15_data *data = d;
+	/* First disable the interrupt */
+	disable_irq_nosync(irq);
+	atomic_inc(&data->interrupt_handled);
+	/* Then schedule a reading work struct */
+	if (data->flag != SHT15_READING_NOTHING)
+		schedule_work(&data->read_work);
+	return IRQ_HANDLED;
+}
+
+/* Each byte of data is acknowledged by pulling the data line
+ * low for one clock pulse.
+ */
+static void sht15_ack(struct sht15_data *data)
+{
+	gpio_direction_output(data->pdata->gpio_data, 0);
+	ndelay(SHT15_TSU);
+	gpio_set_value(data->pdata->gpio_sck, 1);
+	ndelay(SHT15_TSU);
+	gpio_set_value(data->pdata->gpio_sck, 0);
+	ndelay(SHT15_TSU);
+	gpio_set_value(data->pdata->gpio_data, 1);
+
+	gpio_direction_input(data->pdata->gpio_data);
+}
+/**
+ * sht15_end_transmission() - notify device of end of transmission
+ * @data:	device state
+ *
+ * This is basically a NAK. (single clock pulse, data high)
+ **/
+static void sht15_end_transmission(struct sht15_data *data)
+{
+	gpio_direction_output(data->pdata->gpio_data, 1);
+	ndelay(SHT15_TSU);
+	gpio_set_value(data->pdata->gpio_sck, 1);
+	ndelay(SHT15_TSCKH);
+	gpio_set_value(data->pdata->gpio_sck, 0);
+	ndelay(SHT15_TSCKL);
+}
+
+static void sht15_bh_read_data(struct work_struct *work_s)
+{
+	int i;
+	uint16_t val = 0;
+	struct sht15_data *data
+		= container_of(work_s, struct sht15_data,
+			       read_work);
+	/* Firstly, verify the line is low */
+	if (gpio_get_value(data->pdata->gpio_data)) {
+		/* If not, then start the interrupt again - care
+		   here as could have gone low in meantime so verify
+		   it hasn't!
+		*/
+		atomic_set(&data->interrupt_handled, 0);
+		enable_irq(gpio_to_irq(data->pdata->gpio_data));
+		/* If still not occured or another handler has been scheduled */
+		if (gpio_get_value(data->pdata->gpio_data)
+		    || atomic_read(&data->interrupt_handled))
+			return;
+	}
+	/* Read the data back from the device */
+	for (i = 0; i < 16; ++i) {
+		val <<= 1;
+		gpio_set_value(data->pdata->gpio_sck, 1);
+		ndelay(SHT15_TSCKH);
+		val |= !!gpio_get_value(data->pdata->gpio_data);
+		gpio_set_value(data->pdata->gpio_sck, 0);
+		ndelay(SHT15_TSCKL);
+		if (i == 7)
+			sht15_ack(data);
+	}
+	/* Tell the device we are done */
+	sht15_end_transmission(data);
+
+	switch (data->flag) {
+	case SHT15_READING_TEMP:
+		data->val_temp = val;
+		break;
+	case SHT15_READING_HUMID:
+		data->val_humid = val;
+		break;
+	}
+
+	data->flag = SHT15_READING_NOTHING;
+	wake_up(&data->wait_queue);
+}
+
+static void sht15_update_voltage(struct work_struct *work_s)
+{
+	struct sht15_data *data
+		= container_of(work_s, struct sht15_data,
+			       update_supply_work);
+	data->supply_uV = regulator_get_voltage(data->reg);
+}
+
+/**
+ * sht15_invalidate_voltage() - mark supply voltage invalid when notified by reg
+ * @nb:		associated notification structure
+ * @event:	voltage regulator state change event code
+ * @ignored:	function parameter - ignored here
+ *
+ * Note that as the notification code holds the regulator lock, we have
+ * to schedule an update of the supply voltage rather than getting it directly.
+ **/
+static int sht15_invalidate_voltage(struct notifier_block *nb,
+				unsigned long event,
+				void *ignored)
+{
+	struct sht15_data *data = container_of(nb, struct sht15_data, nb);
+
+	if (event == REGULATOR_EVENT_VOLTAGE_CHANGE)
+		data->supply_uV_valid = false;
+	schedule_work(&data->update_supply_work);
+
+	return NOTIFY_OK;
+}
+
+static int __devinit sht15_probe(struct platform_device *pdev)
+{
+	int ret = 0;
+	struct sht15_data *data = kzalloc(sizeof(*data), GFP_KERNEL);
+
+	if (!data) {
+		ret = -ENOMEM;
+		dev_err(&pdev->dev, "kzalloc failed");
+		goto error_ret;
+	}
+
+	INIT_WORK(&data->read_work, sht15_bh_read_data);
+	INIT_WORK(&data->update_supply_work, sht15_update_voltage);
+	platform_set_drvdata(pdev, data);
+	mutex_init(&data->read_lock);
+	data->dev = &pdev->dev;
+	init_waitqueue_head(&data->wait_queue);
+
+	if (pdev->dev.platform_data == NULL) {
+		dev_err(&pdev->dev, "no platform data supplied");
+		goto err_free_data;
+	}
+	data->pdata = pdev->dev.platform_data;
+	data->supply_uV = data->pdata->supply_mv*1000;
+
+/* If a regulator is available, query what the supply voltage actually is!*/
+	data->reg = regulator_get(data->dev, "vcc");
+	if (!IS_ERR(data->reg)) {
+		data->supply_uV = regulator_get_voltage(data->reg);
+		regulator_enable(data->reg);
+		/* setup a notifier block to update this if another device
+		 *  causes the voltage to change */
+		data->nb.notifier_call = &sht15_invalidate_voltage;
+		ret = regulator_register_notifier(data->reg, &data->nb);
+	}
+/* Try requesting the GPIOs */
+	ret = gpio_request(data->pdata->gpio_sck, "SHT15 sck");
+	if (ret) {
+		dev_err(&pdev->dev, "gpio request failed");
+		goto err_free_data;
+	}
+	gpio_direction_output(data->pdata->gpio_sck, 0);
+	ret = gpio_request(data->pdata->gpio_data, "SHT15 data");
+	if (ret) {
+		dev_err(&pdev->dev, "gpio request failed");
+		goto err_release_gpio_sck;
+	}
+	ret = sysfs_create_group(&pdev->dev.kobj, &sht15_attr_group);
+	if (ret) {
+		dev_err(&pdev->dev, "sysfs create failed");
+		goto err_free_data;
+	}
+
+	ret = request_irq(gpio_to_irq(data->pdata->gpio_data),
+			  sht15_interrupt_fired,
+			  IRQF_TRIGGER_FALLING,
+			  "sht15 data",
+			  data);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to get irq for data line");
+		goto err_release_gpio_data;
+	}
+	disable_irq_nosync(gpio_to_irq(data->pdata->gpio_data));
+	sht15_connection_reset(data);
+	sht15_send_cmd(data, 0x1E);
+
+	data->hwmon_dev = hwmon_device_register(data->dev);
+	if (IS_ERR(data->hwmon_dev)) {
+		ret = PTR_ERR(data->hwmon_dev);
+		goto err_release_gpio_data;
+	}
+	return 0;
+
+err_release_gpio_data:
+	gpio_free(data->pdata->gpio_data);
+err_release_gpio_sck:
+	gpio_free(data->pdata->gpio_sck);
+err_free_data:
+	kfree(data);
+error_ret:
+
+	return ret;
+}
+
+static int __devexit sht15_remove(struct platform_device *pdev)
+{
+	struct sht15_data *data = platform_get_drvdata(pdev);
+
+	/* Make sure any reads from the device are done and
+	 * prevent new ones beginnning */
+	mutex_lock(&data->read_lock);
+	hwmon_device_unregister(data->hwmon_dev);
+	sysfs_remove_group(&pdev->dev.kobj, &sht15_attr_group);
+	if (!IS_ERR(data->reg)) {
+		regulator_unregister_notifier(data->reg, &data->nb);
+		regulator_disable(data->reg);
+		regulator_put(data->reg);
+	}
+
+	free_irq(gpio_to_irq(data->pdata->gpio_data), data);
+	gpio_free(data->pdata->gpio_data);
+	gpio_free(data->pdata->gpio_sck);
+	mutex_unlock(&data->read_lock);
+	kfree(data);
+	return 0;
+}
+
+
+static struct platform_driver sht_drivers[] = {
+	{
+		.driver = {
+			.name = "sht10",
+			.owner = THIS_MODULE,
+		},
+		.probe = sht15_probe,
+		.remove = sht15_remove,
+	}, {
+		.driver = {
+			.name = "sht11",
+			.owner = THIS_MODULE,
+		},
+		.probe = sht15_probe,
+		.remove = sht15_remove,
+	}, {
+		.driver = {
+			.name = "sht15",
+			.owner = THIS_MODULE,
+		},
+		.probe = sht15_probe,
+		.remove = sht15_remove,
+	}, {
+		.driver = {
+			.name = "sht71",
+			.owner = THIS_MODULE,
+		},
+		.probe = sht15_probe,
+		.remove = sht15_remove,
+	}, {
+		.driver = {
+			.name = "sht75",
+			.owner = THIS_MODULE,
+		},
+		.probe = sht15_probe,
+		.remove = sht15_remove,
+	},
+};
+
+
+static int __init sht15_init(void)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sht_drivers); i++) {
+		ret = platform_driver_register(&sht_drivers[i]);
+		if (ret)
+			goto error_unreg;
+	}
+
+	return 0;
+
+error_unreg:
+	while (--i >= 0)
+		platform_driver_unregister(&sht_drivers[i]);
+
+	return ret;
+}
+module_init(sht15_init);
+
+static void __exit sht15_exit(void)
+{
+	int i;
+	for (i = ARRAY_SIZE(sht_drivers) - 1; i >= 0; i--)
+		platform_driver_unregister(&sht_drivers[i]);
+}
+module_exit(sht15_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/include/linux/sht15.h b/include/linux/sht15.h
new file mode 100644
index 000000000000..046bce05ecab
--- /dev/null
+++ b/include/linux/sht15.h
@@ -0,0 +1,24 @@
+/*
+ * sht15.h - support for the SHT15 Temperature and Humidity Sensor
+ *
+ * Copyright (c) 2009 Jonathan Cameron
+ *
+ * Copyright (c) 2007 Wouter Horre
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/**
+ * struct sht15_platform_data - sht15 connectivity info
+ * @gpio_data:	no. of gpio to which bidirectional data line is connected
+ * @gpio_sck:	no. of gpio to which the data clock is connected.
+ * @supply_mv:	supply voltage in mv. Overridden by regulator if available.
+ **/
+struct sht15_platform_data {
+	int gpio_data;
+	int gpio_sck;
+	int supply_mv;
+};
+
-- 
cgit v1.2.3


From 17a5138d204014b00cb9c1d6e8ff311993041b5c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 13 Apr 2009 14:39:47 -0700
Subject: aio: remove INIT_KIOCTX

Unused after 20dcae32439384b6863c626bb3b2a09bed65b33e aka
"[PATCH] aio: remove kioctx from mm_struct".

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/init_task.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index dcfb93337e9a..d87247d2641f 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,19 +15,6 @@
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 
-#define INIT_KIOCTX(name, which_mm) \
-{							\
-	.users		= ATOMIC_INIT(1),		\
-	.dead		= 0,				\
-	.mm		= &which_mm,			\
-	.user_id	= 0,				\
-	.next		= NULL,				\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER(name.wait), \
-	.ctx_lock	= __SPIN_LOCK_UNLOCKED(name.ctx_lock), \
-	.reqs_active	= 0U,				\
-	.max_reqs	= ~0U,				\
-}
-
 #define INIT_MM(name) \
 {			 					\
 	.mm_rb		= RB_ROOT,				\
-- 
cgit v1.2.3


From 5dec8bfbdd4921522565a7b0e0c8760ae042ef6d Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 13 Apr 2009 14:39:54 -0700
Subject: include/linux/fiemap.h: include types.h now that it's exported

Include <linux/types.h> in fiemap.h.  Sam Ravnborg pointed out that this
was missing in this newly-exported header which uses the __u32 and __u64
types.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fiemap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h
index 671decbd2aeb..934e22d65801 100644
--- a/include/linux/fiemap.h
+++ b/include/linux/fiemap.h
@@ -11,6 +11,8 @@
 #ifndef _LINUX_FIEMAP_H
 #define _LINUX_FIEMAP_H
 
+#include <linux/types.h>
+
 struct fiemap_extent {
 	__u64 fe_logical;  /* logical offset in bytes for the start of
 			    * the extent from the beginning of the file */
-- 
cgit v1.2.3


From 347486bb108fa6e0fd2753c1be3519d6be2516ed Mon Sep 17 00:00:00 2001
From: Stefan Husemann <shusemann@googlemail.com>
Date: Mon, 13 Apr 2009 14:40:10 -0700
Subject: intelfb: support i854

Support the Intel 854 Chipset in fbdev.

We test and use the patch on a Thomson IP1101 IPTV-Box.  On the VGA-Port
we get a normal signal.

Here is the link to the Mambux-Project: http://www.mambux.de

Cc: Keith Packard <keithp@keithp.com>
Cc: Dave Airlie <airlied@linux.ie>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Stefan Husemann <shusemann@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/agp/intel-agp.c        | 3 +++
 drivers/video/intelfb/intelfb.h     | 2 ++
 drivers/video/intelfb/intelfb_i2c.c | 1 +
 drivers/video/intelfb/intelfbdrv.c  | 1 +
 drivers/video/intelfb/intelfbhw.c   | 5 +++++
 include/drm/drm_pciids.h            | 2 ++
 include/linux/pci_ids.h             | 2 ++
 7 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 9d9490e22e07..3686912427ba 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -2131,6 +2131,8 @@ static const struct intel_driver_description {
 	{ PCI_DEVICE_ID_INTEL_82845G_HB, PCI_DEVICE_ID_INTEL_82845G_IG, 0, "830M",
 		&intel_845_driver, &intel_830_driver },
 	{ PCI_DEVICE_ID_INTEL_82850_HB, 0, 0, "i850", &intel_850_driver, NULL },
+	{ PCI_DEVICE_ID_INTEL_82854_HB, PCI_DEVICE_ID_INTEL_82854_IG, 0, "854",
+		&intel_845_driver, &intel_830_driver },
 	{ PCI_DEVICE_ID_INTEL_82855PM_HB, 0, 0, "855PM", &intel_845_driver, NULL },
 	{ PCI_DEVICE_ID_INTEL_82855GM_HB, PCI_DEVICE_ID_INTEL_82855GM_IG, 0, "855GM",
 		&intel_845_driver, &intel_830_driver },
@@ -2355,6 +2357,7 @@ static struct pci_device_id agp_intel_pci_table[] = {
 	ID(PCI_DEVICE_ID_INTEL_82845_HB),
 	ID(PCI_DEVICE_ID_INTEL_82845G_HB),
 	ID(PCI_DEVICE_ID_INTEL_82850_HB),
+	ID(PCI_DEVICE_ID_INTEL_82854_HB),
 	ID(PCI_DEVICE_ID_INTEL_82855PM_HB),
 	ID(PCI_DEVICE_ID_INTEL_82855GM_HB),
 	ID(PCI_DEVICE_ID_INTEL_82860_HB),
diff --git a/drivers/video/intelfb/intelfb.h b/drivers/video/intelfb/intelfb.h
index a50bea614804..40984551c927 100644
--- a/drivers/video/intelfb/intelfb.h
+++ b/drivers/video/intelfb/intelfb.h
@@ -53,6 +53,7 @@
 #define PCI_DEVICE_ID_INTEL_830M	0x3577
 #define PCI_DEVICE_ID_INTEL_845G	0x2562
 #define PCI_DEVICE_ID_INTEL_85XGM	0x3582
+#define PCI_DEVICE_ID_INTEL_854		0x358E
 #define PCI_DEVICE_ID_INTEL_865G	0x2572
 #define PCI_DEVICE_ID_INTEL_915G	0x2582
 #define PCI_DEVICE_ID_INTEL_915GM	0x2592
@@ -154,6 +155,7 @@ enum intel_chips {
 	INTEL_85XGM,
 	INTEL_852GM,
 	INTEL_852GME,
+	INTEL_854,
 	INTEL_855GM,
 	INTEL_855GME,
 	INTEL_865G,
diff --git a/drivers/video/intelfb/intelfb_i2c.c b/drivers/video/intelfb/intelfb_i2c.c
index b3065492bb20..487f2be47460 100644
--- a/drivers/video/intelfb/intelfb_i2c.c
+++ b/drivers/video/intelfb/intelfb_i2c.c
@@ -156,6 +156,7 @@ void intelfb_create_i2c_busses(struct intelfb_info *dinfo)
 	switch(dinfo->chipset) {
 	case INTEL_830M:
 	case INTEL_845G:
+	case INTEL_854:
 	case INTEL_855GM:
 	case INTEL_865G:
 		dinfo->output[i].type = INTELFB_OUTPUT_DVO;
diff --git a/drivers/video/intelfb/intelfbdrv.c b/drivers/video/intelfb/intelfbdrv.c
index 6d8e5415c809..ace14fe02fc4 100644
--- a/drivers/video/intelfb/intelfbdrv.c
+++ b/drivers/video/intelfb/intelfbdrv.c
@@ -182,6 +182,7 @@ static struct pci_device_id intelfb_pci_table[] __devinitdata = {
 	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_845G, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, INTELFB_CLASS_MASK, INTEL_845G },
 	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_85XGM, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, INTELFB_CLASS_MASK, INTEL_85XGM },
 	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_865G, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, INTELFB_CLASS_MASK, INTEL_865G },
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_854, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, INTELFB_CLASS_MASK, INTEL_854 },
 	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_915G, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, INTELFB_CLASS_MASK, INTEL_915G },
 	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_915GM, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, INTELFB_CLASS_MASK, INTEL_915GM },
 	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_945G, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, INTELFB_CLASS_MASK, INTEL_945G },
diff --git a/drivers/video/intelfb/intelfbhw.c b/drivers/video/intelfb/intelfbhw.c
index 8b26b27c2db6..0689f97c5238 100644
--- a/drivers/video/intelfb/intelfbhw.c
+++ b/drivers/video/intelfb/intelfbhw.c
@@ -84,6 +84,11 @@ int intelfbhw_get_chipset(struct pci_dev *pdev, struct intelfb_info *dinfo)
 		dinfo->mobile = 0;
 		dinfo->pll_index = PLLS_I8xx;
 		return 0;
+	case PCI_DEVICE_ID_INTEL_854:
+		dinfo->mobile = 1;
+		dinfo->name = "Intel(R) 854";
+		dinfo->chipset = INTEL_854;
+		return 0;
 	case PCI_DEVICE_ID_INTEL_85XGM:
 		tmp = 0;
 		dinfo->mobile = 1;
diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h
index 2df74eb09563..9477af01a639 100644
--- a/include/drm/drm_pciids.h
+++ b/include/drm/drm_pciids.h
@@ -472,6 +472,7 @@
 	{0x8086, 0x2562, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \
 	{0x8086, 0x3582, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \
 	{0x8086, 0x2572, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \
+	{0x8086, 0x358e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \
 	{0, 0, 0}
 
 #define gamma_PCI_IDS \
@@ -533,4 +534,5 @@
 	{0x8086, 0x2e22, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \
 	{0x8086, 0xa001, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \
 	{0x8086, 0xa011, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \
+	{0x8086, 0x35e8, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \
 	{0, 0, 0}
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index ee98cd570885..06ba90c211a5 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2514,6 +2514,8 @@
 #define PCI_DEVICE_ID_INTEL_IOAT_TBG3	0x3433
 #define PCI_DEVICE_ID_INTEL_82830_HB	0x3575
 #define PCI_DEVICE_ID_INTEL_82830_CGC	0x3577
+#define PCI_DEVICE_ID_INTEL_82854_HB	0x358c
+#define PCI_DEVICE_ID_INTEL_82854_IG	0x358e
 #define PCI_DEVICE_ID_INTEL_82855GM_HB	0x3580
 #define PCI_DEVICE_ID_INTEL_82855GM_IG	0x3582
 #define PCI_DEVICE_ID_INTEL_E7520_MCH	0x3590
-- 
cgit v1.2.3


From 27b19565fe4ca5b0e9d2ae98ce4b81ca728bf445 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 14 Apr 2009 11:03:12 +0200
Subject: lockdep: warn about lockdep disabling after kernel taint, fix

Impact: build fix for Sparc and s390

Stephen Rothwell reported that the Sparc build broke:

 In file included from kernel/panic.c:12:
 include/linux/debug_locks.h: In function '__debug_locks_off':
 include/linux/debug_locks.h:15: error: implicit declaration of function 'xchg'

due to:

 9eeba61: lockdep: warn about lockdep disabling after kernel taint

There is some inconsistency between architectures about where exactly
xchg() is defined.

The traditional place is in system.h but the more logical point for it
is in atomic.h - where most architectures (especially new ones) have
it defined. These architecture also still offer it via system.h.

Some, such as Sparc or s390 only have it in asm/system.h and not available
via asm/atomic.h at all.

Use the widest set of headers in debug_locks.h and also include asm/system.h.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20090414144317.026498df.sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/debug_locks.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 493dedb7a67b..29b3ce3f2a1d 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -3,6 +3,7 @@
 
 #include <linux/kernel.h>
 #include <asm/atomic.h>
+#include <asm/system.h>
 
 struct task_struct;
 
-- 
cgit v1.2.3


From ef631b0ca01655d24e9ca7e199262c4a46416a26 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 13 Apr 2009 21:31:16 -0700
Subject: rcu: Make hierarchical RCU less IPI-happy

This patch fixes a hierarchical-RCU performance bug located by Anton
Blanchard.  The problem stems from a misguided attempt to provide a
work-around for jiffies-counter failure.  This work-around uses a per-CPU
n_rcu_pending counter, which is incremented on each call to rcu_pending(),
which in turn is called from each scheduling-clock interrupt.  Each CPU
then treats this counter as a surrogate for the jiffies counter, so
that if the jiffies counter fails to advance, the per-CPU n_rcu_pending
counter will cause RCU to invoke force_quiescent_state(), which in turn
will (among other things) send resched IPIs to CPUs that have thus far
failed to pass through an RCU quiescent state.

Unfortunately, each CPU resets only its own counter after sending a
batch of IPIs.  This means that the other CPUs will also (needlessly)
send -another- round of IPIs, for a full N-squared set of IPIs in the
worst case every three scheduler-clock ticks until the grace period
finally ends.  It is not reasonable for a given CPU to reset each and
every n_rcu_pending for all the other CPUs, so this patch instead simply
disables the jiffies-counter "training wheels", thus eliminating the
excessive IPIs.

Note that the jiffies-counter IPIs do not have this problem due to
the fact that the jiffies counter is global, so that the CPU sending
the IPIs can easily reset things, thus preventing the other CPUs from
sending redundant IPIs.

Note also that the n_rcu_pending counter remains, as it will continue to
be used for tracing.  It may also see use to update the jiffies counter,
should an appropriate kick-the-jiffies-counter API appear.

Located-by: Anton Blanchard <anton@au1.ibm.com>
Tested-by: Anton Blanchard <anton@au1.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: anton@samba.org
Cc: akpm@linux-foundation.org
Cc: dipankar@in.ibm.com
Cc: manfred@colorfullife.com
Cc: cl@linux-foundation.org
Cc: josht@linux.vnet.ibm.com
Cc: schamp@sgi.com
Cc: niv@us.ibm.com
Cc: dvhltc@us.ibm.com
Cc: ego@in.ibm.com
Cc: laijs@cn.fujitsu.com
Cc: rostedt@goodmis.org
Cc: peterz@infradead.org
Cc: penberg@cs.helsinki.fi
Cc: andi@firstfloor.org
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
LKML-Reference: <12396834793575-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcutree.h |  3 +--
 kernel/rcutree.c        | 19 ++++---------------
 kernel/rcutree_trace.c  | 14 +++++---------
 3 files changed, 10 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 0cdda00f2b2a..58b2aa5312b9 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -161,9 +161,8 @@ struct rcu_data {
 	unsigned long offline_fqs;	/* Kicked due to being offline. */
 	unsigned long resched_ipi;	/* Sent a resched IPI. */
 
-	/* 5) state to allow this CPU to force_quiescent_state on others */
+	/* 5) For future __rcu_pending statistics. */
 	long n_rcu_pending;		/* rcu_pending() calls since boot. */
-	long n_rcu_pending_force_qs;	/* when to force quiescent states. */
 
 	int cpu;
 };
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7f3266922572..d2a372fb0b9b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -530,8 +530,6 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
 	rdp->qs_pending = 1;
 	rdp->passed_quiesc = 0;
 	rdp->gpnum = rsp->gpnum;
-	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
-				      RCU_JIFFIES_TILL_FORCE_QS;
 }
 
 /*
@@ -578,8 +576,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	rsp->gpnum++;
 	rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
-	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
-				      RCU_JIFFIES_TILL_FORCE_QS;
 	record_gp_stall_check_time(rsp);
 	dyntick_record_completed(rsp, rsp->completed - 1);
 	note_new_gpnum(rsp, rdp);
@@ -1055,7 +1051,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 {
 	unsigned long flags;
 	long lastcomp;
-	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	u8 signaled;
 
@@ -1066,16 +1061,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 		return;	/* Someone else is already on the job. */
 	}
 	if (relaxed &&
-	    (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
-	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
+	    (long)(rsp->jiffies_force_qs - jiffies) >= 0)
 		goto unlock_ret; /* no emergency and done recently. */
 	rsp->n_force_qs++;
 	spin_lock(&rnp->lock);
 	lastcomp = rsp->completed;
 	signaled = rsp->signaled;
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
-	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
-				      RCU_JIFFIES_TILL_FORCE_QS;
 	if (lastcomp == rsp->gpnum) {
 		rsp->n_force_qs_ngp++;
 		spin_unlock(&rnp->lock);
@@ -1144,8 +1136,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * If an RCU GP has gone long enough, go check for dyntick
 	 * idle CPUs and, if needed, send resched IPIs.
 	 */
-	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
-	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
 		force_quiescent_state(rsp, 1);
 
 	/*
@@ -1230,8 +1221,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	if (unlikely(++rdp->qlen > qhimark)) {
 		rdp->blimit = LONG_MAX;
 		force_quiescent_state(rsp, 0);
-	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
-		   (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
 		force_quiescent_state(rsp, 1);
 	local_irq_restore(flags);
 }
@@ -1290,8 +1280,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Has an RCU GP gone long enough to send resched IPIs &c? */
 	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
-	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
-	     (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
+	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
 		return 1;
 
 	/* nothing to do */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4ee954f6a8d5..4b1875ba9404 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -49,14 +49,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
+	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   rdp->completed, rdp->gpnum,
 		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
-		   rdp->qs_pending,
-		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
-		   (int)(rdp->n_rcu_pending & 0xffff));
+		   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
 	seq_printf(m, " dt=%d/%d dn=%d df=%lu",
 		   rdp->dynticks->dynticks,
@@ -102,14 +100,12 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 {
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
+	seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
 		   rdp->completed, rdp->gpnum,
 		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
-		   rdp->qs_pending,
-		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
-		   rdp->n_rcu_pending);
+		   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
 	seq_printf(m, ",%d,%d,%d,%lu",
 		   rdp->dynticks->dynticks,
@@ -123,7 +119,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
-	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
+	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
 #ifdef CONFIG_NO_HZ
 	seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
-- 
cgit v1.2.3


From 67c457a8c378a006a34d92f9bd3078a80a92f250 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 14 Apr 2009 07:50:56 -0400
Subject: jbd2: use SWRITE_SYNC_PLUG when writing synchronous revoke records

The revoke records must be written using the same way as the rest of
the blocks during the commit process; that is, either marked as
synchronous writes or as asynchornous writes.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c     |  3 ++-
 fs/jbd2/revoke.c     | 21 ++++++++++++---------
 include/linux/jbd2.h |  3 ++-
 3 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 073c8c3df7cd..0b7d3b8226fd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -506,7 +506,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	if (err)
 		jbd2_journal_abort(journal, err);
 
-	jbd2_journal_write_revoke_records(journal, commit_transaction);
+	jbd2_journal_write_revoke_records(journal, commit_transaction,
+					  write_op);
 
 	jbd_debug(3, "JBD: commit phase 2\n");
 
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index bbe6d592d8b3..a360b06af2e3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -86,6 +86,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 #endif
 #include <linux/log2.h>
 
@@ -118,8 +119,8 @@ struct jbd2_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
 				    struct journal_head **, int *,
-				    struct jbd2_revoke_record_s *);
-static void flush_descriptor(journal_t *, struct journal_head *, int);
+				    struct jbd2_revoke_record_s *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 #endif
 
 /* Utility functions to maintain the revoke table */
@@ -499,7 +500,8 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
  * revoke hash, deleting the entries as we go.
  */
 void jbd2_journal_write_revoke_records(journal_t *journal,
-				  transaction_t *transaction)
+				       transaction_t *transaction,
+				       int write_op)
 {
 	struct journal_head *descriptor;
 	struct jbd2_revoke_record_s *record;
@@ -523,14 +525,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
 				hash_list->next;
 			write_one_revoke_record(journal, transaction,
 						&descriptor, &offset,
-						record);
+						record, write_op);
 			count++;
 			list_del(&record->hash);
 			kmem_cache_free(jbd2_revoke_record_cache, record);
 		}
 	}
 	if (descriptor)
-		flush_descriptor(journal, descriptor, offset);
+		flush_descriptor(journal, descriptor, offset, write_op);
 	jbd_debug(1, "Wrote %d revoke records\n", count);
 }
 
@@ -543,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
 				    transaction_t *transaction,
 				    struct journal_head **descriptorp,
 				    int *offsetp,
-				    struct jbd2_revoke_record_s *record)
+				    struct jbd2_revoke_record_s *record,
+				    int write_op)
 {
 	struct journal_head *descriptor;
 	int offset;
@@ -562,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
 	/* Make sure we have a descriptor with space left for the record */
 	if (descriptor) {
 		if (offset == journal->j_blocksize) {
-			flush_descriptor(journal, descriptor, offset);
+			flush_descriptor(journal, descriptor, offset, write_op);
 			descriptor = NULL;
 		}
 	}
@@ -607,7 +610,7 @@ static void write_one_revoke_record(journal_t *journal,
 
 static void flush_descriptor(journal_t *journal,
 			     struct journal_head *descriptor,
-			     int offset)
+			     int offset, int write_op)
 {
 	jbd2_journal_revoke_header_t *header;
 	struct buffer_head *bh = jh2bh(descriptor);
@@ -622,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(bh);
 	BUFFER_TRACE(bh, "write");
 	set_buffer_dirty(bh);
-	ll_rw_block(SWRITE, 1, &bh);
+	ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
 }
 #endif
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 8815a3456b3b..cc02393bfce8 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1193,7 +1193,8 @@ extern int	   jbd2_journal_init_revoke_caches(void);
 extern void	   jbd2_journal_destroy_revoke(journal_t *);
 extern int	   jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
 extern int	   jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
-extern void	   jbd2_journal_write_revoke_records(journal_t *, transaction_t *);
+extern void	   jbd2_journal_write_revoke_records(journal_t *,
+						     transaction_t *, int);
 
 /* Recovery revoke support */
 extern int	jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
-- 
cgit v1.2.3


From 38d726d153cfe5efe5fe22d28d36ab382dda3a5c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 14 Apr 2009 10:10:47 -0400
Subject: jbd: use SWRITE_SYNC_PLUG when writing synchronous revoke records

The revoke records must be written using the same way as the rest of
the blocks during the commit process; that is, either marked as
synchronous writes or as asynchornous writes.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd/commit.c     |  2 +-
 fs/jbd/revoke.c     | 20 +++++++++++---------
 include/linux/jbd.h |  3 ++-
 3 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index a8e8513a78a9..06560c520f49 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -502,7 +502,7 @@ void journal_commit_transaction(journal_t *journal)
 		err = 0;
 	}
 
-	journal_write_revoke_records(journal, commit_transaction);
+	journal_write_revoke_records(journal, commit_transaction, write_op);
 
 	/*
 	 * If we found any dirty or locked buffers, then we should have
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index c7bd649bbbdc..1b1a06e1c836 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -67,6 +67,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 #endif
 #include <linux/log2.h>
 
@@ -99,8 +100,8 @@ struct jbd_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
 				    struct journal_head **, int *,
-				    struct jbd_revoke_record_s *);
-static void flush_descriptor(journal_t *, struct journal_head *, int);
+				    struct jbd_revoke_record_s *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 #endif
 
 /* Utility functions to maintain the revoke table */
@@ -486,7 +487,7 @@ void journal_switch_revoke_table(journal_t *journal)
  */
 
 void journal_write_revoke_records(journal_t *journal,
-				  transaction_t *transaction)
+				  transaction_t *transaction, int write_op)
 {
 	struct journal_head *descriptor;
 	struct jbd_revoke_record_s *record;
@@ -510,14 +511,14 @@ void journal_write_revoke_records(journal_t *journal,
 				hash_list->next;
 			write_one_revoke_record(journal, transaction,
 						&descriptor, &offset,
-						record);
+						record, write_op);
 			count++;
 			list_del(&record->hash);
 			kmem_cache_free(revoke_record_cache, record);
 		}
 	}
 	if (descriptor)
-		flush_descriptor(journal, descriptor, offset);
+		flush_descriptor(journal, descriptor, offset, write_op);
 	jbd_debug(1, "Wrote %d revoke records\n", count);
 }
 
@@ -530,7 +531,8 @@ static void write_one_revoke_record(journal_t *journal,
 				    transaction_t *transaction,
 				    struct journal_head **descriptorp,
 				    int *offsetp,
-				    struct jbd_revoke_record_s *record)
+				    struct jbd_revoke_record_s *record,
+				    int write_op)
 {
 	struct journal_head *descriptor;
 	int offset;
@@ -549,7 +551,7 @@ static void write_one_revoke_record(journal_t *journal,
 	/* Make sure we have a descriptor with space left for the record */
 	if (descriptor) {
 		if (offset == journal->j_blocksize) {
-			flush_descriptor(journal, descriptor, offset);
+			flush_descriptor(journal, descriptor, offset, write_op);
 			descriptor = NULL;
 		}
 	}
@@ -586,7 +588,7 @@ static void write_one_revoke_record(journal_t *journal,
 
 static void flush_descriptor(journal_t *journal,
 			     struct journal_head *descriptor,
-			     int offset)
+			     int offset, int write_op)
 {
 	journal_revoke_header_t *header;
 	struct buffer_head *bh = jh2bh(descriptor);
@@ -601,7 +603,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(bh);
 	BUFFER_TRACE(bh, "write");
 	set_buffer_dirty(bh);
-	ll_rw_block(SWRITE, 1, &bh);
+	ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
 }
 #endif
 
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 53ae4399da2d..c2049a04fa0b 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -978,7 +978,8 @@ extern void	   journal_destroy_revoke(journal_t *);
 extern int	   journal_revoke (handle_t *,
 				unsigned long, struct buffer_head *);
 extern int	   journal_cancel_revoke(handle_t *, struct journal_head *);
-extern void	   journal_write_revoke_records(journal_t *, transaction_t *);
+extern void	   journal_write_revoke_records(journal_t *,
+						transaction_t *, int);
 
 /* Recovery revoke support */
 extern int	journal_set_revoke(journal_t *, unsigned long, tid_t);
-- 
cgit v1.2.3


From 78c5b82ee68207a176ad5ca5eabdb2dbe5cfbfd3 Mon Sep 17 00:00:00 2001
From: Leandro Dorileo <ldorileo@gmail.com>
Date: Tue, 14 Apr 2009 14:59:51 +0100
Subject: tty: Update some of the USB kernel doc

Updates some usb_serial_port members documentation.

Signed-off-by: Leandro Dorileo <ldorileo@gmail.com>
Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/usb/serial.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index b95842542590..625e9e4639c6 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -29,7 +29,7 @@
 /**
  * usb_serial_port: structure for the specific ports of a device.
  * @serial: pointer back to the struct usb_serial owner of this port.
- * @tty: pointer to the corresponding tty for this port.
+ * @port: pointer to the corresponding tty_port for this port.
  * @lock: spinlock to grab when updating portions of this structure.
  * @mutex: mutex used to synchronize serial_open() and serial_close()
  *	access for this port.
@@ -44,19 +44,22 @@
  * @interrupt_out_endpointAddress: endpoint address for the interrupt out pipe
  * 	for this port.
  * @bulk_in_buffer: pointer to the bulk in buffer for this port.
+ * @bulk_in_size: the size of the bulk_in_buffer, in bytes.
  * @read_urb: pointer to the bulk in struct urb for this port.
  * @bulk_in_endpointAddress: endpoint address for the bulk in pipe for this
  *	port.
  * @bulk_out_buffer: pointer to the bulk out buffer for this port.
  * @bulk_out_size: the size of the bulk_out_buffer, in bytes.
  * @write_urb: pointer to the bulk out struct urb for this port.
+ * @write_urb_busy: port`s writing status
  * @bulk_out_endpointAddress: endpoint address for the bulk out pipe for this
  *	port.
  * @write_wait: a wait_queue_head_t used by the port.
  * @work: work queue entry for the line discipline waking up.
- * @open_count: number of times this port has been opened.
  * @throttled: nonzero if the read urb is inactive to throttle the device
  * @throttle_req: nonzero if the tty wants to throttle us
+ * @console: attached usb serial console
+ * @dev: pointer to the serial device
  *
  * This structure is used by the usb-serial core and drivers for the specific
  * ports of a device.
-- 
cgit v1.2.3


From 8f3d8ba20e67991b531e9c0227dcd1f99271a32c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Apr 2009 19:55:13 +0200
Subject: block: move bio list helpers into bio.h

It's used by DM and MD and generally useful, so move the bio list
helpers into bio.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/md/dm-bio-list.h    | 117 --------------------------------------------
 drivers/md/dm-delay.c       |   2 -
 drivers/md/dm-mpath.c       |   1 -
 drivers/md/dm-raid1.c       |   1 -
 drivers/md/dm-region-hash.c |   1 -
 drivers/md/dm-snap.c        |   1 -
 drivers/md/dm.c             |   1 -
 drivers/md/raid1.c          |   1 -
 drivers/md/raid10.c         |   1 -
 include/linux/bio.h         | 109 +++++++++++++++++++++++++++++++++++++++++
 10 files changed, 109 insertions(+), 126 deletions(-)
 delete mode 100644 drivers/md/dm-bio-list.h

(limited to 'include/linux')

diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
deleted file mode 100644
index 345098b4ca77..000000000000
--- a/drivers/md/dm-bio-list.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (C) 2004 Red Hat UK Ltd.
- *
- * This file is released under the GPL.
- */
-
-#ifndef DM_BIO_LIST_H
-#define DM_BIO_LIST_H
-
-#include <linux/bio.h>
-
-#ifdef CONFIG_BLOCK
-
-struct bio_list {
-	struct bio *head;
-	struct bio *tail;
-};
-
-static inline int bio_list_empty(const struct bio_list *bl)
-{
-	return bl->head == NULL;
-}
-
-static inline void bio_list_init(struct bio_list *bl)
-{
-	bl->head = bl->tail = NULL;
-}
-
-#define bio_list_for_each(bio, bl) \
-	for (bio = (bl)->head; bio; bio = bio->bi_next)
-
-static inline unsigned bio_list_size(const struct bio_list *bl)
-{
-	unsigned sz = 0;
-	struct bio *bio;
-
-	bio_list_for_each(bio, bl)
-		sz++;
-
-	return sz;
-}
-
-static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
-{
-	bio->bi_next = NULL;
-
-	if (bl->tail)
-		bl->tail->bi_next = bio;
-	else
-		bl->head = bio;
-
-	bl->tail = bio;
-}
-
-static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio)
-{
-	bio->bi_next = bl->head;
-
-	bl->head = bio;
-
-	if (!bl->tail)
-		bl->tail = bio;
-}
-
-static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
-{
-	if (!bl2->head)
-		return;
-
-	if (bl->tail)
-		bl->tail->bi_next = bl2->head;
-	else
-		bl->head = bl2->head;
-
-	bl->tail = bl2->tail;
-}
-
-static inline void bio_list_merge_head(struct bio_list *bl,
-				       struct bio_list *bl2)
-{
-	if (!bl2->head)
-		return;
-
-	if (bl->head)
-		bl2->tail->bi_next = bl->head;
-	else
-		bl->tail = bl2->tail;
-
-	bl->head = bl2->head;
-}
-
-static inline struct bio *bio_list_pop(struct bio_list *bl)
-{
-	struct bio *bio = bl->head;
-
-	if (bio) {
-		bl->head = bl->head->bi_next;
-		if (!bl->head)
-			bl->tail = NULL;
-
-		bio->bi_next = NULL;
-	}
-
-	return bio;
-}
-
-static inline struct bio *bio_list_get(struct bio_list *bl)
-{
-	struct bio *bio = bl->head;
-
-	bl->head = bl->tail = NULL;
-
-	return bio;
-}
-
-#endif /* CONFIG_BLOCK */
-#endif
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 59ee1b015d2d..559dbb52bc85 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -15,8 +15,6 @@
 
 #include <linux/device-mapper.h>
 
-#include "dm-bio-list.h"
-
 #define DM_MSG_PREFIX "delay"
 
 struct delay_c {
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 095f77bf9681..6a386ab4f7eb 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,7 +8,6 @@
 #include <linux/device-mapper.h>
 
 #include "dm-path-selector.h"
-#include "dm-bio-list.h"
 #include "dm-bio-record.h"
 #include "dm-uevent.h"
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 536ef0bef154..076fbb4e967a 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -5,7 +5,6 @@
  * This file is released under the GPL.
  */
 
-#include "dm-bio-list.h"
 #include "dm-bio-record.h"
 
 #include <linux/init.h>
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 59f8d9df9e1a..7b899be0b087 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -14,7 +14,6 @@
 #include <linux/vmalloc.h>
 
 #include "dm.h"
-#include "dm-bio-list.h"
 
 #define	DM_MSG_PREFIX	"region hash"
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 981a0413068f..d73f17fc7778 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -22,7 +22,6 @@
 #include <linux/workqueue.h>
 
 #include "dm-exception-store.h"
-#include "dm-bio-list.h"
 
 #define DM_MSG_PREFIX "snapshots"
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8a994be035ba..424f7b048c30 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -6,7 +6,6 @@
  */
 
 #include "dm.h"
-#include "dm-bio-list.h"
 #include "dm-uevent.h"
 
 #include <linux/init.h>
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 274b491a11c1..36df9109cde1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -35,7 +35,6 @@
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
 #include "md.h"
-#include "dm-bio-list.h"
 #include "raid1.h"
 #include "bitmap.h"
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e293d92641ac..81a54f17417e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,7 +22,6 @@
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
 #include "md.h"
-#include "dm-bio-list.h"
 #include "raid10.h"
 #include "bitmap.h"
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b900d2c67d29..b89cf2d82898 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -504,6 +504,115 @@ static inline int bio_has_data(struct bio *bio)
 	return bio && bio->bi_io_vec != NULL;
 }
 
+/*
+ * BIO list managment for use by remapping drivers (e.g. DM or MD).
+ *
+ * A bio_list anchors a singly-linked list of bios chained through the bi_next
+ * member of the bio.  The bio_list also caches the last list member to allow
+ * fast access to the tail.
+ */
+struct bio_list {
+	struct bio *head;
+	struct bio *tail;
+};
+
+static inline int bio_list_empty(const struct bio_list *bl)
+{
+	return bl->head == NULL;
+}
+
+static inline void bio_list_init(struct bio_list *bl)
+{
+	bl->head = bl->tail = NULL;
+}
+
+#define bio_list_for_each(bio, bl) \
+	for (bio = (bl)->head; bio; bio = bio->bi_next)
+
+static inline unsigned bio_list_size(const struct bio_list *bl)
+{
+	unsigned sz = 0;
+	struct bio *bio;
+
+	bio_list_for_each(bio, bl)
+		sz++;
+
+	return sz;
+}
+
+static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
+{
+	bio->bi_next = NULL;
+
+	if (bl->tail)
+		bl->tail->bi_next = bio;
+	else
+		bl->head = bio;
+
+	bl->tail = bio;
+}
+
+static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio)
+{
+	bio->bi_next = bl->head;
+
+	bl->head = bio;
+
+	if (!bl->tail)
+		bl->tail = bio;
+}
+
+static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
+{
+	if (!bl2->head)
+		return;
+
+	if (bl->tail)
+		bl->tail->bi_next = bl2->head;
+	else
+		bl->head = bl2->head;
+
+	bl->tail = bl2->tail;
+}
+
+static inline void bio_list_merge_head(struct bio_list *bl,
+				       struct bio_list *bl2)
+{
+	if (!bl2->head)
+		return;
+
+	if (bl->head)
+		bl2->tail->bi_next = bl->head;
+	else
+		bl->tail = bl2->tail;
+
+	bl->head = bl2->head;
+}
+
+static inline struct bio *bio_list_pop(struct bio_list *bl)
+{
+	struct bio *bio = bl->head;
+
+	if (bio) {
+		bl->head = bl->head->bi_next;
+		if (!bl->head)
+			bl->tail = NULL;
+
+		bio->bi_next = NULL;
+	}
+
+	return bio;
+}
+
+static inline struct bio *bio_list_get(struct bio_list *bl)
+{
+	struct bio *bio = bl->head;
+
+	bl->head = bl->tail = NULL;
+
+	return bio;
+}
+
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 #define bip_vec_idx(bip, idx)	(&(bip->bip_vec[(idx)]))
-- 
cgit v1.2.3


From 48e70bc18ac81881dedd3aa327c55b924fc41ecf Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 14 Apr 2009 08:19:27 +0200
Subject: Document and move the various READ/WRITE types

It's a somewhat twisty maze of hints and behavioural modifiers, try
and clear it up a bit with some documentation.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/fs.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 562d2855cf30..b535aec4406b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -87,6 +87,60 @@ struct inodes_stat_t {
  */
 #define FMODE_NOCMTIME		((__force fmode_t)2048)
 
+/*
+ * The below are the various read and write types that we support. Some of
+ * them include behavioral modifiers that send information down to the
+ * block layer and IO scheduler. Terminology:
+ *
+ *	The block layer uses device plugging to defer IO a little bit, in
+ *	the hope that we will see more IO very shortly. This increases
+ *	coalescing of adjacent IO and thus reduces the number of IOs we
+ *	have to send to the device. It also allows for better queuing,
+ *	if the IO isn't mergeable. If the caller is going to be waiting
+ *	for the IO, then he must ensure that the device is unplugged so
+ *	that the IO is dispatched to the driver.
+ *
+ *	All IO is handled async in Linux. This is fine for background
+ *	writes, but for reads or writes that someone waits for completion
+ *	on, we want to notify the block layer and IO scheduler so that they
+ *	know about it. That allows them to make better scheduling
+ *	decisions. So when the below references 'sync' and 'async', it
+ *	is referencing this priority hint.
+ *
+ * With that in mind, the available types are:
+ *
+ * READ			A normal read operation. Device will be plugged.
+ * READ_SYNC		A synchronous read. Device is not plugged, caller can
+ *			immediately wait on this read without caring about
+ *			unplugging.
+ * READA		Used for read-ahead operations. Lower priority, and the
+ *			 block layer could (in theory) choose to ignore this
+ *			request if it runs into resource problems.
+ * WRITE		A normal async write. Device will be plugged.
+ * SWRITE		Like WRITE, but a special case for ll_rw_block() that
+ *			tells it to lock the buffer first. Normally a buffer
+ *			must be locked before doing IO.
+ * WRITE_SYNC_PLUG	Synchronous write. Identical to WRITE, but passes down
+ *			the hint that someone will be waiting on this IO
+ *			shortly. The device must still be unplugged explicitly,
+ *			WRITE_SYNC_PLUG does not do this as we could be
+ *			submitting more writes before we actually wait on any
+ *			of them.
+ * WRITE_SYNC		Like WRITE_SYNC_PLUG, but also unplugs the device
+ *			immediately after submission. The write equivalent
+ *			of READ_SYNC.
+ * WRITE_ODIRECT	Special case write for O_DIRECT only.
+ * SWRITE_SYNC
+ * SWRITE_SYNC_PLUG	Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
+ *			See SWRITE.
+ * WRITE_BARRIER	Like WRITE, but tells the block layer that all
+ *			previously submitted writes must be safely on storage
+ *			before this one is started. Also guarantees that when
+ *			this write is complete, it itself is also safely on
+ *			storage. Prevents reordering of writes on both sides
+ *			of this IO.
+ *
+ */
 #define RW_MASK		1
 #define RWA_MASK	2
 #define READ 0
@@ -102,6 +156,11 @@ struct inodes_stat_t {
 			(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
 #define SWRITE_SYNC	(SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
 #define WRITE_BARRIER	(WRITE | (1 << BIO_RW_BARRIER))
+
+/*
+ * These aren't really reads or writes, they pass down information about
+ * parts of device that are now unused by the file system.
+ */
 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
 
-- 
cgit v1.2.3


From b3c2d2ddd63944ef2a1e4a43077b602288107e01 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 14 Apr 2009 19:48:36 +0200
Subject: splice: split up __splice_from_pipe()

Split up __splice_from_pipe() into four helper functions:

  splice_from_pipe_begin()
  splice_from_pipe_next()
  splice_from_pipe_feed()
  splice_from_pipe_end()

splice_from_pipe_next() will wait (if necessary) for more buffers to
be added to the pipe.  splice_from_pipe_feed() will feed the buffers
to the supplied actor and return when there's no more data available
(or if all of the requested data has been copied).

This is necessary so that implementations can do locking around the
non-waiting splice_from_pipe_feed().

This patch should not cause any change in behavior.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c            | 217 ++++++++++++++++++++++++++++++++-----------------
 include/linux/splice.h |  10 +++
 2 files changed, 153 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index c18aa7e03e2b..fd6b278d447b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -601,107 +601,176 @@ out:
 	return ret;
 }
 
+static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
+{
+	smp_mb();
+	if (waitqueue_active(&pipe->wait))
+		wake_up_interruptible(&pipe->wait);
+	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+}
+
 /**
- * __splice_from_pipe - splice data from a pipe to given actor
+ * splice_from_pipe_feed - feed available data from a pipe to a file
  * @pipe:	pipe to splice from
  * @sd:		information to @actor
  * @actor:	handler that splices the data
  *
  * Description:
- *    This function does little more than loop over the pipe and call
- *    @actor to do the actual moving of a single struct pipe_buffer to
- *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
- *    pipe_to_user.
+
+ *    This function loops over the pipe and calls @actor to do the
+ *    actual moving of a single struct pipe_buffer to the desired
+ *    destination.  It returns when there's no more buffers left in
+ *    the pipe or if the requested number of bytes (@sd->total_len)
+ *    have been copied.  It returns a positive number (one) if the
+ *    pipe needs to be filled with more data, zero if the required
+ *    number of bytes have been copied and -errno on error.
  *
+ *    This, together with splice_from_pipe_{begin,end,next}, may be
+ *    used to implement the functionality of __splice_from_pipe() when
+ *    locking is required around copying the pipe buffers to the
+ *    destination.
  */
-ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
-			   splice_actor *actor)
+int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
+			  splice_actor *actor)
 {
-	int ret, do_wakeup, err;
-
-	ret = 0;
-	do_wakeup = 0;
-
-	for (;;) {
-		if (pipe->nrbufs) {
-			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
-			const struct pipe_buf_operations *ops = buf->ops;
+	int ret;
 
-			sd->len = buf->len;
-			if (sd->len > sd->total_len)
-				sd->len = sd->total_len;
+	while (pipe->nrbufs) {
+		struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+		const struct pipe_buf_operations *ops = buf->ops;
 
-			err = actor(pipe, buf, sd);
-			if (err <= 0) {
-				if (!ret && err != -ENODATA)
-					ret = err;
+		sd->len = buf->len;
+		if (sd->len > sd->total_len)
+			sd->len = sd->total_len;
 
-				break;
-			}
+		ret = actor(pipe, buf, sd);
+		if (ret <= 0) {
+			if (ret == -ENODATA)
+				ret = 0;
+			return ret;
+		}
+		buf->offset += ret;
+		buf->len -= ret;
 
-			ret += err;
-			buf->offset += err;
-			buf->len -= err;
+		sd->num_spliced += ret;
+		sd->len -= ret;
+		sd->pos += ret;
+		sd->total_len -= ret;
 
-			sd->len -= err;
-			sd->pos += err;
-			sd->total_len -= err;
-			if (sd->len)
-				continue;
+		if (!buf->len) {
+			buf->ops = NULL;
+			ops->release(pipe, buf);
+			pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+			pipe->nrbufs--;
+			if (pipe->inode)
+				sd->need_wakeup = true;
+		}
 
-			if (!buf->len) {
-				buf->ops = NULL;
-				ops->release(pipe, buf);
-				pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
-				pipe->nrbufs--;
-				if (pipe->inode)
-					do_wakeup = 1;
-			}
+		if (!sd->total_len)
+			return 0;
+	}
 
-			if (!sd->total_len)
-				break;
-		}
+	return 1;
+}
+EXPORT_SYMBOL(splice_from_pipe_feed);
 
-		if (pipe->nrbufs)
-			continue;
+/**
+ * splice_from_pipe_next - wait for some data to splice from
+ * @pipe:	pipe to splice from
+ * @sd:		information about the splice operation
+ *
+ * Description:
+ *    This function will wait for some data and return a positive
+ *    value (one) if pipe buffers are available.  It will return zero
+ *    or -errno if no more data needs to be spliced.
+ */
+int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	while (!pipe->nrbufs) {
 		if (!pipe->writers)
-			break;
-		if (!pipe->waiting_writers) {
-			if (ret)
-				break;
-		}
+			return 0;
 
-		if (sd->flags & SPLICE_F_NONBLOCK) {
-			if (!ret)
-				ret = -EAGAIN;
-			break;
-		}
+		if (!pipe->waiting_writers && sd->num_spliced)
+			return 0;
 
-		if (signal_pending(current)) {
-			if (!ret)
-				ret = -ERESTARTSYS;
-			break;
-		}
+		if (sd->flags & SPLICE_F_NONBLOCK)
+			return -EAGAIN;
 
-		if (do_wakeup) {
-			smp_mb();
-			if (waitqueue_active(&pipe->wait))
-				wake_up_interruptible_sync(&pipe->wait);
-			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
-			do_wakeup = 0;
+		if (signal_pending(current))
+			return -ERESTARTSYS;
+
+		if (sd->need_wakeup) {
+			wakeup_pipe_writers(pipe);
+			sd->need_wakeup = false;
 		}
 
 		pipe_wait(pipe);
 	}
 
-	if (do_wakeup) {
-		smp_mb();
-		if (waitqueue_active(&pipe->wait))
-			wake_up_interruptible(&pipe->wait);
-		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
-	}
+	return 1;
+}
+EXPORT_SYMBOL(splice_from_pipe_next);
 
-	return ret;
+/**
+ * splice_from_pipe_begin - start splicing from pipe
+ * @pipe:	pipe to splice from
+ *
+ * Description:
+ *    This function should be called before a loop containing
+ *    splice_from_pipe_next() and splice_from_pipe_feed() to
+ *    initialize the necessary fields of @sd.
+ */
+void splice_from_pipe_begin(struct splice_desc *sd)
+{
+	sd->num_spliced = 0;
+	sd->need_wakeup = false;
+}
+EXPORT_SYMBOL(splice_from_pipe_begin);
+
+/**
+ * splice_from_pipe_end - finish splicing from pipe
+ * @pipe:	pipe to splice from
+ * @sd:		information about the splice operation
+ *
+ * Description:
+ *    This function will wake up pipe writers if necessary.  It should
+ *    be called after a loop containing splice_from_pipe_next() and
+ *    splice_from_pipe_feed().
+ */
+void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	if (sd->need_wakeup)
+		wakeup_pipe_writers(pipe);
+}
+EXPORT_SYMBOL(splice_from_pipe_end);
+
+/**
+ * __splice_from_pipe - splice data from a pipe to given actor
+ * @pipe:	pipe to splice from
+ * @sd:		information to @actor
+ * @actor:	handler that splices the data
+ *
+ * Description:
+ *    This function does little more than loop over the pipe and call
+ *    @actor to do the actual moving of a single struct pipe_buffer to
+ *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ *    pipe_to_user.
+ *
+ */
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
+			   splice_actor *actor)
+{
+	int ret;
+
+	splice_from_pipe_begin(sd);
+	do {
+		ret = splice_from_pipe_next(pipe, sd);
+		if (ret > 0)
+			ret = splice_from_pipe_feed(pipe, sd, actor);
+	} while (ret > 0);
+	splice_from_pipe_end(pipe, sd);
+
+	return sd->num_spliced ? sd->num_spliced : ret;
 }
 EXPORT_SYMBOL(__splice_from_pipe);
 
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 528dcb93c2f2..8fc2a635586e 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -36,6 +36,8 @@ struct splice_desc {
 		void *data;		/* cookie */
 	} u;
 	loff_t pos;			/* file position */
+	size_t num_spliced;		/* number of bytes already spliced */
+	bool need_wakeup;		/* need to wake up writer */
 };
 
 struct partial_page {
@@ -66,6 +68,14 @@ extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *,
 				splice_actor *);
 extern ssize_t __splice_from_pipe(struct pipe_inode_info *,
 				  struct splice_desc *, splice_actor *);
+extern int splice_from_pipe_feed(struct pipe_inode_info *, struct splice_desc *,
+				 splice_actor *);
+extern int splice_from_pipe_next(struct pipe_inode_info *,
+				 struct splice_desc *);
+extern void splice_from_pipe_begin(struct splice_desc *);
+extern void splice_from_pipe_end(struct pipe_inode_info *,
+				 struct splice_desc *);
+
 extern ssize_t splice_to_pipe(struct pipe_inode_info *,
 			      struct splice_pipe_desc *);
 extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
-- 
cgit v1.2.3


From 328eaaba4e41a04c1dc4679d65bea3fee4349d86 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 14 Apr 2009 19:48:39 +0200
Subject: ocfs2: fix i_mutex locking in ocfs2_splice_to_file()

Rearrange locking of i_mutex on destination and call to
ocfs2_rw_lock() so locks are only held while buffers are copied with
the pipe_to_file() actor, and not while waiting for more data on the
pipe.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/ocfs2/file.c        | 94 +++++++++++++++++++++++++++++++++++++++-----------
 fs/splice.c            |  5 +--
 include/linux/splice.h |  2 ++
 3 files changed, 79 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8672b9536039..c2a87c885b73 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1912,6 +1912,22 @@ out_sems:
 	return written ? written : ret;
 }
 
+static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
+				struct file *out,
+				struct splice_desc *sd)
+{
+	int ret;
+
+	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry,	&sd->pos,
+					    sd->total_len, 0, NULL);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	return splice_from_pipe_feed(pipe, sd, pipe_to_file);
+}
+
 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 				       struct file *out,
 				       loff_t *ppos,
@@ -1919,38 +1935,76 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 				       unsigned int flags)
 {
 	int ret;
-	struct inode *inode = out->f_path.dentry->d_inode;
+	struct address_space *mapping = out->f_mapping;
+	struct inode *inode = mapping->host;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
 
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
 		   (unsigned int)len,
 		   out->f_path.dentry->d_name.len,
 		   out->f_path.dentry->d_name.name);
 
-	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+	if (pipe->inode)
+		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
 
-	ret = ocfs2_rw_lock(inode, 1);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
+	splice_from_pipe_begin(&sd);
+	do {
+		ret = splice_from_pipe_next(pipe, &sd);
+		if (ret <= 0)
+			break;
 
-	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
-					    NULL);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_unlock;
-	}
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		ret = ocfs2_rw_lock(inode, 1);
+		if (ret < 0)
+			mlog_errno(ret);
+		else {
+			ret = ocfs2_splice_to_file(pipe, out, &sd);
+			ocfs2_rw_unlock(inode, 1);
+		}
+		mutex_unlock(&inode->i_mutex);
+	} while (ret > 0);
+	splice_from_pipe_end(pipe, &sd);
 
-	if (pipe->inode)
-		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
-	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
 	if (pipe->inode)
 		mutex_unlock(&pipe->inode->i_mutex);
 
-out_unlock:
-	ocfs2_rw_unlock(inode, 1);
-out:
-	mutex_unlock(&inode->i_mutex);
+	if (sd.num_spliced)
+		ret = sd.num_spliced;
+
+	if (ret > 0) {
+		unsigned long nr_pages;
+
+		*ppos += ret;
+		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+		/*
+		 * If file or inode is SYNC and we actually wrote some data,
+		 * sync it.
+		 */
+		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			int err;
+
+			mutex_lock(&inode->i_mutex);
+			err = ocfs2_rw_lock(inode, 1);
+			if (err < 0) {
+				mlog_errno(err);
+			} else {
+				err = generic_osync_inode(inode, mapping,
+						  OSYNC_METADATA|OSYNC_DATA);
+				ocfs2_rw_unlock(inode, 1);
+			}
+			mutex_unlock(&inode->i_mutex);
+
+			if (err)
+				ret = err;
+		}
+		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+	}
 
 	mlog_exit(ret);
 	return ret;
diff --git a/fs/splice.c b/fs/splice.c
index a1f595b9db40..584b2b7a1dbe 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -555,8 +555,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
  * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
  * a new page in the output file page cache and fill/dirty that.
  */
-static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
-			struct splice_desc *sd)
+int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+		 struct splice_desc *sd)
 {
 	struct file *file = sd->u.file;
 	struct address_space *mapping = file->f_mapping;
@@ -600,6 +600,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 out:
 	return ret;
 }
+EXPORT_SYMBOL(pipe_to_file);
 
 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 {
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 8fc2a635586e..5f3faa9d15ae 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -75,6 +75,8 @@ extern int splice_from_pipe_next(struct pipe_inode_info *,
 extern void splice_from_pipe_begin(struct splice_desc *);
 extern void splice_from_pipe_end(struct pipe_inode_info *,
 				 struct splice_desc *);
+extern int pipe_to_file(struct pipe_inode_info *, struct pipe_buffer *,
+			struct splice_desc *);
 
 extern ssize_t splice_to_pipe(struct pipe_inode_info *,
 			      struct splice_pipe_desc *);
-- 
cgit v1.2.3


From f8cc774ce4844811a55e2352f1443055e3994e28 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 14 Apr 2009 19:48:40 +0200
Subject: splice: remove generic_file_splice_write_nolock()

Remove the now unused generic_file_splice_write_nolock() function.
It's conceptually broken anyway, because splice may need to wait for
pipe events so holding locks across the whole operation is wrong.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c        | 59 ------------------------------------------------------
 include/linux/fs.h |  2 --
 2 files changed, 61 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index 584b2b7a1dbe..128ee36a719b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -810,65 +810,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 	return ret;
 }
 
-/**
- * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
- * @pipe:	pipe info
- * @out:	file to write to
- * @ppos:	position in @out
- * @len:	number of bytes to splice
- * @flags:	splice modifier flags
- *
- * Description:
- *    Will either move or copy pages (determined by @flags options) from
- *    the given pipe inode to the given file. The caller is responsible
- *    for acquiring i_mutex on both inodes.
- *
- */
-ssize_t
-generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
-				 loff_t *ppos, size_t len, unsigned int flags)
-{
-	struct address_space *mapping = out->f_mapping;
-	struct inode *inode = mapping->host;
-	struct splice_desc sd = {
-		.total_len = len,
-		.flags = flags,
-		.pos = *ppos,
-		.u.file = out,
-	};
-	ssize_t ret;
-	int err;
-
-	err = file_remove_suid(out);
-	if (unlikely(err))
-		return err;
-
-	ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
-	if (ret > 0) {
-		unsigned long nr_pages;
-
-		*ppos += ret;
-		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
-		/*
-		 * If file or inode is SYNC and we actually wrote some data,
-		 * sync it.
-		 */
-		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			err = generic_osync_inode(inode, mapping,
-						  OSYNC_METADATA|OSYNC_DATA);
-
-			if (err)
-				ret = err;
-		}
-		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
-	}
-
-	return ret;
-}
-
-EXPORT_SYMBOL(generic_file_splice_write_nolock);
-
 /**
  * generic_file_splice_write - splice data from a pipe to a file
  * @pipe:	pipe info
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b535aec4406b..907d8f56c6fa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2209,8 +2209,6 @@ extern ssize_t generic_file_splice_read(struct file *, loff_t *,
 		struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
-extern ssize_t generic_file_splice_write_nolock(struct pipe_inode_info *,
-		struct file *, loff_t *, size_t, unsigned int);
 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
 		struct file *out, loff_t *, size_t len, unsigned int flags);
 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-- 
cgit v1.2.3


From 61e0d47c33cc371f725bcda4a47ae0efe652dba8 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 14 Apr 2009 19:48:41 +0200
Subject: splice: add helpers for locking pipe inode

There are lots of sequences like this, especially in splice code:

	if (pipe->inode)
		mutex_lock(&pipe->inode->i_mutex);
	/* do something */
	if (pipe->inode)
		mutex_unlock(&pipe->inode->i_mutex);

so introduce helpers which do the conditional locking and unlocking.
Also replace the inode_double_lock() call with a pipe_double_lock()
helper to avoid spreading the use of this functionality beyond the
pipe code.

This patch is just a cleanup, and should cause no behavioral changes.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/inode.c                | 36 ----------------------------------
 fs/pipe.c                 | 42 +++++++++++++++++++++++++++++++++++----
 fs/splice.c               | 50 ++++++++++++++++++++---------------------------
 include/linux/fs.h        |  3 ---
 include/linux/pipe_fs_i.h |  5 +++++
 5 files changed, 64 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index d06d6d268de9..6ad14a1cd8c9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1470,42 +1470,6 @@ static void __wait_on_freeing_inode(struct inode *inode)
 	spin_lock(&inode_lock);
 }
 
-/*
- * We rarely want to lock two inodes that do not have a parent/child
- * relationship (such as directory, child inode) simultaneously. The
- * vast majority of file systems should be able to get along fine
- * without this. Do not use these functions except as a last resort.
- */
-void inode_double_lock(struct inode *inode1, struct inode *inode2)
-{
-	if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
-		if (inode1)
-			mutex_lock(&inode1->i_mutex);
-		else if (inode2)
-			mutex_lock(&inode2->i_mutex);
-		return;
-	}
-
-	if (inode1 < inode2) {
-		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
-	} else {
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
-	}
-}
-EXPORT_SYMBOL(inode_double_lock);
-
-void inode_double_unlock(struct inode *inode1, struct inode *inode2)
-{
-	if (inode1)
-		mutex_unlock(&inode1->i_mutex);
-
-	if (inode2 && inode2 != inode1)
-		mutex_unlock(&inode2->i_mutex);
-}
-EXPORT_SYMBOL(inode_double_unlock);
-
 static __initdata unsigned long ihash_entries;
 static int __init set_ihash_entries(char *str)
 {
diff --git a/fs/pipe.c b/fs/pipe.c
index 4af7aa521813..13414ec45b8d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -37,6 +37,42 @@
  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  */
 
+static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+{
+	if (pipe->inode)
+		mutex_lock_nested(&pipe->inode->i_mutex, subclass);
+}
+
+void pipe_lock(struct pipe_inode_info *pipe)
+{
+	/*
+	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
+	 */
+	pipe_lock_nested(pipe, I_MUTEX_PARENT);
+}
+EXPORT_SYMBOL(pipe_lock);
+
+void pipe_unlock(struct pipe_inode_info *pipe)
+{
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
+}
+EXPORT_SYMBOL(pipe_unlock);
+
+void pipe_double_lock(struct pipe_inode_info *pipe1,
+		      struct pipe_inode_info *pipe2)
+{
+	BUG_ON(pipe1 == pipe2);
+
+	if (pipe1 < pipe2) {
+		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+	} else {
+		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+	}
+}
+
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe)
 {
@@ -47,12 +83,10 @@ void pipe_wait(struct pipe_inode_info *pipe)
 	 * is considered a noninteractive wait:
 	 */
 	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
-	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 	schedule();
 	finish_wait(&pipe->wait, &wait);
-	if (pipe->inode)
-		mutex_lock(&pipe->inode->i_mutex);
+	pipe_lock(pipe);
 }
 
 static int
diff --git a/fs/splice.c b/fs/splice.c
index 128ee36a719b..5384a90665d0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -182,8 +182,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 	do_wakeup = 0;
 	page_nr = 0;
 
-	if (pipe->inode)
-		mutex_lock(&pipe->inode->i_mutex);
+	pipe_lock(pipe);
 
 	for (;;) {
 		if (!pipe->readers) {
@@ -245,15 +244,13 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 		pipe->waiting_writers--;
 	}
 
-	if (pipe->inode) {
-		mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 
-		if (do_wakeup) {
-			smp_mb();
-			if (waitqueue_active(&pipe->wait))
-				wake_up_interruptible(&pipe->wait);
-			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
-		}
+	if (do_wakeup) {
+		smp_mb();
+		if (waitqueue_active(&pipe->wait))
+			wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 
 	while (page_nr < spd_pages)
@@ -801,11 +798,9 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 		.u.file = out,
 	};
 
-	if (pipe->inode)
-		mutex_lock(&pipe->inode->i_mutex);
+	pipe_lock(pipe);
 	ret = __splice_from_pipe(pipe, &sd, actor);
-	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 
 	return ret;
 }
@@ -837,8 +832,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	};
 	ssize_t ret;
 
-	if (pipe->inode)
-		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
+	pipe_lock(pipe);
 
 	splice_from_pipe_begin(&sd);
 	do {
@@ -854,8 +848,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	} while (ret > 0);
 	splice_from_pipe_end(pipe, &sd);
 
-	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 
 	if (sd.num_spliced)
 		ret = sd.num_spliced;
@@ -1348,8 +1341,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
 	if (!pipe)
 		return -EBADF;
 
-	if (pipe->inode)
-		mutex_lock(&pipe->inode->i_mutex);
+	pipe_lock(pipe);
 
 	error = ret = 0;
 	while (nr_segs) {
@@ -1404,8 +1396,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
 		iov++;
 	}
 
-	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 
 	if (!ret)
 		ret = error;
@@ -1533,7 +1524,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 		return 0;
 
 	ret = 0;
-	mutex_lock(&pipe->inode->i_mutex);
+	pipe_lock(pipe);
 
 	while (!pipe->nrbufs) {
 		if (signal_pending(current)) {
@@ -1551,7 +1542,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 		pipe_wait(pipe);
 	}
 
-	mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 	return ret;
 }
 
@@ -1571,7 +1562,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 		return 0;
 
 	ret = 0;
-	mutex_lock(&pipe->inode->i_mutex);
+	pipe_lock(pipe);
 
 	while (pipe->nrbufs >= PIPE_BUFFERS) {
 		if (!pipe->readers) {
@@ -1592,7 +1583,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 		pipe->waiting_writers--;
 	}
 
-	mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 	return ret;
 }
 
@@ -1608,10 +1599,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 
 	/*
 	 * Potential ABBA deadlock, work around it by ordering lock
-	 * grabbing by inode address. Otherwise two different processes
+	 * grabbing by pipe info address. Otherwise two different processes
 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
 	 */
-	inode_double_lock(ipipe->inode, opipe->inode);
+	pipe_double_lock(ipipe, opipe);
 
 	do {
 		if (!opipe->readers) {
@@ -1662,7 +1653,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 	if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
 		ret = -EAGAIN;
 
-	inode_double_unlock(ipipe->inode, opipe->inode);
+	pipe_unlock(ipipe);
+	pipe_unlock(opipe);
 
 	/*
 	 * If we put data in the output pipe, wakeup any potential readers.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 907d8f56c6fa..e766be0d4329 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -797,9 +797,6 @@ enum inode_i_mutex_lock_class
 	I_MUTEX_QUOTA
 };
 
-extern void inode_double_lock(struct inode *inode1, struct inode *inode2);
-extern void inode_double_unlock(struct inode *inode1, struct inode *inode2);
-
 /*
  * NOTE: in a 32bit arch with a preemptable kernel and
  * an UP compile the i_size_read/write must be atomic
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 8e4120285f72..c8f038554e80 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -134,6 +134,11 @@ struct pipe_buf_operations {
    memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
 #define PIPE_SIZE		PAGE_SIZE
 
+/* Pipe lock and unlock operations */
+void pipe_lock(struct pipe_inode_info *);
+void pipe_unlock(struct pipe_inode_info *);
+void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
+
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe);
 
-- 
cgit v1.2.3


From 35c80d5f400f68f2eccf3069d1c068e154bde9c9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 15 Apr 2009 13:22:38 -0400
Subject: Add block_write_full_page_endio for passing endio handler

block_write_full_page doesn't allow the caller to control what happens
when the IO is over.  This adds a new call named block_write_full_page_endio
so the buffer head end_io handler can be provided by the caller.

This will be used by the ext3 data=guarded mode to do i_size updates in
a workqueue based end_io handler.  end_buffer_async_write is also
exported so it can be called to do the dirty work of managing page
writeback for the higher level end_io handler.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Acked-by: Theodore Tso <tytso@mit.edu>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                 | 45 ++++++++++++++++++++++++++++++++++-----------
 include/linux/buffer_head.h |  3 +++
 2 files changed, 37 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index ff8bb1f2333a..b3e5be7514f5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -360,7 +360,7 @@ still_busy:
  * Completion handler for block_write_full_page() - pages which are unlocked
  * during I/O, and which have PageWriteback cleared upon I/O completion.
  */
-static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
 	unsigned long flags;
@@ -438,11 +438,17 @@ static void mark_buffer_async_read(struct buffer_head *bh)
 	set_buffer_async_read(bh);
 }
 
-void mark_buffer_async_write(struct buffer_head *bh)
+void mark_buffer_async_write_endio(struct buffer_head *bh,
+				   bh_end_io_t *handler)
 {
-	bh->b_end_io = end_buffer_async_write;
+	bh->b_end_io = handler;
 	set_buffer_async_write(bh);
 }
+
+void mark_buffer_async_write(struct buffer_head *bh)
+{
+	mark_buffer_async_write_endio(bh, end_buffer_async_write);
+}
 EXPORT_SYMBOL(mark_buffer_async_write);
 
 
@@ -1615,7 +1621,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
  * unplugging the device queue.
  */
 static int __block_write_full_page(struct inode *inode, struct page *page,
-			get_block_t *get_block, struct writeback_control *wbc)
+			get_block_t *get_block, struct writeback_control *wbc,
+			bh_end_io_t *handler)
 {
 	int err;
 	sector_t block;
@@ -1700,7 +1707,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 			continue;
 		}
 		if (test_clear_buffer_dirty(bh)) {
-			mark_buffer_async_write(bh);
+			mark_buffer_async_write_endio(bh, handler);
 		} else {
 			unlock_buffer(bh);
 		}
@@ -1753,7 +1760,7 @@ recover:
 		if (buffer_mapped(bh) && buffer_dirty(bh) &&
 		    !buffer_delay(bh)) {
 			lock_buffer(bh);
-			mark_buffer_async_write(bh);
+			mark_buffer_async_write_endio(bh, handler);
 		} else {
 			/*
 			 * The buffer may have been set dirty during
@@ -2679,7 +2686,8 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
-		ret = __block_write_full_page(inode, page, get_block, wbc);
+		ret = __block_write_full_page(inode, page, get_block, wbc,
+					      end_buffer_async_write);
 	return ret;
 }
 EXPORT_SYMBOL(nobh_writepage);
@@ -2837,9 +2845,10 @@ out:
 
 /*
  * The generic ->writepage function for buffer-backed address_spaces
+ * this form passes in the end_io handler used to finish the IO.
  */
-int block_write_full_page(struct page *page, get_block_t *get_block,
-			struct writeback_control *wbc)
+int block_write_full_page_endio(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc, bh_end_io_t *handler)
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
@@ -2848,7 +2857,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
-		return __block_write_full_page(inode, page, get_block, wbc);
+		return __block_write_full_page(inode, page, get_block, wbc,
+					       handler);
 
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2871,9 +2881,20 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 	 * writes to that region are not written out to the file."
 	 */
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-	return __block_write_full_page(inode, page, get_block, wbc);
+	return __block_write_full_page(inode, page, get_block, wbc, handler);
 }
 
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
+int block_write_full_page(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc)
+{
+	return block_write_full_page_endio(page, get_block, wbc,
+					   end_buffer_async_write);
+}
+
+
 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
 			    get_block_t *get_block)
 {
@@ -3342,9 +3363,11 @@ EXPORT_SYMBOL(block_read_full_page);
 EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
+EXPORT_SYMBOL(block_write_full_page_endio);
 EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
+EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(file_fsync);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7b73bb8f1970..16ed0284d780 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -155,6 +155,7 @@ void create_empty_buffers(struct page *, unsigned long,
 			unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
+void end_buffer_async_write(struct buffer_head *bh, int uptodate);
 
 /* Things to do with buffers at mapping->private_list */
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
@@ -197,6 +198,8 @@ extern int buffer_heads_over_limit;
 void block_invalidatepage(struct page *page, unsigned long offset);
 int block_write_full_page(struct page *page, get_block_t *get_block,
 				struct writeback_control *wbc);
+int block_write_full_page_endio(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc, bh_end_io_t *handler);
 int block_read_full_page(struct page*, get_block_t*);
 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
 				unsigned long from);
-- 
cgit v1.2.3


From 412401029259b1ad67559cec93bcc7ee4a9551aa Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Thu, 16 Apr 2009 09:58:44 -0600
Subject: powerpc/5200: Bring the legacy fsl_spi_platform_data hooks back

In commit 364fdbc00fbdd409ade63500710123fe323aa164 ("spi_mpc83xx:
rework chip selects handling"), I merged activate_cs and deactivate_cs
hooks into cs_control, but I overlooked that mpc52xx_psc_spi driver
is using these hooks too. And that resulted in the following build
failure:

CC      drivers/spi/mpc52xx_psc_spi.o
drivers/spi/mpc52xx_psc_spi.c: In function 'mpc52xx_psc_spi_do_probe':
drivers/spi/mpc52xx_psc_spi.c:398: error: 'struct fsl_spi_platform_data'
has no member named 'activate_cs'
drivers/spi/mpc52xx_psc_spi.c:399: error: 'struct fsl_spi_platform_data'
has no member named 'deactivate_cs'
make[2]: *** [drivers/spi/mpc52xx_psc_spi.o] Error 1

This patch simply adds the legacy hooks back for 2.6.30, and for
2.6.31 we'll convert the driver to ->cs_control.

Reported-by: Subrata Modak <subrata@linux.vnet.ibm.com>
Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 include/linux/fsl_devices.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index f2a78b5e8b55..0cde1806cfab 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -83,6 +83,10 @@ struct fsl_spi_platform_data {
 	u16	max_chipselect;
 	void	(*cs_control)(struct spi_device *spi, bool on);
 	u32	sysclk;
+
+	/* Legacy hooks, used by mpc52xx_psc_spi driver. */
+	void	(*activate_cs)(u8 cs, u8 polarity);
+	void	(*deactivate_cs)(u8 cs, u8 polarity);
 };
 
 struct mpc8xx_pcmcia_ops {
-- 
cgit v1.2.3


From e3cf95dd6d352954b663d2934110d6e30af2406d Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 9 Apr 2009 17:31:17 +0100
Subject: ata: Report 16/32bit PIO as best we can

The legacy old IDE ioctl API for this is a bit primitive so we try
and map stuff sensibly onto it.

- Set PIO over DMA devices to report 32bit
- Add ability to change the PIO32 settings if the controller permits it
- Add that functionality into the sff drivers
- Add that functionality into the VLB legacy driver
- Turn on the 32bit PIO on the ninja32 and add support there

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-scsi.c  | 30 ++++++++++++++++++++++++++----
 drivers/ata/libata-sff.c   | 27 +++++++++++++++++++++++++++
 drivers/ata/pata_legacy.c  | 33 ++++++++++++++++++++-------------
 drivers/ata/pata_ninja32.c |  4 +++-
 include/linux/libata.h     |  8 ++++++++
 5 files changed, 84 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index b9747fa59e54..2733b0c90b75 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -647,23 +647,45 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
 	return rc;
 }
 
+static int ata_ioc32(struct ata_port *ap)
+{
+	if (ap->flags & ATA_FLAG_PIO_DMA)
+		return 1;
+	if (ap->pflags & ATA_PFLAG_PIO32)
+		return 1;
+	return 0;
+}
+
 int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *scsidev,
 		     int cmd, void __user *arg)
 {
 	int val = -EINVAL, rc = -EINVAL;
+	unsigned long flags;
 
 	switch (cmd) {
 	case ATA_IOC_GET_IO32:
-		val = 0;
+		spin_lock_irqsave(ap->lock, flags);
+		val = ata_ioc32(ap);
+		spin_unlock_irqrestore(ap->lock, flags);
 		if (copy_to_user(arg, &val, 1))
 			return -EFAULT;
 		return 0;
 
 	case ATA_IOC_SET_IO32:
 		val = (unsigned long) arg;
-		if (val != 0)
-			return -EINVAL;
-		return 0;
+		rc = 0;
+		spin_lock_irqsave(ap->lock, flags);
+		if (ap->pflags & ATA_PFLAG_PIO32CHANGE) {
+			if (val)
+				ap->pflags |= ATA_PFLAG_PIO32;
+			else
+				ap->pflags &= ~ATA_PFLAG_PIO32;
+		} else {
+			if (val != ata_ioc32(ap))
+				rc = -EINVAL;
+		}
+		spin_unlock_irqrestore(ap->lock, flags);
+		return rc;
 
 	case HDIO_GET_IDENTITY:
 		return ata_get_identity(ap, scsidev, arg);
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 8332e97a9de3..bb18415d3d63 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -87,6 +87,7 @@ const struct ata_port_operations ata_bmdma32_port_ops = {
 	.inherits		= &ata_bmdma_port_ops,
 
 	.sff_data_xfer		= ata_sff_data_xfer32,
+	.port_start		= ata_sff_port_start32,
 };
 EXPORT_SYMBOL_GPL(ata_bmdma32_port_ops);
 
@@ -769,6 +770,9 @@ unsigned int ata_sff_data_xfer32(struct ata_device *dev, unsigned char *buf,
 	void __iomem *data_addr = ap->ioaddr.data_addr;
 	unsigned int words = buflen >> 2;
 	int slop = buflen & 3;
+	
+	if (!(ap->pflags & ATA_PFLAG_PIO32))
+		return ata_sff_data_xfer(dev, buf, buflen, rw);
 
 	/* Transfer multiple of 4 bytes */
 	if (rw == READ)
@@ -2401,6 +2405,29 @@ int ata_sff_port_start(struct ata_port *ap)
 }
 EXPORT_SYMBOL_GPL(ata_sff_port_start);
 
+/**
+ *	ata_sff_port_start32 - Set port up for dma.
+ *	@ap: Port to initialize
+ *
+ *	Called just after data structures for each port are
+ *	initialized.  Allocates space for PRD table if the device
+ *	is DMA capable SFF.
+ *
+ *	May be used as the port_start() entry in ata_port_operations for
+ *	devices that are capable of 32bit PIO.
+ *
+ *	LOCKING:
+ *	Inherited from caller.
+ */
+int ata_sff_port_start32(struct ata_port *ap)
+{
+	ap->pflags |= ATA_PFLAG_PIO32 | ATA_PFLAG_PIO32CHANGE;
+	if (ap->ioaddr.bmdma_addr)
+		return ata_port_start(ap);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ata_sff_port_start32);
+
 /**
  *	ata_sff_std_ports - initialize ioaddr with standard port offsets.
  *	@ioaddr: IO address structure to be initialized
diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c
index 0c6dde80417b..6f985bed8cbb 100644
--- a/drivers/ata/pata_legacy.c
+++ b/drivers/ata/pata_legacy.c
@@ -108,6 +108,7 @@ struct legacy_controller {
 	struct ata_port_operations *ops;
 	unsigned int pio_mask;
 	unsigned int flags;
+	unsigned int pflags;
 	int (*setup)(struct platform_device *, struct legacy_probe *probe,
 		struct legacy_data *data);
 };
@@ -285,7 +286,8 @@ static unsigned int pdc_data_xfer_vlb(struct ata_device *dev,
 {
 	int slop = buflen & 3;
 	/* 32bit I/O capable *and* we need to write a whole number of dwords */
-	if (ata_id_has_dword_io(dev->id) && (slop == 0 || slop == 3)) {
+	if (ata_id_has_dword_io(dev->id) && (slop == 0 || slop == 3)
+					&& (ap->pflags & ATA_PFLAG_PIO32)) {
 		struct ata_port *ap = dev->link->ap;
 		unsigned long flags;
 
@@ -736,7 +738,8 @@ static unsigned int vlb32_data_xfer(struct ata_device *adev, unsigned char *buf,
 	struct ata_port *ap = adev->link->ap;
 	int slop = buflen & 3;
 
-	if (ata_id_has_dword_io(adev->id) && (slop == 0 || slop == 3)) {
+	if (ata_id_has_dword_io(adev->id) && (slop == 0 || slop == 3)
+					&& (ap->pflags & ATA_PFLAG_PIO32)) {
 		if (rw == WRITE)
 			iowrite32_rep(ap->ioaddr.data_addr, buf, buflen >> 2);
 		else
@@ -858,27 +861,30 @@ static struct ata_port_operations winbond_port_ops = {
 
 static struct legacy_controller controllers[] = {
 	{"BIOS",	&legacy_port_ops, 	0x1F,
-						ATA_FLAG_NO_IORDY,	NULL },
+			ATA_FLAG_NO_IORDY,	0,			NULL },
 	{"Snooping", 	&simple_port_ops, 	0x1F,
-						0	       ,	NULL },
+			0,			0,			NULL },
 	{"PDC20230",	&pdc20230_port_ops,	0x7,
-						ATA_FLAG_NO_IORDY,	NULL },
+			ATA_FLAG_NO_IORDY,
+			ATA_PFLAG_PIO32 | ATA_PFLAG_PIO32_CHANGE,	NULL },
 	{"HT6560A",	&ht6560a_port_ops,	0x07,
-						ATA_FLAG_NO_IORDY,	NULL },
+			ATA_FLAG_NO_IORDY,	0,			NULL },
 	{"HT6560B",	&ht6560b_port_ops,	0x1F,
-						ATA_FLAG_NO_IORDY,	NULL },
+			ATA_FLAG_NO_IORDY,	0,			NULL },
 	{"OPTI82C611A",	&opti82c611a_port_ops,	0x0F,
-						0	       ,	NULL },
+			0,			0,			NULL },
 	{"OPTI82C46X",	&opti82c46x_port_ops,	0x0F,
-						0	       ,	NULL },
+			0,			0,			NULL },
 	{"QDI6500",	&qdi6500_port_ops,	0x07,
-					ATA_FLAG_NO_IORDY,	qdi_port },
+			ATA_FLAG_NO_IORDY,
+			ATA_PFLAG_PIO32 | ATA_PFLAG_PIO32_CHANGE,    qdi_port },
 	{"QDI6580",	&qdi6580_port_ops,	0x1F,
-					0	       ,	qdi_port },
+			0, ATA_PFLAG_PIO32 | ATA_PFLAG_PIO32_CHANGE, qdi_port },
 	{"QDI6580DP",	&qdi6580dp_port_ops,	0x1F,
-					0	       ,	qdi_port },
+			0, ATA_PFLAG_PIO32 | ATA_PFLAG_PIO32_CHANGE, qdi_port },
 	{"W83759A",	&winbond_port_ops,	0x1F,
-					0	       ,	winbond_port }
+			0, ATA_PFLAG_PIO32 | ATA_PFLAG_PIO32_CHANGE,
+								winbond_port }
 };
 
 /**
@@ -1008,6 +1014,7 @@ static __init int legacy_init_one(struct legacy_probe *probe)
 	ap->ops = ops;
 	ap->pio_mask = pio_modes;
 	ap->flags |= ATA_FLAG_SLAVE_POSS | iordy;
+	ap->pflags |= controller->pflags;
 	ap->ioaddr.cmd_addr = io_addr;
 	ap->ioaddr.altstatus_addr = ctrl_addr;
 	ap->ioaddr.ctl_addr = ctrl_addr;
diff --git a/drivers/ata/pata_ninja32.c b/drivers/ata/pata_ninja32.c
index 0fb6b1b1e634..dd53a66b19e3 100644
--- a/drivers/ata/pata_ninja32.c
+++ b/drivers/ata/pata_ninja32.c
@@ -44,7 +44,7 @@
 #include <linux/libata.h>
 
 #define DRV_NAME "pata_ninja32"
-#define DRV_VERSION "0.1.3"
+#define DRV_VERSION "0.1.5"
 
 
 /**
@@ -86,6 +86,7 @@ static struct ata_port_operations ninja32_port_ops = {
 	.sff_dev_select = ninja32_dev_select,
 	.cable_detect	= ata_cable_40wire,
 	.set_piomode	= ninja32_set_piomode,
+	.sff_data_xfer	= ata_sff_data_xfer32
 };
 
 static void ninja32_program(void __iomem *base)
@@ -144,6 +145,7 @@ static int ninja32_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 	ap->ioaddr.altstatus_addr = base + 0x1E;
 	ap->ioaddr.bmdma_addr = base;
 	ata_sff_std_ports(&ap->ioaddr);
+	ap->pflags = ATA_PFLAG_PIO32 | ATA_PFLAG_PIO32CHANGE;
 
 	ninja32_program(base);
 	/* FIXME: Should we disable them at remove ? */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index b450a2628855..3d501db36a26 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -209,6 +209,7 @@ enum {
 
 	/* bits 24:31 of ap->flags are reserved for LLD specific flags */
 
+
 	/* struct ata_port pflags */
 	ATA_PFLAG_EH_PENDING	= (1 << 0), /* EH pending */
 	ATA_PFLAG_EH_IN_PROGRESS = (1 << 1), /* EH in progress */
@@ -225,6 +226,9 @@ enum {
 	ATA_PFLAG_PM_PENDING	= (1 << 18), /* PM operation pending */
 	ATA_PFLAG_INIT_GTM_VALID = (1 << 19), /* initial gtm data valid */
 
+	ATA_PFLAG_PIO32		= (1 << 20),  /* 32bit PIO */
+	ATA_PFLAG_PIO32CHANGE	= (1 << 21),  /* 32bit PIO can be turned on/off */
+
 	/* struct ata_queued_cmd flags */
 	ATA_QCFLAG_ACTIVE	= (1 << 0), /* cmd not yet ack'd to scsi lyer */
 	ATA_QCFLAG_DMAMAP	= (1 << 1), /* SG table is DMA mapped */
@@ -689,7 +693,10 @@ struct ata_port {
 	struct Scsi_Host	*scsi_host; /* our co-allocated scsi host */
 	struct ata_port_operations *ops;
 	spinlock_t		*lock;
+	/* Flags owned by the EH context. Only EH should touch these once the
+	   port is active */
 	unsigned long		flags;	/* ATA_FLAG_xxx */
+	/* Flags that change dynamically, protected by ap->lock */
 	unsigned int		pflags; /* ATA_PFLAG_xxx */
 	unsigned int		print_id; /* user visible unique port ID */
 	unsigned int		port_no; /* 0 based port no. inside the host */
@@ -1595,6 +1602,7 @@ extern void ata_sff_drain_fifo(struct ata_queued_cmd *qc);
 extern void ata_sff_error_handler(struct ata_port *ap);
 extern void ata_sff_post_internal_cmd(struct ata_queued_cmd *qc);
 extern int ata_sff_port_start(struct ata_port *ap);
+extern int ata_sff_port_start32(struct ata_port *ap);
 extern void ata_sff_std_ports(struct ata_ioports *ioaddr);
 extern unsigned long ata_bmdma_mode_filter(struct ata_device *dev,
 					   unsigned long xfer_mask);
-- 
cgit v1.2.3


From 13977091a988fb0d21821c2221ddc920eba36b79 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 30 Mar 2009 14:37:25 -0700
Subject: Driver Core: early platform driver

V3 of the early platform driver implementation.

Platform drivers are great for embedded platforms because we can separate
driver configuration from the actual driver.  So base addresses,
interrupts and other configuration can be kept with the processor or board
code, and the platform driver can be reused by many different platforms.

For early devices we have nothing today.  For instance, to configure early
timers and early serial ports we cannot use platform devices.  This
because the setup order during boot.  Timers are needed before the
platform driver core code is available.  The same goes for early printk
support.  Early in this case means before initcalls.

These early drivers today have their configuration either hard coded or
they receive it using some special configuration method.  This is working
quite well, but if we want to support both regular kernel modules and
early devices then we need to have two ways of configuring the same
driver.  A single way would be better.

The early platform driver patch is basically a set of functions that allow
drivers to register themselves and architecture code to locate them and
probe.  Registration happens through early_param().  The time for the
probe is decided by the architecture code.

See Documentation/driver-model/platform.txt for more details.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: David Brownell <david-b@pacbell.net>
Cc: Tejun Heo <htejun@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/driver-model/platform.txt |  59 ++++++++
 drivers/base/platform.c                 | 239 ++++++++++++++++++++++++++++++++
 include/linux/init.h                    |   1 +
 include/linux/platform_device.h         |  42 ++++++
 init/main.c                             |   7 +-
 5 files changed, 347 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-model/platform.txt b/Documentation/driver-model/platform.txt
index 83009fdcbbc8..2e2c2ea90ceb 100644
--- a/Documentation/driver-model/platform.txt
+++ b/Documentation/driver-model/platform.txt
@@ -169,3 +169,62 @@ three different ways to find such a match:
       be probed later if another device registers.  (Which is OK, since
       this interface is only for use with non-hotpluggable devices.)
 
+
+Early Platform Devices and Drivers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The early platform interfaces provide platform data to platform device
+drivers early on during the system boot. The code is built on top of the
+early_param() command line parsing and can be executed very early on.
+
+Example: "earlyprintk" class early serial console in 6 steps
+
+1. Registering early platform device data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code registers platform device data using the function
+early_platform_add_devices(). In the case of early serial console this
+should be hardware configuration for the serial port. Devices registered
+at this point will later on be matched against early platform drivers.
+
+2. Parsing kernel command line
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls parse_early_param() to parse the kernel
+command line. This will execute all matching early_param() callbacks.
+User specified early platform devices will be registered at this point.
+For the early serial console case the user can specify port on the
+kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
+the class string, "serial" is the name of the platfrom driver and
+0 is the platform device id. If the id is -1 then the dot and the
+id can be omitted.
+
+3. Installing early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code may optionally force registration of all early
+platform drivers belonging to a certain class using the function
+early_platform_driver_register_all(). User specified devices from
+step 2 have priority over these. This step is omitted by the serial
+driver example since the early serial driver code should be disabled
+unless the user has specified port on the kernel command line.
+
+4. Early platform driver registration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Compiled-in platform drivers making use of early_platform_init() are
+automatically registered during step 2 or 3. The serial driver example
+should use early_platform_init("earlyprintk", &platform_driver).
+
+5. Probing of early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls early_platform_driver_probe() to match
+registered early platform devices associated with a certain class with
+registered early platform drivers. Matched devices will get probed().
+This step can be executed at any point during the early boot. As soon
+as possible may be good for the serial port case.
+
+6. Inside the early platform driver probe()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The driver code needs to take special care during early boot, especially
+when it comes to memory allocation and interrupt registration. The code
+in the probe() function can use is_early_platform_device() to check if
+it is called at early platform device or at the regular platform device
+time. The early serial driver performs register_console() at this point.
+
+For further information, see <linux/platform_device.h>.
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index d2198f64ad4e..b5b6c973a2e0 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -990,6 +990,8 @@ int __init platform_bus_init(void)
 {
 	int error;
 
+	early_platform_cleanup();
+
 	error = device_register(&platform_bus);
 	if (error)
 		return error;
@@ -1020,3 +1022,240 @@ u64 dma_get_required_mask(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dma_get_required_mask);
 #endif
+
+static __initdata LIST_HEAD(early_platform_driver_list);
+static __initdata LIST_HEAD(early_platform_device_list);
+
+/**
+ * early_platform_driver_register
+ * @edrv: early_platform driver structure
+ * @buf: string passed from early_param()
+ */
+int __init early_platform_driver_register(struct early_platform_driver *epdrv,
+					  char *buf)
+{
+	unsigned long index;
+	int n;
+
+	/* Simply add the driver to the end of the global list.
+	 * Drivers will by default be put on the list in compiled-in order.
+	 */
+	if (!epdrv->list.next) {
+		INIT_LIST_HEAD(&epdrv->list);
+		list_add_tail(&epdrv->list, &early_platform_driver_list);
+	}
+
+	/* If the user has specified device then make sure the driver
+	 * gets prioritized. The driver of the last device specified on
+	 * command line will be put first on the list.
+	 */
+	n = strlen(epdrv->pdrv->driver.name);
+	if (buf && !strncmp(buf, epdrv->pdrv->driver.name, n)) {
+		list_move(&epdrv->list, &early_platform_driver_list);
+
+		if (!strcmp(buf, epdrv->pdrv->driver.name))
+			epdrv->requested_id = -1;
+		else if (buf[n] == '.' && strict_strtoul(&buf[n + 1], 10,
+							 &index) == 0)
+			epdrv->requested_id = index;
+		else
+			epdrv->requested_id = EARLY_PLATFORM_ID_ERROR;
+	}
+
+	return 0;
+}
+
+/**
+ * early_platform_add_devices - add a numbers of early platform devices
+ * @devs: array of early platform devices to add
+ * @num: number of early platform devices in array
+ */
+void __init early_platform_add_devices(struct platform_device **devs, int num)
+{
+	struct device *dev;
+	int i;
+
+	/* simply add the devices to list */
+	for (i = 0; i < num; i++) {
+		dev = &devs[i]->dev;
+
+		if (!dev->devres_head.next) {
+			INIT_LIST_HEAD(&dev->devres_head);
+			list_add_tail(&dev->devres_head,
+				      &early_platform_device_list);
+		}
+	}
+}
+
+/**
+ * early_platform_driver_register_all
+ * @class_str: string to identify early platform driver class
+ */
+void __init early_platform_driver_register_all(char *class_str)
+{
+	/* The "class_str" parameter may or may not be present on the kernel
+	 * command line. If it is present then there may be more than one
+	 * matching parameter.
+	 *
+	 * Since we register our early platform drivers using early_param()
+	 * we need to make sure that they also get registered in the case
+	 * when the parameter is missing from the kernel command line.
+	 *
+	 * We use parse_early_options() to make sure the early_param() gets
+	 * called at least once. The early_param() may be called more than
+	 * once since the name of the preferred device may be specified on
+	 * the kernel command line. early_platform_driver_register() handles
+	 * this case for us.
+	 */
+	parse_early_options(class_str);
+}
+
+/**
+ * early_platform_match
+ * @edrv: early platform driver structure
+ * @id: id to match against
+ */
+static  __init struct platform_device *
+early_platform_match(struct early_platform_driver *epdrv, int id)
+{
+	struct platform_device *pd;
+
+	list_for_each_entry(pd, &early_platform_device_list, dev.devres_head)
+		if (platform_match(&pd->dev, &epdrv->pdrv->driver))
+			if (pd->id == id)
+				return pd;
+
+	return NULL;
+}
+
+/**
+ * early_platform_left
+ * @edrv: early platform driver structure
+ * @id: return true if id or above exists
+ */
+static  __init int early_platform_left(struct early_platform_driver *epdrv,
+				       int id)
+{
+	struct platform_device *pd;
+
+	list_for_each_entry(pd, &early_platform_device_list, dev.devres_head)
+		if (platform_match(&pd->dev, &epdrv->pdrv->driver))
+			if (pd->id >= id)
+				return 1;
+
+	return 0;
+}
+
+/**
+ * early_platform_driver_probe_id
+ * @class_str: string to identify early platform driver class
+ * @id: id to match against
+ * @nr_probe: number of platform devices to successfully probe before exiting
+ */
+static int __init early_platform_driver_probe_id(char *class_str,
+						 int id,
+						 int nr_probe)
+{
+	struct early_platform_driver *epdrv;
+	struct platform_device *match;
+	int match_id;
+	int n = 0;
+	int left = 0;
+
+	list_for_each_entry(epdrv, &early_platform_driver_list, list) {
+		/* only use drivers matching our class_str */
+		if (strcmp(class_str, epdrv->class_str))
+			continue;
+
+		if (id == -2) {
+			match_id = epdrv->requested_id;
+			left = 1;
+
+		} else {
+			match_id = id;
+			left += early_platform_left(epdrv, id);
+
+			/* skip requested id */
+			switch (epdrv->requested_id) {
+			case EARLY_PLATFORM_ID_ERROR:
+			case EARLY_PLATFORM_ID_UNSET:
+				break;
+			default:
+				if (epdrv->requested_id == id)
+					match_id = EARLY_PLATFORM_ID_UNSET;
+			}
+		}
+
+		switch (match_id) {
+		case EARLY_PLATFORM_ID_ERROR:
+			pr_warning("%s: unable to parse %s parameter\n",
+				   class_str, epdrv->pdrv->driver.name);
+			/* fall-through */
+		case EARLY_PLATFORM_ID_UNSET:
+			match = NULL;
+			break;
+		default:
+			match = early_platform_match(epdrv, match_id);
+		}
+
+		if (match) {
+			if (epdrv->pdrv->probe(match))
+				pr_warning("%s: unable to probe %s early.\n",
+					   class_str, match->name);
+			else
+				n++;
+		}
+
+		if (n >= nr_probe)
+			break;
+	}
+
+	if (left)
+		return n;
+	else
+		return -ENODEV;
+}
+
+/**
+ * early_platform_driver_probe
+ * @class_str: string to identify early platform driver class
+ * @nr_probe: number of platform devices to successfully probe before exiting
+ * @user_only: only probe user specified early platform devices
+ */
+int __init early_platform_driver_probe(char *class_str,
+				       int nr_probe,
+				       int user_only)
+{
+	int k, n, i;
+
+	n = 0;
+	for (i = -2; n < nr_probe; i++) {
+		k = early_platform_driver_probe_id(class_str, i, nr_probe - n);
+
+		if (k < 0)
+			break;
+
+		n += k;
+
+		if (user_only)
+			break;
+	}
+
+	return n;
+}
+
+/**
+ * early_platform_cleanup - clean up early platform code
+ */
+void __init early_platform_cleanup(void)
+{
+	struct platform_device *pd, *pd2;
+
+	/* clean up the devres list used to chain devices */
+	list_for_each_entry_safe(pd, pd2, &early_platform_device_list,
+				 dev.devres_head) {
+		list_del(&pd->dev.devres_head);
+		memset(&pd->dev.devres_head, 0, sizeof(pd->dev.devres_head));
+	}
+}
+
diff --git a/include/linux/init.h b/include/linux/init.h
index 68cb0265d009..f121a7a10c3d 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -247,6 +247,7 @@ struct obs_kernel_param {
 
 /* Relies on boot_command_line being set */
 void __init parse_early_param(void);
+void __init parse_early_options(char *cmdline);
 #endif /* __ASSEMBLY__ */
 
 /**
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 76e470a299bf..72736fd8223c 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -77,4 +77,46 @@ extern int platform_driver_probe(struct platform_driver *driver,
 #define platform_get_drvdata(_dev)	dev_get_drvdata(&(_dev)->dev)
 #define platform_set_drvdata(_dev,data)	dev_set_drvdata(&(_dev)->dev, (data))
 
+/* early platform driver interface */
+struct early_platform_driver {
+	const char *class_str;
+	struct platform_driver *pdrv;
+	struct list_head list;
+	int requested_id;
+};
+
+#define EARLY_PLATFORM_ID_UNSET -2
+#define EARLY_PLATFORM_ID_ERROR -3
+
+extern int early_platform_driver_register(struct early_platform_driver *epdrv,
+					  char *buf);
+extern void early_platform_add_devices(struct platform_device **devs, int num);
+
+static inline int is_early_platform_device(struct platform_device *pdev)
+{
+	return !pdev->dev.driver;
+}
+
+extern void early_platform_driver_register_all(char *class_str);
+extern int early_platform_driver_probe(char *class_str,
+				       int nr_probe, int user_only);
+extern void early_platform_cleanup(void);
+
+
+#ifndef MODULE
+#define early_platform_init(class_string, platform_driver)		\
+static __initdata struct early_platform_driver early_driver = {		\
+	.class_str = class_string,					\
+	.pdrv = platform_driver,					\
+	.requested_id = EARLY_PLATFORM_ID_UNSET,			\
+};									\
+static int __init early_platform_driver_setup_func(char *buf)		\
+{									\
+	return early_platform_driver_register(&early_driver, buf);	\
+}									\
+early_param(class_string, early_platform_driver_setup_func)
+#else /* MODULE */
+#define early_platform_init(class_string, platform_driver)
+#endif /* MODULE */
+
 #endif /* _PLATFORM_DEVICE_H_ */
diff --git a/init/main.c b/init/main.c
index 3585f073d636..3bbf93be744c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -492,6 +492,11 @@ static int __init do_early_param(char *param, char *val)
 	return 0;
 }
 
+void __init parse_early_options(char *cmdline)
+{
+	parse_args("early options", cmdline, NULL, 0, do_early_param);
+}
+
 /* Arch code calls this early on, or if not, just before other parsing. */
 void __init parse_early_param(void)
 {
@@ -503,7 +508,7 @@ void __init parse_early_param(void)
 
 	/* All fall through to do_early_param. */
 	strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
-	parse_args("early options", tmp_cmdline, NULL, 0, do_early_param);
+	parse_early_options(tmp_cmdline);
 	done = 1;
 }
 
-- 
cgit v1.2.3


From 4ccb457966391295bd9b3644f6bdc9ddd97b6051 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Thu, 9 Apr 2009 14:48:24 -0700
Subject: dynamic debug: resurrect old pr_debug() semantics as pr_devel()

pr_debug() used to produce zero code unless DEBUG was #defined.  This is
now no longer the case in practice[1].

There are places where it's useful to have debugging printks, but we don't
want them to generate any code in production kernels.

So add a new macro, pr_devel(), for _devel_opment, to provide the old
semantics, ie.  if the programmer doesn't explicitly enable debugging, no
code is produced.

[1]: You can turn CONFIG_DYNAMIC_DEBUG off, but it's enabled in at least
     one distro kernel, so it's not really a solution.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Greg Banks <gnb@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/kernel.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d9e75ec7def5..883cd44ff765 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -377,6 +377,15 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 #define pr_cont(fmt, ...) \
 	printk(KERN_CONT fmt, ##__VA_ARGS__)
 
+/* pr_devel() should produce zero code unless DEBUG is defined */
+#ifdef DEBUG
+#define pr_devel(fmt, ...) \
+	printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#else
+#define pr_devel(fmt, ...) \
+	({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; })
+#endif
+
 /* If you are writing a driver, please use dev_dbg instead */
 #if defined(DEBUG)
 #define pr_debug(fmt, ...) \
-- 
cgit v1.2.3


From 7607b1d673469d5b5dce4c9b6779d165e03c8ff5 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Wed, 8 Apr 2009 12:12:52 -0400
Subject: Driver core: remove pr_fmt() from dynamic_dev_dbg() printk

When pr_fmt() was added to the pr_debug() code, we added it not only to the
dynamic_pr_debug() function, but also to the dynamic_dev_dbg() funciton.
However, dev_dbg() doesn't make use of pr_fmt(), so neither should
dynamic_dev_dbg().

Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/dynamic_debug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index baabf33be244..a0d9422a1569 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -70,7 +70,7 @@ extern int ddebug_remove_module(char *mod_name);
 		DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT };	\
 	if (__dynamic_dbg_enabled(descriptor))				\
 			dev_printk(KERN_DEBUG, dev,			\
-					KBUILD_MODNAME ": " pr_fmt(fmt),\
+					KBUILD_MODNAME ": " fmt,	\
 					##__VA_ARGS__);			\
 	} while (0)
 
-- 
cgit v1.2.3


From 3444b26afa145148951112534f298bdc554ec789 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Wed, 8 Apr 2009 17:36:28 +0000
Subject: USB: add reset endpoint operations

Wireless USB endpoint state has a sequence number and a current
window and not just a single toggle bit.  So allow HCDs to provide a
endpoint_reset method and call this or clear the software toggles as
required (after a clear halt, set configuration etc.).

usb_settoggle() and friends are then HCD internal and are moved into
core/hcd.h and all device drivers call usb_reset_endpoint() instead.

If the device endpoint state has been reset (with a clear halt) but
the host endpoint state has not then subsequent data transfers will
not complete. The device will only work again after it is reset or
disconnected.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/block/ub.c                        | 20 +++++------
 drivers/isdn/hisax/st5481_usb.c           |  9 +----
 drivers/media/video/pvrusb2/pvrusb2-hdw.c |  1 -
 drivers/usb/core/devio.c                  |  2 +-
 drivers/usb/core/hcd.c                    | 26 ++++++++++++++
 drivers/usb/core/hcd.h                    | 14 ++++++++
 drivers/usb/core/message.c                | 58 ++++++++++++++++++++-----------
 drivers/usb/core/usb.c                    |  2 +-
 drivers/usb/storage/transport.c           |  4 +--
 include/linux/usb.h                       |  9 +----
 10 files changed, 91 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 69b7f8e77596..689cd27ac890 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -1025,6 +1025,7 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 {
 	struct urb *urb = &sc->work_urb;
 	struct bulk_cs_wrap *bcs;
+	int endp;
 	int len;
 	int rc;
 
@@ -1033,6 +1034,10 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 		return;
 	}
 
+	endp = usb_pipeendpoint(sc->last_pipe);
+	if (usb_pipein(sc->last_pipe))
+		endp |= USB_DIR_IN;
+
 	if (cmd->state == UB_CMDST_CLEAR) {
 		if (urb->status == -EPIPE) {
 			/*
@@ -1048,9 +1053,7 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 		 * We ignore the result for the halt clear.
 		 */
 
-		/* reset the endpoint toggle */
-		usb_settoggle(sc->dev, usb_pipeendpoint(sc->last_pipe),
-			usb_pipeout(sc->last_pipe), 0);
+		usb_reset_endpoint(sc->dev, endp);
 
 		ub_state_sense(sc, cmd);
 
@@ -1065,9 +1068,7 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 		 * We ignore the result for the halt clear.
 		 */
 
-		/* reset the endpoint toggle */
-		usb_settoggle(sc->dev, usb_pipeendpoint(sc->last_pipe),
-			usb_pipeout(sc->last_pipe), 0);
+		usb_reset_endpoint(sc->dev, endp);
 
 		ub_state_stat(sc, cmd);
 
@@ -1082,9 +1083,7 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 		 * We ignore the result for the halt clear.
 		 */
 
-		/* reset the endpoint toggle */
-		usb_settoggle(sc->dev, usb_pipeendpoint(sc->last_pipe),
-			usb_pipeout(sc->last_pipe), 0);
+		usb_reset_endpoint(sc->dev, endp);
 
 		ub_state_stat_counted(sc, cmd);
 
@@ -2119,8 +2118,7 @@ static int ub_probe_clear_stall(struct ub_dev *sc, int stalled_pipe)
 	del_timer_sync(&timer);
 	usb_kill_urb(&sc->work_urb);
 
-	/* reset the endpoint toggle */
-	usb_settoggle(sc->dev, endp, usb_pipeout(sc->last_pipe), 0);
+	usb_reset_endpoint(sc->dev, endp);
 
 	return 0;
 }
diff --git a/drivers/isdn/hisax/st5481_usb.c b/drivers/isdn/hisax/st5481_usb.c
index ec3c0e507669..2b3a055059ea 100644
--- a/drivers/isdn/hisax/st5481_usb.c
+++ b/drivers/isdn/hisax/st5481_usb.c
@@ -149,14 +149,7 @@ static void usb_ctrl_complete(struct urb *urb)
 	if (ctrl_msg->dr.bRequest == USB_REQ_CLEAR_FEATURE) {
 	        /* Special case handling for pipe reset */
 		le16_to_cpus(&ctrl_msg->dr.wIndex);
-
-		/* toggle is reset on clear */
-		usb_settoggle(adapter->usb_dev, 
-			      ctrl_msg->dr.wIndex & ~USB_DIR_IN, 
-			      (ctrl_msg->dr.wIndex & USB_DIR_IN) == 0,
-			      0);
-
-
+		usb_reset_endpoint(adapter->usb_dev, ctrl_msg->dr.wIndex);
 	}
 	
 	if (ctrl_msg->complete)
diff --git a/drivers/media/video/pvrusb2/pvrusb2-hdw.c b/drivers/media/video/pvrusb2/pvrusb2-hdw.c
index d9d974a8f52a..add3395d3248 100644
--- a/drivers/media/video/pvrusb2/pvrusb2-hdw.c
+++ b/drivers/media/video/pvrusb2/pvrusb2-hdw.c
@@ -1461,7 +1461,6 @@ static int pvr2_upload_firmware1(struct pvr2_hdw *hdw)
 		return ret;
 	}
 
-	usb_settoggle(hdw->usb_dev, 0 & 0xf, !(0 & USB_DIR_IN), 0);
 	usb_clear_halt(hdw->usb_dev, usb_sndbulkpipe(hdw->usb_dev, 0 & 0x7f));
 
 	pipe = usb_sndctrlpipe(hdw->usb_dev, 0);
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index df3c539f652a..308609039c73 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -841,7 +841,7 @@ static int proc_resetep(struct dev_state *ps, void __user *arg)
 	ret = checkintf(ps, ret);
 	if (ret)
 		return ret;
-	usb_settoggle(ps->dev, ep & 0xf, !(ep & USB_DIR_IN), 0);
+	usb_reset_endpoint(ps->dev, ep);
 	return 0;
 }
 
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 81fa8506825d..42b93da1085d 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1539,6 +1539,32 @@ void usb_hcd_disable_endpoint(struct usb_device *udev,
 		hcd->driver->endpoint_disable(hcd, ep);
 }
 
+/**
+ * usb_hcd_reset_endpoint - reset host endpoint state
+ * @udev: USB device.
+ * @ep:   the endpoint to reset.
+ *
+ * Resets any host endpoint state such as the toggle bit, sequence
+ * number and current window.
+ */
+void usb_hcd_reset_endpoint(struct usb_device *udev,
+			    struct usb_host_endpoint *ep)
+{
+	struct usb_hcd *hcd = bus_to_hcd(udev->bus);
+
+	if (hcd->driver->endpoint_reset)
+		hcd->driver->endpoint_reset(hcd, ep);
+	else {
+		int epnum = usb_endpoint_num(&ep->desc);
+		int is_out = usb_endpoint_dir_out(&ep->desc);
+		int is_control = usb_endpoint_xfer_control(&ep->desc);
+
+		usb_settoggle(udev, epnum, is_out, 0);
+		if (is_control)
+			usb_settoggle(udev, epnum, !is_out, 0);
+	}
+}
+
 /* Protect against drivers that try to unlink URBs after the device
  * is gone, by waiting until all unlinks for @udev are finished.
  * Since we don't currently track URBs by device, simply wait until
diff --git a/drivers/usb/core/hcd.h b/drivers/usb/core/hcd.h
index f750eb1ab595..e7d4479de41c 100644
--- a/drivers/usb/core/hcd.h
+++ b/drivers/usb/core/hcd.h
@@ -206,6 +206,11 @@ struct hc_driver {
 	void 	(*endpoint_disable)(struct usb_hcd *hcd,
 			struct usb_host_endpoint *ep);
 
+	/* (optional) reset any endpoint state such as sequence number
+	   and current window */
+	void 	(*endpoint_reset)(struct usb_hcd *hcd,
+			struct usb_host_endpoint *ep);
+
 	/* root hub support */
 	int	(*hub_status_data) (struct usb_hcd *hcd, char *buf);
 	int	(*hub_control) (struct usb_hcd *hcd,
@@ -234,6 +239,8 @@ extern void usb_hcd_flush_endpoint(struct usb_device *udev,
 		struct usb_host_endpoint *ep);
 extern void usb_hcd_disable_endpoint(struct usb_device *udev,
 		struct usb_host_endpoint *ep);
+extern void usb_hcd_reset_endpoint(struct usb_device *udev,
+		struct usb_host_endpoint *ep);
 extern void usb_hcd_synchronize_unlinks(struct usb_device *udev);
 extern int usb_hcd_get_frame_number(struct usb_device *udev);
 
@@ -279,6 +286,13 @@ extern irqreturn_t usb_hcd_irq(int irq, void *__hcd);
 extern void usb_hc_died(struct usb_hcd *hcd);
 extern void usb_hcd_poll_rh_status(struct usb_hcd *hcd);
 
+/* The D0/D1 toggle bits ... USE WITH CAUTION (they're almost hcd-internal) */
+#define usb_gettoggle(dev, ep, out) (((dev)->toggle[out] >> (ep)) & 1)
+#define	usb_dotoggle(dev, ep, out)  ((dev)->toggle[out] ^= (1 << (ep)))
+#define usb_settoggle(dev, ep, out, bit) \
+		((dev)->toggle[out] = ((dev)->toggle[out] & ~(1 << (ep))) | \
+		 ((bit) << (ep)))
+
 /* -------------------------------------------------------------------------- */
 
 /* Enumeration is only for the hub driver, or HCD virtual root hubs */
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 30a0690f3683..b62628377654 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1002,8 +1002,7 @@ int usb_clear_halt(struct usb_device *dev, int pipe)
 	 * the copy in usb-storage, for as long as we need two copies.
 	 */
 
-	/* toggle was reset by the clear */
-	usb_settoggle(dev, usb_pipeendpoint(pipe), usb_pipeout(pipe), 0);
+	usb_reset_endpoint(dev, endp);
 
 	return 0;
 }
@@ -1075,6 +1074,30 @@ void usb_disable_endpoint(struct usb_device *dev, unsigned int epaddr,
 	}
 }
 
+/**
+ * usb_reset_endpoint - Reset an endpoint's state.
+ * @dev: the device whose endpoint is to be reset
+ * @epaddr: the endpoint's address.  Endpoint number for output,
+ *	endpoint number + USB_DIR_IN for input
+ *
+ * Resets any host-side endpoint state such as the toggle bit,
+ * sequence number or current window.
+ */
+void usb_reset_endpoint(struct usb_device *dev, unsigned int epaddr)
+{
+	unsigned int epnum = epaddr & USB_ENDPOINT_NUMBER_MASK;
+	struct usb_host_endpoint *ep;
+
+	if (usb_endpoint_out(epaddr))
+		ep = dev->ep_out[epnum];
+	else
+		ep = dev->ep_in[epnum];
+	if (ep)
+		usb_hcd_reset_endpoint(dev, ep);
+}
+EXPORT_SYMBOL_GPL(usb_reset_endpoint);
+
+
 /**
  * usb_disable_interface -- Disable all endpoints for an interface
  * @dev: the device whose interface is being disabled
@@ -1117,7 +1140,6 @@ void usb_disable_device(struct usb_device *dev, int skip_ep0)
 		usb_disable_endpoint(dev, i, true);
 		usb_disable_endpoint(dev, i + USB_DIR_IN, true);
 	}
-	dev->toggle[0] = dev->toggle[1] = 0;
 
 	/* getting rid of interfaces will disconnect
 	 * any drivers bound to them (a key side effect)
@@ -1154,28 +1176,24 @@ void usb_disable_device(struct usb_device *dev, int skip_ep0)
  * usb_enable_endpoint - Enable an endpoint for USB communications
  * @dev: the device whose interface is being enabled
  * @ep: the endpoint
- * @reset_toggle: flag to set the endpoint's toggle back to 0
+ * @reset_ep: flag to reset the endpoint state
  *
- * Resets the endpoint toggle if asked, and sets dev->ep_{in,out} pointers.
+ * Resets the endpoint state if asked, and sets dev->ep_{in,out} pointers.
  * For control endpoints, both the input and output sides are handled.
  */
 void usb_enable_endpoint(struct usb_device *dev, struct usb_host_endpoint *ep,
-		bool reset_toggle)
+		bool reset_ep)
 {
 	int epnum = usb_endpoint_num(&ep->desc);
 	int is_out = usb_endpoint_dir_out(&ep->desc);
 	int is_control = usb_endpoint_xfer_control(&ep->desc);
 
-	if (is_out || is_control) {
-		if (reset_toggle)
-			usb_settoggle(dev, epnum, 1, 0);
+	if (reset_ep)
+		usb_hcd_reset_endpoint(dev, ep);
+	if (is_out || is_control)
 		dev->ep_out[epnum] = ep;
-	}
-	if (!is_out || is_control) {
-		if (reset_toggle)
-			usb_settoggle(dev, epnum, 0, 0);
+	if (!is_out || is_control)
 		dev->ep_in[epnum] = ep;
-	}
 	ep->enabled = 1;
 }
 
@@ -1183,18 +1201,18 @@ void usb_enable_endpoint(struct usb_device *dev, struct usb_host_endpoint *ep,
  * usb_enable_interface - Enable all the endpoints for an interface
  * @dev: the device whose interface is being enabled
  * @intf: pointer to the interface descriptor
- * @reset_toggles: flag to set the endpoints' toggles back to 0
+ * @reset_eps: flag to reset the endpoints' state
  *
  * Enables all the endpoints for the interface's current altsetting.
  */
 void usb_enable_interface(struct usb_device *dev,
-		struct usb_interface *intf, bool reset_toggles)
+		struct usb_interface *intf, bool reset_eps)
 {
 	struct usb_host_interface *alt = intf->cur_altsetting;
 	int i;
 
 	for (i = 0; i < alt->desc.bNumEndpoints; ++i)
-		usb_enable_endpoint(dev, &alt->endpoint[i], reset_toggles);
+		usb_enable_endpoint(dev, &alt->endpoint[i], reset_eps);
 }
 
 /**
@@ -1335,7 +1353,7 @@ EXPORT_SYMBOL_GPL(usb_set_interface);
  * This issues a standard SET_CONFIGURATION request to the device using
  * the current configuration.  The effect is to reset most USB-related
  * state in the device, including interface altsettings (reset to zero),
- * endpoint halts (cleared), and data toggle (only for bulk and interrupt
+ * endpoint halts (cleared), and endpoint state (only for bulk and interrupt
  * endpoints).  Other usbcore state is unchanged, including bindings of
  * usb device drivers to interfaces.
  *
@@ -1343,7 +1361,7 @@ EXPORT_SYMBOL_GPL(usb_set_interface);
  * (multi-interface) devices.  Instead, the driver for each interface may
  * use usb_set_interface() on the interfaces it claims.  Be careful though;
  * some devices don't support the SET_INTERFACE request, and others won't
- * reset all the interface state (notably data toggles).  Resetting the whole
+ * reset all the interface state (notably endpoint state).  Resetting the whole
  * configuration would affect other drivers' interfaces.
  *
  * The caller must own the device lock.
@@ -1376,8 +1394,6 @@ int usb_reset_configuration(struct usb_device *dev)
 	if (retval < 0)
 		return retval;
 
-	dev->toggle[0] = dev->toggle[1] = 0;
-
 	/* re-init hc/hcd interface/endpoint state */
 	for (i = 0; i < config->desc.bNumInterfaces; i++) {
 		struct usb_interface *intf = config->interface[i];
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index dcfc072630c1..7eee400d3e32 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -362,7 +362,7 @@ struct usb_device *usb_alloc_dev(struct usb_device *parent,
 	dev->ep0.desc.bLength = USB_DT_ENDPOINT_SIZE;
 	dev->ep0.desc.bDescriptorType = USB_DT_ENDPOINT;
 	/* ep0 maxpacket comes later, from device descriptor */
-	usb_enable_endpoint(dev, &dev->ep0, true);
+	usb_enable_endpoint(dev, &dev->ep0, false);
 	dev->can_submit = 1;
 
 	/* Save readable and stable topology id, distinguishing devices
diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c
index 49aedb36dc19..fcb320217218 100644
--- a/drivers/usb/storage/transport.c
+++ b/drivers/usb/storage/transport.c
@@ -247,10 +247,8 @@ int usb_stor_clear_halt(struct us_data *us, unsigned int pipe)
 		USB_ENDPOINT_HALT, endp,
 		NULL, 0, 3*HZ);
 
-	/* reset the endpoint toggle */
 	if (result >= 0)
-		usb_settoggle(us->pusb_dev, usb_pipeendpoint(pipe),
-				usb_pipeout(pipe), 0);
+		usb_reset_endpoint(us->pusb_dev, endp);
 
 	US_DEBUGP("%s: result = %d\n", __func__, result);
 	return result;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index c6b2ab41b908..3aa2cd1f8d08 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1387,6 +1387,7 @@ extern int usb_string(struct usb_device *dev, int index,
 extern int usb_clear_halt(struct usb_device *dev, int pipe);
 extern int usb_reset_configuration(struct usb_device *dev);
 extern int usb_set_interface(struct usb_device *dev, int ifnum, int alternate);
+extern void usb_reset_endpoint(struct usb_device *dev, unsigned int epaddr);
 
 /* this request isn't really synchronous, but it belongs with the others */
 extern int usb_driver_set_configuration(struct usb_device *udev, int config);
@@ -1491,14 +1492,6 @@ void usb_sg_wait(struct usb_sg_request *io);
 #define usb_pipecontrol(pipe)	(usb_pipetype((pipe)) == PIPE_CONTROL)
 #define usb_pipebulk(pipe)	(usb_pipetype((pipe)) == PIPE_BULK)
 
-/* The D0/D1 toggle bits ... USE WITH CAUTION (they're almost hcd-internal) */
-#define usb_gettoggle(dev, ep, out) (((dev)->toggle[out] >> (ep)) & 1)
-#define	usb_dotoggle(dev, ep, out)  ((dev)->toggle[out] ^= (1 << (ep)))
-#define usb_settoggle(dev, ep, out, bit) \
-		((dev)->toggle[out] = ((dev)->toggle[out] & ~(1 << (ep))) | \
-		 ((bit) << (ep)))
-
-
 static inline unsigned int __create_pipe(struct usb_device *dev,
 		unsigned int endpoint)
 {
-- 
cgit v1.2.3


From 42a17ad2762f465d291c3bc0b6ed2b3738f65481 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Sat, 18 Apr 2009 11:30:56 +0200
Subject: <linux/seccomp.h> needs to include <linux/errno.h>.

<linux/seccomp.h> uses EINVAL so should include <linux/errno.h>.  This
fixes a build error on 64-bit MIPS if CONFIG_SECCOMP is disabled.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/seccomp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 262a8dccfa81..167c33361d9c 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -21,6 +21,8 @@ extern long prctl_set_seccomp(unsigned long);
 
 #else /* CONFIG_SECCOMP */
 
+#include <linux/errno.h>
+
 typedef struct { } seccomp_t;
 
 #define secure_computing(x) do { } while (0)
-- 
cgit v1.2.3


From 6a7c7eaf71b636f197d73b381a2ab729ebdcfb2e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 19 Apr 2009 20:08:42 +0200
Subject: PM/Suspend: Introduce two new platform callbacks to avoid breakage

Commit 900af0d973856d6feb6fc088c2d0d3fde57707d3 (PM: Change suspend
code ordering) changed the ordering of suspend code in such a way
that the platform .prepare() callback is now executed after the
device drivers' late suspend callbacks have run.  Unfortunately, this
turns out to break ARM platforms that need to talk via I2C to power
control devices during the .prepare() callback.

For this reason introduce two new platform suspend callbacks,
.prepare_late() and .wake(), that will be called just prior to
disabling non-boot CPUs and right after bringing them back on line,
respectively, and use them instead of .prepare() and .finish() for
ACPI suspend.  Make the PM core execute the .prepare() and .finish()
platform suspend callbacks where they were executed previously (that
is, right after calling the regular suspend methods provided by
device drivers and right before executing their regular resume
methods, respectively).

It is not necessary to make analogous changes to the hibernation
code and data structures at the moment, because they are only used
by ACPI platforms.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reported-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/sleep.c    |  8 ++++----
 include/linux/suspend.h | 36 ++++++++++++++++++++++++++----------
 kernel/power/main.c     | 24 +++++++++++++++++-------
 3 files changed, 47 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 779e4e500df4..d060e6fd7fd5 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -300,9 +300,9 @@ static int acpi_suspend_state_valid(suspend_state_t pm_state)
 static struct platform_suspend_ops acpi_suspend_ops = {
 	.valid = acpi_suspend_state_valid,
 	.begin = acpi_suspend_begin,
-	.prepare = acpi_pm_prepare,
+	.prepare_late = acpi_pm_prepare,
 	.enter = acpi_suspend_enter,
-	.finish = acpi_pm_finish,
+	.wake = acpi_pm_finish,
 	.end = acpi_pm_end,
 };
 
@@ -328,9 +328,9 @@ static int acpi_suspend_begin_old(suspend_state_t pm_state)
 static struct platform_suspend_ops acpi_suspend_ops_old = {
 	.valid = acpi_suspend_state_valid,
 	.begin = acpi_suspend_begin_old,
-	.prepare = acpi_pm_disable_gpes,
+	.prepare_late = acpi_pm_disable_gpes,
 	.enter = acpi_suspend_enter,
-	.finish = acpi_pm_finish,
+	.wake = acpi_pm_finish,
 	.end = acpi_pm_end,
 	.recover = acpi_pm_finish,
 };
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 3e3a4364cbff..795032edfc46 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -58,10 +58,17 @@ typedef int __bitwise suspend_state_t;
  *	by @begin().
  *	@prepare() is called right after devices have been suspended (ie. the
  *	appropriate .suspend() method has been executed for each device) and
- *	before the nonboot CPUs are disabled (it is executed with IRQs enabled).
- *	This callback is optional.  It returns 0 on success or a negative
- *	error code otherwise, in which case the system cannot enter the desired
- *	sleep state (@enter() and @finish() will not be called in that case).
+ *	before device drivers' late suspend callbacks are executed.  It returns
+ *	0 on success or a negative error code otherwise, in which case the
+ *	system cannot enter the desired sleep state (@prepare_late(), @enter(),
+ *	@wake(), and @finish() will not be called in that case).
+ *
+ * @prepare_late: Finish preparing the platform for entering the system sleep
+ *	state indicated by @begin().
+ *	@prepare_late is called before disabling nonboot CPUs and after
+ *	device drivers' late suspend callbacks have been executed.  It returns
+ *	0 on success or a negative error code otherwise, in which case the
+ *	system cannot enter the desired sleep state (@enter() and @wake()).
  *
  * @enter: Enter the system sleep state indicated by @begin() or represented by
  *	the argument if @begin() is not implemented.
@@ -69,19 +76,26 @@ typedef int __bitwise suspend_state_t;
  *	error code otherwise, in which case the system cannot enter the desired
  *	sleep state.
  *
- * @finish: Called when the system has just left a sleep state, right after
- *	the nonboot CPUs have been enabled and before devices are resumed (it is
- *	executed with IRQs enabled).
+ * @wake: Called when the system has just left a sleep state, right after
+ *	the nonboot CPUs have been enabled and before device drivers' early
+ *	resume callbacks are executed.
+ *	This callback is optional, but should be implemented by the platforms
+ *	that implement @prepare_late().  If implemented, it is always called
+ *	after @enter(), even if @enter() fails.
+ *
+ * @finish: Finish wake-up of the platform.
+ *	@finish is called right prior to calling device drivers' regular suspend
+ *	callbacks.
  *	This callback is optional, but should be implemented by the platforms
  *	that implement @prepare().  If implemented, it is always called after
- *	@enter() (even if @enter() fails).
+ *	@enter() and @wake(), if implemented, even if any of them fails.
  *
  * @end: Called by the PM core right after resuming devices, to indicate to
  *	the platform that the system has returned to the working state or
  *	the transition to the sleep state has been aborted.
  *	This callback is optional, but should be implemented by the platforms
- *	that implement @begin(), but platforms implementing @begin() should
- *	also provide a @end() which cleans up transitions aborted before
+ *	that implement @begin().  Accordingly, platforms implementing @begin()
+ *	should also provide a @end() which cleans up transitions aborted before
  *	@enter().
  *
  * @recover: Recover the platform from a suspend failure.
@@ -93,7 +107,9 @@ struct platform_suspend_ops {
 	int (*valid)(suspend_state_t state);
 	int (*begin)(suspend_state_t state);
 	int (*prepare)(void);
+	int (*prepare_late)(void);
 	int (*enter)(suspend_state_t state);
+	void (*wake)(void);
 	void (*finish)(void);
 	void (*end)(void);
 	void (*recover)(void);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f172f41858bb..f99ed6a75eac 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -291,20 +291,26 @@ static int suspend_enter(suspend_state_t state)
 
 	device_pm_lock();
 
+	if (suspend_ops->prepare) {
+		error = suspend_ops->prepare();
+		if (error)
+			goto Done;
+	}
+
 	error = device_power_down(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
-		goto Done;
+		goto Platfrom_finish;
 	}
 
-	if (suspend_ops->prepare) {
-		error = suspend_ops->prepare();
+	if (suspend_ops->prepare_late) {
+		error = suspend_ops->prepare_late();
 		if (error)
 			goto Power_up_devices;
 	}
 
 	if (suspend_test(TEST_PLATFORM))
-		goto Platfrom_finish;
+		goto Platform_wake;
 
 	error = disable_nonboot_cpus();
 	if (error || suspend_test(TEST_CPUS))
@@ -326,13 +332,17 @@ static int suspend_enter(suspend_state_t state)
  Enable_cpus:
 	enable_nonboot_cpus();
 
- Platfrom_finish:
-	if (suspend_ops->finish)
-		suspend_ops->finish();
+ Platform_wake:
+	if (suspend_ops->wake)
+		suspend_ops->wake();
 
  Power_up_devices:
 	device_power_up(PMSG_RESUME);
 
+ Platfrom_finish:
+	if (suspend_ops->finish)
+		suspend_ops->finish();
+
  Done:
 	device_pm_unlock();
 
-- 
cgit v1.2.3


From 0112fc2229847feb6c4eb011e6833d8f1742a375 Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@linuxhacker.ru>
Date: Wed, 8 Apr 2009 20:05:42 +0400
Subject: Separate out common fstatat code into vfs_fstatat

This is a version incorporating Christoph's suggestion.

Separate out common *fstatat functionality into a single function
instead of duplicating it all over the code.

Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/arm/kernel/sys_oabi-compat.c | 19 ++++---------
 arch/s390/kernel/compat_linux.c   | 18 ++++---------
 arch/sparc/kernel/sys_sparc32.c   | 19 ++++---------
 arch/x86/ia32/sys_ia32.c          | 19 ++++---------
 fs/compat.c                       | 19 ++++---------
 fs/stat.c                         | 56 +++++++++++++++++++--------------------
 include/linux/fs.h                |  1 +
 7 files changed, 54 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index e04173c7e621..d59a0cd537f0 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -177,21 +177,12 @@ asmlinkage long sys_oabi_fstatat64(int dfd,
 				   int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
+	int error;
 
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-	error = cp_oldabi_stat64(&stat, statbuf);
-
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_oldabi_stat64(&stat, statbuf);
 }
 
 struct oabi_flock64 {
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 6cc87d8c8682..002c70d3cb75 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -702,20 +702,12 @@ asmlinkage long sys32_fstatat64(unsigned int dfd, char __user *filename,
 				struct stat64_emu31 __user* statbuf, int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_stat64(statbuf, &stat);
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_stat64(statbuf, &stat);
 }
 
 /*
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index e800503879e4..f5000a460c05 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -206,21 +206,12 @@ asmlinkage long compat_sys_fstatat64(unsigned int dfd, char __user *filename,
 		struct compat_stat64 __user * statbuf, int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_compat_stat64(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_compat_stat64(&stat, statbuf);
 }
 
 asmlinkage long compat_sys_sysfs(int option, u32 arg1, u32 arg2)
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index efac92fd1efb..085a8c35f149 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -129,21 +129,12 @@ asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
 			      struct stat64 __user *statbuf, int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
+	int error;
 
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_stat64(statbuf, &stat);
-
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_stat64(statbuf, &stat);
 }
 
 /*
diff --git a/fs/compat.c b/fs/compat.c
index 3f84d5f15889..dda72e267092 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -204,21 +204,12 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
 		struct compat_stat __user *statbuf, int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_compat_stat(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
 }
 #endif
 
diff --git a/fs/stat.c b/fs/stat.c
index 2db740a0cfb5..54711662b855 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -109,6 +109,24 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
 
 EXPORT_SYMBOL(vfs_fstat);
 
+int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
+{
+	int error = -EINVAL;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	if (flag & AT_SYMLINK_NOFOLLOW)
+		error = vfs_lstat_fd(dfd, filename, stat);
+	else
+		error = vfs_stat_fd(dfd, filename, stat);
+out:
+	return error;
+}
+
+EXPORT_SYMBOL(vfs_fstatat);
+
+
 #ifdef __ARCH_WANT_OLD_STAT
 
 /*
@@ -264,21 +282,12 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
 		struct stat __user *, statbuf, int, flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_new_stat(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat(&stat, statbuf);
 }
 #endif
 
@@ -404,21 +413,12 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
 		struct stat64 __user *, statbuf, int, flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_new_stat64(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat64(&stat, statbuf);
 }
 #endif /* __ARCH_WANT_STAT64 */
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e766be0d4329..257f4d37ad23 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2302,6 +2302,7 @@ extern int vfs_lstat(char __user *, struct kstat *);
 extern int vfs_stat_fd(int dfd, char __user *, struct kstat *);
 extern int vfs_lstat_fd(int dfd, char __user *, struct kstat *);
 extern int vfs_fstat(unsigned int, struct kstat *);
+extern int vfs_fstatat(int , char __user *, struct kstat *, int);
 
 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
-- 
cgit v1.2.3


From 2eae7a1874ca5be3232765d89e0250a449f1bc90 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 8 Apr 2009 16:34:03 -0400
Subject: kill vfs_stat_fd / vfs_lstat_fd

There's really no reason to keep vfs_stat_fd and vfs_lstat_fd with
Oleg's vfs_fstatat.  Use vfs_fstatat for the few cases having the
directory fd, and switch all others to vfs_stat / vfs_lstat.

Reviewed-by: Christoph Hellwig <hch@lst.de>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c        |  18 +++++----
 fs/stat.c          | 105 +++++++++++++++++++++--------------------------------
 include/linux/fs.h |   2 -
 3 files changed, 52 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index dda72e267092..379a399bf5c3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -181,22 +181,24 @@ asmlinkage long compat_sys_newstat(char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_compat_stat(&stat, statbuf);
-	return error;
+	error = vfs_stat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
 }
 
 asmlinkage long compat_sys_newlstat(char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_compat_stat(&stat, statbuf);
-	return error;
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
 }
 
 #ifndef __ARCH_WANT_STAT64
diff --git a/fs/stat.c b/fs/stat.c
index 54711662b855..075694e31d8b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -55,46 +55,6 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 
 EXPORT_SYMBOL(vfs_getattr);
 
-int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
-{
-	struct path path;
-	int error;
-
-	error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path);
-	if (!error) {
-		error = vfs_getattr(path.mnt, path.dentry, stat);
-		path_put(&path);
-	}
-	return error;
-}
-
-int vfs_stat(char __user *name, struct kstat *stat)
-{
-	return vfs_stat_fd(AT_FDCWD, name, stat);
-}
-
-EXPORT_SYMBOL(vfs_stat);
-
-int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
-{
-	struct path path;
-	int error;
-
-	error = user_path_at(dfd, name, 0, &path);
-	if (!error) {
-		error = vfs_getattr(path.mnt, path.dentry, stat);
-		path_put(&path);
-	}
-	return error;
-}
-
-int vfs_lstat(char __user *name, struct kstat *stat)
-{
-	return vfs_lstat_fd(AT_FDCWD, name, stat);
-}
-
-EXPORT_SYMBOL(vfs_lstat);
-
 int vfs_fstat(unsigned int fd, struct kstat *stat)
 {
 	struct file *f = fget(fd);
@@ -106,26 +66,43 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
 	}
 	return error;
 }
-
 EXPORT_SYMBOL(vfs_fstat);
 
 int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
 {
+	struct path path;
 	int error = -EINVAL;
+	int lookup_flags = 0;
 
 	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
 		goto out;
 
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, stat);
-	else
-		error = vfs_stat_fd(dfd, filename, stat);
+	if (!(flag & AT_SYMLINK_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+
+	error = user_path_at(dfd, filename, lookup_flags, &path);
+	if (error)
+		goto out;
+
+	error = vfs_getattr(path.mnt, path.dentry, stat);
+	path_put(&path);
 out:
 	return error;
 }
-
 EXPORT_SYMBOL(vfs_fstatat);
 
+int vfs_stat(char __user *name, struct kstat *stat)
+{
+	return vfs_fstatat(AT_FDCWD, name, stat, 0);
+}
+EXPORT_SYMBOL(vfs_stat);
+
+int vfs_lstat(char __user *name, struct kstat *stat)
+{
+	return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
+}
+EXPORT_SYMBOL(vfs_lstat);
+
 
 #ifdef __ARCH_WANT_OLD_STAT
 
@@ -173,23 +150,25 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_old_stat(&stat, statbuf);
+	error = vfs_stat(filename, &stat);
+	if (error)
+		return error;
 
-	return error;
+	return cp_old_stat(&stat, statbuf);
 }
 
 SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_old_stat(&stat, statbuf);
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
 
-	return error;
+	return cp_old_stat(&stat, statbuf);
 }
 
 SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
@@ -258,23 +237,23 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
-
-	if (!error)
-		error = cp_new_stat(&stat, statbuf);
+	int error = vfs_stat(filename, &stat);
 
-	return error;
+	if (error)
+		return error;
+	return cp_new_stat(&stat, statbuf);
 }
 
 SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_new_stat(&stat, statbuf);
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
 
-	return error;
+	return cp_new_stat(&stat, statbuf);
 }
 
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 257f4d37ad23..8f42b35a7565 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2299,8 +2299,6 @@ extern int vfs_readdir(struct file *, filldir_t, void *);
 
 extern int vfs_stat(char __user *, struct kstat *);
 extern int vfs_lstat(char __user *, struct kstat *);
-extern int vfs_stat_fd(int dfd, char __user *, struct kstat *);
-extern int vfs_lstat_fd(int dfd, char __user *, struct kstat *);
 extern int vfs_fstat(unsigned int, struct kstat *);
 extern int vfs_fstatat(int , char __user *, struct kstat *, int);
 
-- 
cgit v1.2.3


From 38e23c95f92a84fb8505a9f572b8a209c9c372c1 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 9 Apr 2009 20:17:52 +0900
Subject: fs: Mark get_filesystem_list() as __init function.

"int get_filesystem_list(char * buf)" is called by only
"static void __init get_fs_names(char *page)".
We can mark get_filesystem_list() as "__init".

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/filesystems.c   | 2 +-
 include/linux/fs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/filesystems.c b/fs/filesystems.c
index 1aa70260e6d1..a24c58e181db 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -199,7 +199,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
 	return retval;
 }
 
-int get_filesystem_list(char * buf)
+int __init get_filesystem_list(char *buf)
 {
 	int len = 0;
 	struct file_system_type * tmp;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8f42b35a7565..5bed436f4353 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2448,7 +2448,7 @@ struct ctl_table;
 int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
-int get_filesystem_list(char * buf);
+int __init get_filesystem_list(char *buf);
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_FS_H */
-- 
cgit v1.2.3


From be9208dff23af904655807672dd8235abf6ac039 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Apr 2009 23:29:41 -0400
Subject: reiserfs: fix j_last_flush_trans_id type

Conversion in commit 600ed41675d8c384519d8f0b3c76afed39ef2f4b had missed
that one, but converted format from %lu to %u.  As the result,
/proc/..../journal got buggered on 64bit boxen.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/reiserfs_fs_sb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 5621d87c4479..6b361d23a499 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -193,7 +193,7 @@ struct reiserfs_journal {
 	atomic_t j_wcount;	/* count of writers for current commit */
 	unsigned long j_bcount;	/* batch count. allows turning X transactions into 1 */
 	unsigned long j_first_unflushed_offset;	/* first unflushed transactions offset */
-	unsigned long j_last_flush_trans_id;	/* last fully flushed journal timestamp */
+	unsigned j_last_flush_trans_id;	/* last fully flushed journal timestamp */
 	struct buffer_head *j_header_bh;
 
 	time_t j_trans_start_time;	/* time this transaction started */
-- 
cgit v1.2.3


From 88bea188b85f9cefefbbd56b8a48d0f798409177 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 21 Apr 2009 00:35:47 -0400
Subject: ACPI: add /sys/firmware/acpi/interrupts/sci_not counter

This counter may prove useful in debugging some
spurious interrupt issues seen in the field.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/ABI/testing/sysfs-firmware-acpi |  8 ++++++--
 drivers/acpi/osl.c                            |  4 +++-
 drivers/acpi/system.c                         | 11 +++++++++--
 include/linux/acpi.h                          |  1 +
 4 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-firmware-acpi b/Documentation/ABI/testing/sysfs-firmware-acpi
index e8ffc70ffe12..4f9ba3c2fca7 100644
--- a/Documentation/ABI/testing/sysfs-firmware-acpi
+++ b/Documentation/ABI/testing/sysfs-firmware-acpi
@@ -69,9 +69,13 @@ Description:
 		gpe1F:	     0	invalid
 		gpe_all:    1192
 		sci:	1194
+		sci_not:     0	
 
-		sci - The total number of times the ACPI SCI
-		has claimed an interrupt.
+		sci - The number of times the ACPI SCI
+		has been called and claimed an interrupt.
+
+		sci_not - The number of times the ACPI SCI
+		has been called and NOT claimed an interrupt.
 
 		gpe_all - count of SCI caused by GPEs.
 
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index d59f08ecaf16..d916bea729f1 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -353,8 +353,10 @@ static irqreturn_t acpi_irq(int irq, void *dev_id)
 	if (handled) {
 		acpi_irq_handled++;
 		return IRQ_HANDLED;
-	} else
+	} else {
+		acpi_irq_not_handled++;
 		return IRQ_NONE;
+	}
 }
 
 acpi_status
diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c
index da51f05ef8d8..0944daec064f 100644
--- a/drivers/acpi/system.c
+++ b/drivers/acpi/system.c
@@ -38,6 +38,7 @@ ACPI_MODULE_NAME("system");
 #define ACPI_SYSTEM_DEVICE_NAME		"System"
 
 u32 acpi_irq_handled;
+u32 acpi_irq_not_handled;
 
 /*
  * Make ACPICA version work as module param
@@ -214,8 +215,9 @@ err:
 
 #define COUNT_GPE 0
 #define COUNT_SCI 1	/* acpi_irq_handled */
-#define COUNT_ERROR 2	/* other */
-#define NUM_COUNTERS_EXTRA 3
+#define COUNT_SCI_NOT 2	/* acpi_irq_not_handled */
+#define COUNT_ERROR 3	/* other */
+#define NUM_COUNTERS_EXTRA 4
 
 struct event_counter {
 	u32 count;
@@ -317,6 +319,8 @@ static ssize_t counter_show(struct kobject *kobj,
 
 	all_counters[num_gpes + ACPI_NUM_FIXED_EVENTS + COUNT_SCI].count =
 		acpi_irq_handled;
+	all_counters[num_gpes + ACPI_NUM_FIXED_EVENTS + COUNT_SCI_NOT].count =
+		acpi_irq_not_handled;
 	all_counters[num_gpes + ACPI_NUM_FIXED_EVENTS + COUNT_GPE].count =
 		acpi_gpe_count;
 
@@ -363,6 +367,7 @@ static ssize_t counter_set(struct kobject *kobj,
 			all_counters[i].count = 0;
 		acpi_gpe_count = 0;
 		acpi_irq_handled = 0;
+		acpi_irq_not_handled = 0;
 		goto end;
 	}
 
@@ -456,6 +461,8 @@ void acpi_irq_stats_init(void)
 			sprintf(buffer, "gpe_all");
 		else if (i == num_gpes + ACPI_NUM_FIXED_EVENTS + COUNT_SCI)
 			sprintf(buffer, "sci");
+		else if (i == num_gpes + ACPI_NUM_FIXED_EVENTS + COUNT_SCI_NOT)
+			sprintf(buffer, "sci_not");
 		else if (i == num_gpes + ACPI_NUM_FIXED_EVENTS + COUNT_ERROR)
 			sprintf(buffer, "error");
 		else
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6586cbd0d4af..88be890ee3c7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -111,6 +111,7 @@ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base);
 int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base);
 void acpi_irq_stats_init(void);
 extern u32 acpi_irq_handled;
+extern u32 acpi_irq_not_handled;
 
 extern struct acpi_mcfg_allocation *pci_mmcfg_config;
 extern int pci_mmcfg_config_num;
-- 
cgit v1.2.3


From 8b9cf76d0fa6cd98fe42dd2f86460d6ede55fed8 Mon Sep 17 00:00:00 2001
From: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Date: Tue, 21 Apr 2009 13:44:13 +0200
Subject: Fix SYSCALL_ALIAS for older MIPS assembler

Older MIPS assembler don't support .set for defining aliases.
Using = works for old and new assembers.

Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index dabe4ad89141..40617c1d8976 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -148,7 +148,7 @@ struct old_linux_dirent;
 	asm ("\t.globl " #alias "\n\t.set " #alias ", " #name "\n"	\
 	     "\t.globl ." #alias "\n\t.set ." #alias ", ." #name)
 #else
-#ifdef CONFIG_ALPHA
+#if defined(CONFIG_ALPHA) || defined(CONFIG_MIPS)
 #define SYSCALL_ALIAS(alias, name)					\
 	asm ( #alias " = " #name "\n\t.globl " #alias)
 #else
-- 
cgit v1.2.3


From 8e19608e8b5c001e4a66ce482edc474f05fb7355 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 21 Apr 2009 12:24:00 -0700
Subject: clocksource: pass clocksource to read() callback

Pass clocksource pointer to the read() callback for clocksources.  This
allows us to share the callback between multiple instances.

[hugh@veritas.com: fix powerpc build of clocksource pass clocksource mods]
[akpm@linux-foundation.org: cleanup]
Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-at91/at91rm9200_time.c         |  2 +-
 arch/arm/mach-at91/at91sam926x_time.c        |  2 +-
 arch/arm/mach-davinci/time.c                 |  2 +-
 arch/arm/mach-imx/time.c                     |  2 +-
 arch/arm/mach-ixp4xx/common.c                |  2 +-
 arch/arm/mach-msm/timer.c                    |  4 ++--
 arch/arm/mach-netx/time.c                    |  2 +-
 arch/arm/mach-ns9xxx/time-ns9360.c           |  2 +-
 arch/arm/mach-omap1/time.c                   |  2 +-
 arch/arm/mach-omap2/timer-gp.c               |  2 +-
 arch/arm/mach-pxa/time.c                     |  2 +-
 arch/arm/mach-realview/core.c                |  2 +-
 arch/arm/mach-versatile/core.c               |  2 +-
 arch/arm/plat-mxc/time.c                     |  2 +-
 arch/arm/plat-omap/common.c                  |  4 ++--
 arch/arm/plat-orion/time.c                   |  2 +-
 arch/avr32/kernel/time.c                     |  2 +-
 arch/blackfin/kernel/time-ts.c               | 12 ++++++------
 arch/ia64/kernel/cyclone.c                   |  2 +-
 arch/ia64/kernel/time.c                      |  4 ++--
 arch/ia64/sn/kernel/sn2/timer.c              |  2 +-
 arch/m68knommu/platform/68328/timers.c       |  2 +-
 arch/m68knommu/platform/coldfire/dma_timer.c |  2 +-
 arch/m68knommu/platform/coldfire/pit.c       |  2 +-
 arch/m68knommu/platform/coldfire/timers.c    |  2 +-
 arch/mips/kernel/cevt-txx9.c                 |  2 +-
 arch/mips/kernel/csrc-bcm1480.c              |  2 +-
 arch/mips/kernel/csrc-ioasic.c               |  6 +++---
 arch/mips/kernel/csrc-r4k.c                  |  2 +-
 arch/mips/kernel/csrc-sb1250.c               |  2 +-
 arch/mips/kernel/i8253.c                     |  2 +-
 arch/mips/nxp/pnx8550/common/time.c          |  2 +-
 arch/mips/sgi-ip27/ip27-timer.c              |  2 +-
 arch/powerpc/kernel/time.c                   |  8 ++++----
 arch/s390/kernel/time.c                      |  2 +-
 arch/sh/kernel/time_32.c                     |  2 +-
 arch/sh/kernel/timers/timer-tmu.c            |  2 +-
 arch/sparc/kernel/time_64.c                  |  7 ++++++-
 arch/um/kernel/time.c                        |  2 +-
 arch/x86/kernel/hpet.c                       |  6 +++---
 arch/x86/kernel/i8253.c                      |  2 +-
 arch/x86/kernel/kvmclock.c                   |  7 ++++++-
 arch/x86/kernel/tsc.c                        |  2 +-
 arch/x86/kernel/vmiclock_32.c                |  2 +-
 arch/x86/lguest/boot.c                       |  2 +-
 arch/x86/xen/time.c                          |  7 ++++++-
 drivers/char/hpet.c                          |  2 +-
 drivers/clocksource/acpi_pm.c                | 12 ++++++------
 drivers/clocksource/cyclone.c                |  2 +-
 drivers/clocksource/scx200_hrt.c             |  2 +-
 drivers/clocksource/tcb_clksrc.c             |  2 +-
 include/linux/clocksource.h                  |  6 +++---
 kernel/time/clocksource.c                    |  8 ++++----
 kernel/time/jiffies.c                        |  2 +-
 54 files changed, 94 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-at91/at91rm9200_time.c b/arch/arm/mach-at91/at91rm9200_time.c
index 1ff1bda0a894..309f3511aa20 100644
--- a/arch/arm/mach-at91/at91rm9200_time.c
+++ b/arch/arm/mach-at91/at91rm9200_time.c
@@ -85,7 +85,7 @@ static struct irqaction at91rm9200_timer_irq = {
 	.handler	= at91rm9200_timer_interrupt
 };
 
-static cycle_t read_clk32k(void)
+static cycle_t read_clk32k(struct clocksource *cs)
 {
 	return read_CRTR();
 }
diff --git a/arch/arm/mach-at91/at91sam926x_time.c b/arch/arm/mach-at91/at91sam926x_time.c
index b63e1d5f1bad..4bd56aee4370 100644
--- a/arch/arm/mach-at91/at91sam926x_time.c
+++ b/arch/arm/mach-at91/at91sam926x_time.c
@@ -31,7 +31,7 @@ static u32 pit_cnt;		/* access only w/system irq blocked */
  * Clocksource:  just a monotonic counter of MCK/16 cycles.
  * We don't care whether or not PIT irqs are enabled.
  */
-static cycle_t read_pit_clk(void)
+static cycle_t read_pit_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 elapsed;
diff --git a/arch/arm/mach-davinci/time.c b/arch/arm/mach-davinci/time.c
index f8bcd29d17a6..6c227d4ba998 100644
--- a/arch/arm/mach-davinci/time.c
+++ b/arch/arm/mach-davinci/time.c
@@ -238,7 +238,7 @@ static void __init timer_init(void)
 /*
  * clocksource
  */
-static cycle_t read_cycles(void)
+static cycle_t read_cycles(struct clocksource *cs)
 {
 	struct timer_s *t = &timers[TID_CLOCKSOURCE];
 
diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c
index aff0ebcfa847..5aef18b599e5 100644
--- a/arch/arm/mach-imx/time.c
+++ b/arch/arm/mach-imx/time.c
@@ -73,7 +73,7 @@ static void __init imx_timer_hardware_init(void)
 	IMX_TCTL(TIMER_BASE) = TCTL_FRR | TCTL_CLK_PCLK1 | TCTL_TEN;
 }
 
-cycle_t imx_get_cycles(void)
+cycle_t imx_get_cycles(struct clocksource *cs)
 {
 	return IMX_TCN(TIMER_BASE);
 }
diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index f4656d2ac8a8..1e93dfee7543 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -401,7 +401,7 @@ void __init ixp4xx_sys_init(void)
 /*
  * clocksource
  */
-cycle_t ixp4xx_get_cycles(void)
+cycle_t ixp4xx_get_cycles(struct clocksource *cs)
 {
 	return *IXP4XX_OSTS;
 }
diff --git a/arch/arm/mach-msm/timer.c b/arch/arm/mach-msm/timer.c
index 444d9c0f5ca6..4855b8ca5101 100644
--- a/arch/arm/mach-msm/timer.c
+++ b/arch/arm/mach-msm/timer.c
@@ -57,12 +57,12 @@ static irqreturn_t msm_timer_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static cycle_t msm_gpt_read(void)
+static cycle_t msm_gpt_read(struct clocksource *cs)
 {
 	return readl(MSM_GPT_BASE + TIMER_COUNT_VAL);
 }
 
-static cycle_t msm_dgt_read(void)
+static cycle_t msm_dgt_read(struct clocksource *cs)
 {
 	return readl(MSM_DGT_BASE + TIMER_COUNT_VAL) >> MSM_DGT_SHIFT;
 }
diff --git a/arch/arm/mach-netx/time.c b/arch/arm/mach-netx/time.c
index f201fddb594f..82801dbf0579 100644
--- a/arch/arm/mach-netx/time.c
+++ b/arch/arm/mach-netx/time.c
@@ -104,7 +104,7 @@ static struct irqaction netx_timer_irq = {
 	.handler	= netx_timer_interrupt,
 };
 
-cycle_t netx_get_cycles(void)
+cycle_t netx_get_cycles(struct clocksource *cs)
 {
 	return readl(NETX_GPIO_COUNTER_CURRENT(TIMER_CLOCKSOURCE));
 }
diff --git a/arch/arm/mach-ns9xxx/time-ns9360.c b/arch/arm/mach-ns9xxx/time-ns9360.c
index 41df69721769..77281260358a 100644
--- a/arch/arm/mach-ns9xxx/time-ns9360.c
+++ b/arch/arm/mach-ns9xxx/time-ns9360.c
@@ -25,7 +25,7 @@
 #define TIMER_CLOCKEVENT 1
 static u32 latch;
 
-static cycle_t ns9360_clocksource_read(void)
+static cycle_t ns9360_clocksource_read(struct clocksource *cs)
 {
 	return __raw_readl(SYS_TR(TIMER_CLOCKSOURCE));
 }
diff --git a/arch/arm/mach-omap1/time.c b/arch/arm/mach-omap1/time.c
index 495a32c287b4..4d56408d3cff 100644
--- a/arch/arm/mach-omap1/time.c
+++ b/arch/arm/mach-omap1/time.c
@@ -198,7 +198,7 @@ static struct irqaction omap_mpu_timer2_irq = {
 	.handler	= omap_mpu_timer2_interrupt,
 };
 
-static cycle_t mpu_read(void)
+static cycle_t mpu_read(struct clocksource *cs)
 {
 	return ~omap_mpu_timer_read(1);
 }
diff --git a/arch/arm/mach-omap2/timer-gp.c b/arch/arm/mach-omap2/timer-gp.c
index 9fc13a2cc3f4..1cb2c0909c2b 100644
--- a/arch/arm/mach-omap2/timer-gp.c
+++ b/arch/arm/mach-omap2/timer-gp.c
@@ -138,7 +138,7 @@ static inline void __init omap2_gp_clocksource_init(void) {}
  * clocksource
  */
 static struct omap_dm_timer *gpt_clocksource;
-static cycle_t clocksource_read_cycles(void)
+static cycle_t clocksource_read_cycles(struct clocksource *cs)
 {
 	return (cycle_t)omap_dm_timer_read_counter(gpt_clocksource);
 }
diff --git a/arch/arm/mach-pxa/time.c b/arch/arm/mach-pxa/time.c
index 8eb3830fbb0b..750c448db672 100644
--- a/arch/arm/mach-pxa/time.c
+++ b/arch/arm/mach-pxa/time.c
@@ -125,7 +125,7 @@ static struct clock_event_device ckevt_pxa_osmr0 = {
 	.set_mode	= pxa_osmr0_set_mode,
 };
 
-static cycle_t pxa_read_oscr(void)
+static cycle_t pxa_read_oscr(struct clocksource *cs)
 {
 	return OSCR;
 }
diff --git a/arch/arm/mach-realview/core.c b/arch/arm/mach-realview/core.c
index 9ab947c14f26..942e1a7eb9b2 100644
--- a/arch/arm/mach-realview/core.c
+++ b/arch/arm/mach-realview/core.c
@@ -715,7 +715,7 @@ static struct irqaction realview_timer_irq = {
 	.handler	= realview_timer_interrupt,
 };
 
-static cycle_t realview_get_cycles(void)
+static cycle_t realview_get_cycles(struct clocksource *cs)
 {
 	return ~readl(timer3_va_base + TIMER_VALUE);
 }
diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c
index 565776680d8c..1f929c391af7 100644
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@ -948,7 +948,7 @@ static struct irqaction versatile_timer_irq = {
 	.handler	= versatile_timer_interrupt,
 };
 
-static cycle_t versatile_get_cycles(void)
+static cycle_t versatile_get_cycles(struct clocksource *cs)
 {
 	return ~readl(TIMER3_VA_BASE + TIMER_VALUE);
 }
diff --git a/arch/arm/plat-mxc/time.c b/arch/arm/plat-mxc/time.c
index ef1b3cd85bd3..dab3357196fb 100644
--- a/arch/arm/plat-mxc/time.c
+++ b/arch/arm/plat-mxc/time.c
@@ -36,7 +36,7 @@ static enum clock_event_mode clockevent_mode = CLOCK_EVT_MODE_UNUSED;
 
 /* clock source */
 
-static cycle_t mxc_get_cycles(void)
+static cycle_t mxc_get_cycles(struct clocksource *cs)
 {
 	return __raw_readl(TIMER_BASE + MXC_TCN);
 }
diff --git a/arch/arm/plat-omap/common.c b/arch/arm/plat-omap/common.c
index d1797147732f..433021f3d7cc 100644
--- a/arch/arm/plat-omap/common.c
+++ b/arch/arm/plat-omap/common.c
@@ -185,7 +185,7 @@ console_initcall(omap_add_serial_console);
 
 #include <linux/clocksource.h>
 
-static cycle_t omap_32k_read(void)
+static cycle_t omap_32k_read(struct clocksource *cs)
 {
 	return omap_readl(TIMER_32K_SYNCHRONIZED);
 }
@@ -207,7 +207,7 @@ unsigned long long sched_clock(void)
 {
 	unsigned long long ret;
 
-	ret = (unsigned long long)omap_32k_read();
+	ret = (unsigned long long)omap_32k_read(&clocksource_32k);
 	ret = (ret * clocksource_32k.mult_orig) >> clocksource_32k.shift;
 	return ret;
 }
diff --git a/arch/arm/plat-orion/time.c b/arch/arm/plat-orion/time.c
index 6fa2923e6dca..2faf9dba4ef7 100644
--- a/arch/arm/plat-orion/time.c
+++ b/arch/arm/plat-orion/time.c
@@ -41,7 +41,7 @@ static u32 ticks_per_jiffy;
 /*
  * Clocksource handling.
  */
-static cycle_t orion_clksrc_read(void)
+static cycle_t orion_clksrc_read(struct clocksource *cs)
 {
 	return 0xffffffff - readl(TIMER0_VAL);
 }
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index 0ff46bf873b0..f27aa3b259fa 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -18,7 +18,7 @@
 #include <mach/pm.h>
 
 
-static cycle_t read_cycle_count(void)
+static cycle_t read_cycle_count(struct clocksource *cs)
 {
 	return (cycle_t)sysreg_read(COUNT);
 }
diff --git a/arch/blackfin/kernel/time-ts.c b/arch/blackfin/kernel/time-ts.c
index 0ed2badfd746..27646121280a 100644
--- a/arch/blackfin/kernel/time-ts.c
+++ b/arch/blackfin/kernel/time-ts.c
@@ -58,16 +58,11 @@ static inline unsigned long long cycles_2_ns(cycle_t cyc)
 	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
 }
 
-static cycle_t read_cycles(void)
+static cycle_t read_cycles(struct clocksource *cs)
 {
 	return __bfin_cycles_off + (get_cycles() << __bfin_cycles_mod);
 }
 
-unsigned long long sched_clock(void)
-{
-	return cycles_2_ns(read_cycles());
-}
-
 static struct clocksource clocksource_bfin = {
 	.name		= "bfin_cycles",
 	.rating		= 350,
@@ -77,6 +72,11 @@ static struct clocksource clocksource_bfin = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
+unsigned long long sched_clock(void)
+{
+	return cycles_2_ns(read_cycles(&clocksource_bfin));
+}
+
 static int __init bfin_clocksource_init(void)
 {
 	set_cyc2ns_scale(get_cclk() / 1000);
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index 790ef0d87e12..71e35864d2e2 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -21,7 +21,7 @@ void __init cyclone_setup(void)
 
 static void __iomem *cyclone_mc;
 
-static cycle_t read_cyclone(void)
+static cycle_t read_cyclone(struct clocksource *cs)
 {
 	return (cycle_t)readq((void __iomem *)cyclone_mc);
 }
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 641c8b61c4f1..604c1a35db33 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -33,7 +33,7 @@
 
 #include "fsyscall_gtod_data.h"
 
-static cycle_t itc_get_cycles(void);
+static cycle_t itc_get_cycles(struct clocksource *cs);
 
 struct fsyscall_gtod_data_t fsyscall_gtod_data = {
 	.lock = SEQLOCK_UNLOCKED,
@@ -383,7 +383,7 @@ ia64_init_itm (void)
 	}
 }
 
-static cycle_t itc_get_cycles(void)
+static cycle_t itc_get_cycles(struct clocksource *cs)
 {
 	u64 lcycle, now, ret;
 
diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c
index cf67fc562054..21d6f09e3447 100644
--- a/arch/ia64/sn/kernel/sn2/timer.c
+++ b/arch/ia64/sn/kernel/sn2/timer.c
@@ -23,7 +23,7 @@
 
 extern unsigned long sn_rtc_cycles_per_second;
 
-static cycle_t read_sn2(void)
+static cycle_t read_sn2(struct clocksource *cs)
 {
 	return (cycle_t)readq(RTC_COUNTER_ADDR);
 }
diff --git a/arch/m68knommu/platform/68328/timers.c b/arch/m68knommu/platform/68328/timers.c
index 6bafefa546e5..309f725995bf 100644
--- a/arch/m68knommu/platform/68328/timers.c
+++ b/arch/m68knommu/platform/68328/timers.c
@@ -75,7 +75,7 @@ static struct irqaction m68328_timer_irq = {
 
 /***************************************************************************/
 
-static cycle_t m68328_read_clk(void)
+static cycle_t m68328_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/m68knommu/platform/coldfire/dma_timer.c b/arch/m68knommu/platform/coldfire/dma_timer.c
index 772578b1084f..a5f562823d7a 100644
--- a/arch/m68knommu/platform/coldfire/dma_timer.c
+++ b/arch/m68knommu/platform/coldfire/dma_timer.c
@@ -34,7 +34,7 @@
 #define DMA_DTMR_CLK_DIV_16	(2 << 1)
 #define DMA_DTMR_ENABLE		(1 << 0)
 
-static cycle_t cf_dt_get_cycles(void)
+static cycle_t cf_dt_get_cycles(struct clocksource *cs)
 {
 	return __raw_readl(DTCN0);
 }
diff --git a/arch/m68knommu/platform/coldfire/pit.c b/arch/m68knommu/platform/coldfire/pit.c
index 2a12e7fa9748..61b96211f8ff 100644
--- a/arch/m68knommu/platform/coldfire/pit.c
+++ b/arch/m68knommu/platform/coldfire/pit.c
@@ -125,7 +125,7 @@ static struct irqaction pit_irq = {
 
 /***************************************************************************/
 
-static cycle_t pit_read_clk(void)
+static cycle_t pit_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/m68knommu/platform/coldfire/timers.c b/arch/m68knommu/platform/coldfire/timers.c
index 454f25493491..1ba8a3731653 100644
--- a/arch/m68knommu/platform/coldfire/timers.c
+++ b/arch/m68knommu/platform/coldfire/timers.c
@@ -78,7 +78,7 @@ static struct irqaction mcftmr_timer_irq = {
 
 /***************************************************************************/
 
-static cycle_t mcftmr_read_clk(void)
+static cycle_t mcftmr_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/mips/kernel/cevt-txx9.c b/arch/mips/kernel/cevt-txx9.c
index eccf7d6096bd..2e911e3da8d3 100644
--- a/arch/mips/kernel/cevt-txx9.c
+++ b/arch/mips/kernel/cevt-txx9.c
@@ -22,7 +22,7 @@
 
 static struct txx9_tmr_reg __iomem *txx9_cs_tmrptr;
 
-static cycle_t txx9_cs_read(void)
+static cycle_t txx9_cs_read(struct clocksource *cs)
 {
 	return __raw_readl(&txx9_cs_tmrptr->trr);
 }
diff --git a/arch/mips/kernel/csrc-bcm1480.c b/arch/mips/kernel/csrc-bcm1480.c
index 868745e7184b..51489f8a825e 100644
--- a/arch/mips/kernel/csrc-bcm1480.c
+++ b/arch/mips/kernel/csrc-bcm1480.c
@@ -28,7 +28,7 @@
 
 #include <asm/sibyte/sb1250.h>
 
-static cycle_t bcm1480_hpt_read(void)
+static cycle_t bcm1480_hpt_read(struct clocksource *cs)
 {
 	return (cycle_t) __raw_readq(IOADDR(A_SCD_ZBBUS_CYCLE_COUNT));
 }
diff --git a/arch/mips/kernel/csrc-ioasic.c b/arch/mips/kernel/csrc-ioasic.c
index 1d5f63cf8997..b551f48d3a07 100644
--- a/arch/mips/kernel/csrc-ioasic.c
+++ b/arch/mips/kernel/csrc-ioasic.c
@@ -25,7 +25,7 @@
 #include <asm/dec/ioasic.h>
 #include <asm/dec/ioasic_addrs.h>
 
-static cycle_t dec_ioasic_hpt_read(void)
+static cycle_t dec_ioasic_hpt_read(struct clocksource *cs)
 {
 	return ioasic_read(IO_REG_FCTR);
 }
@@ -47,13 +47,13 @@ void __init dec_ioasic_clocksource_init(void)
 	while (!ds1287_timer_state())
 		;
 
-	start = dec_ioasic_hpt_read();
+	start = dec_ioasic_hpt_read(&clocksource_dec);
 
 	while (i--)
 		while (!ds1287_timer_state())
 			;
 
-	end = dec_ioasic_hpt_read();
+	end = dec_ioasic_hpt_read(&clocksource_dec);
 
 	freq = (end - start) * 10;
 	printk(KERN_INFO "I/O ASIC clock frequency %dHz\n", freq);
diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c
index f1a2893931ed..e95a3cd48eea 100644
--- a/arch/mips/kernel/csrc-r4k.c
+++ b/arch/mips/kernel/csrc-r4k.c
@@ -10,7 +10,7 @@
 
 #include <asm/time.h>
 
-static cycle_t c0_hpt_read(void)
+static cycle_t c0_hpt_read(struct clocksource *cs)
 {
 	return read_c0_count();
 }
diff --git a/arch/mips/kernel/csrc-sb1250.c b/arch/mips/kernel/csrc-sb1250.c
index 92212bbb8e45..d14d3d1907fa 100644
--- a/arch/mips/kernel/csrc-sb1250.c
+++ b/arch/mips/kernel/csrc-sb1250.c
@@ -33,7 +33,7 @@
  * The HPT is free running from SB1250_HPT_VALUE down to 0 then starts over
  * again.
  */
-static cycle_t sb1250_hpt_read(void)
+static cycle_t sb1250_hpt_read(struct clocksource *cs)
 {
 	unsigned int count;
 
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index 689719e34f08..ed20e7fe65e3 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -128,7 +128,7 @@ void __init setup_pit_timer(void)
  * to just read by itself. So use jiffies to emulate a free
  * running counter:
  */
-static cycle_t pit_read(void)
+static cycle_t pit_read(struct clocksource *cs)
 {
 	unsigned long flags;
 	int count;
diff --git a/arch/mips/nxp/pnx8550/common/time.c b/arch/mips/nxp/pnx8550/common/time.c
index cf293b279098..8df43e9e4d90 100644
--- a/arch/mips/nxp/pnx8550/common/time.c
+++ b/arch/mips/nxp/pnx8550/common/time.c
@@ -35,7 +35,7 @@
 
 static unsigned long cpj;
 
-static cycle_t hpt_read(void)
+static cycle_t hpt_read(struct clocksource *cs)
 {
 	return read_c0_count2();
 }
diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c
index f024057a35f8..f10a7cd64f7e 100644
--- a/arch/mips/sgi-ip27/ip27-timer.c
+++ b/arch/mips/sgi-ip27/ip27-timer.c
@@ -159,7 +159,7 @@ static void __init hub_rt_clock_event_global_init(void)
 	setup_irq(irq, &hub_rt_irqaction);
 }
 
-static cycle_t hub_rt_read(void)
+static cycle_t hub_rt_read(struct clocksource *cs)
 {
 	return REMOTE_HUB_L(cputonasid(0), PI_RT_COUNT);
 }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 926ea864e34f..48571ac56fb7 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -77,7 +77,7 @@
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
 
-static cycle_t rtc_read(void);
+static cycle_t rtc_read(struct clocksource *);
 static struct clocksource clocksource_rtc = {
 	.name         = "rtc",
 	.rating       = 400,
@@ -88,7 +88,7 @@ static struct clocksource clocksource_rtc = {
 	.read         = rtc_read,
 };
 
-static cycle_t timebase_read(void);
+static cycle_t timebase_read(struct clocksource *);
 static struct clocksource clocksource_timebase = {
 	.name         = "timebase",
 	.rating       = 400,
@@ -766,12 +766,12 @@ unsigned long read_persistent_clock(void)
 }
 
 /* clocksource code */
-static cycle_t rtc_read(void)
+static cycle_t rtc_read(struct clocksource *cs)
 {
 	return (cycle_t)get_rtc();
 }
 
-static cycle_t timebase_read(void)
+static cycle_t timebase_read(struct clocksource *cs)
 {
 	return (cycle_t)get_tb();
 }
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 6ded50dfa75a..ef596d020573 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -201,7 +201,7 @@ unsigned long read_persistent_clock(void)
 	return ts.tv_sec;
 }
 
-static cycle_t read_tod_clock(void)
+static cycle_t read_tod_clock(struct clocksource *cs)
 {
 	return get_clock();
 }
diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index c34e1e0f9b02..1700d2465f6c 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -208,7 +208,7 @@ unsigned long long sched_clock(void)
 	if (!clocksource_sh.rating)
 		return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
 
-	cycles = clocksource_sh.read();
+	cycles = clocksource_sh.read(&clocksource_sh);
 	return cyc2ns(&clocksource_sh, cycles);
 }
 #endif
diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c
index c5d3396f5960..fe8d8930ccb6 100644
--- a/arch/sh/kernel/timers/timer-tmu.c
+++ b/arch/sh/kernel/timers/timer-tmu.c
@@ -81,7 +81,7 @@ static int tmu_timer_stop(void)
  */
 static int tmus_are_scaled;
 
-static cycle_t tmu_timer_read(void)
+static cycle_t tmu_timer_read(struct clocksource *cs)
 {
 	return ((cycle_t)(~_tmu_read(TMU1)))<<tmus_are_scaled;
 }
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index db310aa00183..5c12e79b4bdf 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -814,6 +814,11 @@ void udelay(unsigned long usecs)
 }
 EXPORT_SYMBOL(udelay);
 
+static cycle_t clocksource_tick_read(struct clocksource *cs)
+{
+	return tick_ops->get_tick();
+}
+
 void __init time_init(void)
 {
 	unsigned long freq = sparc64_init_timers();
@@ -827,7 +832,7 @@ void __init time_init(void)
 	clocksource_tick.mult =
 		clocksource_hz2mult(freq,
 				    clocksource_tick.shift);
-	clocksource_tick.read = tick_ops->get_tick;
+	clocksource_tick.read = clocksource_tick_read;
 
 	printk("clocksource: mult[%x] shift[%d]\n",
 	       clocksource_tick.mult, clocksource_tick.shift);
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index b13a87a3ec95..c8b9c469fcd7 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -65,7 +65,7 @@ static irqreturn_t um_timer(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
-static cycle_t itimer_read(void)
+static cycle_t itimer_read(struct clocksource *cs)
 {
 	return os_nsecs() / 1000;
 }
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 648b3a2a3a44..3f0019e0a229 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -722,7 +722,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
 /*
  * Clock source related code
  */
-static cycle_t read_hpet(void)
+static cycle_t read_hpet(struct clocksource *cs)
 {
 	return (cycle_t)hpet_readl(HPET_COUNTER);
 }
@@ -756,7 +756,7 @@ static int hpet_clocksource_register(void)
 	hpet_restart_counter();
 
 	/* Verify whether hpet counter works */
-	t1 = read_hpet();
+	t1 = hpet_readl(HPET_COUNTER);
 	rdtscll(start);
 
 	/*
@@ -770,7 +770,7 @@ static int hpet_clocksource_register(void)
 		rdtscll(now);
 	} while ((now - start) < 200000UL);
 
-	if (t1 == read_hpet()) {
+	if (t1 == hpet_readl(HPET_COUNTER)) {
 		printk(KERN_WARNING
 		       "HPET counter not counting. HPET disabled\n");
 		return -ENODEV;
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 3475440baa54..c2e0bb0890d4 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -129,7 +129,7 @@ void __init setup_pit_timer(void)
  * to just read by itself. So use jiffies to emulate a free
  * running counter:
  */
-static cycle_t pit_read(void)
+static cycle_t pit_read(struct clocksource *cs)
 {
 	static int old_count;
 	static u32 old_jifs;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 137f2e8132df..223af43f1526 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -77,6 +77,11 @@ static cycle_t kvm_clock_read(void)
 	return ret;
 }
 
+static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
+{
+	return kvm_clock_read();
+}
+
 /*
  * If we don't do that, there is the possibility that the guest
  * will calibrate under heavy load - thus, getting a lower lpj -
@@ -107,7 +112,7 @@ static void kvm_get_preset_lpj(void)
 
 static struct clocksource kvm_clock = {
 	.name = "kvm-clock",
-	.read = kvm_clock_read,
+	.read = kvm_clock_get_cycles,
 	.rating = 400,
 	.mask = CLOCKSOURCE_MASK(64),
 	.mult = 1 << KVM_SCALE,
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7a567ebe6361..d57de05dc430 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -699,7 +699,7 @@ static struct clocksource clocksource_tsc;
  * code, which is necessary to support wrapping clocksources like pm
  * timer.
  */
-static cycle_t read_tsc(void)
+static cycle_t read_tsc(struct clocksource *cs)
 {
 	cycle_t ret = (cycle_t)get_cycles();
 
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index d303369a7bad..2b3eb82efeeb 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -283,7 +283,7 @@ void __devinit vmi_time_ap_init(void)
 /** vmi clocksource */
 static struct clocksource clocksource_vmi;
 
-static cycle_t read_real_cycles(void)
+static cycle_t read_real_cycles(struct clocksource *cs)
 {
 	cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
 	return max(ret, clocksource_vmi.cycle_last);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a2085368a3dc..ca7ec44bafc3 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -663,7 +663,7 @@ static unsigned long lguest_tsc_khz(void)
 
 /* If we can't use the TSC, the kernel falls back to our lower-priority
  * "lguest_clock", where we read the time value given to us by the Host. */
-static cycle_t lguest_clock_read(void)
+static cycle_t lguest_clock_read(struct clocksource *cs)
 {
 	unsigned long sec, nsec;
 
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 14f240623497..0a5aa44299a5 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -213,6 +213,11 @@ cycle_t xen_clocksource_read(void)
 	return ret;
 }
 
+static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
+{
+	return xen_clocksource_read();
+}
+
 static void xen_read_wallclock(struct timespec *ts)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
@@ -241,7 +246,7 @@ int xen_set_wallclock(unsigned long now)
 static struct clocksource xen_clocksource __read_mostly = {
 	.name = "xen",
 	.rating = 400,
-	.read = xen_clocksource_read,
+	.read = xen_clocksource_get_cycles,
 	.mask = ~0,
 	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
 	.shift = XEN_SHIFT,
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 50dfa3bc71ce..340ba4f9dc54 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -72,7 +72,7 @@ static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ;
 #ifdef CONFIG_IA64
 static void __iomem *hpet_mctr;
 
-static cycle_t read_hpet(void)
+static cycle_t read_hpet(struct clocksource *cs)
 {
 	return (cycle_t)read_counter((void __iomem *)hpet_mctr);
 }
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index ee19b6e8fcb4..40bd8c61c7d7 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -57,7 +57,7 @@ u32 acpi_pm_read_verified(void)
 	return v2;
 }
 
-static cycle_t acpi_pm_read(void)
+static cycle_t acpi_pm_read(struct clocksource *cs)
 {
 	return (cycle_t)read_pmtmr();
 }
@@ -83,7 +83,7 @@ static int __init acpi_pm_good_setup(char *__str)
 }
 __setup("acpi_pm_good", acpi_pm_good_setup);
 
-static cycle_t acpi_pm_read_slow(void)
+static cycle_t acpi_pm_read_slow(struct clocksource *cs)
 {
 	return (cycle_t)acpi_pm_read_verified();
 }
@@ -156,9 +156,9 @@ static int verify_pmtmr_rate(void)
 	unsigned long count, delta;
 
 	mach_prepare_counter();
-	value1 = clocksource_acpi_pm.read();
+	value1 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 	mach_countup(&count);
-	value2 = clocksource_acpi_pm.read();
+	value2 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 	delta = (value2 - value1) & ACPI_PM_MASK;
 
 	/* Check that the PMTMR delta is within 5% of what we expect */
@@ -195,9 +195,9 @@ static int __init init_acpi_pm_clocksource(void)
 	/* "verify" this timing source: */
 	for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) {
 		udelay(100 * j);
-		value1 = clocksource_acpi_pm.read();
+		value1 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 		for (i = 0; i < ACPI_PM_READ_CHECKS; i++) {
-			value2 = clocksource_acpi_pm.read();
+			value2 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 			if (value2 == value1)
 				continue;
 			if (value2 > value1)
diff --git a/drivers/clocksource/cyclone.c b/drivers/clocksource/cyclone.c
index 8615059a8729..64e528e8bfa6 100644
--- a/drivers/clocksource/cyclone.c
+++ b/drivers/clocksource/cyclone.c
@@ -19,7 +19,7 @@
 int use_cyclone = 0;
 static void __iomem *cyclone_ptr;
 
-static cycle_t read_cyclone(void)
+static cycle_t read_cyclone(struct clocksource *cs)
 {
 	return (cycle_t)readl(cyclone_ptr);
 }
diff --git a/drivers/clocksource/scx200_hrt.c b/drivers/clocksource/scx200_hrt.c
index b92da677aa5d..27f4d9637b62 100644
--- a/drivers/clocksource/scx200_hrt.c
+++ b/drivers/clocksource/scx200_hrt.c
@@ -43,7 +43,7 @@ MODULE_PARM_DESC(ppm, "+-adjust to actual XO freq (ppm)");
 /* The base timer frequency, * 27 if selected */
 #define HRT_FREQ   1000000
 
-static cycle_t read_hrt(void)
+static cycle_t read_hrt(struct clocksource *cs)
 {
 	/* Read the timer value */
 	return (cycle_t) inl(scx200_cb_base + SCx200_TIMER_OFFSET);
diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
index 254f1064d973..01b886e68822 100644
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -39,7 +39,7 @@
 
 static void __iomem *tcaddr;
 
-static cycle_t tc_get_cycles(void)
+static cycle_t tc_get_cycles(struct clocksource *cs)
 {
 	unsigned long	flags;
 	u32		lower, upper;
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 573819ef4cc0..0d96cde9ee5d 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -143,7 +143,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
  *			400-499: Perfect
  *				The ideal clocksource. A must-use where
  *				available.
- * @read:		returns a cycle value
+ * @read:		returns a cycle value, passes clocksource as argument
  * @mask:		bitmask for two's complement
  *			subtraction of non 64 bit counters
  * @mult:		cycle to nanosecond multiplier (adjusted by NTP)
@@ -162,7 +162,7 @@ struct clocksource {
 	char *name;
 	struct list_head list;
 	int rating;
-	cycle_t (*read)(void);
+	cycle_t (*read)(struct clocksource *cs);
 	cycle_t mask;
 	u32 mult;
 	u32 mult_orig;
@@ -271,7 +271,7 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
  */
 static inline cycle_t clocksource_read(struct clocksource *cs)
 {
-	return cs->read();
+	return cs->read(cs);
 }
 
 /**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c46c931a7fe7..ecfd7b5187e0 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -181,12 +181,12 @@ static void clocksource_watchdog(unsigned long data)
 
 	resumed = test_and_clear_bit(0, &watchdog_resumed);
 
-	wdnow = watchdog->read();
+	wdnow = watchdog->read(watchdog);
 	wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
 	watchdog_last = wdnow;
 
 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
-		csnow = cs->read();
+		csnow = cs->read(cs);
 
 		if (unlikely(resumed)) {
 			cs->wd_last = csnow;
@@ -247,7 +247,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 
 		list_add(&cs->wd_list, &watchdog_list);
 		if (!started && watchdog) {
-			watchdog_last = watchdog->read();
+			watchdog_last = watchdog->read(watchdog);
 			watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 			add_timer_on(&watchdog_timer,
 				     cpumask_first(cpu_online_mask));
@@ -268,7 +268,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 				cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
 			/* Start if list is not empty */
 			if (!list_empty(&watchdog_list)) {
-				watchdog_last = watchdog->read();
+				watchdog_last = watchdog->read(watchdog);
 				watchdog_timer.expires =
 					jiffies + WATCHDOG_INTERVAL;
 				add_timer_on(&watchdog_timer,
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 06f197560f3b..c3f6c30816e3 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -50,7 +50,7 @@
  */
 #define JIFFIES_SHIFT	8
 
-static cycle_t jiffies_read(void)
+static cycle_t jiffies_read(struct clocksource *cs)
 {
 	return (cycle_t) jiffies;
 }
-- 
cgit v1.2.3


From 4614e6adafa2c5e6c3a9c245af2807fa7bc5117a Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 21 Apr 2009 12:24:02 -0700
Subject: clocksource: add enable() and disable() callbacks

Add enable() and disable() callbacks for clocksources.

This allows us to put unused clocksources in power save mode.  The
functions clocksource_enable() and clocksource_disable() wrap the
callbacks and are inserted in the timekeeping code to enable before use
and disable after switching to a new clocksource.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/clocksource.h | 31 +++++++++++++++++++++++++++++++
 kernel/time/timekeeping.c   | 12 +++++++++---
 2 files changed, 40 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 0d96cde9ee5d..5a40d14daa9f 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -144,6 +144,8 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
  *				The ideal clocksource. A must-use where
  *				available.
  * @read:		returns a cycle value, passes clocksource as argument
+ * @enable:		optional function to enable the clocksource
+ * @disable:		optional function to disable the clocksource
  * @mask:		bitmask for two's complement
  *			subtraction of non 64 bit counters
  * @mult:		cycle to nanosecond multiplier (adjusted by NTP)
@@ -163,6 +165,8 @@ struct clocksource {
 	struct list_head list;
 	int rating;
 	cycle_t (*read)(struct clocksource *cs);
+	int (*enable)(struct clocksource *cs);
+	void (*disable)(struct clocksource *cs);
 	cycle_t mask;
 	u32 mult;
 	u32 mult_orig;
@@ -274,6 +278,33 @@ static inline cycle_t clocksource_read(struct clocksource *cs)
 	return cs->read(cs);
 }
 
+/**
+ * clocksource_enable: - enable clocksource
+ * @cs:		pointer to clocksource
+ *
+ * Enables the specified clocksource. The clocksource callback
+ * function should start up the hardware and setup mult and field
+ * members of struct clocksource to reflect hardware capabilities.
+ */
+static inline int clocksource_enable(struct clocksource *cs)
+{
+	return cs->enable ? cs->enable(cs) : 0;
+}
+
+/**
+ * clocksource_disable: - disable clocksource
+ * @cs:		pointer to clocksource
+ *
+ * Disables the specified clocksource. The clocksource callback
+ * function should power down the now unused hardware block to
+ * save power.
+ */
+static inline void clocksource_disable(struct clocksource *cs)
+{
+	if (cs->disable)
+		cs->disable(cs);
+}
+
 /**
  * cyc2ns - converts clocksource cycles to nanoseconds
  * @cs:		Pointer to clocksource
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 900f1b6598d1..687dff49f6e7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -182,7 +182,7 @@ EXPORT_SYMBOL(do_settimeofday);
  */
 static void change_clocksource(void)
 {
-	struct clocksource *new;
+	struct clocksource *new, *old;
 
 	new = clocksource_get_next();
 
@@ -191,11 +191,16 @@ static void change_clocksource(void)
 
 	clocksource_forward_now();
 
-	new->raw_time = clock->raw_time;
+	if (clocksource_enable(new))
+		return;
 
+	new->raw_time = clock->raw_time;
+	old = clock;
 	clock = new;
+	clocksource_disable(old);
+
 	clock->cycle_last = 0;
-	clock->cycle_last = clocksource_read(new);
+	clock->cycle_last = clocksource_read(clock);
 	clock->error = 0;
 	clock->xtime_nsec = 0;
 	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -292,6 +297,7 @@ void __init timekeeping_init(void)
 	ntp_init();
 
 	clock = clocksource_get_next();
+	clocksource_enable(clock);
 	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
 	clock->cycle_last = clocksource_read(clock);
 
-- 
cgit v1.2.3


From 40112ae7504745799e75ef418057f0d2cb745050 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Tue, 21 Apr 2009 12:24:03 -0700
Subject: ipmi: test for event buffer before using

The IPMI driver would attempt to use the event buffer even if that
didn't exist on the BMC.  This patch modified the IPMI driver to check
for the event buffer's existence before trying to use it.

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/ipmi/ipmi_si_intf.c | 148 +++++++++++++++++++++++++++++++--------
 include/linux/ipmi_msgdefs.h     |   6 ++
 2 files changed, 125 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 2438fdf889b4..259644646b82 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -82,12 +82,6 @@
 #define SI_SHORT_TIMEOUT_USEC  250 /* .25ms when the SM request a
 				      short timeout */
 
-/* Bit for BMC global enables. */
-#define IPMI_BMC_RCV_MSG_INTR     0x01
-#define IPMI_BMC_EVT_MSG_INTR     0x02
-#define IPMI_BMC_EVT_MSG_BUFF     0x04
-#define IPMI_BMC_SYS_LOG          0x08
-
 enum si_intf_state {
 	SI_NORMAL,
 	SI_GETTING_FLAGS,
@@ -220,6 +214,9 @@ struct smi_info {
 			     OEM2_DATA_AVAIL)
 	unsigned char       msg_flags;
 
+	/* Does the BMC have an event buffer? */
+	char		    has_event_buffer;
+
 	/*
 	 * If set to true, this will request events the next time the
 	 * state machine is idle.
@@ -968,7 +965,8 @@ static void request_events(void *send_info)
 {
 	struct smi_info *smi_info = send_info;
 
-	if (atomic_read(&smi_info->stop_operation))
+	if (atomic_read(&smi_info->stop_operation) ||
+				!smi_info->has_event_buffer)
 		return;
 
 	atomic_set(&smi_info->req_events, 1);
@@ -2407,26 +2405,9 @@ static struct of_platform_driver ipmi_of_platform_driver = {
 };
 #endif /* CONFIG_PPC_OF */
 
-
-static int try_get_dev_id(struct smi_info *smi_info)
+static int wait_for_msg_done(struct smi_info *smi_info)
 {
-	unsigned char         msg[2];
-	unsigned char         *resp;
-	unsigned long         resp_len;
 	enum si_sm_result     smi_result;
-	int                   rv = 0;
-
-	resp = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
-	if (!resp)
-		return -ENOMEM;
-
-	/*
-	 * Do a Get Device ID command, since it comes back with some
-	 * useful info.
-	 */
-	msg[0] = IPMI_NETFN_APP_REQUEST << 2;
-	msg[1] = IPMI_GET_DEVICE_ID_CMD;
-	smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2);
 
 	smi_result = smi_info->handlers->event(smi_info->si_sm, 0);
 	for (;;) {
@@ -2441,16 +2422,39 @@ static int try_get_dev_id(struct smi_info *smi_info)
 		} else
 			break;
 	}
-	if (smi_result == SI_SM_HOSED) {
+	if (smi_result == SI_SM_HOSED)
 		/*
 		 * We couldn't get the state machine to run, so whatever's at
 		 * the port is probably not an IPMI SMI interface.
 		 */
-		rv = -ENODEV;
+		return -ENODEV;
+
+	return 0;
+}
+
+static int try_get_dev_id(struct smi_info *smi_info)
+{
+	unsigned char         msg[2];
+	unsigned char         *resp;
+	unsigned long         resp_len;
+	int                   rv = 0;
+
+	resp = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
+	if (!resp)
+		return -ENOMEM;
+
+	/*
+	 * Do a Get Device ID command, since it comes back with some
+	 * useful info.
+	 */
+	msg[0] = IPMI_NETFN_APP_REQUEST << 2;
+	msg[1] = IPMI_GET_DEVICE_ID_CMD;
+	smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2);
+
+	rv = wait_for_msg_done(smi_info);
+	if (rv)
 		goto out;
-	}
 
-	/* Otherwise, we got some data. */
 	resp_len = smi_info->handlers->get_result(smi_info->si_sm,
 						  resp, IPMI_MAX_MSG_LENGTH);
 
@@ -2462,6 +2466,88 @@ static int try_get_dev_id(struct smi_info *smi_info)
 	return rv;
 }
 
+static int try_enable_event_buffer(struct smi_info *smi_info)
+{
+	unsigned char         msg[3];
+	unsigned char         *resp;
+	unsigned long         resp_len;
+	int                   rv = 0;
+
+	resp = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
+	if (!resp)
+		return -ENOMEM;
+
+	msg[0] = IPMI_NETFN_APP_REQUEST << 2;
+	msg[1] = IPMI_GET_BMC_GLOBAL_ENABLES_CMD;
+	smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2);
+
+	rv = wait_for_msg_done(smi_info);
+	if (rv) {
+		printk(KERN_WARNING
+		       "ipmi_si: Error getting response from get global,"
+		       " enables command, the event buffer is not"
+		       " enabled.\n");
+		goto out;
+	}
+
+	resp_len = smi_info->handlers->get_result(smi_info->si_sm,
+						  resp, IPMI_MAX_MSG_LENGTH);
+
+	if (resp_len < 4 ||
+			resp[0] != (IPMI_NETFN_APP_REQUEST | 1) << 2 ||
+			resp[1] != IPMI_GET_BMC_GLOBAL_ENABLES_CMD   ||
+			resp[2] != 0) {
+		printk(KERN_WARNING
+		       "ipmi_si: Invalid return from get global"
+		       " enables command, cannot enable the event"
+		       " buffer.\n");
+		rv = -EINVAL;
+		goto out;
+	}
+
+	if (resp[3] & IPMI_BMC_EVT_MSG_BUFF)
+		/* buffer is already enabled, nothing to do. */
+		goto out;
+
+	msg[0] = IPMI_NETFN_APP_REQUEST << 2;
+	msg[1] = IPMI_SET_BMC_GLOBAL_ENABLES_CMD;
+	msg[2] = resp[3] | IPMI_BMC_EVT_MSG_BUFF;
+	smi_info->handlers->start_transaction(smi_info->si_sm, msg, 3);
+
+	rv = wait_for_msg_done(smi_info);
+	if (rv) {
+		printk(KERN_WARNING
+		       "ipmi_si: Error getting response from set global,"
+		       " enables command, the event buffer is not"
+		       " enabled.\n");
+		goto out;
+	}
+
+	resp_len = smi_info->handlers->get_result(smi_info->si_sm,
+						  resp, IPMI_MAX_MSG_LENGTH);
+
+	if (resp_len < 3 ||
+			resp[0] != (IPMI_NETFN_APP_REQUEST | 1) << 2 ||
+			resp[1] != IPMI_SET_BMC_GLOBAL_ENABLES_CMD) {
+		printk(KERN_WARNING
+		       "ipmi_si: Invalid return from get global,"
+		       "enables command, not enable the event"
+		       " buffer.\n");
+		rv = -EINVAL;
+		goto out;
+	}
+
+	if (resp[2] != 0)
+		/*
+		 * An error when setting the event buffer bit means
+		 * that the event buffer is not supported.
+		 */
+		rv = -ENOENT;
+ out:
+	kfree(resp);
+	return rv;
+}
+
 static int type_file_read_proc(char *page, char **start, off_t off,
 			       int count, int *eof, void *data)
 {
@@ -2847,6 +2933,10 @@ static int try_smi_init(struct smi_info *new_smi)
 	new_smi->intf_num = smi_num;
 	smi_num++;
 
+	rv = try_enable_event_buffer(new_smi);
+	if (rv == 0)
+		new_smi->has_event_buffer = 1;
+
 	/*
 	 * Start clearing the flags before we enable interrupts or the
 	 * timer to avoid racing with the timer.
diff --git a/include/linux/ipmi_msgdefs.h b/include/linux/ipmi_msgdefs.h
index b56a158d587a..a079f586e907 100644
--- a/include/linux/ipmi_msgdefs.h
+++ b/include/linux/ipmi_msgdefs.h
@@ -58,6 +58,12 @@
 #define IPMI_READ_EVENT_MSG_BUFFER_CMD	0x35
 #define IPMI_GET_CHANNEL_INFO_CMD	0x42
 
+/* Bit for BMC global enables. */
+#define IPMI_BMC_RCV_MSG_INTR     0x01
+#define IPMI_BMC_EVT_MSG_INTR     0x02
+#define IPMI_BMC_EVT_MSG_BUFF     0x04
+#define IPMI_BMC_SYS_LOG          0x08
+
 #define IPMI_NETFN_STORAGE_REQUEST		0x0a
 #define IPMI_NETFN_STORAGE_RESPONSE		0x0b
 #define IPMI_ADD_SEL_ENTRY_CMD		0x44
-- 
cgit v1.2.3


From 4dec302ff71ebf48f5784a2d2fc5e3745e6d4d52 Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@hp.com>
Date: Tue, 21 Apr 2009 12:24:05 -0700
Subject: ipmi: add oem message handling

Enable userspace to receive messages that a BMC transmits using an OEM
medium.  This is used by the HP iLO2.

Based on code originally written by Patrick Schoeller.

Signed-off-by: dann frazier <dannf@hp.com>
Signed-off-by: Corey Minyard <cminyard@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/ipmi/ipmi_msghandler.c | 138 ++++++++++++++++++++++++++++++++++--
 include/linux/ipmi.h                |   2 +
 include/linux/ipmi_msgdefs.h        |   2 +
 3 files changed, 137 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 83c7477ba801..aa83a0865ec1 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -3284,6 +3284,114 @@ static int handle_lan_get_msg_cmd(ipmi_smi_t          intf,
 	return rv;
 }
 
+/*
+ * This routine will handle "Get Message" command responses with
+ * channels that use an OEM Medium. The message format belongs to
+ * the OEM.  See IPMI 2.0 specification, Chapter 6 and
+ * Chapter 22, sections 22.6 and 22.24 for more details.
+ */
+static int handle_oem_get_msg_cmd(ipmi_smi_t          intf,
+				  struct ipmi_smi_msg *msg)
+{
+	struct cmd_rcvr       *rcvr;
+	int                   rv = 0;
+	unsigned char         netfn;
+	unsigned char         cmd;
+	unsigned char         chan;
+	ipmi_user_t           user = NULL;
+	struct ipmi_system_interface_addr *smi_addr;
+	struct ipmi_recv_msg  *recv_msg;
+
+	/*
+	 * We expect the OEM SW to perform error checking
+	 * so we just do some basic sanity checks
+	 */
+	if (msg->rsp_size < 4) {
+		/* Message not big enough, just ignore it. */
+		ipmi_inc_stat(intf, invalid_commands);
+		return 0;
+	}
+
+	if (msg->rsp[2] != 0) {
+		/* An error getting the response, just ignore it. */
+		return 0;
+	}
+
+	/*
+	 * This is an OEM Message so the OEM needs to know how
+	 * handle the message. We do no interpretation.
+	 */
+	netfn = msg->rsp[0] >> 2;
+	cmd = msg->rsp[1];
+	chan = msg->rsp[3] & 0xf;
+
+	rcu_read_lock();
+	rcvr = find_cmd_rcvr(intf, netfn, cmd, chan);
+	if (rcvr) {
+		user = rcvr->user;
+		kref_get(&user->refcount);
+	} else
+		user = NULL;
+	rcu_read_unlock();
+
+	if (user == NULL) {
+		/* We didn't find a user, just give up. */
+		ipmi_inc_stat(intf, unhandled_commands);
+
+		/*
+		 * Don't do anything with these messages, just allow
+		 * them to be freed.
+		 */
+
+		rv = 0;
+	} else {
+		/* Deliver the message to the user. */
+		ipmi_inc_stat(intf, handled_commands);
+
+		recv_msg = ipmi_alloc_recv_msg();
+		if (!recv_msg) {
+			/*
+			 * We couldn't allocate memory for the
+			 * message, so requeue it for handling
+			 * later.
+			 */
+			rv = 1;
+			kref_put(&user->refcount, free_user);
+		} else {
+			/*
+			 * OEM Messages are expected to be delivered via
+			 * the system interface to SMS software.  We might
+			 * need to visit this again depending on OEM
+			 * requirements
+			 */
+			smi_addr = ((struct ipmi_system_interface_addr *)
+				    &(recv_msg->addr));
+			smi_addr->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE;
+			smi_addr->channel = IPMI_BMC_CHANNEL;
+			smi_addr->lun = msg->rsp[0] & 3;
+
+			recv_msg->user = user;
+			recv_msg->user_msg_data = NULL;
+			recv_msg->recv_type = IPMI_OEM_RECV_TYPE;
+			recv_msg->msg.netfn = msg->rsp[0] >> 2;
+			recv_msg->msg.cmd = msg->rsp[1];
+			recv_msg->msg.data = recv_msg->msg_data;
+
+			/*
+			 * The message starts at byte 4 which follows the
+			 * the Channel Byte in the "GET MESSAGE" command
+			 */
+			recv_msg->msg.data_len = msg->rsp_size - 4;
+			memcpy(recv_msg->msg_data,
+			       &(msg->rsp[4]),
+			       msg->rsp_size - 4);
+			deliver_response(recv_msg);
+		}
+	}
+
+	return rv;
+}
+
 static void copy_event_into_recv_msg(struct ipmi_recv_msg *recv_msg,
 				     struct ipmi_smi_msg  *msg)
 {
@@ -3539,6 +3647,17 @@ static int handle_new_recv_msg(ipmi_smi_t          intf,
 			goto out;
 		}
 
+		/*
+		** We need to make sure the channels have been initialized.
+		** The channel_handler routine will set the "curr_channel"
+		** equal to or greater than IPMI_MAX_CHANNELS when all the
+		** channels for this interface have been initialized.
+		*/
+		if (intf->curr_channel < IPMI_MAX_CHANNELS) {
+			requeue = 1;     /* Just put the message back for now */
+			goto out;
+		}
+
 		switch (intf->channels[chan].medium) {
 		case IPMI_CHANNEL_MEDIUM_IPMB:
 			if (msg->rsp[4] & 0x04) {
@@ -3574,11 +3693,20 @@ static int handle_new_recv_msg(ipmi_smi_t          intf,
 			break;
 
 		default:
-			/*
-			 * We don't handle the channel type, so just
-			 * free the message.
-			 */
-			requeue = 0;
+			/* Check for OEM Channels.  Clients had better
+			   register for these commands. */
+			if ((intf->channels[chan].medium
+			     >= IPMI_CHANNEL_MEDIUM_OEM_MIN)
+			    && (intf->channels[chan].medium
+				<= IPMI_CHANNEL_MEDIUM_OEM_MAX)) {
+				requeue = handle_oem_get_msg_cmd(intf, msg);
+			} else {
+				/*
+				 * We don't handle the channel type, so just
+				 * free the message.
+				 */
+				requeue = 0;
+			}
 		}
 
 	} else if ((msg->rsp[0] == ((IPMI_NETFN_APP_REQUEST|1) << 2))
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 7ebdb4fb4e54..65aae34759de 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -198,6 +198,8 @@ struct kernel_ipmi_msg {
 					      response.  When you send a
 					      response message, this will
 					      be returned. */
+#define IPMI_OEM_RECV_TYPE		5 /* The response for OEM Channels */
+
 /* Note that async events and received commands do not have a completion
    code as the first byte of the incoming data, unlike a response. */
 
diff --git a/include/linux/ipmi_msgdefs.h b/include/linux/ipmi_msgdefs.h
index a079f586e907..df97e6e31e87 100644
--- a/include/linux/ipmi_msgdefs.h
+++ b/include/linux/ipmi_msgdefs.h
@@ -115,5 +115,7 @@
 #define IPMI_CHANNEL_MEDIUM_USB1	10
 #define IPMI_CHANNEL_MEDIUM_USB2	11
 #define IPMI_CHANNEL_MEDIUM_SYSINTF	12
+#define IPMI_CHANNEL_MEDIUM_OEM_MIN	0x60
+#define IPMI_CHANNEL_MEDIUM_OEM_MAX	0x7f
 
 #endif /* __LINUX_IPMI_MSGDEFS_H */
-- 
cgit v1.2.3


From e638c1394010859a015a3b533ee452d768e62cea Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 21 Apr 2009 12:24:41 -0700
Subject: memcg: use rcu_dereference to access mm->owner

mm->owner should be accessed with rcu_dereference().

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 18146c980b68..a9e3b76aa884 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -75,7 +75,7 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
 {
 	struct mem_cgroup *mem;
 	rcu_read_lock();
-	mem = mem_cgroup_from_task((mm)->owner);
+	mem = mem_cgroup_from_task(rcu_dereference((mm)->owner));
 	rcu_read_unlock();
 	return cgroup == mem;
 }
-- 
cgit v1.2.3


From 6e538aaf50ae782a890cbc02c27950448d8193e1 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Tue, 21 Apr 2009 12:24:49 -0700
Subject: spi: documentation: emphasise spi_master.setup() semantics

This is a doc-only patch which I hope will reduce the number of
spi_master controller driver patches starting out with a common
implementation bug.

(As in: almost every spi_master driver I see starts out with its
version of this bug.  Sigh.)

It just re-emphasizes that the setup() method may be called for one
device while a transfer is active on another ...  which means that most
driver implementations shouldn't touch any registers.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/spi/spi-summary | 6 ++++++
 include/linux/spi/spi.h       | 7 ++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary
index 0f5122eb282b..4a02d2508bc8 100644
--- a/Documentation/spi/spi-summary
+++ b/Documentation/spi/spi-summary
@@ -511,10 +511,16 @@ SPI MASTER METHODS
 	This sets up the device clock rate, SPI mode, and word sizes.
 	Drivers may change the defaults provided by board_info, and then
 	call spi_setup(spi) to invoke this routine.  It may sleep.
+
 	Unless each SPI slave has its own configuration registers, don't
 	change them right away ... otherwise drivers could corrupt I/O
 	that's in progress for other SPI devices.
 
+		** BUG ALERT:  for some reason the first version of
+		** many spi_master drivers seems to get this wrong.
+		** When you code setup(), ASSUME that the controller
+		** is actively processing transfers for another device.
+
     master->transfer(struct spi_device *spi, struct spi_message *message)
     	This must not sleep.  Its responsibility is arrange that the
 	transfer happens and its complete() callback is issued.  The two
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 2cc43fa380cb..a0faa18f7b1b 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -245,7 +245,12 @@ struct spi_master {
 	 */
 	u16			dma_alignment;
 
-	/* setup mode and clock, etc (spi driver may call many times) */
+	/* Setup mode and clock, etc (spi driver may call many times).
+	 *
+	 * IMPORTANT:  this may be called when transfers to another
+	 * device are active.  DO NOT UPDATE SHARED REGISTERS in ways
+	 * which could break those transfers.
+	 */
 	int			(*setup)(struct spi_device *spi);
 
 	/* bidirectional bulk transfers
-- 
cgit v1.2.3


From 9b8de7479d0dbab1ed98b5b015d44232c9d3d08e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 21 Apr 2009 23:00:24 +0100
Subject: FRV: Fix the section attribute on UP DECLARE_PER_CPU()

In non-SMP mode, the variable section attribute specified by DECLARE_PER_CPU()
does not agree with that specified by DEFINE_PER_CPU().  This means that
architectures that have a small data section references relative to a base
register may throw up linkage errors due to too great a displacement between
where the base register points and the per-CPU variable.

On FRV, the .h declaration says that the variable is in the .sdata section, but
the .c definition says it's actually in the .data section.  The linker throws
up the following errors:

kernel/built-in.o: In function `release_task':
kernel/exit.c:78: relocation truncated to fit: R_FRV_GPREL12 against symbol `per_cpu__process_counts' defined in .data section in kernel/built-in.o
kernel/exit.c:78: relocation truncated to fit: R_FRV_GPREL12 against symbol `per_cpu__process_counts' defined in .data section in kernel/built-in.o

To fix this, DECLARE_PER_CPU() should simply apply the same section attribute
as does DEFINE_PER_CPU().  However, this is made slightly more complex by
virtue of the fact that there are several variants on DEFINE, so these need to
be matched by variants on DECLARE.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/percpu.h  |  2 +-
 arch/ia64/include/asm/smp.h      |  2 +-
 arch/x86/include/asm/desc.h      |  2 +-
 arch/x86/include/asm/hardirq.h   |  2 +-
 arch/x86/include/asm/processor.h |  6 +++---
 arch/x86/include/asm/tlbflush.h  |  2 +-
 include/asm-generic/percpu.h     | 43 ++++++++++++++++++++++++++++++++++++++--
 include/linux/percpu.h           | 24 ----------------------
 net/rds/rds.h                    |  2 +-
 9 files changed, 50 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/percpu.h b/arch/alpha/include/asm/percpu.h
index 3495e8e00d70..e9e0bb5a23bf 100644
--- a/arch/alpha/include/asm/percpu.h
+++ b/arch/alpha/include/asm/percpu.h
@@ -73,6 +73,6 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 
 #endif /* SMP */
 
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name)
+#include <asm-generic/percpu.h>
 
 #endif /* __ALPHA_PERCPU_H */
diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h
index 598408336251..d217d1d4e051 100644
--- a/arch/ia64/include/asm/smp.h
+++ b/arch/ia64/include/asm/smp.h
@@ -58,7 +58,7 @@ extern struct smp_boot_data {
 extern char no_int_routing __devinitdata;
 
 extern cpumask_t cpu_core_map[NR_CPUS];
-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+DECLARE_PER_CPU_SHARED_ALIGNED(cpumask_t, cpu_sibling_map);
 extern int smp_num_siblings;
 extern void __iomem *ipi_base_addr;
 extern unsigned char smp_int_redirect;
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 5623c50d67b2..c45f415ce315 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -37,7 +37,7 @@ extern gate_desc idt_table[];
 struct gdt_page {
 	struct desc_struct gdt[GDT_ENTRIES];
 } __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 039db6aa8e02..37555e52f980 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -26,7 +26,7 @@ typedef struct {
 #endif
 } ____cacheline_aligned irq_cpustat_t;
 
-DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
+DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 
 /* We can have at most NR_VECTORS irqs routed to a cpu at a time */
 #define MAX_HARDIRQS_PER_CPU NR_VECTORS
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fcf4d92e7e04..c2cceae709c8 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -138,7 +138,7 @@ extern struct tss_struct	doublefault_tss;
 extern __u32			cleared_cpu_caps[NCAPINTS];
 
 #ifdef CONFIG_SMP
-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
 #define cpu_data(cpu)		per_cpu(cpu_info, cpu)
 #define current_cpu_data	__get_cpu_var(cpu_info)
 #else
@@ -270,7 +270,7 @@ struct tss_struct {
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
 
 /*
  * Save the original ist values for checking stack pointers during debugging
@@ -393,7 +393,7 @@ union irq_stack_union {
 	};
 };
 
-DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
+DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union);
 DECLARE_INIT_PER_CPU(irq_stack_union);
 
 DECLARE_PER_CPU(char *, irq_stack_ptr);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d3539f998f88..16a5c84b0329 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -152,7 +152,7 @@ struct tlb_state {
 	struct mm_struct *active_mm;
 	int state;
 };
-DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
 static inline void reset_lazy_tlbstate(void)
 {
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index b0e63c672ebd..af47b9e10064 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -73,11 +73,50 @@ extern void setup_per_cpu_areas(void);
 
 #endif	/* SMP */
 
+#ifndef PER_CPU_BASE_SECTION
+#ifdef CONFIG_SMP
+#define PER_CPU_BASE_SECTION ".data.percpu"
+#else
+#define PER_CPU_BASE_SECTION ".data"
+#endif
+#endif
+
+#ifdef CONFIG_SMP
+
+#ifdef MODULE
+#define PER_CPU_SHARED_ALIGNED_SECTION ""
+#else
+#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
+#endif
+#define PER_CPU_FIRST_SECTION ".first"
+
+#else
+
+#define PER_CPU_SHARED_ALIGNED_SECTION ""
+#define PER_CPU_FIRST_SECTION ""
+
+#endif
+
 #ifndef PER_CPU_ATTRIBUTES
 #define PER_CPU_ATTRIBUTES
 #endif
 
-#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
-					__typeof__(type) per_cpu_var(name)
+#define DECLARE_PER_CPU_SECTION(type, name, section)			\
+	extern \
+	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
+	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+
+#define DECLARE_PER_CPU(type, name)					\
+	DECLARE_PER_CPU_SECTION(type, name, "")
+
+#define DECLARE_PER_CPU_SHARED_ALIGNED(type, name)			\
+	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
+	____cacheline_aligned_in_smp
+
+#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name)				\
+	DECLARE_PER_CPU_SECTION(type, name, ".page_aligned")
+
+#define DECLARE_PER_CPU_FIRST(type, name)				\
+	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION)
 
 #endif /* _ASM_GENERIC_PERCPU_H_ */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index cfda2d5ad319..f052d8184993 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -9,30 +9,6 @@
 
 #include <asm/percpu.h>
 
-#ifndef PER_CPU_BASE_SECTION
-#ifdef CONFIG_SMP
-#define PER_CPU_BASE_SECTION ".data.percpu"
-#else
-#define PER_CPU_BASE_SECTION ".data"
-#endif
-#endif
-
-#ifdef CONFIG_SMP
-
-#ifdef MODULE
-#define PER_CPU_SHARED_ALIGNED_SECTION ""
-#else
-#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
-#endif
-#define PER_CPU_FIRST_SECTION ".first"
-
-#else
-
-#define PER_CPU_SHARED_ALIGNED_SECTION ""
-#define PER_CPU_FIRST_SECTION ""
-
-#endif
-
 #define DEFINE_PER_CPU_SECTION(type, name, section)			\
 	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
 	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 619f0a30a4e5..71794449ca4e 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -638,7 +638,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *,
 void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
 
 /* stats.c */
-DECLARE_PER_CPU(struct rds_statistics, rds_stats);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
 #define rds_stats_inc_which(which, member) do {		\
 	per_cpu(which, get_cpu()).member++;		\
 	put_cpu();					\
-- 
cgit v1.2.3


From 5028eaa97dd1dab9cd7c30c4d38f71c708ca64bc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 21 Apr 2009 23:00:29 +0100
Subject: PERCPU: Collect the DECLARE/DEFINE declarations together

Collect the DECLARE/DEFINE declarations together in linux/percpu-defs.h so
that they're in one place, and give them descriptive comments, particularly
the SHARED_ALIGNED variant.

It would be nice to collect these in linux/percpu.h, but that's not possible
without sorting out the severe #include recursion between the x86 arch headers
and the general headers (and possibly other arches too).

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/percpu.h | 26 ++------------
 include/linux/percpu-defs.h  | 84 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/percpu.h       | 20 -----------
 3 files changed, 86 insertions(+), 44 deletions(-)
 create mode 100644 include/linux/percpu-defs.h

(limited to 'include/linux')

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index af47b9e10064..d7d50d7ee51e 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -1,13 +1,9 @@
 #ifndef _ASM_GENERIC_PERCPU_H_
 #define _ASM_GENERIC_PERCPU_H_
+
 #include <linux/compiler.h>
 #include <linux/threads.h>
-
-/*
- * Determine the real variable name from the name visible in the
- * kernel sources.
- */
-#define per_cpu_var(var) per_cpu__##var
+#include <linux/percpu-defs.h>
 
 #ifdef CONFIG_SMP
 
@@ -101,22 +97,4 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_ATTRIBUTES
 #endif
 
-#define DECLARE_PER_CPU_SECTION(type, name, section)			\
-	extern \
-	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
-	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
-
-#define DECLARE_PER_CPU(type, name)					\
-	DECLARE_PER_CPU_SECTION(type, name, "")
-
-#define DECLARE_PER_CPU_SHARED_ALIGNED(type, name)			\
-	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
-	____cacheline_aligned_in_smp
-
-#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name)				\
-	DECLARE_PER_CPU_SECTION(type, name, ".page_aligned")
-
-#define DECLARE_PER_CPU_FIRST(type, name)				\
-	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION)
-
 #endif /* _ASM_GENERIC_PERCPU_H_ */
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
new file mode 100644
index 000000000000..8f921d74f49f
--- /dev/null
+++ b/include/linux/percpu-defs.h
@@ -0,0 +1,84 @@
+#ifndef _LINUX_PERCPU_DEFS_H
+#define _LINUX_PERCPU_DEFS_H
+
+/*
+ * Determine the real variable name from the name visible in the
+ * kernel sources.
+ */
+#define per_cpu_var(var) per_cpu__##var
+
+/*
+ * Base implementations of per-CPU variable declarations and definitions, where
+ * the section in which the variable is to be placed is provided by the
+ * 'section' argument.  This may be used to affect the parameters governing the
+ * variable's storage.
+ *
+ * NOTE!  The sections for the DECLARE and for the DEFINE must match, lest
+ * linkage errors occur due the compiler generating the wrong code to access
+ * that section.
+ */
+#define DECLARE_PER_CPU_SECTION(type, name, section)			\
+	extern								\
+	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
+	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+
+#define DEFINE_PER_CPU_SECTION(type, name, section)			\
+	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
+	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+
+/*
+ * Variant on the per-CPU variable declaration/definition theme used for
+ * ordinary per-CPU variables.
+ */
+#define DECLARE_PER_CPU(type, name)					\
+	DECLARE_PER_CPU_SECTION(type, name, "")
+
+#define DEFINE_PER_CPU(type, name)					\
+	DEFINE_PER_CPU_SECTION(type, name, "")
+
+/*
+ * Declaration/definition used for per-CPU variables that must come first in
+ * the set of variables.
+ */
+#define DECLARE_PER_CPU_FIRST(type, name)				\
+	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION)
+
+#define DEFINE_PER_CPU_FIRST(type, name)				\
+	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION)
+
+/*
+ * Declaration/definition used for per-CPU variables that must be cacheline
+ * aligned under SMP conditions so that, whilst a particular instance of the
+ * data corresponds to a particular CPU, inefficiencies due to direct access by
+ * other CPUs are reduced by preventing the data from unnecessarily spanning
+ * cachelines.
+ *
+ * An example of this would be statistical data, where each CPU's set of data
+ * is updated by that CPU alone, but the data from across all CPUs is collated
+ * by a CPU processing a read from a proc file.
+ */
+#define DECLARE_PER_CPU_SHARED_ALIGNED(type, name)			\
+	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
+	____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)			\
+	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
+	____cacheline_aligned_in_smp
+
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name)				\
+	DECLARE_PER_CPU_SECTION(type, name, ".page_aligned")
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)				\
+	DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")
+
+/*
+ * Intermodule exports for per-CPU variables.
+ */
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+
+
+#endif /* _LINUX_PERCPU_DEFS_H */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index f052d8184993..1581ff235c7e 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -9,26 +9,6 @@
 
 #include <asm/percpu.h>
 
-#define DEFINE_PER_CPU_SECTION(type, name, section)			\
-	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
-	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
-
-#define DEFINE_PER_CPU(type, name)					\
-	DEFINE_PER_CPU_SECTION(type, name, "")
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)			\
-	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
-	____cacheline_aligned_in_smp
-
-#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)				\
-	DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")
-
-#define DEFINE_PER_CPU_FIRST(type, name)				\
-	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION)
-
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
-
 /* enough to cover all DEFINE_PER_CPUs in modules */
 #ifdef CONFIG_MODULES
 #define PERCPU_MODULE_RESERVE		(8 << 10)
-- 
cgit v1.2.3


From 5dd559f020c98a2a4b3e063f09c0e4bc771ed838 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 21 Apr 2009 16:30:32 -0600
Subject: Trivial: fix a typo in slow-work.h

Fix a comment typo in slow-work.h

...a trivial mistake, but it will mess up kerneldoc if nothing else.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slow-work.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 85958277f83d..b65c8881f07a 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -67,7 +67,7 @@ static inline void slow_work_init(struct slow_work *work,
 }
 
 /**
- * slow_work_init - Initialise a very slow work item
+ * vslow_work_init - Initialise a very slow work item
  * @work: The work item to initialise
  * @ops: The operations to use to handle the slow work item
  *
-- 
cgit v1.2.3


From d4d5291c8cd499b1b590336059d5cc3e24c1ced6 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 21 Apr 2009 13:32:54 -0700
Subject: driver synchronization: make scsi_wait_scan more advanced

There is currently only one way for userspace to say "wait for my storage
device to get ready for the modules I just loaded": to load the
scsi_wait_scan module. Expectations of userspace are that once this
module is loaded, all the (storage) devices for which the drivers
were loaded before the module load are present.

Now, there are some issues with the implementation, and the async
stuff got caught in the middle of this: The existing code only
waits for the scsy async probing to finish, but it did not take
into account at all that probing might not have begun yet.
(Russell ran into this problem on his computer and the fix works for him)

This patch fixes this more thoroughly than the previous "fix", which
had some bad side effects (namely, for kernel code that wanted to wait for
the scsi scan it would also do an async sync, which would deadlock if you did
it from async context already.. there's a report about that on lkml):
The patch makes the module first wait for all device driver probes, and then it
will wait for the scsi parallel scan to finish.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Tested-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/dd.c             |  1 +
 drivers/scsi/scsi_scan.c      |  2 --
 drivers/scsi/scsi_wait_scan.c | 11 +++++++++++
 include/linux/device.h        |  1 +
 4 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index f17c3266a0e0..742cbe6b042b 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -179,6 +179,7 @@ void wait_for_device_probe(void)
 	wait_event(probe_waitqueue, atomic_read(&probe_count) == 0);
 	async_synchronize_full();
 }
+EXPORT_SYMBOL_GPL(wait_for_device_probe);
 
 /**
  * driver_probe_device - attempt to bind device & driver together
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index a14d245a66b8..6f51ca485f35 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -180,8 +180,6 @@ int scsi_complete_async_scans(void)
 	spin_unlock(&async_scan_lock);
 
 	kfree(data);
-	/* Synchronize async operations globally */
-	async_synchronize_full();
 	return 0;
 }
 
diff --git a/drivers/scsi/scsi_wait_scan.c b/drivers/scsi/scsi_wait_scan.c
index 2f21af21269a..74708fcaf82f 100644
--- a/drivers/scsi/scsi_wait_scan.c
+++ b/drivers/scsi/scsi_wait_scan.c
@@ -11,10 +11,21 @@
  */
 
 #include <linux/module.h>
+#include <linux/device.h>
 #include <scsi/scsi_scan.h>
 
 static int __init wait_scan_init(void)
 {
+	/*
+	 * First we need to wait for device probing to finish;
+	 * the drivers we just loaded might just still be probing
+	 * and might not yet have reached the scsi async scanning
+	 */
+	wait_for_device_probe();
+	/*
+	 * and then we wait for the actual asynchronous scsi scan
+	 * to finish.
+	 */
 	scsi_complete_async_scans();
 	return 0;
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index 2918c0e8fdfd..6a69caaac18a 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -551,6 +551,7 @@ extern int (*platform_notify_remove)(struct device *dev);
 extern struct device *get_device(struct device *dev);
 extern void put_device(struct device *dev);
 
+extern void wait_for_device_probe(void);
 
 /* drivers/base/power/shutdown.c */
 extern void device_shutdown(void);
-- 
cgit v1.2.3


From 451a9ebf653d28337ba53ed5b4b70b0b9543cca1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 Apr 2009 19:50:51 +0200
Subject: bio: fix bio_kmalloc()

Impact: fix bio_kmalloc() and its destruction path

bio_kmalloc() was broken in two ways.

* bvec_alloc_bs() first allocates bvec using kmalloc() and then
  ignores it and allocates again like non-kmalloc bvecs.

* bio_kmalloc_destructor() didn't check for and free bio integrity
  data.

This patch fixes the above problems.  kmalloc patch is separated out
from bio_alloc_bioset() and allocates the requested number of bvecs as
inline bvecs.

* bio_alloc_bioset() no longer takes NULL @bs.  None other than
  bio_kmalloc() used it and outside users can't know how it was
  allocated anyway.

* Define and use BIO_POOL_NONE so that pool index check in
  bvec_free_bs() triggers if inline or kmalloc allocated bvec gets
  there.

* Relocate destructors on top of each allocation function so that how
  they're used is more clear.

Jens Axboe suggested allocating bvecs inline.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c            | 118 ++++++++++++++++++++++++----------------------------
 include/linux/bio.h |   1 +
 2 files changed, 55 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio.c b/fs/bio.c
index cd42bb882f30..d35588fd6d57 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -174,14 +174,6 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
 {
 	struct bio_vec *bvl;
 
-	/*
-	 * If 'bs' is given, lookup the pool and do the mempool alloc.
-	 * If not, this is a bio_kmalloc() allocation and just do a
-	 * kzalloc() for the exact number of vecs right away.
-	 */
-	if (!bs)
-		bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
-
 	/*
 	 * see comment near bvec_array define!
 	 */
@@ -260,21 +252,6 @@ void bio_free(struct bio *bio, struct bio_set *bs)
 	mempool_free(p, bs->bio_pool);
 }
 
-/*
- * default destructor for a bio allocated with bio_alloc_bioset()
- */
-static void bio_fs_destructor(struct bio *bio)
-{
-	bio_free(bio, fs_bio_set);
-}
-
-static void bio_kmalloc_destructor(struct bio *bio)
-{
-	if (bio_has_allocated_vec(bio))
-		kfree(bio->bi_io_vec);
-	kfree(bio);
-}
-
 void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
@@ -301,21 +278,15 @@ void bio_init(struct bio *bio)
  **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+	unsigned long idx = BIO_POOL_NONE;
 	struct bio_vec *bvl = NULL;
-	struct bio *bio = NULL;
-	unsigned long idx = 0;
-	void *p = NULL;
-
-	if (bs) {
-		p = mempool_alloc(bs->bio_pool, gfp_mask);
-		if (!p)
-			goto err;
-		bio = p + bs->front_pad;
-	} else {
-		bio = kmalloc(sizeof(*bio), gfp_mask);
-		if (!bio)
-			goto err;
-	}
+	struct bio *bio;
+	void *p;
+
+	p = mempool_alloc(bs->bio_pool, gfp_mask);
+	if (unlikely(!p))
+		return NULL;
+	bio = p + bs->front_pad;
 
 	bio_init(bio);
 
@@ -332,22 +303,50 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 		nr_iovecs = bvec_nr_vecs(idx);
 	}
+out_set:
 	bio->bi_flags |= idx << BIO_POOL_OFFSET;
 	bio->bi_max_vecs = nr_iovecs;
-out_set:
 	bio->bi_io_vec = bvl;
-
 	return bio;
 
 err_free:
-	if (bs)
-		mempool_free(p, bs->bio_pool);
-	else
-		kfree(bio);
-err:
+	mempool_free(p, bs->bio_pool);
 	return NULL;
 }
 
+static void bio_fs_destructor(struct bio *bio)
+{
+	bio_free(bio, fs_bio_set);
+}
+
+/**
+ *	bio_alloc - allocate a new bio, memory pool backed
+ *	@gfp_mask: allocation mask to use
+ *	@nr_iovecs: number of iovecs
+ *
+ *	Allocate a new bio with @nr_iovecs bvecs.  If @gfp_mask
+ *	contains __GFP_WAIT, the allocation is guaranteed to succeed.
+ *
+ *	RETURNS:
+ *	Pointer to new bio on success, NULL on failure.
+ */
+struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
+{
+	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+
+	if (bio)
+		bio->bi_destructor = bio_fs_destructor;
+
+	return bio;
+}
+
+static void bio_kmalloc_destructor(struct bio *bio)
+{
+	if (bio_integrity(bio))
+		bio_integrity_free(bio);
+	kfree(bio);
+}
+
 /**
  * bio_alloc - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
@@ -366,29 +365,20 @@ err:
  *   do so can cause livelocks under memory pressure.
  *
  **/
-struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
-{
-	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
-
-	if (bio)
-		bio->bi_destructor = bio_fs_destructor;
-
-	return bio;
-}
-
-/*
- * Like bio_alloc(), but doesn't use a mempool backing. This means that
- * it CAN fail, but while bio_alloc() can only be used for allocations
- * that have a short (finite) life span, bio_kmalloc() should be used
- * for more permanent bio allocations (like allocating some bio's for
- * initalization or setup purposes).
- */
 struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
 {
-	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+	struct bio *bio;
 
-	if (bio)
-		bio->bi_destructor = bio_kmalloc_destructor;
+	bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
+		      gfp_mask);
+	if (unlikely(!bio))
+		return NULL;
+
+	bio_init(bio);
+	bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
+	bio->bi_max_vecs = nr_iovecs;
+	bio->bi_io_vec = bio->bi_inline_vecs;
+	bio->bi_destructor = bio_kmalloc_destructor;
 
 	return bio;
 }
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b89cf2d82898..7b214fd672a2 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -132,6 +132,7 @@ struct bio {
  * top 4 bits of bio flags indicate the pool this bio came from
  */
 #define BIO_POOL_BITS		(4)
+#define BIO_POOL_NONE		((1UL << BIO_POOL_BITS) - 1)
 #define BIO_POOL_OFFSET		(BITS_PER_LONG - BIO_POOL_BITS)
 #define BIO_POOL_MASK		(1UL << BIO_POOL_OFFSET)
 #define BIO_POOL_IDX(bio)	((bio)->bi_flags >> BIO_POOL_OFFSET)	
-- 
cgit v1.2.3


From 71982a409f12c50d011325a4471aa20666bb908d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Apr 2009 08:34:48 +0200
Subject: block: include empty disks in /proc/diskstats

/proc/diskstats used to show stats for all disks whether they're
zero-sized or not and their non-zero partitions.  Commit
074a7aca7afa6f230104e8e65eba3420263714a5 accidentally changed the
behavior such that it doesn't print out zero sized disks.  This patch
implements DISK_PITER_INCL_EMPTY_PART0 flag to partition iterator and
uses it in diskstats_show() such that empty part0 is shown in
/proc/diskstats.

Reported and bisectd by Dianel Collins.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Daniel Collins <solemnwarning@solemnwarning.no-ip.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 12 ++++++++----
 include/linux/genhd.h |  1 +
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index a9ec910974c1..1a4916e01732 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -98,7 +98,7 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
 
 	if (flags & DISK_PITER_REVERSE)
 		piter->idx = ptbl->len - 1;
-	else if (flags & DISK_PITER_INCL_PART0)
+	else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
 		piter->idx = 0;
 	else
 		piter->idx = 1;
@@ -134,7 +134,8 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 	/* determine iteration parameters */
 	if (piter->flags & DISK_PITER_REVERSE) {
 		inc = -1;
-		if (piter->flags & DISK_PITER_INCL_PART0)
+		if (piter->flags & (DISK_PITER_INCL_PART0 |
+				    DISK_PITER_INCL_EMPTY_PART0))
 			end = -1;
 		else
 			end = 0;
@@ -150,7 +151,10 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 		part = rcu_dereference(ptbl->part[piter->idx]);
 		if (!part)
 			continue;
-		if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects)
+		if (!part->nr_sects &&
+		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
+		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
+		      piter->idx == 0))
 			continue;
 
 		get_device(part_to_dev(part));
@@ -1011,7 +1015,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 				"\n\n");
 	*/
  
-	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_PART0);
+	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
 		cpu = part_stat_lock();
 		part_round_stats(cpu, hd);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 634c53028fb8..a1a28caed23d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -214,6 +214,7 @@ static inline void disk_put_part(struct hd_struct *part)
 #define DISK_PITER_REVERSE	(1 << 0) /* iterate in the reverse direction */
 #define DISK_PITER_INCL_EMPTY	(1 << 1) /* include 0-sized parts */
 #define DISK_PITER_INCL_PART0	(1 << 2) /* include partition 0 */
+#define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */
 
 struct disk_part_iter {
 	struct gendisk		*disk;
-- 
cgit v1.2.3


From 4cd481f68dde99ac416003b825c835f71e364393 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Mon, 13 Apr 2009 11:59:32 +0200
Subject: KVM: Fix overlapping check for memory slots

When checking for overlapping slots on registration of a new one, kvm
currently also considers zero-length (ie. deleted) slots and rejects
requests incorrectly. This finally denies user space from joining slots.
Fix the check by skipping deleted slots and advertise this via a
KVM_CAP_JOIN_MEMORY_REGIONS_WORKS.

Cc: stable@kernel.org
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h | 2 ++
 virt/kvm/kvm_main.c | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 311a073afe8a..8cc137911b34 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -409,6 +409,8 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 #define KVM_CAP_DEVICE_DEASSIGNMENT 27
 #endif
+/* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 28d693a1ee8f..1ecbe2391c8b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -961,7 +961,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
 		struct kvm_memory_slot *s = &kvm->memslots[i];
 
-		if (s == memslot)
+		if (s == memslot || !s->npages)
 			continue;
 		if (!((base_gfn + npages <= s->base_gfn) ||
 		      (base_gfn >= s->base_gfn + s->npages)))
@@ -1983,6 +1983,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
 	switch (arg) {
 	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
 		return 1;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	case KVM_CAP_IRQ_ROUTING:
-- 
cgit v1.2.3


From 1b6b8ce2ac372ea1f2065b89228ede105eb68dc5 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Thu, 9 Apr 2009 14:57:39 +0800
Subject: PCI: only save/restore existent registers in the PCIe capability

PCIe 1.1 base neither requires the endpoint to implement the entire
PCIe capability structure nor specifies default values of registers
that are not implemented by the device. So we only save and restore
registers that must be implemented by different device types if the
device PCIe capability version is 1.

PCIe 1.1 Capability Structure Expansion ECN and PCIe 2.0 requires
all registers in the PCIe capability to be either implemented or
hardwired to 0. Their PCIe capability version is 2.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c        | 70 ++++++++++++++++++++++++++++++++++++++----------
 include/linux/pci_regs.h |  1 +
 2 files changed, 57 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 16fd0d4c3166..34bf0fdf5047 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -681,11 +681,34 @@ EXPORT_SYMBOL(pci_choose_state);
 
 #define PCI_EXP_SAVE_REGS	7
 
+#define pcie_cap_has_devctl(type, flags)	1
+#define pcie_cap_has_lnkctl(type, flags)		\
+		((flags & PCI_EXP_FLAGS_VERS) > 1 ||	\
+		 (type == PCI_EXP_TYPE_ROOT_PORT ||	\
+		  type == PCI_EXP_TYPE_ENDPOINT ||	\
+		  type == PCI_EXP_TYPE_LEG_END))
+#define pcie_cap_has_sltctl(type, flags)		\
+		((flags & PCI_EXP_FLAGS_VERS) > 1 ||	\
+		 ((type == PCI_EXP_TYPE_ROOT_PORT) ||	\
+		  (type == PCI_EXP_TYPE_DOWNSTREAM &&	\
+		   (flags & PCI_EXP_FLAGS_SLOT))))
+#define pcie_cap_has_rtctl(type, flags)			\
+		((flags & PCI_EXP_FLAGS_VERS) > 1 ||	\
+		 (type == PCI_EXP_TYPE_ROOT_PORT ||	\
+		  type == PCI_EXP_TYPE_RC_EC))
+#define pcie_cap_has_devctl2(type, flags)		\
+		((flags & PCI_EXP_FLAGS_VERS) > 1)
+#define pcie_cap_has_lnkctl2(type, flags)		\
+		((flags & PCI_EXP_FLAGS_VERS) > 1)
+#define pcie_cap_has_sltctl2(type, flags)		\
+		((flags & PCI_EXP_FLAGS_VERS) > 1)
+
 static int pci_save_pcie_state(struct pci_dev *dev)
 {
 	int pos, i = 0;
 	struct pci_cap_saved_state *save_state;
 	u16 *cap;
+	u16 flags;
 
 	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
 	if (pos <= 0)
@@ -698,13 +721,22 @@ static int pci_save_pcie_state(struct pci_dev *dev)
 	}
 	cap = (u16 *)&save_state->data[0];
 
-	pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &cap[i++]);
-	pci_read_config_word(dev, pos + PCI_EXP_LNKCTL, &cap[i++]);
-	pci_read_config_word(dev, pos + PCI_EXP_SLTCTL, &cap[i++]);
-	pci_read_config_word(dev, pos + PCI_EXP_RTCTL, &cap[i++]);
-	pci_read_config_word(dev, pos + PCI_EXP_DEVCTL2, &cap[i++]);
-	pci_read_config_word(dev, pos + PCI_EXP_LNKCTL2, &cap[i++]);
-	pci_read_config_word(dev, pos + PCI_EXP_SLTCTL2, &cap[i++]);
+	pci_read_config_word(dev, pos + PCI_EXP_FLAGS, &flags);
+
+	if (pcie_cap_has_devctl(dev->pcie_type, flags))
+		pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &cap[i++]);
+	if (pcie_cap_has_lnkctl(dev->pcie_type, flags))
+		pci_read_config_word(dev, pos + PCI_EXP_LNKCTL, &cap[i++]);
+	if (pcie_cap_has_sltctl(dev->pcie_type, flags))
+		pci_read_config_word(dev, pos + PCI_EXP_SLTCTL, &cap[i++]);
+	if (pcie_cap_has_rtctl(dev->pcie_type, flags))
+		pci_read_config_word(dev, pos + PCI_EXP_RTCTL, &cap[i++]);
+	if (pcie_cap_has_devctl2(dev->pcie_type, flags))
+		pci_read_config_word(dev, pos + PCI_EXP_DEVCTL2, &cap[i++]);
+	if (pcie_cap_has_lnkctl2(dev->pcie_type, flags))
+		pci_read_config_word(dev, pos + PCI_EXP_LNKCTL2, &cap[i++]);
+	if (pcie_cap_has_sltctl2(dev->pcie_type, flags))
+		pci_read_config_word(dev, pos + PCI_EXP_SLTCTL2, &cap[i++]);
 
 	return 0;
 }
@@ -714,6 +746,7 @@ static void pci_restore_pcie_state(struct pci_dev *dev)
 	int i = 0, pos;
 	struct pci_cap_saved_state *save_state;
 	u16 *cap;
+	u16 flags;
 
 	save_state = pci_find_saved_cap(dev, PCI_CAP_ID_EXP);
 	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
@@ -721,13 +754,22 @@ static void pci_restore_pcie_state(struct pci_dev *dev)
 		return;
 	cap = (u16 *)&save_state->data[0];
 
-	pci_write_config_word(dev, pos + PCI_EXP_DEVCTL, cap[i++]);
-	pci_write_config_word(dev, pos + PCI_EXP_LNKCTL, cap[i++]);
-	pci_write_config_word(dev, pos + PCI_EXP_SLTCTL, cap[i++]);
-	pci_write_config_word(dev, pos + PCI_EXP_RTCTL, cap[i++]);
-	pci_write_config_word(dev, pos + PCI_EXP_DEVCTL2, cap[i++]);
-	pci_write_config_word(dev, pos + PCI_EXP_LNKCTL2, cap[i++]);
-	pci_write_config_word(dev, pos + PCI_EXP_SLTCTL2, cap[i++]);
+	pci_read_config_word(dev, pos + PCI_EXP_FLAGS, &flags);
+
+	if (pcie_cap_has_devctl(dev->pcie_type, flags))
+		pci_write_config_word(dev, pos + PCI_EXP_DEVCTL, cap[i++]);
+	if (pcie_cap_has_lnkctl(dev->pcie_type, flags))
+		pci_write_config_word(dev, pos + PCI_EXP_LNKCTL, cap[i++]);
+	if (pcie_cap_has_sltctl(dev->pcie_type, flags))
+		pci_write_config_word(dev, pos + PCI_EXP_SLTCTL, cap[i++]);
+	if (pcie_cap_has_rtctl(dev->pcie_type, flags))
+		pci_write_config_word(dev, pos + PCI_EXP_RTCTL, cap[i++]);
+	if (pcie_cap_has_devctl2(dev->pcie_type, flags))
+		pci_write_config_word(dev, pos + PCI_EXP_DEVCTL2, cap[i++]);
+	if (pcie_cap_has_lnkctl2(dev->pcie_type, flags))
+		pci_write_config_word(dev, pos + PCI_EXP_LNKCTL2, cap[i++]);
+	if (pcie_cap_has_sltctl2(dev->pcie_type, flags))
+		pci_write_config_word(dev, pos + PCI_EXP_SLTCTL2, cap[i++]);
 }
 
 
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index e4d08c1b2e0b..616bf8b3c8b5 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -376,6 +376,7 @@
 #define  PCI_EXP_TYPE_DOWNSTREAM 0x6	/* Downstream Port */
 #define  PCI_EXP_TYPE_PCI_BRIDGE 0x7	/* PCI/PCI-X Bridge */
 #define  PCI_EXP_TYPE_RC_END	0x9	/* Root Complex Integrated Endpoint */
+#define  PCI_EXP_TYPE_RC_EC	0x10	/* Root Complex Event Collector */
 #define PCI_EXP_FLAGS_SLOT	0x0100	/* Slot implemented */
 #define PCI_EXP_FLAGS_IRQ	0x3e00	/* Interrupt message number */
 #define PCI_EXP_DEVCAP		4	/* Device capabilities */
-- 
cgit v1.2.3


From 952043ac12a117d8e94bddd9088338d7ad20ca7d Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 23 Apr 2009 08:48:15 +0100
Subject: bitops: Add __ffs64 bitop

Finds the first set bit in a 64 bit word. This is required in order
to fix a bug in GFS2, but I think it should be a generic function
in case of future users.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Reviewed-by: Willy Tarreau <w@1wt.eu>
---
 include/linux/bitops.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 61829139795a..c05a29cb9bb2 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -112,6 +112,25 @@ static inline unsigned fls_long(unsigned long l)
 	return fls64(l);
 }
 
+/**
+ * __ffs64 - find first set bit in a 64 bit word
+ * @word: The 64 bit word
+ *
+ * On 64 bit arches this is a synomyn for __ffs
+ * The result is not defined if no bits are set, so check that @word
+ * is non-zero before calling this.
+ */
+static inline unsigned long __ffs64(u64 word)
+{
+#if BITS_PER_LONG == 32
+	if (((u32)word) == 0UL)
+		return __ffs((u32)(word >> 32)) + 32;
+#elif BITS_PER_LONG != 64
+#error BITS_PER_LONG not 32 or 64
+#endif
+	return __ffs((unsigned long)word);
+}
+
 #ifdef __KERNEL__
 #ifdef CONFIG_GENERIC_FIND_FIRST_BIT
 
-- 
cgit v1.2.3


From fbfc396efbc11d784b4325adfc02e82a0df01a8d Mon Sep 17 00:00:00 2001
From: "Mark A. Greer" <mgreer@mvista.com>
Date: Tue, 21 Apr 2009 20:52:54 -0700
Subject: USB: musb: Prevent multiple includes of musb.h

Add #ifndef to musb header file to prevent multiple inclusions.

Signed-off-by: Mark A. Greer <mgreer@mvista.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/musb.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/musb.h b/include/linux/usb/musb.h
index d6aad0ea6033..d43755669261 100644
--- a/include/linux/usb/musb.h
+++ b/include/linux/usb/musb.h
@@ -7,6 +7,9 @@
  * key configuration differences between boards.
  */
 
+#ifndef __LINUX_USB_MUSB_H
+#define __LINUX_USB_MUSB_H
+
 /* The USB role is defined by the connector used on the board, so long as
  * standards are being followed.  (Developer boards sometimes won't.)
  */
@@ -101,3 +104,5 @@ extern int __init tusb6010_setup_interface(
 extern int tusb6010_platform_retime(unsigned is_refclk);
 
 #endif	/* OMAP2 */
+
+#endif /* __LINUX_USB_MUSB_H */
-- 
cgit v1.2.3


From 097102c2d04974bdfcfa16a5f3062d499842139c Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Tue, 21 Apr 2009 09:33:14 +0200
Subject: pktcdvd.h should include mempool.h

Fix this build error:
In file included from fs/compat_ioctl.c:104:
include/linux/pktcdvd.h:285: error: expected specifier-qualifier-list before 'mempool_t'

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/pktcdvd.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 04b4d7330e6d..d745f5b6c7b0 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -113,6 +113,7 @@ struct pkt_ctrl_command {
 #include <linux/cdrom.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+#include <linux/mempool.h>
 
 /* default bio write queue congestion marks */
 #define PKT_WRITE_CONGESTION_ON    10000
-- 
cgit v1.2.3


From 42dad7647aec49b3ad20dd0cb832b232a6ae514f Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 22 Apr 2009 14:01:49 +0200
Subject: block: simplify I/O stat accounting

This simplifies I/O stat accounting switching code and separates it
completely from I/O scheduler switch code.

Requests are accounted according to the state of their request queue
at the time of the request allocation. There is no need anymore to
flush the request queue when switching I/O accounting state.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 6 ++++--
 block/blk-merge.c      | 5 ++++-
 block/blk-sysfs.c      | 4 ----
 block/blk.h            | 7 +------
 include/linux/blkdev.h | 3 +++
 5 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 07ab75403e1a..2998fe3a2377 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -643,7 +643,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq)
 }
 
 static struct request *
-blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
+blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 
@@ -652,7 +652,7 @@ blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
 
 	blk_rq_init(q, rq);
 
-	rq->cmd_flags = rw | REQ_ALLOCED;
+	rq->cmd_flags = flags | REQ_ALLOCED;
 
 	if (priv) {
 		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
@@ -792,6 +792,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	if (priv)
 		rl->elvpriv++;
 
+	if (blk_queue_io_stat(q))
+		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 63760ca3da0f..23d2a6fe34a3 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -402,7 +402,10 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 
 	elv_merge_requests(q, req, next);
 
-	blk_account_io_merge(req);
+	/*
+	 * 'next' is going away, so update stats accordingly
+	 */
+	blk_account_io_merge(next);
 
 	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
 	if (blk_rq_cpu_valid(next))
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index cac4e9febe6a..3ff9bba3379a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -209,14 +209,10 @@ static ssize_t queue_iostats_store(struct request_queue *q, const char *page,
 	ssize_t ret = queue_var_store(&stats, page, count);
 
 	spin_lock_irq(q->queue_lock);
-	elv_quiesce_start(q);
-
 	if (stats)
 		queue_flag_set(QUEUE_FLAG_IO_STAT, q);
 	else
 		queue_flag_clear(QUEUE_FLAG_IO_STAT, q);
-
-	elv_quiesce_end(q);
 	spin_unlock_irq(q->queue_lock);
 
 	return ret;
diff --git a/block/blk.h b/block/blk.h
index 5dfc41267a08..79c85f7c9ff5 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -114,12 +114,7 @@ static inline int blk_cpu_to_group(int cpu)
 
 static inline int blk_do_io_stat(struct request *rq)
 {
-	struct gendisk *disk = rq->rq_disk;
-
-	if (!disk || !disk->queue)
-		return 0;
-
-	return blk_queue_io_stat(disk->queue) && (rq->cmd_flags & REQ_ELVPRIV);
+	return rq->rq_disk && blk_rq_io_stat(rq);
 }
 
 #endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ba54c834a590..2755d5c6da22 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -118,6 +118,7 @@ enum rq_flag_bits {
 	__REQ_COPY_USER,	/* contains copies of user pages */
 	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
 	__REQ_NOIDLE,		/* Don't anticipate more IO after this one */
+	__REQ_IO_STAT,		/* account I/O stat */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -145,6 +146,7 @@ enum rq_flag_bits {
 #define REQ_COPY_USER	(1 << __REQ_COPY_USER)
 #define REQ_INTEGRITY	(1 << __REQ_INTEGRITY)
 #define REQ_NOIDLE	(1 << __REQ_NOIDLE)
+#define REQ_IO_STAT	(1 << __REQ_IO_STAT)
 
 #define BLK_MAX_CDB	16
 
@@ -598,6 +600,7 @@ enum {
 				 blk_failfast_transport(rq) ||	\
 				 blk_failfast_driver(rq))
 #define blk_rq_started(rq)	((rq)->cmd_flags & REQ_STARTED)
+#define blk_rq_io_stat(rq)	((rq)->cmd_flags & REQ_IO_STAT)
 
 #define blk_account_rq(rq)	(blk_rq_started(rq) && (blk_fs_request(rq) || blk_discard_rq(rq))) 
 
-- 
cgit v1.2.3


From 71951b64a5a87c09eb6fde59ce51aaab2fdaeab2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 24 Apr 2009 16:58:41 +0200
Subject: netfilter: nf_ct_dccp: add missing role attributes for DCCP

This patch adds missing role attribute to the DCCP type, otherwise
the creation of entries is not of any use.

The attribute added is CTA_PROTOINFO_DCCP_ROLE which contains the
role of the conntrack original tuple.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nfnetlink_conntrack.h |  1 +
 net/netfilter/nf_conntrack_proto_dccp.c       | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 29fe9ea1d346..1a865e48b8eb 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -100,6 +100,7 @@ enum ctattr_protoinfo_tcp {
 enum ctattr_protoinfo_dccp {
 	CTA_PROTOINFO_DCCP_UNSPEC,
 	CTA_PROTOINFO_DCCP_STATE,
+	CTA_PROTOINFO_DCCP_ROLE,
 	__CTA_PROTOINFO_DCCP_MAX,
 };
 #define CTA_PROTOINFO_DCCP_MAX (__CTA_PROTOINFO_DCCP_MAX - 1)
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 5411d63f31a5..8e757dd53396 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -633,6 +633,8 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
 	if (!nest_parms)
 		goto nla_put_failure;
 	NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state);
+	NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE,
+		   ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]);
 	nla_nest_end(skb, nest_parms);
 	read_unlock_bh(&dccp_lock);
 	return 0;
@@ -644,6 +646,7 @@ nla_put_failure:
 
 static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
 	[CTA_PROTOINFO_DCCP_STATE]	= { .type = NLA_U8 },
+	[CTA_PROTOINFO_DCCP_ROLE]	= { .type = NLA_U8 },
 };
 
 static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
@@ -661,11 +664,21 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
 		return err;
 
 	if (!tb[CTA_PROTOINFO_DCCP_STATE] ||
-	    nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE)
+	    !tb[CTA_PROTOINFO_DCCP_ROLE] ||
+	    nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX ||
+	    nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) {
 		return -EINVAL;
+	}
 
 	write_lock_bh(&dccp_lock);
 	ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]);
+	if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) {
+		ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
+		ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
+	} else {
+		ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER;
+		ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT;
+	}
 	write_unlock_bh(&dccp_lock);
 	return 0;
 }
-- 
cgit v1.2.3


From c80d471a476b6d6fe0bc1fd25293c24c66b7aaaf Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@MIT.EDU>
Date: Sat, 25 Apr 2009 22:10:56 -0400
Subject: Add new HEAD_TEXT_SECTION macro.

This patch is preparation for replacing all uses of ".head.text" or
".text.head" in the kernel with macros, so that the section name can
later be changed without having to touch a lot of the kernel.

Since some linker scripts do more complex things than referencing
HEAD_TEXT, we add a HEAD_TEXT_SECTION macro that just contains the
actual name.

I've defined HEAD_TEXT_SECTION in a new header,
include/linux/section-names.h, so that this section name only needs to
appear in one place.  I anticipate creating similar macro structures
for a number of other section names.

The long-term goal here is to be able to change the kernel's magic
section names to those that are compatible with -ffunction-sections
-fdata-sections.  This requires renaming all magic sections with names
of the form ".text.foo".

Signed-off-by: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/vmlinux.lds.h | 4 +++-
 include/linux/init.h              | 4 +++-
 include/linux/section-names.h     | 6 ++++++
 3 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/section-names.h

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 7fa660fd449c..eaa06ef6f7d9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -1,3 +1,5 @@
+#include <linux/section-names.h>
+
 #ifndef LOAD_OFFSET
 #define LOAD_OFFSET 0
 #endif
@@ -331,7 +333,7 @@
 #endif
 
 /* Section used for early init (in .S files) */
-#define HEAD_TEXT  *(.head.text)
+#define HEAD_TEXT  *(HEAD_TEXT_SECTION)
 
 /* init and exit section handling */
 #define INIT_DATA							\
diff --git a/include/linux/init.h b/include/linux/init.h
index f121a7a10c3d..20a1334e34e9 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -2,6 +2,8 @@
 #define _LINUX_INIT_H
 
 #include <linux/compiler.h>
+#include <linux/section-names.h>
+#include <linux/stringify.h>
 
 /* These macros are used to mark some functions or 
  * initialized data (doesn't apply to uninitialized data)
@@ -107,7 +109,7 @@
 #define __memexitconst   __section(.memexit.rodata)
 
 /* For assembly routines */
-#define __HEAD		.section	".head.text","ax"
+#define __HEAD		.section	__stringify(HEAD_TEXT_SECTION),"ax"
 #define __INIT		.section	".init.text","ax"
 #define __FINIT		.previous
 
diff --git a/include/linux/section-names.h b/include/linux/section-names.h
new file mode 100644
index 000000000000..c956f4eb2adf
--- /dev/null
+++ b/include/linux/section-names.h
@@ -0,0 +1,6 @@
+#ifndef __LINUX_SECTION_NAMES_H
+#define __LINUX_SECTION_NAMES_H
+
+#define HEAD_TEXT_SECTION .head.text
+
+#endif /* !__LINUX_SECTION_NAMES_H */
-- 
cgit v1.2.3


From c759a6b4e1cae6aff71f58c9c85404ebcd81b6e0 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 27 Apr 2009 02:36:20 -0700
Subject: net: Fix LL_MAX_HEADER for CONFIG_TR_MODULE

Unless I miss anything this should fix a bug.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f4a755..453be9a674c0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -104,7 +104,7 @@ struct wireless_dev;
 # else
 #  define LL_MAX_HEADER 96
 # endif
-#elif defined(CONFIG_TR)
+#elif defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
 # define LL_MAX_HEADER 48
 #else
 # define LL_MAX_HEADER 32
-- 
cgit v1.2.3


From 37b607c5ac3b7c92a6a3624bb29f1cdcdcf7044a Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Mon, 27 Apr 2009 05:45:54 -0700
Subject: net: Fix typo in net_device_ops description.

Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 453be9a674c0..5a96a1a406e9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -500,7 +500,7 @@ struct netdev_queue {
  *
  * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
  *	This function  is called when the Media Access Control address
- *	needs to be changed. If not this interface is not defined, the
+ *	needs to be changed. If this interface is not defined, the
  *	mac address can not be changed.
  *
  * int (*ndo_validate_addr)(struct net_device *dev);
-- 
cgit v1.2.3


From 27b1833279995e7c290a40cac4ef36ccea7e9283 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@MIT.EDU>
Date: Mon, 27 Apr 2009 14:02:27 -0400
Subject: Remove unused support code for refok sections.

The old refok sections

  .text.init.refok
  .data.init.refok
  .exit.text.refok

have been deprecated since commit
312b1485fb509c9bc32eda28ad29537896658cb8.  After the other patches in
this patch series nothing is put in these sections, so clean things up
by eliminating all the remaining references to them.

Signed-off-by: Tim Abbott <tabbott@mit.edu>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/vmlinux.lds.h |  3 ---
 include/linux/init.h              |  8 --------
 scripts/mod/modpost.c             | 18 ------------------
 3 files changed, 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index eaa06ef6f7d9..89853bcd27a6 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -90,7 +90,6 @@
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
-	*(.data.init.refok)						\
 	*(.ref.data)							\
 	DEV_KEEP(init.data)						\
 	DEV_KEEP(exit.data)						\
@@ -289,8 +288,6 @@
 		*(.text.hot)						\
 		*(.text)						\
 		*(.ref.text)						\
-		*(.text.init.refok)					\
-		*(.exit.text.refok)					\
 	DEV_KEEP(init.text)						\
 	DEV_KEEP(exit.text)						\
 	CPU_KEEP(init.text)						\
diff --git a/include/linux/init.h b/include/linux/init.h
index 20a1334e34e9..0e06c176f185 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -62,14 +62,6 @@
 #define __refdata        __section(.ref.data)
 #define __refconst       __section(.ref.rodata)
 
-/* backward compatibility note
- *  A few places hardcode the old section names:
- *  .text.init.refok
- *  .data.init.refok
- *  .exit.text.refok
- *  They should be converted to use the defines from this file
- */
-
 /* compatibility defines */
 #define __init_refok     __ref
 #define __initdata_refok __refdata
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index df6e6286a065..8d46ea7d6715 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -794,15 +794,6 @@ static const char *init_exit_sections[] =
 /* data section */
 static const char *data_sections[] = { DATA_SECTIONS, NULL };
 
-/* sections that may refer to an init/exit section with no warning */
-static const char *initref_sections[] =
-{
-	".text.init.refok*",
-	".exit.text.refok*",
-	".data.init.refok*",
-	NULL
-};
-
 
 /* symbols in .data that may refer to init/exit sections */
 static const char *symbol_white_list[] =
@@ -915,11 +906,6 @@ static int section_mismatch(const char *fromsec, const char *tosec)
 /**
  * Whitelist to allow certain references to pass with no warning.
  *
- * Pattern 0:
- *   Do not warn if funtion/data are marked with __init_refok/__initdata_refok.
- *   The pattern is identified by:
- *   fromsec = .text.init.refok* | .data.init.refok*
- *
  * Pattern 1:
  *   If a module parameter is declared __initdata and permissions=0
  *   then this is legal despite the warning generated.
@@ -958,10 +944,6 @@ static int section_mismatch(const char *fromsec, const char *tosec)
 static int secref_whitelist(const char *fromsec, const char *fromsym,
 			    const char *tosec, const char *tosym)
 {
-	/* Check for pattern 0 */
-	if (match(fromsec, initref_sections))
-		return 0;
-
 	/* Check for pattern 1 */
 	if (match(tosec, init_data_sections) &&
 	    match(fromsec, data_sections) &&
-- 
cgit v1.2.3


From bf368e4e70cd4e0f880923c44e95a4273d725ab4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 28 Apr 2009 02:24:21 -0700
Subject: net: Avoid extra wakeups of threads blocked in wait_for_packet()

In 2.6.25 we added UDP mem accounting.

This unfortunatly added a penalty when a frame is transmitted, since
we have at TX completion time to call sock_wfree() to perform necessary
memory accounting. This calls sock_def_write_space() and utimately
scheduler if any thread is waiting on the socket.
Thread(s) waiting for an incoming frame was scheduled, then had to sleep
again as event was meaningless.

(All threads waiting on a socket are using same sk_sleep anchor)

This adds lot of extra wakeups and increases latencies, as noted
by Christoph Lameter, and slows down softirq handler.

Reference : http://marc.info/?l=linux-netdev&m=124060437012283&w=2

Fortunatly, Davide Libenzi recently added concept of keyed wakeups
into kernel, and particularly for sockets (see commit
37e5540b3c9d838eb20f2ca8ea2eb8072271e403
epoll keyed wakeups: make sockets use keyed wakeups)

Davide goal was to optimize epoll, but this new wakeup infrastructure
can help non epoll users as well, if they care to setup an appropriate
handler.

This patch introduces new DEFINE_WAIT_FUNC() helper and uses it
in wait_for_packet(), so that only relevant event can wakeup a thread
blocked in this function.

Trace of function calls from bnx2 TX completion bnx2_poll_work() is :
__kfree_skb()
 skb_release_head_state()
  sock_wfree()
   sock_def_write_space()
    __wake_up_sync_key()
     __wake_up_common()
      receiver_wake_function() : Stops here since thread is waiting for an INPUT


Reported-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/wait.h |  6 ++++--
 net/core/datagram.c  | 14 +++++++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5d631c17eaee..bc024632f365 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -440,13 +440,15 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
-#define DEFINE_WAIT(name)						\
+#define DEFINE_WAIT_FUNC(name, function)				\
 	wait_queue_t name = {						\
 		.private	= current,				\
-		.func		= autoremove_wake_function,		\
+		.func		= function,				\
 		.task_list	= LIST_HEAD_INIT((name).task_list),	\
 	}
 
+#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
+
 #define DEFINE_WAIT_BIT(name, word, bit)				\
 	struct wait_bit_queue name = {					\
 		.key = __WAIT_BIT_KEY_INITIALIZER(word, bit),		\
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d0de644b378d..b01a76abe1d2 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -64,13 +64,25 @@ static inline int connection_based(struct sock *sk)
 	return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
 }
 
+static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync,
+				  void *key)
+{
+	unsigned long bits = (unsigned long)key;
+
+	/*
+	 * Avoid a wakeup if event not interesting for us
+	 */
+	if (bits && !(bits & (POLLIN | POLLERR)))
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, key);
+}
 /*
  * Wait for a packet..
  */
 static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
 {
 	int error;
-	DEFINE_WAIT(wait);
+	DEFINE_WAIT_FUNC(wait, receiver_wake_function);
 
 	prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 
-- 
cgit v1.2.3


From 6916d97f6e25cc66a32d6e9a16419067d843b14f Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@euromail.se>
Date: Mon, 27 Apr 2009 11:52:43 -0700
Subject: Input: bcm5974 - add quad-finger tapping

The integrated button on the new unibody Macbooks presents a need to
report explicit four-finger actions. Evidently, the finger pressing
the button is also touching the trackpad, so in order to fully support
three-finger actions, the driver must be able to report four-finger
actions. This patch adds a new button, BTN_TOOL_QUADTAP, which
achieves this.

Signed-off-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/mouse/bcm5974.c | 4 +++-
 include/linux/input.h         | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c
index bda873393b0d..2ddf05e1d852 100644
--- a/drivers/input/mouse/bcm5974.c
+++ b/drivers/input/mouse/bcm5974.c
@@ -258,6 +258,7 @@ static void setup_events_to_report(struct input_dev *input_dev,
 	__set_bit(BTN_TOOL_FINGER, input_dev->keybit);
 	__set_bit(BTN_TOOL_DOUBLETAP, input_dev->keybit);
 	__set_bit(BTN_TOOL_TRIPLETAP, input_dev->keybit);
+	__set_bit(BTN_TOOL_QUADTAP, input_dev->keybit);
 	__set_bit(BTN_LEFT, input_dev->keybit);
 }
 
@@ -329,7 +330,8 @@ static int report_tp_state(struct bcm5974 *dev, int size)
 	input_report_key(input, BTN_TOUCH, dev->fingers > 0);
 	input_report_key(input, BTN_TOOL_FINGER, dev->fingers == 1);
 	input_report_key(input, BTN_TOOL_DOUBLETAP, dev->fingers == 2);
-	input_report_key(input, BTN_TOOL_TRIPLETAP, dev->fingers > 2);
+	input_report_key(input, BTN_TOOL_TRIPLETAP, dev->fingers == 3);
+	input_report_key(input, BTN_TOOL_QUADTAP, dev->fingers > 3);
 
 	input_report_abs(input, ABS_PRESSURE, abs_p);
 	input_report_abs(input, ABS_TOOL_WIDTH, abs_w);
diff --git a/include/linux/input.h b/include/linux/input.h
index 6b28048fc568..32cb825939be 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -445,6 +445,7 @@ struct input_absinfo {
 #define BTN_STYLUS2		0x14c
 #define BTN_TOOL_DOUBLETAP	0x14d
 #define BTN_TOOL_TRIPLETAP	0x14e
+#define BTN_TOOL_QUADTAP	0x14f	/* Four fingers on trackpad */
 
 #define BTN_WHEEL		0x150
 #define BTN_GEAR_DOWN		0x150
-- 
cgit v1.2.3


From 5e5ee686e3c0f8a3cbe9b75c2690326bf91af10d Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@euromail.se>
Date: Tue, 28 Apr 2009 07:47:33 -0700
Subject: Input: add detailed multi-touch finger data report protocol

In order to utilize the full power of the new multi-touch devices, a
way to report detailed finger data to user space is needed. This patch
adds a multi-touch (MT) protocol which allows drivers to report details
for an arbitrary number of fingers.

The driver sends a SYN_MT_REPORT event via the input_mt_sync() function
when a complete finger has been reported.

In order to stay compatible with existing applications, the data
reported in a finger packet must not be recognized as single-touch
events. In addition, all finger data must bypass input filtering,
since subsequent events of the same type refer to different fingers.

A set of ABS_MT events with the desired properties are defined. The
events are divided into categories, to allow for partial implementation.
The minimum set consists of ABS_MT_TOUCH_MAJOR, ABS_MT_POSITION_X and
ABS_MT_POSITION_Y, which allows for multiple fingers to be tracked.
If the device supports it, the ABS_MT_WIDTH_MAJOR may be used to provide
the size of the approaching finger. Anisotropy and direction may be
specified with ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MINOR and
ABS_MT_ORIENTATION. Devices with more granular information may specify
general shapes as blobs, i.e., as a sequence of rectangular shapes
grouped together by a ABS_MT_BLOB_ID. Finally, the ABS_MT_TOOL_TYPE
may be used to specify whether the touching tool is a finger or a pen.

Signed-off-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/input.c | 13 +++++++++++++
 include/linux/input.h | 23 +++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/input/input.c b/drivers/input/input.c
index 8ff92aa13a0a..e54e002665b0 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -33,6 +33,15 @@ MODULE_LICENSE("GPL");
  * EV_ABS events which should not be cached are listed here.
  */
 static unsigned int input_abs_bypass_init_data[] __initdata = {
+	ABS_MT_TOUCH_MAJOR,
+	ABS_MT_TOUCH_MINOR,
+	ABS_MT_WIDTH_MAJOR,
+	ABS_MT_WIDTH_MINOR,
+	ABS_MT_ORIENTATION,
+	ABS_MT_POSITION_X,
+	ABS_MT_POSITION_Y,
+	ABS_MT_TOOL_TYPE,
+	ABS_MT_BLOB_ID,
 	0
 };
 static unsigned long input_abs_bypass[BITS_TO_LONGS(ABS_CNT)];
@@ -169,6 +178,10 @@ static void input_handle_event(struct input_dev *dev,
 				disposition = INPUT_PASS_TO_HANDLERS;
 			}
 			break;
+		case SYN_MT_REPORT:
+			dev->sync = 0;
+			disposition = INPUT_PASS_TO_HANDLERS;
+			break;
 		}
 		break;
 
diff --git a/include/linux/input.h b/include/linux/input.h
index 32cb825939be..0e6ff5de3588 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -106,6 +106,7 @@ struct input_absinfo {
 
 #define SYN_REPORT		0
 #define SYN_CONFIG		1
+#define SYN_MT_REPORT		2
 
 /*
  * Keys and buttons
@@ -645,6 +646,17 @@ struct input_absinfo {
 #define ABS_TOOL_WIDTH		0x1c
 #define ABS_VOLUME		0x20
 #define ABS_MISC		0x28
+
+#define ABS_MT_TOUCH_MAJOR	0x30	/* Major axis of touching ellipse */
+#define ABS_MT_TOUCH_MINOR	0x31	/* Minor axis (omit if circular) */
+#define ABS_MT_WIDTH_MAJOR	0x32	/* Major axis of approaching ellipse */
+#define ABS_MT_WIDTH_MINOR	0x33	/* Minor axis (omit if circular) */
+#define ABS_MT_ORIENTATION	0x34	/* Ellipse orientation */
+#define ABS_MT_POSITION_X	0x35	/* Center X ellipse position */
+#define ABS_MT_POSITION_Y	0x36	/* Center Y ellipse position */
+#define ABS_MT_TOOL_TYPE	0x37	/* Type of touching device */
+#define ABS_MT_BLOB_ID		0x38	/* Group a set of packets as a blob */
+
 #define ABS_MAX			0x3f
 #define ABS_CNT			(ABS_MAX+1)
 
@@ -743,6 +755,12 @@ struct input_absinfo {
 #define BUS_GSC			0x1A
 #define BUS_ATARI		0x1B
 
+/*
+ * MT_TOOL types
+ */
+#define MT_TOOL_FINGER		0
+#define MT_TOOL_PEN		1
+
 /*
  * Values describing the status of a force-feedback effect
  */
@@ -1312,6 +1330,11 @@ static inline void input_sync(struct input_dev *dev)
 	input_event(dev, EV_SYN, SYN_REPORT, 0);
 }
 
+static inline void input_mt_sync(struct input_dev *dev)
+{
+	input_event(dev, EV_SYN, SYN_MT_REPORT, 0);
+}
+
 void input_set_capability(struct input_dev *dev, unsigned int type, unsigned int code);
 
 static inline void input_set_abs_params(struct input_dev *dev, int axis, int min, int max, int fuzz, int flat)
-- 
cgit v1.2.3


From 9f6532519feab921856f41b30a2397ee25f4de49 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 3 Apr 2009 21:31:30 -0700
Subject: regulator: fix header file missing kernel-doc

Add regulator header file missing kernel-doc:

Warning(include/linux/regulator/driver.h:117): No description found for parameter 'set_mode'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
cc:	Liam Girdwood <lrg@slimlogic.co.uk>
cc:	Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/regulator/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 4848d8dacd90..225f733e7533 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -50,6 +50,7 @@ enum regulator_status {
  * @set_current_limit: Configure a limit for a current-limited regulator.
  * @get_current_limit: Get the configured limit for a current-limited regulator.
  *
+ * @set_mode: Set the configured operating mode for the regulator.
  * @get_mode: Get the configured operating mode for the regulator.
  * @get_status: Return actual (not as-configured) status of regulator, as a
  *	REGULATOR_STATUS value (or negative errno)
-- 
cgit v1.2.3


From 942e4a2bd680c606af0211e64eb216be2e19bf61 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 28 Apr 2009 22:36:33 -0700
Subject: netfilter: revised locking for x_tables

The x_tables are organized with a table structure and a per-cpu copies
of the counters and rules. On older kernels there was a reader/writer
lock per table which was a performance bottleneck. In 2.6.30-rc, this
was converted to use RCU and the counters/rules which solved the performance
problems for do_table but made replacing rules much slower because of
the necessary RCU grace period.

This version uses a per-cpu set of spinlocks and counters to allow to
table processing to proceed without the cache thrashing of a global
reader lock and keeps the same performance for table updates.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h |  73 +++++++++++++++++++--
 net/ipv4/netfilter/arp_tables.c    | 125 +++++++++++-------------------------
 net/ipv4/netfilter/ip_tables.c     | 126 +++++++++++--------------------------
 net/ipv6/netfilter/ip6_tables.c    | 123 +++++++++++-------------------------
 net/netfilter/x_tables.c           |  53 ++++++++--------
 5 files changed, 204 insertions(+), 296 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 7b1a652066c0..1b2e43502ef7 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,74 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+/*
+ * Per-CPU spinlock associated with per-cpu table entries, and
+ * with a counter for the "reading" side that allows a recursive
+ * reader to avoid taking the lock and deadlocking.
+ *
+ * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
+ * It needs to ensure that the rules are not being changed while the packet
+ * is being processed. In some cases, the read lock will be acquired
+ * twice on the same CPU; this is okay because of the count.
+ *
+ * "writing" is used when reading counters.
+ *  During replace any readers that are using the old tables have to complete
+ *  before freeing the old table. This is handled by the write locking
+ *  necessary for reading the counters.
+ */
+struct xt_info_lock {
+	spinlock_t lock;
+	unsigned char readers;
+};
+DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
+
+/*
+ * Note: we need to ensure that preemption is disabled before acquiring
+ * the per-cpu-variable, so we do it as a two step process rather than
+ * using "spin_lock_bh()".
+ *
+ * We _also_ need to disable bottom half processing before updating our
+ * nesting count, to make sure that the only kind of re-entrancy is this
+ * code being called by itself: since the count+lock is not an atomic
+ * operation, we can allow no races.
+ *
+ * _Only_ that special combination of being per-cpu and never getting
+ * re-entered asynchronously means that the count is safe.
+ */
+static inline void xt_info_rdlock_bh(void)
+{
+	struct xt_info_lock *lock;
+
+	local_bh_disable();
+	lock = &__get_cpu_var(xt_info_locks);
+	if (!lock->readers++)
+		spin_lock(&lock->lock);
+}
+
+static inline void xt_info_rdunlock_bh(void)
+{
+	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
+
+	if (!--lock->readers)
+		spin_unlock(&lock->lock);
+	local_bh_enable();
+}
+
+/*
+ * The "writer" side needs to get exclusive access to the lock,
+ * regardless of readers.  This must be called with bottom half
+ * processing (and thus also preemption) disabled.
+ */
+static inline void xt_info_wrlock(unsigned int cpu)
+{
+	spin_lock(&per_cpu(xt_info_locks, cpu).lock);
+}
+
+static inline void xt_info_wrunlock(unsigned int cpu)
+{
+	spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
+}
 
 /*
  * This helper is performance critical and must be inlined
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 5ba533d234db..831fe1879dc0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
+		return ERR_PTR(-ENOMEM);
 
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
-
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 810c0b62c7d4..2ec8d7290c40 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t,
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 * with data used by 'current' CPU.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 800ae8542471..219e165aea10 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t,
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
@@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	xt_info_wrlock(curcpu);
+	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 509a95621f9f..150e5cf62f85 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	local_bh_disable();
 	private = table->private;
+
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		local_bh_enable();
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
 
-	synchronize_net();
-	return oldinfo;
+	table->private = newinfo;
+	newinfo->initial_entries = private->initial_entries;
+
+	/*
+	 * Even though table entries have now been swapped, other CPU's
+	 * may still be using the old entries. This is okay, because
+	 * resynchronization happens because of the locking done
+	 * during the get_counters() routine.
+	 */
+	local_bh_enable();
+
+	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1147,7 +1143,14 @@ static struct pernet_operations xt_net_ops = {
 
 static int __init xt_init(void)
 {
-	int i, rv;
+	unsigned int i;
+	int rv;
+
+	for_each_possible_cpu(i) {
+		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
+		spin_lock_init(&lock->lock);
+		lock->readers = 0;
+	}
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)
-- 
cgit v1.2.3


From d37dc42ab6f040b8f0f2962ab219c5b2accf748d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 30 Apr 2009 06:45:08 -0400
Subject: nls: add a nls_nullsize inline

It's possible for character sets to require a multi-byte null
string terminator. Add a helper function that determines the size
of the null terminator at runtime.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 include/linux/nls.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nls.h b/include/linux/nls.h
index 6a882208301a..52b1a76c1b43 100644
--- a/include/linux/nls.h
+++ b/include/linux/nls.h
@@ -58,6 +58,25 @@ static inline int nls_strnicmp(struct nls_table *t, const unsigned char *s1,
 	return 0;
 }
 
+/*
+ * nls_nullsize - return length of null character for codepage
+ * @codepage - codepage for which to return length of NULL terminator
+ *
+ * Since we can't guarantee that the null terminator will be a particular
+ * length, we have to check against the codepage. If there's a problem
+ * determining it, assume a single-byte NULL terminator.
+ */
+static inline int
+nls_nullsize(const struct nls_table *codepage)
+{
+	int charlen;
+	char tmp[NLS_MAX_CHARSET_SIZE];
+
+	charlen = codepage->uni2char(0, tmp, NLS_MAX_CHARSET_SIZE);
+
+	return charlen > 0 ? charlen : 1;
+}
+
 #define MODULE_ALIAS_NLS(name)	MODULE_ALIAS("nls_" __stringify(name))
 
 #endif /* _LINUX_NLS_H */
-- 
cgit v1.2.3


From 96c16743973e8c1a7b9c655d10b7973408d6d1dd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@gmail.com>
Date: Thu, 30 Apr 2009 18:24:34 +0200
Subject: ide-cd: fix REQ_QUIET tests in cdrom_decode_status

Original patch (dfa4411cc3a690011cab90e9a536938795366cf9) was buggy.
This is a more proper fix which introduces blk_rq_quiet() macro
alleviating the need for dumb, too short caching variables.

Thanks to Helge Deller and Bart for debugging this.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Reported-and-tested-by: Helge Deller <deller@gmx.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c   | 9 ++++-----
 include/linux/blkdev.h | 1 +
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 3d4e09969763..925eb9e245d1 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -312,7 +312,6 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq = hwif->rq;
 	int err, sense_key, do_end_request = 0;
-	u8 quiet = rq->cmd_flags & REQ_QUIET;
 
 	/* get the IDE error register */
 	err = ide_read_error(drive);
@@ -347,7 +346,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		} else {
 			cdrom_saw_media_change(drive);
 
-			if (blk_fs_request(rq) && !quiet)
+			if (blk_fs_request(rq) && !blk_rq_quiet(rq))
 				printk(KERN_ERR PFX "%s: tray open\n",
 					drive->name);
 		}
@@ -382,7 +381,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		 * No point in retrying after an illegal request or data
 		 * protect error.
 		 */
-		if (!quiet)
+		if (!blk_rq_quiet(rq))
 			ide_dump_status(drive, "command error", stat);
 		do_end_request = 1;
 		break;
@@ -391,14 +390,14 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		 * No point in re-trying a zillion times on a bad sector.
 		 * If we got here the error is not correctable.
 		 */
-		if (!quiet)
+		if (!blk_rq_quiet(rq))
 			ide_dump_status(drive, "media error "
 					"(bad sector)", stat);
 		do_end_request = 1;
 		break;
 	case BLANK_CHECK:
 		/* disk appears blank? */
-		if (!quiet)
+		if (!blk_rq_quiet(rq))
 			ide_dump_status(drive, "media error (blank)",
 					stat);
 		do_end_request = 1;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ba54c834a590..6f841fb1be30 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -598,6 +598,7 @@ enum {
 				 blk_failfast_transport(rq) ||	\
 				 blk_failfast_driver(rq))
 #define blk_rq_started(rq)	((rq)->cmd_flags & REQ_STARTED)
+#define blk_rq_quiet(rq)	((rq)->cmd_flags & REQ_QUIET)
 
 #define blk_account_rq(rq)	(blk_rq_started(rq) && (blk_fs_request(rq) || blk_discard_rq(rq))) 
 
-- 
cgit v1.2.3


From 0f3d042ed2f934f149ccb78300454beaf0c1134b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 1 May 2009 09:10:46 -0700
Subject: netfilter: use likely() in xt_info_rdlock_bh()

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1b2e43502ef7..c9efe039dc57 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -472,7 +472,7 @@ static inline void xt_info_rdlock_bh(void)
 
 	local_bh_disable();
 	lock = &__get_cpu_var(xt_info_locks);
-	if (!lock->readers++)
+	if (likely(!lock->readers++))
 		spin_lock(&lock->lock);
 }
 
@@ -480,7 +480,7 @@ static inline void xt_info_rdunlock_bh(void)
 {
 	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
 
-	if (!--lock->readers)
+	if (likely(!--lock->readers))
 		spin_unlock(&lock->lock);
 	local_bh_enable();
 }
-- 
cgit v1.2.3


From c047fcd245975f40312ed57bf43e7d4abd188e6b Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 1 May 2009 15:34:02 -0700
Subject: virtio: add missing include to virtio_net.h

virtio_net.h uses the macro ETH_ALEN which is defined in linux/if_ether.h.
Discovered when hacking on virtio-over-pci patches.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 242348bb3766..cec79adbe3ea 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -4,6 +4,7 @@
  * compatible drivers/servers. */
 #include <linux/types.h>
 #include <linux/virtio_config.h>
+#include <linux/if_ether.h>
 
 /* The ID for virtio_net */
 #define VIRTIO_ID_NET	1
-- 
cgit v1.2.3


From ae3abae64f177586be55b04a7fb7047a34b21a3e Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Thu, 30 Apr 2009 15:08:19 -0700
Subject: memcg: fix mem_cgroup_shrink_usage()

Current mem_cgroup_shrink_usage() has two problems.

1. It doesn't call mem_cgroup_out_of_memory and doesn't update
   last_oom_jiffies, so pagefault_out_of_memory invokes global OOM.

2. Considering hierarchy, shrinking has to be done from the
   mem_over_limit, not from the memcg which the page would be charged to.

mem_cgroup_try_charge_swapin() does all of these things properly, so we
use it and call cancel_charge_swapin when it succeeded.

The name of "shrink_usage" is not appropriate for this behavior, so we
change it too.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.cn>
Cc: Paul Menage <menage@google.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  4 ++--
 mm/memcontrol.c            | 33 ++++++++++++---------------------
 mm/shmem.c                 |  8 ++++++--
 3 files changed, 20 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a9e3b76aa884..25b9ca93d232 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -56,7 +56,7 @@ extern void mem_cgroup_move_lists(struct page *page,
 				  enum lru_list from, enum lru_list to);
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
-extern int mem_cgroup_shrink_usage(struct page *page,
+extern int mem_cgroup_shmem_charge_fallback(struct page *page,
 			struct mm_struct *mm, gfp_t gfp_mask);
 
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -155,7 +155,7 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 }
 
-static inline int mem_cgroup_shrink_usage(struct page *page,
+static inline int mem_cgroup_shmem_charge_fallback(struct page *page,
 			struct mm_struct *mm, gfp_t gfp_mask)
 {
 	return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 575203ae2109..01c2d8f14685 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1617,37 +1617,28 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
 }
 
 /*
- * A call to try to shrink memory usage under specified resource controller.
- * This is typically used for page reclaiming for shmem for reducing side
- * effect of page allocation from shmem, which is used by some mem_cgroup.
+ * A call to try to shrink memory usage on charge failure at shmem's swapin.
+ * Calling hierarchical_reclaim is not enough because we should update
+ * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
+ * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
+ * not from the memcg which this page would be charged to.
+ * try_charge_swapin does all of these works properly.
  */
-int mem_cgroup_shrink_usage(struct page *page,
+int mem_cgroup_shmem_charge_fallback(struct page *page,
 			    struct mm_struct *mm,
 			    gfp_t gfp_mask)
 {
 	struct mem_cgroup *mem = NULL;
-	int progress = 0;
-	int retry = MEM_CGROUP_RECLAIM_RETRIES;
+	int ret;
 
 	if (mem_cgroup_disabled())
 		return 0;
-	if (page)
-		mem = try_get_mem_cgroup_from_swapcache(page);
-	if (!mem && mm)
-		mem = try_get_mem_cgroup_from_mm(mm);
-	if (unlikely(!mem))
-		return 0;
 
-	do {
-		progress = mem_cgroup_hierarchical_reclaim(mem,
-					gfp_mask, true, false);
-		progress += mem_cgroup_check_under_limit(mem);
-	} while (!progress && --retry);
+	ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
+	if (!ret)
+		mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
 
-	css_put(&mem->css);
-	if (!retry)
-		return -ENOMEM;
-	return 0;
+	return ret;
 }
 
 static DEFINE_MUTEX(set_limit_mutex);
diff --git a/mm/shmem.c b/mm/shmem.c
index f9cb20ebb990..b25f95ce3db7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1340,8 +1340,12 @@ repeat:
 			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			if (error == -ENOMEM) {
-				/* allow reclaim from this memory cgroup */
-				error = mem_cgroup_shrink_usage(swappage,
+				/*
+				 * reclaim from proper memory cgroup and
+				 * call memcg's OOM if needed.
+				 */
+				error = mem_cgroup_shmem_charge_fallback(
+								swappage,
 								current->mm,
 								gfp);
 				if (error) {
-- 
cgit v1.2.3


From 74641f584da8eccf30becfbb5507ab457187db22 Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Thu, 30 Apr 2009 15:08:49 -0700
Subject: alpha: binfmt_aout fix

This fixes the problem introduced by commit 3bfacef412 (get rid of
special-casing the /sbin/loader on alpha): osf/1 ecoff binary segfaults
when binfmt_aout built as module.  That happens because aout binary
handler gets on the top of the binfmt list due to late registration, and
kernel attempts to execute the binary without preparatory work that must
be done by binfmt_loader.

Fixed by changing the registration order of the default binfmt handlers
using list_add_tail() and introducing insert_binfmt() function which
places new handler on the top of the binfmt list.  This might be generally
useful for installing arch-specific frontends for default handlers or just
for overriding them.

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Richard Henderson <rth@twiddle.net
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/Makefile        |  6 +++++-
 arch/alpha/kernel/binfmt_loader.c |  2 +-
 fs/exec.c                         |  7 ++++---
 include/linux/binfmts.h           | 14 +++++++++++++-
 4 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/Makefile b/arch/alpha/kernel/Makefile
index a427538252f8..7739a62440a7 100644
--- a/arch/alpha/kernel/Makefile
+++ b/arch/alpha/kernel/Makefile
@@ -8,7 +8,7 @@ EXTRA_CFLAGS	:= -Werror -Wno-sign-compare
 
 obj-y    := entry.o traps.o process.o init_task.o osf_sys.o irq.o \
 	    irq_alpha.o signal.o setup.o ptrace.o time.o \
-	    alpha_ksyms.o systbls.o err_common.o io.o binfmt_loader.o
+	    alpha_ksyms.o systbls.o err_common.o io.o
 
 obj-$(CONFIG_VGA_HOSE)	+= console.o
 obj-$(CONFIG_SMP)	+= smp.o
@@ -43,6 +43,10 @@ else
 # Misc support
 obj-$(CONFIG_ALPHA_SRM)		+= srmcons.o
 
+ifdef CONFIG_BINFMT_AOUT
+obj-y	+= binfmt_loader.o
+endif
+
 # Core logic support
 obj-$(CONFIG_ALPHA_APECS)	+= core_apecs.o
 obj-$(CONFIG_ALPHA_CIA)		+= core_cia.o
diff --git a/arch/alpha/kernel/binfmt_loader.c b/arch/alpha/kernel/binfmt_loader.c
index 4a0af906b00a..3fcfad410130 100644
--- a/arch/alpha/kernel/binfmt_loader.c
+++ b/arch/alpha/kernel/binfmt_loader.c
@@ -46,6 +46,6 @@ static struct linux_binfmt loader_format = {
 
 static int __init init_loader_binfmt(void)
 {
-	return register_binfmt(&loader_format);
+	return insert_binfmt(&loader_format);
 }
 arch_initcall(init_loader_binfmt);
diff --git a/fs/exec.c b/fs/exec.c
index a3a8ce83940f..639177b0eeac 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -69,17 +69,18 @@ int suid_dumpable = 0;
 static LIST_HEAD(formats);
 static DEFINE_RWLOCK(binfmt_lock);
 
-int register_binfmt(struct linux_binfmt * fmt)
+int __register_binfmt(struct linux_binfmt * fmt, int insert)
 {
 	if (!fmt)
 		return -EINVAL;
 	write_lock(&binfmt_lock);
-	list_add(&fmt->lh, &formats);
+	insert ? list_add(&fmt->lh, &formats) :
+		 list_add_tail(&fmt->lh, &formats);
 	write_unlock(&binfmt_lock);
 	return 0;	
 }
 
-EXPORT_SYMBOL(register_binfmt);
+EXPORT_SYMBOL(__register_binfmt);
 
 void unregister_binfmt(struct linux_binfmt * fmt)
 {
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 6638b8148de7..61ee18c1bdb4 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -82,7 +82,19 @@ struct linux_binfmt {
 	int hasvdso;
 };
 
-extern int register_binfmt(struct linux_binfmt *);
+extern int __register_binfmt(struct linux_binfmt *fmt, int insert);
+
+/* Registration of default binfmt handlers */
+static inline int register_binfmt(struct linux_binfmt *fmt)
+{
+	return __register_binfmt(fmt, 0);
+}
+/* Same as above, but adds a new binfmt at the top of the list */
+static inline int insert_binfmt(struct linux_binfmt *fmt)
+{
+	return __register_binfmt(fmt, 1);
+}
+
 extern void unregister_binfmt(struct linux_binfmt *);
 
 extern int prepare_binprm(struct linux_binprm *);
-- 
cgit v1.2.3


From 0763ed2355198cdef2f6a2098e9d52eb1fe4365d Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 30 Apr 2009 15:08:50 -0700
Subject: of: make of_(un)register_platform_driver common code

Some drivers using of_register_platform_driver() wrapper break on sparc
because the wrapper isn't in the header file.  This patch moves it from
Microblaze and PowerPC implementations and makes it common code.

Fixes this sparc64 allmodconfig build error (at least):

drivers/leds/leds-gpio.c: In function `gpio_led_init':
drivers/leds/leds-gpio.c:295: error: implicit declaration of function `of_register_platform_driver'
drivers/leds/leds-gpio.c: In function `gpio_led_exit':
drivers/leds/leds-gpio.c:311: error: implicit declaration of function `of_unregister_platform_driver'

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/microblaze/include/asm/of_platform.h | 10 ----------
 arch/powerpc/include/asm/of_platform.h    | 10 ----------
 include/linux/of_platform.h               | 10 ++++++++++
 3 files changed, 10 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_platform.h b/arch/microblaze/include/asm/of_platform.h
index 187c0eedaece..37491276c6ca 100644
--- a/arch/microblaze/include/asm/of_platform.h
+++ b/arch/microblaze/include/asm/of_platform.h
@@ -36,16 +36,6 @@ static const struct of_device_id of_default_bus_ids[] = {
 	{},
 };
 
-/* Platform drivers register/unregister */
-static inline int of_register_platform_driver(struct of_platform_driver *drv)
-{
-	return of_register_driver(drv, &of_platform_bus_type);
-}
-static inline void of_unregister_platform_driver(struct of_platform_driver *drv)
-{
-	of_unregister_driver(drv);
-}
-
 /* Platform devices and busses creation */
 extern struct of_device *of_platform_device_create(struct device_node *np,
 						const char *bus_id,
diff --git a/arch/powerpc/include/asm/of_platform.h b/arch/powerpc/include/asm/of_platform.h
index 53b46507ffde..d4aaa3489440 100644
--- a/arch/powerpc/include/asm/of_platform.h
+++ b/arch/powerpc/include/asm/of_platform.h
@@ -11,16 +11,6 @@
  *
  */
 
-/* Platform drivers register/unregister */
-static inline int of_register_platform_driver(struct of_platform_driver *drv)
-{
-	return of_register_driver(drv, &of_platform_bus_type);
-}
-static inline void of_unregister_platform_driver(struct of_platform_driver *drv)
-{
-	of_unregister_driver(drv);
-}
-
 /* Platform devices and busses creation */
 extern struct of_device *of_platform_device_create(struct device_node *np,
 						   const char *bus_id,
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 3d327b67d7e2..908406651330 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -51,6 +51,16 @@ extern int of_register_driver(struct of_platform_driver *drv,
 			      struct bus_type *bus);
 extern void of_unregister_driver(struct of_platform_driver *drv);
 
+/* Platform drivers register/unregister */
+static inline int of_register_platform_driver(struct of_platform_driver *drv)
+{
+	return of_register_driver(drv, &of_platform_bus_type);
+}
+static inline void of_unregister_platform_driver(struct of_platform_driver *drv)
+{
+	of_unregister_driver(drv);
+}
+
 #include <asm/of_platform.h>
 
 extern struct of_device *of_find_device_by_node(struct device_node *np);
-- 
cgit v1.2.3


From 00a62ce91e554198ef28234c91c36f850f5a3bc9 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 30 Apr 2009 15:08:51 -0700
Subject: mm: fix Committed_AS underflow on large NR_CPUS environment

The Committed_AS field can underflow in certain situations:

>         # while true; do cat /proc/meminfo  | grep _AS; sleep 1; done | uniq -c
>               1 Committed_AS: 18446744073709323392 kB
>              11 Committed_AS: 18446744073709455488 kB
>               6 Committed_AS:    35136 kB
>               5 Committed_AS: 18446744073709454400 kB
>               7 Committed_AS:    35904 kB
>               3 Committed_AS: 18446744073709453248 kB
>               2 Committed_AS:    34752 kB
>               9 Committed_AS: 18446744073709453248 kB
>               8 Committed_AS:    34752 kB
>               3 Committed_AS: 18446744073709320960 kB
>               7 Committed_AS: 18446744073709454080 kB
>               3 Committed_AS: 18446744073709320960 kB
>               5 Committed_AS: 18446744073709454080 kB
>               6 Committed_AS: 18446744073709320960 kB

Because NR_CPUS can be greater than 1000 and meminfo_proc_show() does
not check for underflow.

But NR_CPUS proportional isn't good calculation.  In general,
possibility of lock contention is proportional to the number of online
cpus, not theorical maximum cpus (NR_CPUS).

The current kernel has generic percpu-counter stuff.  using it is right
way.  it makes code simplify and percpu_counter_read_positive() don't
make underflow issue.

Reported-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Eric B Munson <ebmunson@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: <stable@kernel.org>		[All kernel versions]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c    |  2 +-
 include/linux/mman.h |  9 +++------
 mm/mmap.c            | 12 ++++++------
 mm/nommu.c           | 13 +++++++------
 mm/swap.c            | 46 ----------------------------------------------
 5 files changed, 17 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 74ea974f5ca6..c6b0302af4c4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -35,7 +35,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	si_meminfo(&i);
 	si_swapinfo(&i);
-	committed = atomic_long_read(&vm_committed_space);
+	committed = percpu_counter_read_positive(&vm_committed_as);
 	allowed = ((totalram_pages - hugetlb_total_pages())
 		* sysctl_overcommit_ratio / 100) + total_swap_pages;
 
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 30d1073bac3b..9872d6ca58ae 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -12,21 +12,18 @@
 
 #ifdef __KERNEL__
 #include <linux/mm.h>
+#include <linux/percpu_counter.h>
 
 #include <asm/atomic.h>
 
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
-extern atomic_long_t vm_committed_space;
+extern struct percpu_counter vm_committed_as;
 
-#ifdef CONFIG_SMP
-extern void vm_acct_memory(long pages);
-#else
 static inline void vm_acct_memory(long pages)
 {
-	atomic_long_add(pages, &vm_committed_space);
+	percpu_counter_add(&vm_committed_as, pages);
 }
-#endif
 
 static inline void vm_unacct_memory(long pages)
 {
diff --git a/mm/mmap.c b/mm/mmap.c
index 3303d1ba8e87..6b7b1a95944b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
 
 /*
  * Check that a process has enough memory to allocate a new virtual
@@ -179,11 +179,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 	if (mm)
 		allowed -= mm->total_vm / 32;
 
-	/*
-	 * cast `allowed' as a signed long because vm_committed_space
-	 * sometimes has a negative value
-	 */
-	if (atomic_long_read(&vm_committed_space) < (long)allowed)
+	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 		return 0;
 error:
 	vm_unacct_memory(pages);
@@ -2481,4 +2477,8 @@ void mm_drop_all_locks(struct mm_struct *mm)
  */
 void __init mmap_init(void)
 {
+	int ret;
+
+	ret = percpu_counter_init(&vm_committed_as, 0);
+	VM_BUG_ON(ret);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 72eda4aee2cb..809998aa7b50 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -62,7 +62,7 @@ void *high_memory;
 struct page *mem_map;
 unsigned long max_mapnr;
 unsigned long num_physpages;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
@@ -463,6 +463,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
  */
 void __init mmap_init(void)
 {
+	int ret;
+
+	ret = percpu_counter_init(&vm_committed_as, 0);
+	VM_BUG_ON(ret);
 	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
 
@@ -1847,12 +1851,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 	if (mm)
 		allowed -= mm->total_vm / 32;
 
-	/*
-	 * cast `allowed' as a signed long because vm_committed_space
-	 * sometimes has a negative value
-	 */
-	if (atomic_long_read(&vm_committed_space) < (long)allowed)
+	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 		return 0;
+
 error:
 	vm_unacct_memory(pages);
 
diff --git a/mm/swap.c b/mm/swap.c
index bede23ce64ea..cb29ae5d33ab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -491,49 +491,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 
 EXPORT_SYMBOL(pagevec_lookup_tag);
 
-#ifdef CONFIG_SMP
-/*
- * We tolerate a little inaccuracy to avoid ping-ponging the counter between
- * CPUs
- */
-#define ACCT_THRESHOLD	max(16, NR_CPUS * 2)
-
-static DEFINE_PER_CPU(long, committed_space);
-
-void vm_acct_memory(long pages)
-{
-	long *local;
-
-	preempt_disable();
-	local = &__get_cpu_var(committed_space);
-	*local += pages;
-	if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
-		atomic_long_add(*local, &vm_committed_space);
-		*local = 0;
-	}
-	preempt_enable();
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* Drop the CPU's cached committed space back into the central pool. */
-static int cpu_swap_callback(struct notifier_block *nfb,
-			     unsigned long action,
-			     void *hcpu)
-{
-	long *committed;
-
-	committed = &per_cpu(committed_space, (long)hcpu);
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		atomic_long_add(*committed, &vm_committed_space);
-		*committed = 0;
-		drain_cpu_pagevecs((long)hcpu);
-	}
-	return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-#endif /* CONFIG_SMP */
-
 /*
  * Perform any setup for the swap system
  */
@@ -554,7 +511,4 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
-#ifdef CONFIG_HOTPLUG_CPU
-	hotcpu_notifier(cpu_swap_callback, 0);
-#endif
 }
-- 
cgit v1.2.3


From f75e6745aa3084124ae1434fd7629853bdaf6798 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 21 Apr 2009 17:18:20 -0400
Subject: SUNRPC: Fix the problem of EADDRNOTAVAIL syslog floods on reconnect

See http://bugzilla.kernel.org/show_bug.cgi?id=13034

If the port gets into a TIME_WAIT state, then we cannot reconnect without
binding to a new port.

Tested-by: Petr Vandrovec <petr@vandrovec.name>
Tested-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprt.c           |  6 ++----
 net/sunrpc/xprtsock.c       | 26 +++++++++++++++++++++-----
 3 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 1758d9f5b5c3..08afe43118f4 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -261,6 +261,7 @@ void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
 #define XPRT_BINDING		(5)
 #define XPRT_CLOSING		(6)
 #define XPRT_CONNECTION_ABORT	(7)
+#define XPRT_CONNECTION_CLOSE	(8)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a0bfe53f1621..06ca058572f2 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -672,10 +672,8 @@ xprt_init_autodisconnect(unsigned long data)
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
 		goto out_abort;
 	spin_unlock(&xprt->transport_lock);
-	if (xprt_connecting(xprt))
-		xprt_release_write(xprt, NULL);
-	else
-		queue_work(rpciod_workqueue, &xprt->task_cleanup);
+	set_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
+	queue_work(rpciod_workqueue, &xprt->task_cleanup);
 	return;
 out_abort:
 	spin_unlock(&xprt->transport_lock);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index d40ff50887aa..e18596146013 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -807,6 +807,9 @@ static void xs_reset_transport(struct sock_xprt *transport)
  *
  * This is used when all requests are complete; ie, no DRC state remains
  * on the server we want to save.
+ *
+ * The caller _must_ be holding XPRT_LOCKED in order to avoid issues with
+ * xs_reset_transport() zeroing the socket from underneath a writer.
  */
 static void xs_close(struct rpc_xprt *xprt)
 {
@@ -824,6 +827,14 @@ static void xs_close(struct rpc_xprt *xprt)
 	xprt_disconnect_done(xprt);
 }
 
+static void xs_tcp_close(struct rpc_xprt *xprt)
+{
+	if (test_and_clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state))
+		xs_close(xprt);
+	else
+		xs_tcp_shutdown(xprt);
+}
+
 /**
  * xs_destroy - prepare to shutdown a transport
  * @xprt: doomed transport
@@ -1772,6 +1783,15 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
 			xprt, -status, xprt_connected(xprt),
 			sock->sk->sk_state);
 	switch (status) {
+	default:
+		printk("%s: connect returned unhandled error %d\n",
+			__func__, status);
+	case -EADDRNOTAVAIL:
+		/* We're probably in TIME_WAIT. Get rid of existing socket,
+		 * and retry
+		 */
+		set_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
+		xprt_force_disconnect(xprt);
 	case -ECONNREFUSED:
 	case -ECONNRESET:
 	case -ENETUNREACH:
@@ -1782,10 +1802,6 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
 		xprt_clear_connecting(xprt);
 		return;
 	}
-	/* get rid of existing socket, and retry */
-	xs_tcp_shutdown(xprt);
-	printk("%s: connect returned unhandled error %d\n",
-			__func__, status);
 out_eagain:
 	status = -EAGAIN;
 out:
@@ -1994,7 +2010,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.buf_free		= rpc_free,
 	.send_request		= xs_tcp_send_request,
 	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
-	.close			= xs_tcp_shutdown,
+	.close			= xs_tcp_close,
 	.destroy		= xs_destroy,
 	.print_stats		= xs_tcp_print_stats,
 };
-- 
cgit v1.2.3


From 9f722c0978b04acba209f8ca1896ad05814bc3a3 Mon Sep 17 00:00:00 2001
From: Omar Laazimani <omar.oberthur@gmail.com>
Date: Mon, 4 May 2009 12:01:43 -0700
Subject: usbnet: CDC EEM support (v5)

This introduces a CDC Ethernet Emulation Model (EEM) host side
driver to support USB EEM devices.

EEM is different from the Ethernet Control Model (ECM) currently
supported by the "CDC Ethernet" driver.  One key difference is
that it doesn't require of USB interface alternate settings to
manage interface state; some maldesigned hardware can't handle
that part of USB.  It also avoids a separate USB interface for
control and status updates.

[ dbrownell@users.sourceforge.net: fix skb leaks, add rx packet
checks, improve fault handling, EEM conformance updates, cleanup ]

Signed-off-by: Omar Laazimani <omar.oberthur@gmail.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/Kconfig   |  14 ++
 drivers/net/usb/Makefile  |   1 +
 drivers/net/usb/cdc_eem.c | 381 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/cdc.h   |   3 +
 4 files changed, 399 insertions(+)
 create mode 100644 drivers/net/usb/cdc_eem.c

(limited to 'include/linux')

diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig
index 8ee21030e9ac..dfc6cf765fbd 100644
--- a/drivers/net/usb/Kconfig
+++ b/drivers/net/usb/Kconfig
@@ -180,6 +180,20 @@ config USB_NET_CDCETHER
 	  IEEE 802 "local assignment" bit is set in the address, a "usbX"
 	  name is used instead.
 
+config USB_NET_CDC_EEM
+	tristate "CDC EEM support"
+	depends on USB_USBNET && EXPERIMENTAL
+	help
+	  This option supports devices conforming to the Communication Device
+	  Class (CDC) Ethernet Emulation Model, a specification that's easy to
+	  implement in device firmware.  The CDC EEM specifications are available
+	  from <http://www.usb.org/>.
+
+	  This driver creates an interface named "ethX", where X depends on
+	  what other networking devices you have in use.  However, if the
+	  IEEE 802 "local assignment" bit is set in the address, a "usbX"
+	  name is used instead.
+
 config USB_NET_DM9601
 	tristate "Davicom DM9601 based USB 1.1 10/100 ethernet devices"
 	depends on USB_USBNET
diff --git a/drivers/net/usb/Makefile b/drivers/net/usb/Makefile
index 88a87eeb376a..c8aef62cf2b7 100644
--- a/drivers/net/usb/Makefile
+++ b/drivers/net/usb/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_USB_RTL8150)	+= rtl8150.o
 obj-$(CONFIG_USB_HSO)		+= hso.o
 obj-$(CONFIG_USB_NET_AX8817X)	+= asix.o
 obj-$(CONFIG_USB_NET_CDCETHER)	+= cdc_ether.o
+obj-$(CONFIG_USB_NET_CDC_EEM)	+= cdc_eem.o
 obj-$(CONFIG_USB_NET_DM9601)	+= dm9601.o
 obj-$(CONFIG_USB_NET_SMSC95XX)	+= smsc95xx.o
 obj-$(CONFIG_USB_NET_GL620A)	+= gl620a.o
diff --git a/drivers/net/usb/cdc_eem.c b/drivers/net/usb/cdc_eem.c
new file mode 100644
index 000000000000..80e01778dd3b
--- /dev/null
+++ b/drivers/net/usb/cdc_eem.c
@@ -0,0 +1,381 @@
+/*
+ * USB CDC EEM network interface driver
+ * Copyright (C) 2009 Oberthur Technologies
+ * by Omar Laazimani, Olivier Condemine
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ctype.h>
+#include <linux/ethtool.h>
+#include <linux/workqueue.h>
+#include <linux/mii.h>
+#include <linux/usb.h>
+#include <linux/crc32.h>
+#include <linux/usb/cdc.h>
+#include <linux/usb/usbnet.h>
+
+
+/*
+ * This driver is an implementation of the CDC "Ethernet Emulation
+ * Model" (EEM) specification, which encapsulates Ethernet frames
+ * for transport over USB using a simpler USB device model than the
+ * previous CDC "Ethernet Control Model" (ECM, or "CDC Ethernet").
+ *
+ * For details, see www.usb.org/developers/devclass_docs/CDC_EEM10.pdf
+ *
+ * This version has been tested with GIGAntIC WuaoW SIM Smart Card on 2.6.24,
+ * 2.6.27 and 2.6.30rc2 kernel.
+ * It has also been validated on Openmoko Om 2008.12 (based on 2.6.24 kernel).
+ * build on 23-April-2009
+ */
+
+#define EEM_HEAD	2		/* 2 byte header */
+
+/*-------------------------------------------------------------------------*/
+
+static void eem_linkcmd_complete(struct urb *urb)
+{
+	dev_kfree_skb(urb->context);
+	usb_free_urb(urb);
+}
+
+static void eem_linkcmd(struct usbnet *dev, struct sk_buff *skb)
+{
+	struct urb		*urb;
+	int			status;
+
+	urb = usb_alloc_urb(0, GFP_ATOMIC);
+	if (!urb)
+		goto fail;
+
+	usb_fill_bulk_urb(urb, dev->udev, dev->out,
+			skb->data, skb->len, eem_linkcmd_complete, skb);
+
+	status = usb_submit_urb(urb, GFP_ATOMIC);
+	if (status) {
+		usb_free_urb(urb);
+fail:
+		dev_kfree_skb(skb);
+		devwarn(dev, "link cmd failure\n");
+		return;
+	}
+}
+
+static int eem_bind(struct usbnet *dev, struct usb_interface *intf)
+{
+	int status = 0;
+
+	status = usbnet_get_endpoints(dev, intf);
+	if (status < 0) {
+		usb_set_intfdata(intf, NULL);
+		usb_driver_release_interface(driver_of(intf), intf);
+		return status;
+	}
+
+	/* no jumbogram (16K) support for now */
+
+	dev->net->hard_header_len += EEM_HEAD + ETH_FCS_LEN;
+
+	return 0;
+}
+
+/*
+ * EEM permits packing multiple Ethernet frames into USB transfers
+ * (a "bundle"), but for TX we don't try to do that.
+ */
+static struct sk_buff *eem_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
+				       gfp_t flags)
+{
+	struct sk_buff	*skb2 = NULL;
+	u16		len = skb->len;
+	u32		crc = 0;
+	int		padlen = 0;
+
+	/* When ((len + EEM_HEAD + ETH_FCS_LEN) % dev->maxpacket) is
+	 * zero, stick two bytes of zero length EEM packet on the end.
+	 * Else the framework would add invalid single byte padding,
+	 * since it can't know whether ZLPs will be handled right by
+	 * all the relevant hardware and software.
+	 */
+	if (!((len + EEM_HEAD + ETH_FCS_LEN) % dev->maxpacket))
+		padlen += 2;
+
+	if (!skb_cloned(skb)) {
+		int	headroom = skb_headroom(skb);
+		int	tailroom = skb_tailroom(skb);
+
+		if ((tailroom >= ETH_FCS_LEN + padlen)
+				&& (headroom >= EEM_HEAD))
+			goto done;
+
+		if ((headroom + tailroom)
+				> (EEM_HEAD + ETH_FCS_LEN + padlen)) {
+			skb->data = memmove(skb->head +
+					EEM_HEAD,
+					skb->data,
+					skb->len);
+			skb_set_tail_pointer(skb, len);
+			goto done;
+		}
+	}
+
+	skb2 = skb_copy_expand(skb, EEM_HEAD, ETH_FCS_LEN + padlen, flags);
+	if (!skb2)
+		return NULL;
+
+	dev_kfree_skb_any(skb);
+	skb = skb2;
+
+done:
+	/* we don't use the "no Ethernet CRC" option */
+	crc = crc32_le(~0, skb->data, skb->len);
+	crc = ~crc;
+
+	put_unaligned_le32(crc, skb_put(skb, 4));
+
+	/* EEM packet header format:
+	 * b0..13:	length of ethernet frame
+	 * b14:		bmCRC (1 == valid Ethernet CRC)
+	 * b15:		bmType (0 == data)
+	 */
+	len = skb->len;
+	put_unaligned_le16(BIT(14) | len, skb_push(skb, 2));
+
+	/* Bundle a zero length EEM packet if needed */
+	if (padlen)
+		put_unaligned_le16(0, skb_put(skb, 2));
+
+	return skb;
+}
+
+static int eem_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
+{
+	/*
+	 * Our task here is to strip off framing, leaving skb with one
+	 * data frame for the usbnet framework code to process.  But we
+	 * may have received multiple EEM payloads, or command payloads.
+	 * So we must process _everything_ as if it's a header, except
+	 * maybe the last data payload
+	 *
+	 * REVISIT the framework needs updating so that when we consume
+	 * all payloads (the last or only message was a command, or a
+	 * zero length EEM packet) that is not accounted as an rx_error.
+	 */
+	do {
+		struct sk_buff	*skb2 = NULL;
+		u16		header;
+		u16		len = 0;
+
+		/* incomplete EEM header? */
+		if (skb->len < EEM_HEAD)
+			return 0;
+
+		/*
+		 * EEM packet header format:
+		 * b0..14:	EEM type dependant (Data or Command)
+		 * b15:		bmType
+		 */
+		header = get_unaligned_le16(skb->data);
+		skb_pull(skb, EEM_HEAD);
+
+		/*
+		 * The bmType bit helps to denote when EEM
+		 * packet is data or command :
+		 *	bmType = 0	: EEM data payload
+		 *	bmType = 1	: EEM (link) command
+		 */
+		if (header & BIT(15)) {
+			u16	bmEEMCmd;
+
+			/*
+			 * EEM (link) command packet:
+			 * b0..10:	bmEEMCmdParam
+			 * b11..13:	bmEEMCmd
+			 * b14:		bmReserved (must be 0)
+			 * b15:		1 (EEM command)
+			 */
+			if (header & BIT(14)) {
+				devdbg(dev, "reserved command %04x\n", header);
+				continue;
+			}
+
+			bmEEMCmd = (header >> 11) & 0x7;
+			switch (bmEEMCmd) {
+
+			/* Responding to echo requests is mandatory. */
+			case 0:		/* Echo command */
+				len = header & 0x7FF;
+
+				/* bogus command? */
+				if (skb->len < len)
+					return 0;
+
+				skb2 = skb_clone(skb, GFP_ATOMIC);
+				if (unlikely(!skb2))
+					goto next;
+				skb_trim(skb2, len);
+				put_unaligned_le16(BIT(15) | (1 << 11) | len,
+						skb_push(skb2, 2));
+				eem_linkcmd(dev, skb2);
+				break;
+
+			/*
+			 * Host may choose to ignore hints.
+			 *  - suspend: peripheral ready to suspend
+			 *  - response: suggest N millisec polling
+			 *  - response complete: suggest N sec polling
+			 */
+			case 2:		/* Suspend hint */
+			case 3:		/* Response hint */
+			case 4:		/* Response complete hint */
+				continue;
+
+			/*
+			 * Hosts should never receive host-to-peripheral
+			 * or reserved command codes; or responses to an
+			 * echo command we didn't send.
+			 */
+			case 1:		/* Echo response */
+			case 5:		/* Tickle */
+			default:	/* reserved */
+				devwarn(dev, "unexpected link command %d\n",
+						bmEEMCmd);
+				continue;
+			}
+
+		} else {
+			u32	crc, crc2;
+			int	is_last;
+
+			/* zero length EEM packet? */
+			if (header == 0)
+				continue;
+
+			/*
+			 * EEM data packet header :
+			 * b0..13:	length of ethernet frame
+			 * b14:		bmCRC
+			 * b15:		0 (EEM data)
+			 */
+			len = header & 0x3FFF;
+
+			/* bogus EEM payload? */
+			if (skb->len < len)
+				return 0;
+
+			/* bogus ethernet frame? */
+			if (len < (ETH_HLEN + ETH_FCS_LEN))
+				goto next;
+
+			/*
+			 * Treat the last payload differently: framework
+			 * code expects our "fixup" to have stripped off
+			 * headers, so "skb" is a data packet (or error).
+			 * Else if it's not the last payload, keep "skb"
+			 * for further processing.
+			 */
+			is_last = (len == skb->len);
+			if (is_last)
+				skb2 = skb;
+			else {
+				skb2 = skb_clone(skb, GFP_ATOMIC);
+				if (unlikely(!skb2))
+					return 0;
+			}
+
+			crc = get_unaligned_le32(skb2->data
+					+ len - ETH_FCS_LEN);
+			skb_trim(skb2, len - ETH_FCS_LEN);
+
+			/*
+			 * The bmCRC helps to denote when the CRC field in
+			 * the Ethernet frame contains a calculated CRC:
+			 *	bmCRC = 1	: CRC is calculated
+			 *	bmCRC = 0	: CRC = 0xDEADBEEF
+			 */
+			if (header & BIT(14))
+				crc2 = ~crc32_le(~0, skb2->data, len);
+			else
+				crc2 = 0xdeadbeef;
+
+			if (is_last)
+				return crc == crc2;
+
+			if (unlikely(crc != crc2)) {
+				dev->stats.rx_errors++;
+				dev_kfree_skb_any(skb2);
+			} else
+				usbnet_skb_return(dev, skb2);
+		}
+
+next:
+		skb_pull(skb, len);
+	} while (skb->len);
+
+	return 1;
+}
+
+static const struct driver_info eem_info = {
+	.description =	"CDC EEM Device",
+	.flags =	FLAG_ETHER,
+	.bind =		eem_bind,
+	.rx_fixup =	eem_rx_fixup,
+	.tx_fixup =	eem_tx_fixup,
+};
+
+/*-------------------------------------------------------------------------*/
+
+static const struct usb_device_id products[] = {
+{
+	USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_EEM,
+			USB_CDC_PROTO_EEM),
+	.driver_info = (unsigned long) &eem_info,
+},
+{
+	/* EMPTY == end of list */
+},
+};
+MODULE_DEVICE_TABLE(usb, products);
+
+static struct usb_driver eem_driver = {
+	.name =		"cdc_eem",
+	.id_table =	products,
+	.probe =	usbnet_probe,
+	.disconnect =	usbnet_disconnect,
+	.suspend =	usbnet_suspend,
+	.resume =	usbnet_resume,
+};
+
+
+static int __init eem_init(void)
+{
+	return usb_register(&eem_driver);
+}
+module_init(eem_init);
+
+static void __exit eem_exit(void)
+{
+	usb_deregister(&eem_driver);
+}
+module_exit(eem_exit);
+
+MODULE_AUTHOR("Omar Laazimani <omar.oberthur@gmail.com>");
+MODULE_DESCRIPTION("USB CDC EEM");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/usb/cdc.h b/include/linux/usb/cdc.h
index 3c86ed25a04c..c24124a42ce5 100644
--- a/include/linux/usb/cdc.h
+++ b/include/linux/usb/cdc.h
@@ -17,6 +17,7 @@
 #define USB_CDC_SUBCLASS_DMM			0x09
 #define USB_CDC_SUBCLASS_MDLM			0x0a
 #define USB_CDC_SUBCLASS_OBEX			0x0b
+#define USB_CDC_SUBCLASS_EEM			0x0c
 
 #define USB_CDC_PROTO_NONE			0
 
@@ -28,6 +29,8 @@
 #define USB_CDC_ACM_PROTO_AT_CDMA		6
 #define USB_CDC_ACM_PROTO_VENDOR		0xff
 
+#define USB_CDC_PROTO_EEM			7
+
 /*-------------------------------------------------------------------------*/
 
 /*
-- 
cgit v1.2.3


From a7ca7fccacc029958fd09985e7f3529b90ec791d Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 5 May 2009 14:31:12 +0200
Subject: netfilter: add missing linux/types.h include to xt_LED.h

Pointed out by Dave Miller:

  CHECK   include/linux/netfilter (57 files)
/home/davem/src/GIT/net-2.6/usr/include/linux/netfilter/xt_LED.h:6: found __[us]{8,16,32,64} type without #include <linux/types.h>

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_LED.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_LED.h b/include/linux/netfilter/xt_LED.h
index 4c91a0d770d0..f5509e7524d3 100644
--- a/include/linux/netfilter/xt_LED.h
+++ b/include/linux/netfilter/xt_LED.h
@@ -1,6 +1,8 @@
 #ifndef _XT_LED_H
 #define _XT_LED_H
 
+#include <linux/types.h>
+
 struct xt_led_info {
 	char id[27];        /* Unique ID for this trigger in the LED class */
 	__u8 always_blink;  /* Blink even if the LED is already on */
-- 
cgit v1.2.3


From 280f37afa2c270ff029cb420b34396aa002909c3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 5 May 2009 17:46:07 +0200
Subject: netfilter: xt_cluster: fix use of cluster match with 32 nodes

This patch fixes a problem when you use 32 nodes in the cluster
match:

% iptables -I PREROUTING -t mangle -i eth0 -m cluster \
  --cluster-total-nodes  32  --cluster-local-node  32 \
  --cluster-hash-seed 0xdeadbeef -j MARK --set-mark 0xffff
iptables: Invalid argument. Run `dmesg' for more information.
% dmesg | tail -1
xt_cluster: this node mask cannot be higher than the total number of nodes

The problem is related to this checking:

if (info->node_mask >= (1 << info->total_nodes)) {
	printk(KERN_ERR "xt_cluster: this node mask cannot be "
			"higher than the total number of nodes\n");
	return false;
}

(1 << 32) is 1. Thus, the checking fails.

BTW, I said this before but I insist: I have only tested the cluster
match with 2 nodes getting ~45% extra performance in an active-active setup.
The maximum limit of 32 nodes is still completely arbitrary. I'd really
appreciate if people that have more nodes in their setups let me know.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_cluster.h | 2 ++
 net/netfilter/xt_cluster.c           | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_cluster.h b/include/linux/netfilter/xt_cluster.h
index 5e0a0d07b526..886682656f09 100644
--- a/include/linux/netfilter/xt_cluster.h
+++ b/include/linux/netfilter/xt_cluster.h
@@ -12,4 +12,6 @@ struct xt_cluster_match_info {
 	u_int32_t		flags;
 };
 
+#define XT_CLUSTER_NODES_MAX	32
+
 #endif /* _XT_CLUSTER_MATCH_H */
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index 6c4847662b85..69a639f35403 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -135,7 +135,13 @@ static bool xt_cluster_mt_checkentry(const struct xt_mtchk_param *par)
 {
 	struct xt_cluster_match_info *info = par->matchinfo;
 
-	if (info->node_mask >= (1 << info->total_nodes)) {
+	if (info->total_nodes > XT_CLUSTER_NODES_MAX) {
+		printk(KERN_ERR "xt_cluster: you have exceeded the maximum "
+				"number of cluster nodes (%u > %u)\n",
+				info->total_nodes, XT_CLUSTER_NODES_MAX);
+		return false;
+	}
+	if (info->node_mask >= (1ULL << info->total_nodes)) {
 		printk(KERN_ERR "xt_cluster: this node mask cannot be "
 				"higher than the total number of nodes\n");
 		return false;
-- 
cgit v1.2.3


From e67c85626cd02e306da1b4195bfaf68d61050796 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Sun, 8 Mar 2009 23:13:32 +0800
Subject: Revert driver core: move platform_data into platform_device

This reverts commit 006f4571a15fae3a0575f2a0f9e9b63b3d1012f8:

	This patch moves platform_data from struct device into
	struct platform_device, based on the two ideas:

	1. Now all platform_driver is registered by platform_driver_register,
	   which makes probe()/release()/... of platform_driver passed parameter
	   of platform_device *, so platform driver can get platform_data from
	   platform_device;

	2. Other kind of devices do not need to use platform_data, we can
	   decrease size of device if moving it to platform_device.

	Taking into consideration of thousands of files to be fixed and they
	can't be finished in one night(maybe it will take a long time), so we
	keep platform_data in device to allow two kind of cases coexist until
	all platform devices pass its platfrom data from
	platform_device->platform_data.

	All patches to do this kind of conversion are welcome.

As we don't really want to do it, it was a bad idea.

Cc: David Brownell <david-b@pacbell.net>
Cc: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/platform.c         | 3 ---
 include/linux/device.h          | 9 ++-------
 include/linux/platform_device.h | 1 -
 3 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index d1d0ee431926..8b4708e06244 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -217,7 +217,6 @@ int platform_device_add_data(struct platform_device *pdev, const void *data,
 	if (d) {
 		memcpy(d, data, size);
 		pdev->dev.platform_data = d;
-		pdev->platform_data = d;
 	}
 	return d ? 0 : -ENOMEM;
 }
@@ -247,8 +246,6 @@ int platform_device_add(struct platform_device *pdev)
 	else
 		dev_set_name(&pdev->dev, pdev->name);
 
-	pdev->platform_data = pdev->dev.platform_data;
-
 	for (i = 0; i < pdev->num_resources; i++) {
 		struct resource *p, *r = &pdev->resource[i];
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 6a69caaac18a..5d5c197bad45 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -384,13 +384,8 @@ struct device {
 	struct device_driver *driver;	/* which driver has allocated this
 					   device */
 	void		*driver_data;	/* data private to the driver */
-
-	void		*platform_data;	/* We will remove platform_data
-					   field if all platform devices
-					   pass its platform specific data
-					   from platform_device->platform_data,
-					   other kind of devices should not
-					   use platform_data. */
+	void		*platform_data;	/* Platform specific data, device
+					   core doesn't touch it */
 	struct dev_pm_info	power;
 
 #ifdef CONFIG_NUMA
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 72736fd8223c..b67bb5d7b221 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -20,7 +20,6 @@ struct platform_device {
 	struct device	dev;
 	u32		num_resources;
 	struct resource	* resource;
-	void		*platform_data;
 
 	struct platform_device_id	*id_entry;
 };
-- 
cgit v1.2.3


From edcc37a0478836b4a51eafb1bcec6a52708f681d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 3 May 2009 06:00:05 -0400
Subject: Always lookup priv_root on reiserfs mount and keep it

... even if it's a negative dentry.  That way we can set ->d_op on
root before anyone could race with us.  Simplify d_compare(), while
we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/super.c            |  6 ++-
 fs/reiserfs/xattr.c            | 86 ++++++++++++++++++------------------------
 include/linux/reiserfs_xattr.h |  1 +
 3 files changed, 41 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0ae6486d9046..d444fe0013a4 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1842,7 +1842,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 			goto error;
 		}
 
-		if ((errval = reiserfs_xattr_init(s, s->s_flags))) {
+		if ((errval = reiserfs_lookup_privroot(s)) ||
+		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
 			dput(s->s_root);
 			s->s_root = NULL;
 			goto error;
@@ -1855,7 +1856,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 			reiserfs_info(s, "using 3.5.x disk format\n");
 		}
 
-		if ((errval = reiserfs_xattr_init(s, s->s_flags))) {
+		if ((errval = reiserfs_lookup_privroot(s)) ||
+		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
 			dput(s->s_root);
 			s->s_root = NULL;
 			goto error;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 31a3dbb120e1..2891f789f545 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -903,16 +903,19 @@ static int create_privroot(struct dentry *dentry)
 	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
 
 	err = xattr_mkdir(inode, dentry, 0700);
-	if (err) {
-		dput(dentry);
-		dentry = NULL;
+	if (err || !dentry->d_inode) {
+		reiserfs_warning(dentry->d_sb, "jdm-20006",
+				 "xattrs/ACLs enabled and couldn't "
+				 "find/create .reiserfs_priv. "
+				 "Failing mount.");
+		return -EOPNOTSUPP;
 	}
 
-	if (dentry && dentry->d_inode)
-		reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
-			      "storage.\n", PRIVROOT_NAME);
+	dentry->d_inode->i_flags |= S_PRIVATE;
+	reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
+		      "storage.\n", PRIVROOT_NAME);
 
-	return err;
+	return 0;
 }
 
 static int xattr_mount_check(struct super_block *s)
@@ -944,11 +947,9 @@ static int
 xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
 {
 	struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
-	if (name->len == priv_root->d_name.len &&
-	    name->hash == priv_root->d_name.hash &&
-	    !memcmp(name->name, priv_root->d_name.name, name->len)) {
+	if (container_of(q1, struct dentry, d_name) == priv_root)
 		return -ENOENT;
-	} else if (q1->len == name->len &&
+	if (q1->len == name->len &&
 		   !memcmp(q1->name, name->name, name->len))
 		return 0;
 	return 1;
@@ -958,6 +959,27 @@ static const struct dentry_operations xattr_lookup_poison_ops = {
 	.d_compare = xattr_lookup_poison,
 };
 
+int reiserfs_lookup_privroot(struct super_block *s)
+{
+	struct dentry *dentry;
+	int err = 0;
+
+	/* If we don't have the privroot located yet - go find it */
+	mutex_lock(&s->s_root->d_inode->i_mutex);
+	dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
+				strlen(PRIVROOT_NAME));
+	if (!IS_ERR(dentry)) {
+		REISERFS_SB(s)->priv_root = dentry;
+		s->s_root->d_op = &xattr_lookup_poison_ops;
+		if (dentry->d_inode)
+			dentry->d_inode->i_flags |= S_PRIVATE;
+	} else
+		err = PTR_ERR(dentry);
+	mutex_unlock(&s->s_root->d_inode->i_mutex);
+
+	return err;
+}
+
 /* We need to take a copy of the mount flags since things like
  * MS_RDONLY don't get set until *after* we're called.
  * mount_flags != mount_options */
@@ -969,48 +991,12 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 	err = xattr_mount_check(s);
 	if (err)
 		goto error;
-#endif
 
-	/* If we don't have the privroot located yet - go find it */
-	if (!REISERFS_SB(s)->priv_root) {
-		struct dentry *dentry;
-		mutex_lock_nested(&s->s_root->d_inode->i_mutex, I_MUTEX_CHILD);
-		dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
-					strlen(PRIVROOT_NAME));
-		if (!IS_ERR(dentry)) {
-#ifdef CONFIG_REISERFS_FS_XATTR
-			if (!(mount_flags & MS_RDONLY) && !dentry->d_inode)
-				err = create_privroot(dentry);
-#endif
-			if (!dentry->d_inode) {
-				dput(dentry);
-				dentry = NULL;
-			}
-		} else
-			err = PTR_ERR(dentry);
+	if (!REISERFS_SB(s)->priv_root->d_inode && !(mount_flags & MS_RDONLY)) {
+		mutex_lock(&s->s_root->d_inode->i_mutex);
+		err = create_privroot(REISERFS_SB(s)->priv_root);
 		mutex_unlock(&s->s_root->d_inode->i_mutex);
-
-		if (!err && dentry) {
-			s->s_root->d_op = &xattr_lookup_poison_ops;
-			dentry->d_inode->i_flags |= S_PRIVATE;
-			REISERFS_SB(s)->priv_root = dentry;
-#ifdef CONFIG_REISERFS_FS_XATTR
-		/* xattrs are unavailable */
-		} else if (!(mount_flags & MS_RDONLY)) {
-			/* If we're read-only it just means that the dir
-			 * hasn't been created. Not an error -- just no
-			 * xattrs on the fs. We'll check again if we
-			 * go read-write */
-			reiserfs_warning(s, "jdm-20006",
-					 "xattrs/ACLs enabled and couldn't "
-					 "find/create .reiserfs_priv. "
-					 "Failing mount.");
-			err = -EOPNOTSUPP;
-#endif
-		}
 	}
-
-#ifdef CONFIG_REISERFS_FS_XATTR
 	if (!err)
 		s->s_xattr = reiserfs_xattr_handlers;
 
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index dcae01e63e40..fea1a8e65bef 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -38,6 +38,7 @@ struct nameidata;
 int reiserfs_xattr_register_handlers(void) __init;
 void reiserfs_xattr_unregister_handlers(void);
 int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
+int reiserfs_lookup_privroot(struct super_block *sb);
 int reiserfs_delete_xattrs(struct inode *inode);
 int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
 
-- 
cgit v1.2.3


From ab17c4f02156c4f75d7fa43a5aa2a7f942d47201 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 5 May 2009 15:30:15 -0400
Subject: reiserfs: fixup xattr_root caching

 The xattr_root caching was broken from my previous patch set. It wouldn't
 cause corruption, but could cause decreased performance due to allocating
 a larger chunk of the journal (~ 27 blocks) than it would actually use.

 This patch loads the xattr root dentry at xattr initialization and creates
 it on-demand. Since we're using the cached dentry, there's no point
 in keeping lookup_or_create_dir around, so that's removed.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/xattr.c            | 73 ++++++++++++++++++++++++++----------------
 include/linux/reiserfs_fs_sb.h |  2 +-
 include/linux/reiserfs_xattr.h |  2 +-
 3 files changed, 48 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 2891f789f545..c77984473db9 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -113,36 +113,28 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 
 #define xattr_may_create(flags)	(!flags || flags & XATTR_CREATE)
 
-/* Returns and possibly creates the xattr dir. */
-static struct dentry *lookup_or_create_dir(struct dentry *parent,
-					    const char *name, int flags)
+static struct dentry *open_xa_root(struct super_block *sb, int flags)
 {
-	struct dentry *dentry;
-	BUG_ON(!parent);
+	struct dentry *privroot = REISERFS_SB(sb)->priv_root;
+	struct dentry *xaroot;
+	if (!privroot->d_inode)
+		return ERR_PTR(-ENODATA);
 
-	mutex_lock_nested(&parent->d_inode->i_mutex, I_MUTEX_XATTR);
-	dentry = lookup_one_len(name, parent, strlen(name));
-	if (!IS_ERR(dentry) && !dentry->d_inode) {
-		int err = -ENODATA;
+	mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
 
+	xaroot = dget(REISERFS_SB(sb)->xattr_root);
+	if (!xaroot->d_inode) {
+		int err = -ENODATA;
 		if (xattr_may_create(flags))
-			err = xattr_mkdir(parent->d_inode, dentry, 0700);
-
+			err = xattr_mkdir(privroot->d_inode, xaroot, 0700);
 		if (err) {
-			dput(dentry);
-			dentry = ERR_PTR(err);
+			dput(xaroot);
+			xaroot = ERR_PTR(err);
 		}
 	}
-	mutex_unlock(&parent->d_inode->i_mutex);
-	return dentry;
-}
 
-static struct dentry *open_xa_root(struct super_block *sb, int flags)
-{
-	struct dentry *privroot = REISERFS_SB(sb)->priv_root;
-	if (!privroot)
-		return ERR_PTR(-ENODATA);
-	return lookup_or_create_dir(privroot, XAROOT_NAME, flags);
+	mutex_unlock(&privroot->d_inode->i_mutex);
+	return xaroot;
 }
 
 static struct dentry *open_xa_dir(const struct inode *inode, int flags)
@@ -158,10 +150,22 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 		 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
 		 inode->i_generation);
 
-	xadir = lookup_or_create_dir(xaroot, namebuf, flags);
+	mutex_lock_nested(&xaroot->d_inode->i_mutex, I_MUTEX_XATTR);
+
+	xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
+	if (!IS_ERR(xadir) && !xadir->d_inode) {
+		int err = -ENODATA;
+		if (xattr_may_create(flags))
+			err = xattr_mkdir(xaroot->d_inode, xadir, 0700);
+		if (err) {
+			dput(xadir);
+			xadir = ERR_PTR(err);
+		}
+	}
+
+	mutex_unlock(&xaroot->d_inode->i_mutex);
 	dput(xaroot);
 	return xadir;
-
 }
 
 /* The following are side effects of other operations that aren't explicitly
@@ -986,19 +990,33 @@ int reiserfs_lookup_privroot(struct super_block *s)
 int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 {
 	int err = 0;
+	struct dentry *privroot = REISERFS_SB(s)->priv_root;
 
 #ifdef CONFIG_REISERFS_FS_XATTR
 	err = xattr_mount_check(s);
 	if (err)
 		goto error;
 
-	if (!REISERFS_SB(s)->priv_root->d_inode && !(mount_flags & MS_RDONLY)) {
+	if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
 		mutex_lock(&s->s_root->d_inode->i_mutex);
 		err = create_privroot(REISERFS_SB(s)->priv_root);
 		mutex_unlock(&s->s_root->d_inode->i_mutex);
 	}
-	if (!err)
+
+	if (privroot->d_inode) {
 		s->s_xattr = reiserfs_xattr_handlers;
+		mutex_lock(&privroot->d_inode->i_mutex);
+		if (!REISERFS_SB(s)->xattr_root) {
+			struct dentry *dentry;
+			dentry = lookup_one_len(XAROOT_NAME, privroot,
+						strlen(XAROOT_NAME));
+			if (!IS_ERR(dentry))
+				REISERFS_SB(s)->xattr_root = dentry;
+			else
+				err = PTR_ERR(dentry);
+		}
+		mutex_unlock(&privroot->d_inode->i_mutex);
+	}
 
 error:
 	if (err) {
@@ -1008,11 +1026,12 @@ error:
 #endif
 
 	/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
-	s->s_flags = s->s_flags & ~MS_POSIXACL;
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
 	if (reiserfs_posixacl(s))
 		s->s_flags |= MS_POSIXACL;
+	else
 #endif
+		s->s_flags &= ~MS_POSIXACL;
 
 	return err;
 }
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 6b361d23a499..8651640868a1 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -402,7 +402,7 @@ struct reiserfs_sb_info {
 	int reserved_blocks;	/* amount of blocks reserved for further allocations */
 	spinlock_t bitmap_lock;	/* this lock on now only used to protect reserved_blocks variable */
 	struct dentry *priv_root;	/* root of /.reiserfs_priv */
-	struct dentry *xattr_root;	/* root of /.reiserfs_priv/.xa */
+	struct dentry *xattr_root;	/* root of /.reiserfs_priv/xattrs */
 	int j_errno;
 #ifdef CONFIG_QUOTA
 	char *s_qf_names[MAXQUOTAS];
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index fea1a8e65bef..cdedc01036e4 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -98,7 +98,7 @@ static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode)
 
 	if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) {
 		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
-		if (REISERFS_SB(inode->i_sb)->xattr_root == NULL)
+		if (!REISERFS_SB(inode->i_sb)->xattr_root->d_inode)
 			nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
 	}
 
-- 
cgit v1.2.3


From 677c9b2e393a0cd203bd54e9c18b012b2c73305a Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 5 May 2009 15:30:17 -0400
Subject: reiserfs: remove privroot hiding in lookup

 With Al Viro's patch to move privroot lookup to fs mount, there's no need
 to have special code to hide the privroot in reiserfs_lookup.

 I've also cleaned up the privroot hiding in reiserfs_readdir_dentry and
 removed the last user of reiserfs_xattrs().

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/dir.c              | 24 +++++++++++++-----------
 fs/reiserfs/namei.c            | 17 ++---------------
 fs/reiserfs/xattr.c            |  2 +-
 include/linux/reiserfs_fs_sb.h |  1 -
 4 files changed, 16 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 67a80d7e59e2..45ee3d357c70 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -41,6 +41,18 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 
 #define store_ih(where,what) copy_item_head (where, what)
 
+static inline bool is_privroot_deh(struct dentry *dir,
+				   struct reiserfs_de_head *deh)
+{
+	int ret = 0;
+#ifdef CONFIG_REISERFS_FS_XATTR
+	struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
+	ret = (dir == dir->d_parent && privroot->d_inode &&
+	       deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
+#endif
+	return ret;
+}
+
 int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
 			   filldir_t filldir, loff_t *pos)
 {
@@ -138,18 +150,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
 				}
 
 				/* Ignore the .reiserfs_priv entry */
-				if (reiserfs_xattrs(inode->i_sb) &&
-				    !old_format_only(inode->i_sb) &&
-				    dentry == inode->i_sb->s_root &&
-				    REISERFS_SB(inode->i_sb)->priv_root &&
-				    REISERFS_SB(inode->i_sb)->priv_root->d_inode
-				    && deh_objectid(deh) ==
-				    le32_to_cpu(INODE_PKEY
-						(REISERFS_SB(inode->i_sb)->
-						 priv_root->d_inode)->
-						k_objectid)) {
+				if (is_privroot_deh(dentry, deh))
 					continue;
-				}
 
 				d_off = deh_offset(deh);
 				*pos = d_off;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index efd4d720718e..271579128634 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -338,21 +338,8 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
 				&path_to_entry, &de);
 	pathrelse(&path_to_entry);
 	if (retval == NAME_FOUND) {
-		/* Hide the .reiserfs_priv directory */
-		if (reiserfs_xattrs(dir->i_sb) &&
-		    !old_format_only(dir->i_sb) &&
-		    REISERFS_SB(dir->i_sb)->priv_root &&
-		    REISERFS_SB(dir->i_sb)->priv_root->d_inode &&
-		    de.de_objectid ==
-		    le32_to_cpu(INODE_PKEY
-				(REISERFS_SB(dir->i_sb)->priv_root->d_inode)->
-				k_objectid)) {
-			reiserfs_write_unlock(dir->i_sb);
-			return ERR_PTR(-EACCES);
-		}
-
-		inode =
-		    reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
+		inode = reiserfs_iget(dir->i_sb,
+				      (struct cpu_key *)&(de.de_dir_id));
 		if (!inode || IS_ERR(inode)) {
 			reiserfs_write_unlock(dir->i_sb);
 			return ERR_PTR(-EACCES);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index c77984473db9..2237e10c7c7c 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -841,7 +841,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 	if (!dentry->d_inode)
 		return -EINVAL;
 
-	if (!reiserfs_xattrs(dentry->d_sb) ||
+	if (!dentry->d_sb->s_xattr ||
 	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 8651640868a1..6473650c28f1 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -488,7 +488,6 @@ enum reiserfs_mount_options {
 #define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
 #define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
 #define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
-#define reiserfs_xattrs(s) ((s)->s_xattr != NULL)
 #define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
 #define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
 #define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
-- 
cgit v1.2.3


From 74dbbdd7fdc11763f4698d2f3e684cf4446951e6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 6 May 2009 01:07:50 -0400
Subject: New helper: deactivate_locked_super()

Does equivalent of up_write(&s->s_umount); deactivate_super(s);
However, it does not does not unlock it until it's all over.
As the result, it's safe to use to dispose of new superblock on ->get_sb()
failure exits - nobody will see the sucker until it's all over.
Equivalent using up_write/deactivate_super is safe for that purpose
if superblock is either	safe to use or has NULL ->s_root when we unlock.
Normally filesystems take the required precautions, but
	a) we do have bugs in that area in some of them.
	b) up_write/deactivate_super sequence is extremely common,
so the helper makes sense anyway.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 46 ++++++++++++++++++++++++++++++++++------------
 include/linux/fs.h |  1 +
 2 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 786fe7d72790..a9dc4c33ef4d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -207,6 +207,34 @@ void deactivate_super(struct super_block *s)
 
 EXPORT_SYMBOL(deactivate_super);
 
+/**
+ *	deactivate_locked_super	-	drop an active reference to superblock
+ *	@s: superblock to deactivate
+ *
+ *	Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that
+ *	it does not unlock it until it's all over.  As the result, it's safe to
+ *	use to dispose of new superblock on ->get_sb() failure exits - nobody
+ *	will see the sucker until it's all over.  Equivalent using up_write +
+ *	deactivate_super is safe for that purpose only if superblock is either
+ *	safe to use or has NULL ->s_root when we unlock.
+ */
+void deactivate_locked_super(struct super_block *s)
+{
+	struct file_system_type *fs = s->s_type;
+	if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
+		s->s_count -= S_BIAS-1;
+		spin_unlock(&sb_lock);
+		vfs_dq_off(s, 0);
+		fs->kill_sb(s);
+		put_filesystem(fs);
+		put_super(s);
+	} else {
+		up_write(&s->s_umount);
+	}
+}
+
+EXPORT_SYMBOL(deactivate_locked_super);
+
 /**
  *	grab_super - acquire an active reference
  *	@s: reference we are trying to make active
@@ -797,8 +825,7 @@ int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
 		sb->s_flags = flags;
 		err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 		if (err) {
-			up_write(&sb->s_umount);
-			deactivate_super(sb);
+			deactivate_locked_super(sb);
 			return err;
 		}
 
@@ -854,8 +881,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
+			deactivate_locked_super(s);
 			error = -EBUSY;
 			goto error_bdev;
 		}
@@ -870,8 +896,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
+			deactivate_locked_super(s);
 			goto error;
 		}
 
@@ -921,8 +946,7 @@ int get_sb_nodev(struct file_system_type *fs_type,
 
 	error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 	if (error) {
-		up_write(&s->s_umount);
-		deactivate_super(s);
+		deactivate_locked_super(s);
 		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
@@ -952,8 +976,7 @@ int get_sb_single(struct file_system_type *fs_type,
 		s->s_flags = flags;
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
+			deactivate_locked_super(s);
 			return error;
 		}
 		s->s_flags |= MS_ACTIVE;
@@ -1006,8 +1029,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	return mnt;
 out_sb:
 	dput(mnt->mnt_root);
-	up_write(&mnt->mnt_sb->s_umount);
-	deactivate_super(mnt->mnt_sb);
+	deactivate_locked_super(mnt->mnt_sb);
 out_free_secdata:
 	free_secdata(secdata);
 out_mnt:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5bed436f4353..11484d08042c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1775,6 +1775,7 @@ void kill_block_super(struct super_block *sb);
 void kill_anon_super(struct super_block *sb);
 void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
+void deactivate_locked_super(struct super_block *sb);
 int set_anon_super(struct super_block *s, void *data);
 struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
-- 
cgit v1.2.3


From db6c1fbb92eeb4cb52c6133e0c533602f49fc4bd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Apr 2009 18:07:08 +0200
Subject: romfs: cleanup romfs_fs.h

There's no kernel-only content in it anymore, so move it to header-y
and remove the superflous #ifdef __KERNEL__.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/Kbuild     | 2 +-
 include/linux/romfs_fs.h | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index ca9b9b9bd331..3f0eaa397ef5 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -138,6 +138,7 @@ header-y += qnxtypes.h
 header-y += radeonfb.h
 header-y += raw.h
 header-y += resource.h
+header-y += romfs_fs.h
 header-y += rose.h
 header-y += serial_reg.h
 header-y += smbno.h
@@ -314,7 +315,6 @@ unifdef-y += irqnr.h
 unifdef-y += reboot.h
 unifdef-y += reiserfs_fs.h
 unifdef-y += reiserfs_xattr.h
-unifdef-y += romfs_fs.h
 unifdef-y += route.h
 unifdef-y += rtc.h
 unifdef-y += rtnetlink.h
diff --git a/include/linux/romfs_fs.h b/include/linux/romfs_fs.h
index e20bbf9eb365..c490fbc43fe2 100644
--- a/include/linux/romfs_fs.h
+++ b/include/linux/romfs_fs.h
@@ -53,9 +53,4 @@ struct romfs_inode {
 #define ROMFH_PAD (ROMFH_SIZE-1)
 #define ROMFH_MASK (~ROMFH_PAD)
 
-#ifdef __KERNEL__
-
-/* Not much now */
-
-#endif /* __KERNEL__ */
 #endif
-- 
cgit v1.2.3


From 6e8341a11eb21826b7192d0bb88cb5b44900a9af Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 6 Apr 2009 11:16:22 -0400
Subject: Switch open_exec() and sys_uselib() to do_open_filp()

... and make path_lookup_open() static

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c             | 72 ++++++++++++++++++---------------------------------
 fs/namei.c            | 13 +++++-----
 fs/open.c             |  2 +-
 include/linux/fs.h    |  2 +-
 include/linux/namei.h |  1 -
 5 files changed, 34 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 41ae8e0de72d..895823d0149d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -105,36 +105,28 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
 SYSCALL_DEFINE1(uselib, const char __user *, library)
 {
 	struct file *file;
-	struct nameidata nd;
 	char *tmp = getname(library);
 	int error = PTR_ERR(tmp);
 
-	if (!IS_ERR(tmp)) {
-		error = path_lookup_open(AT_FDCWD, tmp,
-					 LOOKUP_FOLLOW, &nd,
-					 FMODE_READ|FMODE_EXEC);
-		putname(tmp);
-	}
-	if (error)
+	if (IS_ERR(tmp))
+		goto out;
+
+	file = do_filp_open(AT_FDCWD, tmp,
+				O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
+				MAY_READ | MAY_EXEC | MAY_OPEN);
+	putname(tmp);
+	error = PTR_ERR(file);
+	if (IS_ERR(file))
 		goto out;
 
 	error = -EINVAL;
-	if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
+	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
 		goto exit;
 
 	error = -EACCES;
-	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
-		goto exit;
-
-	error = may_open(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN, 0);
-	if (error)
+	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;
 
-	file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
-	error = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out;
-
 	fsnotify_open(file->f_path.dentry);
 
 	error = -ENOEXEC;
@@ -156,13 +148,10 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 		}
 		read_unlock(&binfmt_lock);
 	}
+exit:
 	fput(file);
 out:
   	return error;
-exit:
-	release_open_intent(&nd);
-	path_put(&nd.path);
-	goto out;
 }
 
 #ifdef CONFIG_MMU
@@ -657,44 +646,33 @@ EXPORT_SYMBOL(setup_arg_pages);
 
 struct file *open_exec(const char *name)
 {
-	struct nameidata nd;
 	struct file *file;
 	int err;
 
-	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd,
-				FMODE_READ|FMODE_EXEC);
-	if (err)
+	file = do_filp_open(AT_FDCWD, name,
+				O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
+				MAY_EXEC | MAY_OPEN);
+	if (IS_ERR(file))
 		goto out;
 
 	err = -EACCES;
-	if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
-		goto out_path_put;
-
-	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
-		goto out_path_put;
-
-	err = may_open(&nd.path, MAY_EXEC | MAY_OPEN, 0);
-	if (err)
-		goto out_path_put;
+	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
+		goto exit;
 
-	file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
-	if (IS_ERR(file))
-		return file;
+	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+		goto exit;
 
 	fsnotify_open(file->f_path.dentry);
 
 	err = deny_write_access(file);
-	if (err) {
-		fput(file);
-		goto out;
-	}
+	if (err)
+		goto exit;
 
+out:
 	return file;
 
- out_path_put:
-	release_open_intent(&nd);
-	path_put(&nd.path);
- out:
+exit:
+	fput(file);
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL(open_exec);
diff --git a/fs/namei.c b/fs/namei.c
index 78f253cd2d4f..967c3db92724 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1130,8 +1130,8 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
  * @nd: pointer to nameidata
  * @open_flags: open intent flags
  */
-int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
-		struct nameidata *nd, int open_flags)
+static int path_lookup_open(int dfd, const char *name,
+		unsigned int lookup_flags, struct nameidata *nd, int open_flags)
 {
 	struct file *filp = get_empty_filp();
 	int err;
@@ -1637,18 +1637,19 @@ static int open_will_write_to_fs(int flag, struct inode *inode)
  * open_to_namei_flags() for more details.
  */
 struct file *do_filp_open(int dfd, const char *pathname,
-		int open_flag, int mode)
+		int open_flag, int mode, int acc_mode)
 {
 	struct file *filp;
 	struct nameidata nd;
-	int acc_mode, error;
+	int error;
 	struct path path;
 	struct dentry *dir;
 	int count = 0;
 	int will_write;
 	int flag = open_to_namei_flags(open_flag);
 
-	acc_mode = MAY_OPEN | ACC_MODE(flag);
+	if (!acc_mode)
+		acc_mode = MAY_OPEN | ACC_MODE(flag);
 
 	/* O_TRUNC implies we need access checks for write permissions */
 	if (flag & O_TRUNC)
@@ -1869,7 +1870,7 @@ do_link:
  */
 struct file *filp_open(const char *filename, int flags, int mode)
 {
-	return do_filp_open(AT_FDCWD, filename, flags, mode);
+	return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
 }
 EXPORT_SYMBOL(filp_open);
 
diff --git a/fs/open.c b/fs/open.c
index 377eb25b6abf..bdfbf03615a4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1033,7 +1033,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 	if (!IS_ERR(tmp)) {
 		fd = get_unused_fd_flags(flags);
 		if (fd >= 0) {
-			struct file *f = do_filp_open(dfd, tmp, flags, mode);
+			struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
 			if (IS_ERR(f)) {
 				put_unused_fd(fd);
 				fd = PTR_ERR(f);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11484d08042c..ed788426f464 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2118,7 +2118,7 @@ extern struct file *create_write_pipe(int flags);
 extern void free_write_pipe(struct file *);
 
 extern struct file *do_filp_open(int dfd, const char *pathname,
-		int open_flag, int mode);
+		int open_flag, int mode, int acc_mode);
 extern int may_open(struct path *, int, int);
 
 extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index fc2e03579877..518098fe63af 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -69,7 +69,6 @@ extern int path_lookup(const char *, unsigned, struct nameidata *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct nameidata *);
 
-extern int path_lookup_open(int dfd, const char *name, unsigned lookup_flags, struct nameidata *, int open_flags);
 extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
 		int (*open)(struct inode *, struct file *));
 extern struct file *nameidata_to_filp(struct nameidata *nd, int flags);
-- 
cgit v1.2.3


From 2a32cebd6cbcc43996c3e2d114fa32ba1e71192a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 May 2009 16:05:57 -0400
Subject: Fix races around the access to ->s_options

Put generic_show_options read access to s_options under rcu_read_lock,
split save_mount_options() into "we are setting it the first time"
(uses in foo_fill_super()) and "we are relacing and freeing the old one",
synchronize_rcu() before kfree() in the latter.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/isdn/capi/capifs.c |  3 +--
 fs/affs/super.c            |  3 +--
 fs/afs/super.c             |  4 ++--
 fs/hpfs/super.c            |  3 +--
 fs/namespace.c             | 21 ++++++++++++++++++---
 fs/reiserfs/super.c        |  3 +--
 include/linux/fs.h         |  1 +
 7 files changed, 25 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c
index b129409925af..8f9f3b5a3e8c 100644
--- a/drivers/isdn/capi/capifs.c
+++ b/drivers/isdn/capi/capifs.c
@@ -75,8 +75,7 @@ static int capifs_remount(struct super_block *s, int *flags, char *data)
 		}
 	}
 
-	kfree(s->s_options);
-	s->s_options = new_opt;
+	replace_mount_options(s, new_opt);
 
 	config.setuid  = setuid;
 	config.setgid  = setgid;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 5ce695e707fe..63f5183f263b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -507,8 +507,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 		kfree(new_opts);
 		return -EINVAL;
 	}
-	kfree(sb->s_options);
-	sb->s_options = new_opts;
+	replace_mount_options(sb, new_opts);
 
 	sbi->s_flags = mount_flags;
 	sbi->s_mode  = mode;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 2753f16dd315..76828e5f8a39 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -408,17 +408,17 @@ static int afs_get_sb(struct file_system_type *fs_type,
 			deactivate_locked_super(sb);
 			goto error;
 		}
-		sb->s_options = new_opts;
+		save_mount_options(sb, new_opts);
 		sb->s_flags |= MS_ACTIVE;
 	} else {
 		_debug("reuse");
-		kfree(new_opts);
 		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
 	}
 
 	simple_set_mnt(mnt, sb);
 	afs_put_volume(params.volume);
 	afs_put_cell(params.cell);
+	kfree(new_opts);
 	_leave(" = 0 [%p]", sb);
 	return 0;
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index fecf402d7b8a..fc77965be841 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -423,8 +423,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 
 	if (!(*flags & MS_RDONLY)) mark_dirty(s);
 
-	kfree(s->s_options);
-	s->s_options = new_opts;
+	replace_mount_options(s, new_opts);
 
 	return 0;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 0d2003fb4377..134d494158d9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -695,12 +695,16 @@ static inline void mangle(struct seq_file *m, const char *s)
  */
 int generic_show_options(struct seq_file *m, struct vfsmount *mnt)
 {
-	const char *options = mnt->mnt_sb->s_options;
+	const char *options;
+
+	rcu_read_lock();
+	options = rcu_dereference(mnt->mnt_sb->s_options);
 
 	if (options != NULL && options[0]) {
 		seq_putc(m, ',');
 		mangle(m, options);
 	}
+	rcu_read_unlock();
 
 	return 0;
 }
@@ -721,11 +725,22 @@ EXPORT_SYMBOL(generic_show_options);
  */
 void save_mount_options(struct super_block *sb, char *options)
 {
-	kfree(sb->s_options);
-	sb->s_options = kstrdup(options, GFP_KERNEL);
+	BUG_ON(sb->s_options);
+	rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
 }
 EXPORT_SYMBOL(save_mount_options);
 
+void replace_mount_options(struct super_block *sb, char *options)
+{
+	char *old = sb->s_options;
+	rcu_assign_pointer(sb->s_options, options);
+	if (old) {
+		synchronize_rcu();
+		kfree(old);
+	}
+}
+EXPORT_SYMBOL(replace_mount_options);
+
 #ifdef CONFIG_PROC_FS
 /* iterator */
 static void *m_start(struct seq_file *m, loff_t *pos)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index d444fe0013a4..1215a4f50cd2 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1316,8 +1316,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	}
 
 out_ok:
-	kfree(s->s_options);
-	s->s_options = new_opts;
+	replace_mount_options(s, new_opts);
 	return 0;
 
 out_err:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed788426f464..3b534e527e09 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2368,6 +2368,7 @@ extern void file_update_time(struct file *file);
 
 extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt);
 extern void save_mount_options(struct super_block *sb, char *options);
+extern void replace_mount_options(struct super_block *sb, char *options);
 
 static inline ino_t parent_ino(struct dentry *dentry)
 {
-- 
cgit v1.2.3


From ecf4667d30dd63fa130e22f8f2da3e6ce003358b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 12 May 2009 13:19:37 -0700
Subject: syscalls.h add the missing sys_pipe2 declaration

In order to build the generic syscall table, we need a declaration for
every system call.  sys_pipe2 was added without a proper declaration, so
add this to syscalls.h now.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 40617c1d8976..30520844b8da 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -433,6 +433,7 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg);
 asmlinkage long sys_fcntl64(unsigned int fd,
 				unsigned int cmd, unsigned long arg);
 #endif
+asmlinkage long sys_pipe2(int __user *fildes, int flags);
 asmlinkage long sys_dup(unsigned int fildes);
 asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd);
 asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags);
-- 
cgit v1.2.3


From 4f005dbe5584fe54c9f6d6d4f0acd3fb29be84da Mon Sep 17 00:00:00 2001
From: Maciej Sosnowski <maciej.sosnowski@intel.com>
Date: Thu, 23 Apr 2009 12:31:51 +0200
Subject: ioatdma: fix "ioatdma frees DMA memory with wrong function"

as reported by Alexander Beregalov <a.beregalov@gmail.com>

ioatdma 0000:00:08.0: DMA-API: device driver frees DMA memory with
wrong function [device address=0x000000007f76f800] [size=2000 bytes]
[map
ped as single] [unmapped as page]

The ioatdma driver was unmapping all regions
(either allocated as page or single) using unmap_page.
This patch lets dma driver recognize if unmap_single or unmap_page should be used.
It introduces two new dma control flags:
DMA_COMPL_SRC_UNMAP_SINGLE and DMA_COMPL_DEST_UNMAP_SINGLE.
They should be set to indicate dma driver to do dma-unmapping as single
(first one for the source, tha latter for the destination).
If respective flag is not set, the driver assumes dma-unmapping as page.

Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Reported-by: Alexander Beregalov <a.beregalov@gmail.com>
Tested-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmaengine.c   | 17 +++++++++++------
 drivers/dma/ioat_dma.c    | 45 ++++++++++++++++++++++++++++-----------------
 include/linux/dmaengine.h |  6 ++++++
 3 files changed, 45 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 92438e9dacc3..5a87384ea4ff 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -804,11 +804,14 @@ dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest,
 	dma_addr_t dma_dest, dma_src;
 	dma_cookie_t cookie;
 	int cpu;
+	unsigned long flags;
 
 	dma_src = dma_map_single(dev->dev, src, len, DMA_TO_DEVICE);
 	dma_dest = dma_map_single(dev->dev, dest, len, DMA_FROM_DEVICE);
-	tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len,
-					 DMA_CTRL_ACK);
+	flags = DMA_CTRL_ACK |
+		DMA_COMPL_SRC_UNMAP_SINGLE |
+		DMA_COMPL_DEST_UNMAP_SINGLE;
+	tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len, flags);
 
 	if (!tx) {
 		dma_unmap_single(dev->dev, dma_src, len, DMA_TO_DEVICE);
@@ -850,11 +853,12 @@ dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page,
 	dma_addr_t dma_dest, dma_src;
 	dma_cookie_t cookie;
 	int cpu;
+	unsigned long flags;
 
 	dma_src = dma_map_single(dev->dev, kdata, len, DMA_TO_DEVICE);
 	dma_dest = dma_map_page(dev->dev, page, offset, len, DMA_FROM_DEVICE);
-	tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len,
-					 DMA_CTRL_ACK);
+	flags = DMA_CTRL_ACK | DMA_COMPL_SRC_UNMAP_SINGLE;
+	tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len, flags);
 
 	if (!tx) {
 		dma_unmap_single(dev->dev, dma_src, len, DMA_TO_DEVICE);
@@ -898,12 +902,13 @@ dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg,
 	dma_addr_t dma_dest, dma_src;
 	dma_cookie_t cookie;
 	int cpu;
+	unsigned long flags;
 
 	dma_src = dma_map_page(dev->dev, src_pg, src_off, len, DMA_TO_DEVICE);
 	dma_dest = dma_map_page(dev->dev, dest_pg, dest_off, len,
 				DMA_FROM_DEVICE);
-	tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len,
-					 DMA_CTRL_ACK);
+	flags = DMA_CTRL_ACK;
+	tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len, flags);
 
 	if (!tx) {
 		dma_unmap_page(dev->dev, dma_src, len, DMA_TO_DEVICE);
diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index e4fc33c1c32f..1955ee8d6d20 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -1063,22 +1063,31 @@ static void ioat_dma_cleanup_tasklet(unsigned long data)
 static void
 ioat_dma_unmap(struct ioat_dma_chan *ioat_chan, struct ioat_desc_sw *desc)
 {
-	/*
-	 * yes we are unmapping both _page and _single
-	 * alloc'd regions with unmap_page. Is this
-	 * *really* that bad?
-	 */
-	if (!(desc->async_tx.flags & DMA_COMPL_SKIP_DEST_UNMAP))
-		pci_unmap_page(ioat_chan->device->pdev,
-				pci_unmap_addr(desc, dst),
-				pci_unmap_len(desc, len),
-				PCI_DMA_FROMDEVICE);
-
-	if (!(desc->async_tx.flags & DMA_COMPL_SKIP_SRC_UNMAP))
-		pci_unmap_page(ioat_chan->device->pdev,
-				pci_unmap_addr(desc, src),
-				pci_unmap_len(desc, len),
-				PCI_DMA_TODEVICE);
+	if (!(desc->async_tx.flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+		if (desc->async_tx.flags & DMA_COMPL_DEST_UNMAP_SINGLE)
+			pci_unmap_single(ioat_chan->device->pdev,
+					 pci_unmap_addr(desc, dst),
+					 pci_unmap_len(desc, len),
+					 PCI_DMA_FROMDEVICE);
+		else
+			pci_unmap_page(ioat_chan->device->pdev,
+				       pci_unmap_addr(desc, dst),
+				       pci_unmap_len(desc, len),
+				       PCI_DMA_FROMDEVICE);
+	}
+
+	if (!(desc->async_tx.flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+		if (desc->async_tx.flags & DMA_COMPL_SRC_UNMAP_SINGLE)
+			pci_unmap_single(ioat_chan->device->pdev,
+					 pci_unmap_addr(desc, src),
+					 pci_unmap_len(desc, len),
+					 PCI_DMA_TODEVICE);
+		else
+			pci_unmap_page(ioat_chan->device->pdev,
+				       pci_unmap_addr(desc, src),
+				       pci_unmap_len(desc, len),
+				       PCI_DMA_TODEVICE);
+	}
 }
 
 /**
@@ -1363,6 +1372,7 @@ static int ioat_dma_self_test(struct ioatdma_device *device)
 	int err = 0;
 	struct completion cmp;
 	unsigned long tmo;
+	unsigned long flags;
 
 	src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
 	if (!src)
@@ -1392,8 +1402,9 @@ static int ioat_dma_self_test(struct ioatdma_device *device)
 				 DMA_TO_DEVICE);
 	dma_dest = dma_map_single(dma_chan->device->dev, dest, IOAT_TEST_SIZE,
 				  DMA_FROM_DEVICE);
+	flags = DMA_COMPL_SRC_UNMAP_SINGLE | DMA_COMPL_DEST_UNMAP_SINGLE;
 	tx = device->common.device_prep_dma_memcpy(dma_chan, dma_dest, dma_src,
-						   IOAT_TEST_SIZE, 0);
+						   IOAT_TEST_SIZE, flags);
 	if (!tx) {
 		dev_err(&device->pdev->dev,
 			"Self-test prep failed, disabling\n");
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 2e2aa3df170c..ffefba81c818 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -78,12 +78,18 @@ enum dma_transaction_type {
  * 	dependency chains
  * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s)
  * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s)
+ * @DMA_COMPL_SRC_UNMAP_SINGLE - set to do the source dma-unmapping as single
+ * 	(if not set, do the source dma-unmapping as page)
+ * @DMA_COMPL_DEST_UNMAP_SINGLE - set to do the destination dma-unmapping as single
+ * 	(if not set, do the destination dma-unmapping as page)
  */
 enum dma_ctrl_flags {
 	DMA_PREP_INTERRUPT = (1 << 0),
 	DMA_CTRL_ACK = (1 << 1),
 	DMA_COMPL_SKIP_SRC_UNMAP = (1 << 2),
 	DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3),
+	DMA_COMPL_SRC_UNMAP_SINGLE = (1 << 4),
+	DMA_COMPL_DEST_UNMAP_SINGLE = (1 << 5),
 };
 
 /**
-- 
cgit v1.2.3


From cd17cbfda004fe5f406c01b318c6378d9895896f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 15 May 2009 11:32:24 +0200
Subject: Revert "mm: add /proc controls for pdflush threads"

This reverts commit fafd688e4c0c34da0f3de909881117d374e4c7af.

Work is progressing to switch away from pdflush as the process backing
for flushing out dirty data. So it seems pointless to add more knobs
to control pdflush threads. The original author of the patch did not
have any specific use cases for adding the knobs, so we can easily
revert this before 2.6.30 to avoid having to maintain this API
forever.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/sysctl/vm.txt | 28 ----------------------------
 include/linux/writeback.h   |  2 --
 kernel/sysctl.c             | 23 -----------------------
 mm/pdflush.c                | 31 ++++++++++++-------------------
 4 files changed, 12 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index b716d33912d8..c302ddf629a0 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -39,8 +39,6 @@ Currently, these files are in /proc/sys/vm:
 - nr_hugepages
 - nr_overcommit_hugepages
 - nr_pdflush_threads
-- nr_pdflush_threads_min
-- nr_pdflush_threads_max
 - nr_trim_pages         (only if CONFIG_MMU=n)
 - numa_zonelist_order
 - oom_dump_tasks
@@ -469,32 +467,6 @@ The default value is 0.
 
 ==============================================================
 
-nr_pdflush_threads_min
-
-This value controls the minimum number of pdflush threads.
-
-At boot time, the kernel will create and maintain 'nr_pdflush_threads_min'
-threads for the kernel's lifetime.
-
-The default value is 2.  The minimum value you can specify is 1, and
-the maximum value is the current setting of 'nr_pdflush_threads_max'.
-
-See 'nr_pdflush_threads_max' below for more information.
-
-==============================================================
-
-nr_pdflush_threads_max
-
-This value controls the maximum number of pdflush threads that can be
-created.  The pdflush algorithm will create a new pdflush thread (up to
-this maximum) if no pdflush threads have been available for >= 1 second.
-
-The default value is 8.  The minimum value you can specify is the
-current value of 'nr_pdflush_threads_min' and the
-maximum is 1000.
-
-==============================================================
-
 overcommit_memory:
 
 This value contains a flag that enables memory overcommitment.
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9c1ed1fb6ddb..93445477f86a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -168,8 +168,6 @@ void writeback_set_ratelimit(void);
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
 				   read-only. */
-extern int nr_pdflush_threads_max; /* Global so it can be exported to sysctl */
-extern int nr_pdflush_threads_min; /* Global so it can be exported to sysctl */
 
 
 #endif		/* WRITEBACK_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ea78fa101ad6..b2970d56fb76 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -101,7 +101,6 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
-static int one_thousand = 1000;
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1033,28 +1032,6 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0444 /* read-only*/,
 		.proc_handler	= &proc_dointvec,
 	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "nr_pdflush_threads_min",
-		.data		= &nr_pdflush_threads_min,
-		.maxlen		= sizeof nr_pdflush_threads_min,
-		.mode		= 0644 /* read-write */,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &one,
-		.extra2		= &nr_pdflush_threads_max,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "nr_pdflush_threads_max",
-		.data		= &nr_pdflush_threads_max,
-		.maxlen		= sizeof nr_pdflush_threads_max,
-		.mode		= 0644 /* read-write */,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &nr_pdflush_threads_min,
-		.extra2		= &one_thousand,
-	},
 	{
 		.ctl_name	= VM_SWAPPINESS,
 		.procname	= "swappiness",
diff --git a/mm/pdflush.c b/mm/pdflush.c
index f2caf96993f8..235ac440c44e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -57,14 +57,6 @@ static DEFINE_SPINLOCK(pdflush_lock);
  */
 int nr_pdflush_threads = 0;
 
-/*
- * The max/min number of pdflush threads. R/W by sysctl at
- * /proc/sys/vm/nr_pdflush_threads_max/min
- */
-int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS;
-int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS;
-
-
 /*
  * The time at which the pdflush thread pool last went empty
  */
@@ -76,7 +68,7 @@ static unsigned long last_empty_jifs;
  * Thread pool management algorithm:
  * 
  * - The minimum and maximum number of pdflush instances are bound
- *   by nr_pdflush_threads_min and nr_pdflush_threads_max.
+ *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
  * 
  * - If there have been no idle pdflush instances for 1 second, create
  *   a new one.
@@ -142,13 +134,14 @@ static int __pdflush(struct pdflush_work *my_work)
 		 * To throttle creation, we reset last_empty_jifs.
 		 */
 		if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
-			if (list_empty(&pdflush_list) &&
-			    nr_pdflush_threads < nr_pdflush_threads_max) {
-				last_empty_jifs = jiffies;
-				nr_pdflush_threads++;
-				spin_unlock_irq(&pdflush_lock);
-				start_one_pdflush_thread();
-				spin_lock_irq(&pdflush_lock);
+			if (list_empty(&pdflush_list)) {
+				if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
+					last_empty_jifs = jiffies;
+					nr_pdflush_threads++;
+					spin_unlock_irq(&pdflush_lock);
+					start_one_pdflush_thread();
+					spin_lock_irq(&pdflush_lock);
+				}
 			}
 		}
 
@@ -160,7 +153,7 @@ static int __pdflush(struct pdflush_work *my_work)
 		 */
 		if (list_empty(&pdflush_list))
 			continue;
-		if (nr_pdflush_threads <= nr_pdflush_threads_min)
+		if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
 			continue;
 		pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
 		if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
@@ -266,9 +259,9 @@ static int __init pdflush_init(void)
 	 * Pre-set nr_pdflush_threads...  If we fail to create,
 	 * the count will be decremented.
 	 */
-	nr_pdflush_threads = nr_pdflush_threads_min;
+	nr_pdflush_threads = MIN_PDFLUSH_THREADS;
 
-	for (i = 0; i < nr_pdflush_threads_min; i++)
+	for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
 		start_one_pdflush_thread();
 	return 0;
 }
-- 
cgit v1.2.3


From 4bca3286433585b5f1c3e7d8ac37a2f4b3def9ca Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 15 May 2009 00:40:35 -0400
Subject: libata: Media rotation rate and form factor heuristics

This patch provides new heuristics for parsing both the form factor and
media rotation rate ATA IDENFITY words.

The reported ATA version must be 7 or greater and the device must return
values defined as valid in the standard.  Only then are the
characteristics reported to SCSI via the VPD B1 page.

This seems like a reasonable compromise to me considering that we have
been shipping several kernel releases that key off the rotation rate bit
without any version checking whatsoever.  With no complaints so far.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-scsi.c | 11 ++++++-----
 include/linux/ata.h       | 28 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index d1718a1f278a..342316064e9f 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -2142,13 +2142,14 @@ static unsigned int ata_scsiop_inq_89(struct ata_scsi_args *args, u8 *rbuf)
 
 static unsigned int ata_scsiop_inq_b1(struct ata_scsi_args *args, u8 *rbuf)
 {
+	int form_factor = ata_id_form_factor(args->id);
+	int media_rotation_rate = ata_id_rotation_rate(args->id);
+
 	rbuf[1] = 0xb1;
 	rbuf[3] = 0x3c;
-	if (ata_id_major_version(args->id) > 7) {
-		rbuf[4] = args->id[217] >> 8;
-		rbuf[5] = args->id[217];
-		rbuf[7] = args->id[168] & 0xf;
-	}
+	rbuf[4] = media_rotation_rate >> 8;
+	rbuf[5] = media_rotation_rate;
+	rbuf[7] = form_factor;
 
 	return 0;
 }
diff --git a/include/linux/ata.h b/include/linux/ata.h
index cb79b7a208e1..915da43edee1 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -730,6 +730,34 @@ static inline int ata_id_has_unload(const u16 *id)
 	return 0;
 }
 
+static inline int ata_id_form_factor(const u16 *id)
+{
+	u16 val = id[168];
+
+	if (ata_id_major_version(id) < 7 || val == 0 || val == 0xffff)
+		return 0;
+
+	val &= 0xf;
+
+	if (val > 5)
+		return 0;
+
+	return val;
+}
+
+static inline int ata_id_rotation_rate(const u16 *id)
+{
+	u16 val = id[217];
+
+	if (ata_id_major_version(id) < 7 || val == 0 || val == 0xffff)
+		return 0;
+
+	if (val > 1 && val < 0x401)
+		return 0;
+
+	return val;
+}
+
 static inline int ata_id_has_trim(const u16 *id)
 {
 	if (ata_id_major_version(id) >= 7 &&
-- 
cgit v1.2.3


From b83674c0da6558e357c6b482ccf299eeea77d8ef Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Sun, 17 May 2009 01:02:03 -0400
Subject: reiserfs: fixup perms when xattrs are disabled

This adds CONFIG_REISERFS_FS_XATTR protection from reiserfs_permission.

This is needed to avoid warnings during file deletions and chowns with
xattrs disabled.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/xattr.c            | 36 +++++++++++++++++++-----------------
 include/linux/reiserfs_xattr.h |  4 +---
 2 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 628075ca82c1..8e7deb0e6964 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -871,23 +871,6 @@ static int reiserfs_check_acl(struct inode *inode, int mask)
 	return error;
 }
 
-int reiserfs_permission(struct inode *inode, int mask)
-{
-	/*
-	 * We don't do permission checks on the internal objects.
-	 * Permissions are determined by the "owning" object.
-	 */
-	if (IS_PRIVATE(inode))
-		return 0;
-	/*
-	 * Stat data v1 doesn't support ACLs.
-	 */
-	if (get_inode_sd_version(inode) == STAT_DATA_V1)
-		return generic_permission(inode, mask, NULL);
-	else
-		return generic_permission(inode, mask, reiserfs_check_acl);
-}
-
 static int create_privroot(struct dentry *dentry)
 {
 	int err;
@@ -951,6 +934,25 @@ static int xattr_mount_check(struct super_block *s)
 	return 0;
 }
 
+int reiserfs_permission(struct inode *inode, int mask)
+{
+	/*
+	 * We don't do permission checks on the internal objects.
+	 * Permissions are determined by the "owning" object.
+	 */
+	if (IS_PRIVATE(inode))
+		return 0;
+
+#ifdef CONFIG_REISERFS_FS_XATTR
+	/*
+	 * Stat data v1 doesn't support ACLs.
+	 */
+	if (get_inode_sd_version(inode) != STAT_DATA_V1)
+		return generic_permission(inode, mask, reiserfs_check_acl);
+#endif
+	return generic_permission(inode, mask, NULL);
+}
+
 /* This will catch lookups from the fs root to .reiserfs_priv */
 static int
 xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index cdedc01036e4..99928dce37ea 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -41,6 +41,7 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
 int reiserfs_lookup_privroot(struct super_block *sb);
 int reiserfs_delete_xattrs(struct inode *inode);
 int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
+int reiserfs_permission(struct inode *inode, int mask);
 
 #ifdef CONFIG_REISERFS_FS_XATTR
 #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
@@ -50,7 +51,6 @@ int reiserfs_setxattr(struct dentry *dentry, const char *name,
 		      const void *value, size_t size, int flags);
 ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 int reiserfs_removexattr(struct dentry *dentry, const char *name);
-int reiserfs_permission(struct inode *inode, int mask);
 
 int reiserfs_xattr_get(struct inode *, const char *, void *, size_t);
 int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
@@ -117,8 +117,6 @@ static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 #define reiserfs_listxattr NULL
 #define reiserfs_removexattr NULL
 
-#define reiserfs_permission NULL
-
 static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 {
 }
-- 
cgit v1.2.3


From eb33575cf67d3f35fa2510210ef92631266e2465 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 13 May 2009 17:34:48 +0100
Subject: [ARM] Double check memmap is actually valid with a memmap has
 unexpected holes V2

pfn_valid() is meant to be able to tell if a given PFN has valid memmap
associated with it or not. In FLATMEM, it is expected that holes always
have valid memmap as long as there is valid PFNs either side of the hole.
In SPARSEMEM, it is assumed that a valid section has a memmap for the
entire section.

However, ARM and maybe other embedded architectures in the future free
memmap backing holes to save memory on the assumption the memmap is never
used. The page_zone linkages are then broken even though pfn_valid()
returns true. A walker of the full memmap must then do this additional
check to ensure the memmap they are looking at is sane by making sure the
zone and PFN linkages are still valid. This is expensive, but walkers of
the full memmap are extremely rare.

This was caught before for FLATMEM and hacked around but it hits again for
SPARSEMEM because the page_zone linkages can look ok where the PFN linkages
are totally screwed. This looks like a hatchet job but the reality is that
any clean solution would end up consumning all the memory saved by punching
these unexpected holes in the memmap. For example, we tried marking the
memmap within the section invalid but the section size exceeds the size of
the hole in most cases so pfn_valid() starts returning false where valid
memmap exists. Shrinking the size of the section would increase memory
consumption offsetting the gains.

This patch identifies when an architecture is punching unexpected holes
in the memmap that the memory model cannot automatically detect and sets
ARCH_HAS_HOLES_MEMORYMODEL. At the moment, this is restricted to EP93xx
which is the model sub-architecture this has been reported on but may expand
later. When set, walkers of the full memmap must call memmap_valid_within()
for each PFN and passing in what it expects the page and zone to be for
that PFN. If it finds the linkages to be broken, it assumes the memmap is
invalid for that PFN.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/Kconfig       |  6 +++---
 include/linux/mmzone.h | 26 ++++++++++++++++++++++++++
 mm/mmzone.c            | 15 +++++++++++++++
 mm/vmstat.c            | 19 ++++---------------
 4 files changed, 48 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e60ec54df334..9d02cdb15b23 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -273,6 +273,7 @@ config ARCH_EP93XX
 	select HAVE_CLK
 	select COMMON_CLKDEV
 	select ARCH_REQUIRE_GPIOLIB
+	select ARCH_HAS_HOLES_MEMORYMODEL
 	help
 	  This enables support for the Cirrus EP93xx series of CPUs.
 
@@ -976,10 +977,9 @@ config OABI_COMPAT
 	  UNPREDICTABLE (in fact it can be predicted that it won't work
 	  at all). If in doubt say Y.
 
-config ARCH_FLATMEM_HAS_HOLES
+config ARCH_HAS_HOLES_MEMORYMODEL
 	bool
-	default y
-	depends on FLATMEM
+	default n
 
 # Discontigmem is deprecated
 config ARCH_DISCONTIGMEM_ENABLE
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 186ec6ab334d..a47c879e1304 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1097,6 +1097,32 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
 #define pfn_valid_within(pfn) (1)
 #endif
 
+#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
+/*
+ * pfn_valid() is meant to be able to tell if a given PFN has valid memmap
+ * associated with it or not. In FLATMEM, it is expected that holes always
+ * have valid memmap as long as there is valid PFNs either side of the hole.
+ * In SPARSEMEM, it is assumed that a valid section has a memmap for the
+ * entire section.
+ *
+ * However, an ARM, and maybe other embedded architectures in the future
+ * free memmap backing holes to save memory on the assumption the memmap is
+ * never used. The page_zone linkages are then broken even though pfn_valid()
+ * returns true. A walker of the full memmap must then do this additional
+ * check to ensure the memmap they are looking at is sane by making sure
+ * the zone and PFN linkages are still valid. This is expensive, but walkers
+ * of the full memmap are extremely rare.
+ */
+int memmap_valid_within(unsigned long pfn,
+					struct page *page, struct zone *zone);
+#else
+static inline int memmap_valid_within(unsigned long pfn,
+					struct page *page, struct zone *zone)
+{
+	return 1;
+}
+#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+
 #endif /* !__GENERATING_BOUNDS.H */
 #endif /* !__ASSEMBLY__ */
 #endif /* _LINUX_MMZONE_H */
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 16ce8b955dcf..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -6,6 +6,7 @@
 
 
 #include <linux/stddef.h>
+#include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/module.h>
 
@@ -72,3 +73,17 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
 	*zone = zonelist_zone(z);
 	return z;
 }
+
+#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
+int memmap_valid_within(unsigned long pfn,
+					struct page *page, struct zone *zone)
+{
+	if (page_to_pfn(page) != pfn)
+		return 0;
+
+	if (page_zone(page) != zone)
+		return 0;
+
+	return 1;
+}
+#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 66f6130976cb..74d66dba0cbe 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -509,22 +509,11 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
 			continue;
 
 		page = pfn_to_page(pfn);
-#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
-		/*
-		 * Ordinarily, memory holes in flatmem still have a valid
-		 * memmap for the PFN range. However, an architecture for
-		 * embedded systems (e.g. ARM) can free up the memmap backing
-		 * holes to save memory on the assumption the memmap is
-		 * never used. The page_zone linkages are then broken even
-		 * though pfn_valid() returns true. Skip the page if the
-		 * linkages are broken. Even if this test passed, the impact
-		 * is that the counters for the movable type are off but
-		 * fragmentation monitoring is likely meaningless on small
-		 * systems.
-		 */
-		if (page_zone(page) != zone)
+
+		/* Watch for unexpected holes punched in the memmap */
+		if (!memmap_valid_within(pfn, page, zone))
 			continue;
-#endif
+
 		mtype = get_pageblock_migratetype(page);
 
 		if (mtype < MIGRATE_TYPES)
-- 
cgit v1.2.3


From 03fbdb15c14e9746c63168e3ff2c64b9c8336d33 Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Wed, 20 May 2009 22:39:08 +0100
Subject: [ARM] 5519/1: amba probe: pass "struct amba_id *" instead of void *

The second argument of the probe method points to the amba_id
structure, so it's better passed with the correct type. None of the
current in-tree drivers uses the pointer, so they have only been
checked for a clean compile.

Change suggested by Russell King.

Signed-off-by: Alessandro Rubini <rubini@unipv.it>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/input/serio/ambakmi.c | 2 +-
 drivers/mmc/host/mmci.c       | 2 +-
 drivers/rtc/rtc-pl030.c       | 2 +-
 drivers/rtc/rtc-pl031.c       | 2 +-
 drivers/serial/amba-pl010.c   | 2 +-
 drivers/serial/amba-pl011.c   | 2 +-
 drivers/video/amba-clcd.c     | 2 +-
 include/linux/amba/bus.h      | 2 +-
 sound/arm/aaci.c              | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/serio/ambakmi.c b/drivers/input/serio/ambakmi.c
index e29cdc13a199..a28c06d686e1 100644
--- a/drivers/input/serio/ambakmi.c
+++ b/drivers/input/serio/ambakmi.c
@@ -107,7 +107,7 @@ static void amba_kmi_close(struct serio *io)
 	clk_disable(kmi->clk);
 }
 
-static int amba_kmi_probe(struct amba_device *dev, void *id)
+static int amba_kmi_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct amba_kmi_port *kmi;
 	struct serio *io;
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index 36875dcfa492..7d4febdab286 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -490,7 +490,7 @@ static void mmci_check_status(unsigned long data)
 	mod_timer(&host->timer, jiffies + HZ);
 }
 
-static int __devinit mmci_probe(struct amba_device *dev, void *id)
+static int __devinit mmci_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct mmc_platform_data *plat = dev->dev.platform_data;
 	struct mmci_host *host;
diff --git a/drivers/rtc/rtc-pl030.c b/drivers/rtc/rtc-pl030.c
index 826153552157..aaf1f75fa293 100644
--- a/drivers/rtc/rtc-pl030.c
+++ b/drivers/rtc/rtc-pl030.c
@@ -102,7 +102,7 @@ static const struct rtc_class_ops pl030_ops = {
 	.set_alarm	= pl030_set_alarm,
 };
 
-static int pl030_probe(struct amba_device *dev, void *id)
+static int pl030_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct pl030_rtc *rtc;
 	int ret;
diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c
index 333eec689d2f..451fc13784d1 100644
--- a/drivers/rtc/rtc-pl031.c
+++ b/drivers/rtc/rtc-pl031.c
@@ -127,7 +127,7 @@ static int pl031_remove(struct amba_device *adev)
 	return 0;
 }
 
-static int pl031_probe(struct amba_device *adev, void *id)
+static int pl031_probe(struct amba_device *adev, struct amba_id *id)
 {
 	int ret;
 	struct pl031_local *ldata;
diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c
index e3a5ad5ef1d6..cdc049d4350f 100644
--- a/drivers/serial/amba-pl010.c
+++ b/drivers/serial/amba-pl010.c
@@ -665,7 +665,7 @@ static struct uart_driver amba_reg = {
 	.cons			= AMBA_CONSOLE,
 };
 
-static int pl010_probe(struct amba_device *dev, void *id)
+static int pl010_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct uart_amba_port *uap;
 	void __iomem *base;
diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index 8b2b9700f3e4..88fdac51b6c5 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -729,7 +729,7 @@ static struct uart_driver amba_reg = {
 	.cons			= AMBA_CONSOLE,
 };
 
-static int pl011_probe(struct amba_device *dev, void *id)
+static int pl011_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct uart_amba_port *uap;
 	void __iomem *base;
diff --git a/drivers/video/amba-clcd.c b/drivers/video/amba-clcd.c
index 61050ab14128..d1f80bac54f0 100644
--- a/drivers/video/amba-clcd.c
+++ b/drivers/video/amba-clcd.c
@@ -437,7 +437,7 @@ static int clcdfb_register(struct clcd_fb *fb)
 	return ret;
 }
 
-static int clcdfb_probe(struct amba_device *dev, void *id)
+static int clcdfb_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct clcd_board *board = dev->dev.platform_data;
 	struct clcd_fb *fb;
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 51e6e54b2aa1..9b93cafa82a0 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -28,7 +28,7 @@ struct amba_id {
 
 struct amba_driver {
 	struct device_driver	drv;
-	int			(*probe)(struct amba_device *, void *);
+	int			(*probe)(struct amba_device *, struct amba_id *);
 	int			(*remove)(struct amba_device *);
 	void			(*shutdown)(struct amba_device *);
 	int			(*suspend)(struct amba_device *, pm_message_t);
diff --git a/sound/arm/aaci.c b/sound/arm/aaci.c
index 7fbd68fab944..5c48e36038f2 100644
--- a/sound/arm/aaci.c
+++ b/sound/arm/aaci.c
@@ -1074,7 +1074,7 @@ static unsigned int __devinit aaci_size_fifo(struct aaci *aaci)
 	return i;
 }
 
-static int __devinit aaci_probe(struct amba_device *dev, void *id)
+static int __devinit aaci_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct aaci *aaci;
 	int ret, i;
-- 
cgit v1.2.3


From 28ee9bc5cc42776e0364399b401a64906ac1ac8e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 22 May 2009 16:23:38 +0200
Subject: ide: report timeouts in ide_busy_sleep()

* change 'hwif' argument to 'drive'
* report an error on timeout

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-probe.c | 9 ++++++---
 include/linux/ide.h     | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 7f264ed1141b..c895ed52b2e8 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -295,7 +295,7 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 
 	timeout = ((cmd == ATA_CMD_ID_ATA) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2;
 
-	if (ide_busy_sleep(hwif, timeout, use_altstatus))
+	if (ide_busy_sleep(drive, timeout, use_altstatus))
 		return 1;
 
 	/* wait for IRQ and ATA_DRQ */
@@ -316,8 +316,9 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 	return rc;
 }
 
-int ide_busy_sleep(ide_hwif_t *hwif, unsigned long timeout, int altstatus)
+int ide_busy_sleep(ide_drive_t *drive, unsigned long timeout, int altstatus)
 {
+	ide_hwif_t *hwif = drive->hwif;
 	u8 stat;
 
 	timeout += jiffies;
@@ -330,6 +331,8 @@ int ide_busy_sleep(ide_hwif_t *hwif, unsigned long timeout, int altstatus)
 			return 0;
 	} while (time_before(jiffies, timeout));
 
+	printk(KERN_ERR "%s: timeout in %s\n", drive->name, __func__);
+
 	return 1;	/* drive timed-out */
 }
 
@@ -420,7 +423,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 			tp_ops->dev_select(drive);
 			msleep(50);
 			tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
-			(void)ide_busy_sleep(hwif, WAIT_WORSTCASE, 0);
+			(void)ide_busy_sleep(drive, WAIT_WORSTCASE, 0);
 			rc = ide_dev_read_id(drive, cmd, id);
 		}
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ff65fffb078f..9fed365a598b 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1109,7 +1109,7 @@ void ide_fix_driveid(u16 *);
 
 extern void ide_fixstring(u8 *, const int, const int);
 
-int ide_busy_sleep(ide_hwif_t *, unsigned long, int);
+int ide_busy_sleep(ide_drive_t *, unsigned long, int);
 
 int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long);
 
-- 
cgit v1.2.3


From 5993856e53fbc4b4f28e2d481deaebeb715b1267 Mon Sep 17 00:00:00 2001
From: Harald Welte <HaraldWelte@viatech.com>
Date: Fri, 22 May 2009 16:23:39 +0200
Subject: via82cxxx: Add VIA VX855 PCI Device ID

This patch adds the PCI Device ID 0xc409 to the PCI ID table of via82cxxx.c,
as well as the 0x8409 south bridge ID.

This is required to make the IDE driver work on the VX855/VX875 integrated
chipset.

Signed-off-by: Harald Welte <HaraldWelte@viatech.com>
Cc: Joseph Chan <JosephChan@via.com.tw>
Cc: Bruce Chang <BruceChang@via.com.tw>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/via82cxxx.c | 2 ++
 include/linux/pci_ids.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ide/via82cxxx.c b/drivers/ide/via82cxxx.c
index 3ff7231e4858..028de26a25fe 100644
--- a/drivers/ide/via82cxxx.c
+++ b/drivers/ide/via82cxxx.c
@@ -67,6 +67,7 @@ static struct via_isa_bridge {
 	u8 udma_mask;
 	u8 flags;
 } via_isa_bridges[] = {
+	{ "vx855",	PCI_DEVICE_ID_VIA_VX855,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
 	{ "vx800",	PCI_DEVICE_ID_VIA_VX800,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
 	{ "cx700",	PCI_DEVICE_ID_VIA_CX700,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
 	{ "vt8237s",	PCI_DEVICE_ID_VIA_8237S,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
@@ -474,6 +475,7 @@ static const struct pci_device_id via_pci_tbl[] = {
 	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_82C576_1),  0 },
 	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_82C586_1),  0 },
 	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_CX700_IDE), 0 },
+	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_VX855_IDE), 0 },
 	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_6410),      1 },
 	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_SATA_EIDE), 1 },
 	{ 0, },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 06ba90c211a5..0f71812d67d3 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1406,7 +1406,7 @@
 #define PCI_DEVICE_ID_VIA_82C598_1	0x8598
 #define PCI_DEVICE_ID_VIA_838X_1	0xB188
 #define PCI_DEVICE_ID_VIA_83_87XX_1	0xB198
-#define PCI_DEVICE_ID_VIA_C409_IDE	0XC409
+#define PCI_DEVICE_ID_VIA_VX855_IDE	0xC409
 #define PCI_DEVICE_ID_VIA_ANON		0xFFFF
 
 #define PCI_VENDOR_ID_SIEMENS           0x110A
-- 
cgit v1.2.3


From df391e0eda1e678add56a8e34226edf05d89af6a Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@euromail.se>
Date: Sat, 23 May 2009 09:51:20 -0700
Subject: Input: multitouch - add tracking ID to the protocol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are a few multi-touch devices that support finger tracking
well in hardware, Stantum being the prime example. By exposing the
tracking ID in the MT protocol, evdev bandwidth and cpu usage in
user space can be reduced.

This patch adds the ABS_MT_TRACKING_ID to the MT protocol.

Signed-off-by: Henrik Rydberg <rydberg@euromail.se>
Tested-by: Stéphane Chatty <chatty@enac.fr>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/input.c | 1 +
 include/linux/input.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/input/input.c b/drivers/input/input.c
index e54e002665b0..5d445f48789b 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -42,6 +42,7 @@ static unsigned int input_abs_bypass_init_data[] __initdata = {
 	ABS_MT_POSITION_Y,
 	ABS_MT_TOOL_TYPE,
 	ABS_MT_BLOB_ID,
+	ABS_MT_TRACKING_ID,
 	0
 };
 static unsigned long input_abs_bypass[BITS_TO_LONGS(ABS_CNT)];
diff --git a/include/linux/input.h b/include/linux/input.h
index 0e6ff5de3588..6fed4f6a9c9e 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -656,6 +656,7 @@ struct input_absinfo {
 #define ABS_MT_POSITION_Y	0x36	/* Center Y ellipse position */
 #define ABS_MT_TOOL_TYPE	0x37	/* Type of touching device */
 #define ABS_MT_BLOB_ID		0x38	/* Group a set of packets as a blob */
+#define ABS_MT_TRACKING_ID	0x39	/* Unique ID of initiated contact */
 
 #define ABS_MAX			0x3f
 #define ABS_CNT			(ABS_MAX+1)
-- 
cgit v1.2.3


From bfcaa50270e18f35220a11d46e98fc6232c24606 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 25 May 2009 17:23:15 +0200
Subject: netfilter: nf_ct_tcp: fix accepting invalid RST segments

Robert L Mathews discovered that some clients send evil TCP RST segments,
which are accepted by netfilter conntrack but discarded by the
destination. Thus the conntrack entry is destroyed but the destination
retransmits data until timeout.

The same technique, i.e. sending properly crafted RST segments, can easily
be used to bypass connlimit/connbytes based restrictions (the sample
script written by Robert can be found in the netfilter mailing list
archives).

The patch below adds a new flag and new field to struct ip_ct_tcp_state so
that checking RST segments can be made more strict and thus TCP conntrack
can catch the invalid ones: the RST segment is accepted only if its
sequence number higher than or equal to the highest ack we seen from the
other direction. (The last_ack field cannot be reused because it is used
to catch resent packets.)

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nf_conntrack_tcp.h |  4 ++++
 net/netfilter/nf_conntrack_proto_tcp.c     | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_tcp.h b/include/linux/netfilter/nf_conntrack_tcp.h
index 3066789b972a..b2f384d42611 100644
--- a/include/linux/netfilter/nf_conntrack_tcp.h
+++ b/include/linux/netfilter/nf_conntrack_tcp.h
@@ -35,6 +35,9 @@ enum tcp_conntrack {
 /* Has unacknowledged data */
 #define IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED	0x10
 
+/* The field td_maxack has been set */
+#define IP_CT_TCP_FLAG_MAXACK_SET		0x20
+
 struct nf_ct_tcp_flags {
 	__u8 flags;
 	__u8 mask;
@@ -46,6 +49,7 @@ struct ip_ct_tcp_state {
 	u_int32_t	td_end;		/* max of seq + len */
 	u_int32_t	td_maxend;	/* max of ack + max(win, 1) */
 	u_int32_t	td_maxwin;	/* max(win) */
+	u_int32_t	td_maxack;	/* max of ack */
 	u_int8_t	td_scale;	/* window scale factor */
 	u_int8_t	flags;		/* per direction options */
 };
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b5ccf2b4b2e7..97a6e93d742e 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -634,6 +634,14 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			sender->td_end = end;
 			sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
 		}
+		if (tcph->ack) {
+			if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+				sender->td_maxack = ack;
+				sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+			} else if (after(ack, sender->td_maxack))
+				sender->td_maxack = ack;
+		}
+
 		/*
 		 * Update receiver data.
 		 */
@@ -918,6 +926,16 @@ static int tcp_packet(struct nf_conn *ct,
 				  "nf_ct_tcp: invalid state ");
 		return -NF_ACCEPT;
 	case TCP_CONNTRACK_CLOSE:
+		if (index == TCP_RST_SET
+		    && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
+		    && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
+			/* Invalid RST  */
+			write_unlock_bh(&tcp_lock);
+			if (LOG_INVALID(net, IPPROTO_TCP))
+				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+					  "nf_ct_tcp: invalid RST ");
+			return -NF_ACCEPT;
+		}
 		if (index == TCP_RST_SET
 		    && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
 			 && ct->proto.tcp.last_index == TCP_SYN_SET)
-- 
cgit v1.2.3


From 2f102607ac77354b02a76cf2748598ce9f270f08 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 27 May 2009 23:59:58 -0400
Subject: i7300_idle: allow testing on i5000-series hardware w/o re-compile

Testing the i7300_idle driver on i5000-series hardware required
an edit to i7300_idle.h to "#define SUPPORT_I5000 1" and a re-build
of both i7300_idle and ioat_dma.

Replace that build-time scheme with a load-time module parameter:
"7300_idle.forceload=1" to make it easier to test the driver
on hardware that while not officially validated, works fine
and is much more commonly available.

By default (no modparam) the driver will continue to load
only on the i7300.

Note that ioat_dma runs a copy of i7300_idle's probe routine
to know to reserve an IOAT channel for i7300_idle.
This change makes ioat_dma do that always on the i5000,
just like it does on the i7300.

Signed-off-by: Len Brown <len.brown@intel.com>
Acked-by: Andrew Henroid <andrew.d.henroid@intel.com>
---
 drivers/dma/ioat_dma.c     |  2 +-
 drivers/idle/i7300_idle.c  |  6 +++++-
 include/linux/i7300_idle.h | 20 ++++++++++----------
 3 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index 1955ee8d6d20..a600fc0f7962 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -173,7 +173,7 @@ static int ioat_dma_enumerate_channels(struct ioatdma_device *device)
 	xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
 
 #ifdef  CONFIG_I7300_IDLE_IOAT_CHANNEL
-	if (i7300_idle_platform_probe(NULL, NULL) == 0) {
+	if (i7300_idle_platform_probe(NULL, NULL, 1) == 0) {
 		device->common.chancnt--;
 	}
 #endif
diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c
index bf740394d704..949c97ff57e3 100644
--- a/drivers/idle/i7300_idle.c
+++ b/drivers/idle/i7300_idle.c
@@ -41,6 +41,10 @@ static int debug;
 module_param_named(debug, debug, uint, 0644);
 MODULE_PARM_DESC(debug, "Enable debug printks in this driver");
 
+static int forceload;
+module_param_named(forceload, forceload, uint, 0644);
+MODULE_PARM_DESC(debug, "Enable driver testing on unvalidated i5000");
+
 #define dprintk(fmt, arg...) \
 	do { if (debug) printk(KERN_INFO I7300_PRINT fmt, ##arg); } while (0)
 
@@ -552,7 +556,7 @@ static int __init i7300_idle_init(void)
 	cpus_clear(idle_cpumask);
 	total_us = 0;
 
-	if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev))
+	if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev, forceload))
 		return -ENODEV;
 
 	if (i7300_idle_thrt_save())
diff --git a/include/linux/i7300_idle.h b/include/linux/i7300_idle.h
index 05a80c44513c..1587b7dec505 100644
--- a/include/linux/i7300_idle.h
+++ b/include/linux/i7300_idle.h
@@ -16,35 +16,33 @@
 struct fbd_ioat {
 	unsigned int vendor;
 	unsigned int ioat_dev;
+	unsigned int enabled;
 };
 
 /*
  * The i5000 chip-set has the same hooks as the i7300
- * but support is disabled by default because this driver
- * has not been validated on that platform.
+ * but it is not enabled by default and must be manually
+ * manually enabled with "forceload=1" because it is
+ * only lightly validated.
  */
-#define SUPPORT_I5000 0
 
 static const struct fbd_ioat fbd_ioat_list[] = {
-	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB},
-#if SUPPORT_I5000
-	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT},
-#endif
+	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB, 1},
+	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT, 0},
 	{0, 0}
 };
 
 /* table of devices that work with this driver */
 static const struct pci_device_id pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_FBD_CNB) },
-#if SUPPORT_I5000
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5000_ERR) },
-#endif
 	{ } /* Terminating entry */
 };
 
 /* Check for known platforms with I/O-AT */
 static inline int i7300_idle_platform_probe(struct pci_dev **fbd_dev,
-						struct pci_dev **ioat_dev)
+						struct pci_dev **ioat_dev,
+						int enable_all)
 {
 	int i;
 	struct pci_dev *memdev, *dmadev;
@@ -69,6 +67,8 @@ static inline int i7300_idle_platform_probe(struct pci_dev **fbd_dev,
 	for (i = 0; fbd_ioat_list[i].vendor != 0; i++) {
 		if (dmadev->vendor == fbd_ioat_list[i].vendor &&
 		    dmadev->device == fbd_ioat_list[i].ioat_dev) {
+			if (!(fbd_ioat_list[i].enabled || enable_all))
+				continue;
 			if (fbd_dev)
 				*fbd_dev = memdev;
 			if (ioat_dev)
-- 
cgit v1.2.3


From b2e1feaf0af6b8a826b86748a19ddc2013ab7dbd Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 28 May 2009 14:34:20 -0700
Subject: cred: #include init.h in cred.h

linux/cred.h can't be included as first header (alphabetical order)
because it uses __init which is enough to break compilation on some archs.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cred.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 3282ee4318e7..4fa999696310 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -13,6 +13,7 @@
 #define _LINUX_CRED_H
 
 #include <linux/capability.h>
+#include <linux/init.h>
 #include <linux/key.h>
 #include <asm/atomic.h>
 
-- 
cgit v1.2.3


From e767e0561d7fd2333df1921f1ab4176211f9036b Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Thu, 28 May 2009 14:34:28 -0700
Subject: memcg: fix deadlock between lock_page_cgroup and mapping tree_lock

mapping->tree_lock can be acquired from interrupt context.  Then,
following dead lock can occur.

Assume "A" as a page.

 CPU0:
       lock_page_cgroup(A)
		interrupted
			-> take mapping->tree_lock.
 CPU1:
       take mapping->tree_lock
		-> lock_page_cgroup(A)

This patch tries to fix above deadlock by moving memcg's hook to out of
mapping->tree_lock.  charge/uncharge of pagecache/swapcache is protected
by page lock, not tree_lock.

After this patch, lock_page_cgroup() is not called under mapping->tree_lock.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 5 +++++
 mm/filemap.c         | 6 +++---
 mm/memcontrol.c      | 4 +++-
 mm/swap_state.c      | 4 +---
 mm/truncate.c        | 1 +
 mm/vmscan.c          | 2 ++
 6 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 62d81435347a..d476aad3ff57 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -437,6 +437,11 @@ static inline int mem_cgroup_cache_charge_swapin(struct page *page,
 	return 0;
 }
 
+static inline void
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+{
+}
+
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 379ff0bcbf6e..1b60f30cebfa 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -121,7 +121,6 @@ void __remove_from_page_cache(struct page *page)
 	mapping->nrpages--;
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	BUG_ON(page_mapped(page));
-	mem_cgroup_uncharge_cache_page(page);
 
 	/*
 	 * Some filesystems seem to re-dirty the page even after
@@ -145,6 +144,7 @@ void remove_from_page_cache(struct page *page)
 	spin_lock_irq(&mapping->tree_lock);
 	__remove_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
+	mem_cgroup_uncharge_cache_page(page);
 }
 
 static int sync_page(void *word)
@@ -476,13 +476,13 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 		if (likely(!error)) {
 			mapping->nrpages++;
 			__inc_zone_page_state(page, NR_FILE_PAGES);
+			spin_unlock_irq(&mapping->tree_lock);
 		} else {
 			page->mapping = NULL;
+			spin_unlock_irq(&mapping->tree_lock);
 			mem_cgroup_uncharge_cache_page(page);
 			page_cache_release(page);
 		}
-
-		spin_unlock_irq(&mapping->tree_lock);
 		radix_tree_preload_end();
 	} else
 		mem_cgroup_uncharge_cache_page(page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 01c2d8f14685..4a747a27a22f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1488,8 +1488,9 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 
+#ifdef CONFIG_SWAP
 /*
- * called from __delete_from_swap_cache() and drop "page" account.
+ * called after __delete_from_swap_cache() and drop "page" account.
  * memcg information is recorded to swap_cgroup of "ent"
  */
 void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
@@ -1506,6 +1507,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 	if (memcg)
 		css_put(&memcg->css);
 }
+#endif
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3ecea98ecb45..1416e7e9e02d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -109,8 +109,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
  */
 void __delete_from_swap_cache(struct page *page)
 {
-	swp_entry_t ent = {.val = page_private(page)};
-
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(!PageSwapCache(page));
 	VM_BUG_ON(PageWriteback(page));
@@ -121,7 +119,6 @@ void __delete_from_swap_cache(struct page *page)
 	total_swapcache_pages--;
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	INC_CACHE_INFO(del_total);
-	mem_cgroup_uncharge_swapcache(page, ent);
 }
 
 /**
@@ -191,6 +188,7 @@ void delete_from_swap_cache(struct page *page)
 	__delete_from_swap_cache(page);
 	spin_unlock_irq(&swapper_space.tree_lock);
 
+	mem_cgroup_uncharge_swapcache(page, entry);
 	swap_free(entry);
 	page_cache_release(page);
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 55206fab7b99..12e1579f9165 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -359,6 +359,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	BUG_ON(page_has_private(page));
 	__remove_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
+	mem_cgroup_uncharge_cache_page(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
 failed:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5fa3eda1f03f..d254306562cd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,10 +470,12 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
+		mem_cgroup_uncharge_swapcache(page, swap);
 		swap_free(swap);
 	} else {
 		__remove_from_page_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
+		mem_cgroup_uncharge_cache_page(page);
 	}
 
 	return 1;
-- 
cgit v1.2.3


From 52bb25a620e1925bb53d41d0ed28571b3de98a31 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 1 Jun 2009 06:21:13 +0000
Subject: headers_check fix: linux/auto_fs.h

fix the following 'make headers_check' warnings:

  usr/include/linux/auto_fs.h:17: include of <linux/types.h> is preferred over <asm/types.h>

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/auto_fs.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h
index 63265852b7d1..7b09c8348fd3 100644
--- a/include/linux/auto_fs.h
+++ b/include/linux/auto_fs.h
@@ -14,13 +14,12 @@
 #ifndef _LINUX_AUTO_FS_H
 #define _LINUX_AUTO_FS_H
 
+#include <linux/types.h>
 #ifdef __KERNEL__
 #include <linux/fs.h>
 #include <linux/limits.h>
-#include <linux/types.h>
 #include <linux/ioctl.h>
 #else
-#include <asm/types.h>
 #include <sys/ioctl.h>
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From d280cc989ad591607e812cd5c5dfde702b5f191a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 1 Jun 2009 06:23:25 +0000
Subject: headers_check fix: linux/net_dropmon.h

fix the following 'make headers_check' warnings:

  usr/include/linux/net_dropmon.h:7: found __[us]{8,16,32,64} type without #include <linux/types.h>

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/net_dropmon.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
index 0217fb81a630..0e2e100c44a2 100644
--- a/include/linux/net_dropmon.h
+++ b/include/linux/net_dropmon.h
@@ -1,6 +1,7 @@
 #ifndef __NET_DROPMON_H
 #define __NET_DROPMON_H
 
+#include <linux/types.h>
 #include <linux/netlink.h>
 
 struct net_dm_drop_point {
-- 
cgit v1.2.3


From 05ad709d04799125ed85dd816fdb558258102172 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@etchedpixels.co.uk>
Date: Tue, 2 Jun 2009 16:58:10 +0100
Subject: parport: quickfix the proc registration bug

Ideally we should have a directory of drivers and a link to the 'active'
driver. For now just show the first device which is effectively the existing
semantics without a warning.

This is an update on the original buggy patch that I then forgot to
resubmit. Confusingly it was proposed by Red Hat, written by Etched Pixels
fixed and submitted by Intel ...

Resolves-Bug: http://bugzilla.kernel.org/show_bug.cgi?id=9749
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parport/share.c | 13 ++++++++++---
 include/linux/parport.h |  4 ++++
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index 0ebca450ed29..dffa5d4fb298 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -614,7 +614,10 @@ parport_register_device(struct parport *port, const char *name,
 	 * pardevice fields. -arca
 	 */
 	port->ops->init_state(tmp, tmp->state);
-	parport_device_proc_register(tmp);
+	if (!test_and_set_bit(PARPORT_DEVPROC_REGISTERED, &port->devflags)) {
+		port->proc_device = tmp;
+		parport_device_proc_register(tmp);
+	}
 	return tmp;
 
  out_free_all:
@@ -646,10 +649,14 @@ void parport_unregister_device(struct pardevice *dev)
 	}
 #endif
 
-	parport_device_proc_unregister(dev);
-
 	port = dev->port->physport;
 
+	if (port->proc_device == dev) {
+		port->proc_device = NULL;
+		clear_bit(PARPORT_DEVPROC_REGISTERED, &port->devflags);
+		parport_device_proc_unregister(dev);
+	}
+
 	if (port->cad == dev) {
 		printk(KERN_DEBUG "%s: %s forgot to release port\n",
 		       port->name, dev->name);
diff --git a/include/linux/parport.h b/include/linux/parport.h
index e1f83c5065c5..38a423ed3c01 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -324,6 +324,10 @@ struct parport {
 	int spintime;
 	atomic_t ref_count;
 
+	unsigned long devflags;
+#define PARPORT_DEVPROC_REGISTERED	0
+	struct pardevice *proc_device;	/* Currently register proc device */
+
 	struct list_head full_list;
 	struct parport *slaves[3];
 };
-- 
cgit v1.2.3


From 087eb437051b3de817720f9c80c440fc9e7dcce8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 4 Jun 2009 16:29:07 -0700
Subject: ptrace: tracehook_report_clone: fix false positives

The "trace || CLONE_PTRACE" check in tracehook_report_clone() is not right,

- If the untraced task does clone(CLONE_PTRACE) the new child is not traced,
  we must not queue SIGSTOP.

- If we forked the traced task, but the tracer exits and untraces both the
  forking task and the new child (after copy_process() drops tasklist_lock),
  we should not queue SIGSTOP too.

Change the code to check task_ptrace() != 0 instead. This is still racy, but
the race is harmless.

We can race with another tracer attaching to this child, or the tracer can
exit and detach in parallel. But giwen that we didn't do wake_up_new_task()
yet, the child must have the pending SIGSTOP anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 11 +++++------
 kernel/fork.c             |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index c7aa154f4bfc..eb96603d92db 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -259,14 +259,12 @@ static inline void tracehook_finish_clone(struct task_struct *child,
 
 /**
  * tracehook_report_clone - in parent, new child is about to start running
- * @trace:		return value from tracehook_prepare_clone()
  * @regs:		parent's user register state
  * @clone_flags:	flags from parent's system call
  * @pid:		new child's PID in the parent's namespace
  * @child:		new child task
  *
- * Called after a child is set up, but before it has been started
- * running.  @trace is the value returned by tracehook_prepare_clone().
+ * Called after a child is set up, but before it has been started running.
  * This is not a good place to block, because the child has not started
  * yet.  Suspend the child here if desired, and then block in
  * tracehook_report_clone_complete().  This must prevent the child from
@@ -276,13 +274,14 @@ static inline void tracehook_finish_clone(struct task_struct *child,
  *
  * Called with no locks held, but the child cannot run until this returns.
  */
-static inline void tracehook_report_clone(int trace, struct pt_regs *regs,
+static inline void tracehook_report_clone(struct pt_regs *regs,
 					  unsigned long clone_flags,
 					  pid_t pid, struct task_struct *child)
 {
-	if (unlikely(trace) || unlikely(clone_flags & CLONE_PTRACE)) {
+	if (unlikely(task_ptrace(child))) {
 		/*
-		 * The child starts up with an immediate SIGSTOP.
+		 * It doesn't matter who attached/attaching to this
+		 * task, the pending SIGSTOP is right in any case.
 		 */
 		sigaddset(&child->pending.signal, SIGSTOP);
 		set_tsk_thread_flag(child, TIF_SIGPENDING);
diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd00726..875ffbdd96d0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1409,7 +1409,7 @@ long do_fork(unsigned long clone_flags,
 		}
 
 		audit_finish_fork(p);
-		tracehook_report_clone(trace, regs, clone_flags, nr, p);
+		tracehook_report_clone(regs, clone_flags, nr, p);
 
 		/*
 		 * We set PF_STARTING at creation in case tracing wants to
-- 
cgit v1.2.3


From aa853f85d9ed593672d0f24a98c72a2518cb63e6 Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Sat, 6 Jun 2009 10:17:57 +0100
Subject: [ARM] 5543/1: arm: serial amba: add missing declaration in serial.h

This header is sometimes included in the uncompress stage to get
register values, but no <linux/amba/bus.h> can be included there.
So declare "struct amba_device" here before using it in a prototype.

Signed-off-by: Alessandro Rubini <rubini@unipv.it>
Acked-by: Andrea Gallo <andrea.gallo@stericsson.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/amba/serial.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 48ee32a18ac5..64a982ea5d5f 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -159,6 +159,7 @@
 #define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD|UART01x_FR_DSR|UART01x_FR_CTS)
 
 #ifndef __ASSEMBLY__
+struct amba_device; /* in uncompress this is included but amba/bus.h is not */
 struct amba_pl010_data {
 	void (*set_mctrl)(struct amba_device *dev, void __iomem *base, unsigned int mctrl);
 };
-- 
cgit v1.2.3


From 0281b5dc0350cbf6dd21ed558a33cccce77abc02 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 6 Jun 2009 14:50:36 -0700
Subject: cpumask: introduce zalloc_cpumask_var

So can get cpumask_var with cpumask_clear

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/cpumask.h | 15 +++++++++++++++
 lib/cpumask.c           | 12 ++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 9f315382610b..c5ac87ca7bc6 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1022,6 +1022,8 @@ typedef struct cpumask *cpumask_var_t;
 
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
 bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
+bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
+bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
 void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
 void free_cpumask_var(cpumask_var_t mask);
 void free_bootmem_cpumask_var(cpumask_var_t mask);
@@ -1040,6 +1042,19 @@ static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
 	return true;
 }
 
+static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	cpumask_clear(*mask);
+	return true;
+}
+
+static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
+					  int node)
+{
+	cpumask_clear(*mask);
+	return true;
+}
+
 static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
 {
 }
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 1f71b97de0f9..eb23aaa0c7b8 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -119,6 +119,12 @@ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 }
 EXPORT_SYMBOL(alloc_cpumask_var_node);
 
+bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
+{
+	return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
+}
+EXPORT_SYMBOL(zalloc_cpumask_var_node);
+
 /**
  * alloc_cpumask_var - allocate a struct cpumask
  * @mask: pointer to cpumask_var_t where the cpumask is returned
@@ -135,6 +141,12 @@ bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 }
 EXPORT_SYMBOL(alloc_cpumask_var);
 
+bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	return alloc_cpumask_var(mask, flags | __GFP_ZERO);
+}
+EXPORT_SYMBOL(zalloc_cpumask_var);
+
 /**
  * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena.
  * @mask: pointer to cpumask_var_t where the cpumask is returned
-- 
cgit v1.2.3